Consistent use of context to specify number of threads. (#8733)

- Use context in all tests. - Use context in R. - Use context in C API DMatrix initialization. (0 threads is used as dft).
2023-01-30 15:25:31 +08:00
parent 21a28f2cc5
commit 3760cede0f
24 changed files with 212 additions and 152 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,4 +1,6 @@
-// Copyright (c) 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
+ */
 #include <rabit/c_api.h>

 #include <cstring>
@@ -279,7 +281,7 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
  auto jconfig = Json::Load(StringView{config});
  auto missing = GetMissing(jconfig);
  std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
-  auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", 0);

  xgboost_CHECK_C_ARG_PTR(next);
  xgboost_CHECK_C_ARG_PTR(reset);
@@ -319,7 +321,7 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
  xgboost_CHECK_C_ARG_PTR(config);
  auto jconfig = Json::Load(StringView{config});
  auto missing = GetMissing(jconfig);
-  auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", 0);
  auto max_bin = OptionalArg<Integer, int64_t>(jconfig, "max_bin", 256);

  xgboost_CHECK_C_ARG_PTR(next);
@@ -420,7 +422,7 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
-  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
  xgboost_CHECK_C_ARG_PTR(out);
  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
  API_END();
@@ -435,10 +437,9 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
-  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
  xgboost_CHECK_C_ARG_PTR(out);
-  *out =
-      new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
  API_END();
 }

@@ -506,8 +507,7 @@ XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char c
  auto jconfig = Json::Load(StringView{config});
  auto missing = GetMissing(jconfig);
  auto n_batches = RequiredArg<Integer>(jconfig, "nbatch", __func__);
-  auto n_threads =
-      OptionalArg<Integer, std::int64_t>(jconfig, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
  data::RecordBatchesIterAdapter adapter(next, n_batches);
  xgboost_CHECK_C_ARG_PTR(out);
  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,4 +1,6 @@
-// Copyright (c) 2019-2022 by Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
+ */
 #include "../common/threading_utils.h"
 #include "../data/device_adapter.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -68,8 +70,7 @@ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data,
  auto config = Json::Load(StringView{c_json_config});

  float missing = GetMissing(config);
-  auto n_threads =
-      OptionalArg<Integer, std::int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, std::int64_t>(config, "nthread", 0);
  data::CudfAdapter adapter(json_str);
  *out =
      new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
@@ -83,8 +84,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  std::string json_str{data};
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
-  auto n_threads =
-      OptionalArg<Integer, std::int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto n_threads = OptionalArg<Integer, std::int64_t>(config, "nthread", 0);
  data::CupyAdapter adapter(json_str);
  *out =
      new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #include "threading_utils.h"

@@ -10,14 +10,6 @@

 namespace xgboost {
 namespace common {
-/**
- * \brief Get thread limit from CFS
- *
- * Modified from
- * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
- *
- * MIT License: Copyright (c) 2016 Domagoj Šarić
- */
 int32_t GetCfsCPUCount() noexcept {
 #if defined(__linux__)
  // https://bugs.openjdk.java.net/browse/JDK-8146115
@@ -47,5 +39,20 @@ int32_t GetCfsCPUCount() noexcept {
 #endif  //  defined(__linux__)
  return -1;
 }
+
+std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
+  // Don't use parallel if we are in a parallel region.
+  if (omp_in_parallel()) {
+    return 1;
+  }
+  // If -1 or 0 is specified by the user, we default to maximum number of threads.
+  if (n_threads <= 0) {
+    n_threads = std::min(omp_get_num_procs(), omp_get_max_threads());
+  }
+  // Honor the openmp thread limit, which can be set via environment variable.
+  n_threads = std::min(n_threads, OmpGetThreadLimit());
+  n_threads = std::max(n_threads, 1);
+  return n_threads;
+}
 }  // namespace common
 }  // namespace xgboost
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 by XGBoost Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_THREADING_UTILS_H_
 #define XGBOOST_COMMON_THREADING_UTILS_H_
@@ -231,23 +231,28 @@ void ParallelFor(Index size, int32_t n_threads, Func fn) {
  ParallelFor(size, n_threads, Sched::Static(), fn);
 }

-inline int32_t OmpGetThreadLimit() {
-  int32_t limit = omp_get_thread_limit();
+inline std::int32_t OmpGetThreadLimit() {
+  std::int32_t limit = omp_get_thread_limit();
  CHECK_GE(limit, 1) << "Invalid thread limit for OpenMP.";
  return limit;
 }

-int32_t GetCfsCPUCount() noexcept;
-
-inline int32_t OmpGetNumThreads(int32_t n_threads) {
-  if (n_threads <= 0) {
-    n_threads = std::min(omp_get_num_procs(), omp_get_max_threads());
-  }
-  n_threads = std::min(n_threads, OmpGetThreadLimit());
-  n_threads = std::max(n_threads, 1);
-  return n_threads;
-}
+/**
+ * \brief Get thread limit from CFS.
+ *
+ *   This function has non-trivial overhead and should not be called repeatly.
+ *
+ * Modified from
+ * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
+ *
+ * MIT License: Copyright (c) 2016 Domagoj Šarić
+ */
+std::int32_t GetCfsCPUCount() noexcept;

+/**
+ * \brief Get the number of available threads based on n_threads specified by users.
+ */
+std::int32_t OmpGetNumThreads(std::int32_t n_threads);

 /*!
 * \brief A C-style array with in-stack allocation. As long as the array is smaller than
--- a/src/context.cc
+++ b/src/context.cc
@@ -53,9 +53,6 @@ void Context::ConfigureGpuId(bool require_gpu) {
 }

 std::int32_t Context::Threads() const {
-  if (omp_in_parallel()) {
-    return 1;
-  }
  auto n_threads = common::OmpGetNumThreads(nthread);
  if (cfs_cpu_count_ > 0) {
    n_threads = std::min(n_threads, cfs_cpu_count_);
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023 by XGBoost Contributors
 * \file data.cc
 */
 #include "xgboost/data.h"
@@ -27,6 +27,7 @@
 #include "sparse_page_writer.h"
 #include "validation.h"
 #include "xgboost/c_api.h"
+#include "xgboost/context.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/learner.h"
 #include "xgboost/logging.h"
@@ -850,7 +851,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
      std::unique_ptr<dmlc::Parser<uint32_t>> parser(
          dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
      data::FileAdapter adapter(parser.get());
-      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, cache_file);
+      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
+                             cache_file);
    } else {
      data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
                              file_format};