Define the new device parameter. (#9362)

2023-07-13 19:30:25 +08:00
parent 2d0cd2817e
commit 04aff3af8e
63 changed files with 827 additions and 477 deletions
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -117,10 +117,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                          RequiredArg<Integer>(config, "iteration_begin", __func__),
                          RequiredArg<Integer>(config, "iteration_end", __func__));
  CHECK(p_predt);
-  if (learner->Ctx()->IsCPU()) {
-    // Prediction using DMatrix as fallback.
-    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
-  } else {
+  if (learner->Ctx()->IsCUDA()) {
    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
  }
  p_predt->SetDevice(proxy->DeviceIdx());
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -3,23 +3,18 @@
 */
 #include "error_msg.h"

+#include "../collective/communicator-inl.h"  // for GetRank
 #include "xgboost/logging.h"

 namespace xgboost::error {
 void WarnDeprecatedGPUHist() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
  auto msg =
      "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
      R"(parameter to CUDA instead.

    E.g. tree_method = "hist", device = "CUDA"
-
 )";
  LOG(WARNING) << msg;
-  logged = true;
 }

 void WarnManualUpdater() {
@@ -33,4 +28,23 @@ void WarnManualUpdater() {
         "behavior. For common uses, we recommend using `tree_method` parameter instead.";
  logged = true;
 }
+
+void WarnDeprecatedGPUId() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "`gpu_id` is deprecated in favor of the new `device` parameter: "
+               << "device = cpu/cuda/cuda:0";
+  logged = true;
+}
+
+void WarnEmptyDataset() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+  logged = true;
+}
 }  // namespace xgboost::error
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -82,5 +82,9 @@ inline void WarnOldSerialization() {
 void WarnDeprecatedGPUHist();

 void WarnManualUpdater();
+
+void WarnDeprecatedGPUId();
+
+void WarnEmptyDataset();
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/context.cc
+++ b/src/context.cc
@@ -3,53 +3,201 @@
 *
 * \brief Context object used for controlling runtime parameters.
 */
-#include <xgboost/context.h>
+#include "xgboost/context.h"

-#include "common/common.h"  // AssertGPUSupport
+#include <algorithm>  // for find_if
+#include <charconv>   // for from_chars
+#include <iterator>   // for distance
+#include <optional>   // for optional
+#include <regex>      // for regex_replace, regex_match
+
+#include "common/common.h"     // AssertGPUSupport
+#include "common/error_msg.h"  // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
+#include "xgboost/string_view.h"

 namespace xgboost {

 DMLC_REGISTER_PARAMETER(Context);

-std::int32_t constexpr Context::kCpuId;
+bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;

 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}

-void Context::ConfigureGpuId(bool require_gpu) {
-#if defined(XGBOOST_USE_CUDA)
-  if (gpu_id == kCpuId) {  // 0. User didn't specify the `gpu_id'
-    if (require_gpu) {     // 1. `tree_method' or `predictor' or both are using
-                           // GPU.
-      // 2. Use device 0 as default.
-      this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-    }
-  }
+namespace {
+inline constexpr char const* kDevice = "device";

-  // 3. When booster is loaded from a memory image (Python pickle or R
-  // raw model), number of available GPUs could be different.  Wrap around it.
-  int32_t n_gpus = common::AllVisibleGPUs();
-  if (n_gpus == 0) {
-    if (gpu_id != kCpuId) {
-      LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
-    }
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  } else if (fail_on_invalid_gpu_id) {
-    CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
-        << "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
-  } else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
-    LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
-                 << gpu_id % n_gpus;
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
-  }
+#if !defined(XGBOOST_USE_CUDA)
+DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
+  device = DeviceOrd::CPU();
+  return device;
+}
 #else
-  // Just set it to CPU, don't think about it.
-  this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  (void)(require_gpu);
-#endif  // defined(XGBOOST_USE_CUDA)
+// Check CUDA on the current device, wrap the ordinal if necessary.
+[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
+  // When booster is loaded from a memory image (Python pickle or R raw model), number of
+  // available GPUs could be different.  Wrap around it.
+  std::int32_t n_visible = common::AllVisibleGPUs();
+  if (n_visible == 0) {
+    if (device.IsCUDA()) {
+      LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
+    }
+    device = DeviceOrd::CPU();
+  } else if (fail_on_invalid) {
+    CHECK(device.IsCPU() || device.ordinal < n_visible)
+        << "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
+        << " is invalid.";
+  } else if (device.IsCUDA() && device.ordinal >= n_visible) {
+    device.ordinal = device.ordinal % n_visible;
+    LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
+                 << device.ordinal;
+  }

-  common::SetDevice(this->gpu_id);
+  if (device.IsCUDA()) {
+    common::SetDevice(device.ordinal);
+  }
+  return device;
+}
+#endif  //  !defined(XGBOOST_USE_CUDA)
+
+[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
+  // Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
+  // letting go of unknown characters.
+  if (ordinal.empty()) {
+    return std::nullopt;
+  }
+
+  std::size_t offset{0};
+  if (ordinal[0] == '-') {
+    offset = 1;
+  }
+  if (ordinal.size() <= offset) {
+    return std::nullopt;
+  }
+
+  bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
+                           [](auto c) { return std::isdigit(c); });
+  if (!valid) {
+    return std::nullopt;
+  }
+
+  std::int32_t parsed_id{Context::kCpuId};
+  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
+  if (res.ec != std::errc()) {
+    return std::nullopt;
+  }
+
+  return parsed_id;
+}
+
+[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
+  StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
+- cpu
+- cuda
+- cuda:<device ordinal>  # e.g. cuda:0
+- gpu
+- gpu:<device ordinal>   # e.g. gpu:0
+)"};
+  auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
+
+#if defined(__MINGW32__)
+  // mingw hangs on regex using rtools 430. Basic checks only.
+  CHECK_GE(input.size(), 3) << msg;
+  auto substr = input.substr(0, 3);
+  bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
+  CHECK(valid) << msg;
+#else
+  std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
+  if (!std::regex_match(input, pattern)) {
+    fatal();
+  }
+#endif  // defined(__MINGW32__)
+
+  // handle alias
+  std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
+
+  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
+  DeviceOrd device;
+  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  if (split_it == s_device.cend()) {
+    // no ordinal.
+    if (s_device == DeviceSym::CPU()) {
+      device = DeviceOrd::CPU();
+    } else if (s_device == DeviceSym::CUDA()) {
+      device = DeviceOrd::CUDA(0);  // use 0 as default;
+    } else {
+      fatal();
+    }
+  } else {
+    // must be CUDA when ordinal is specifed.
+    // +1 for colon
+    std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
+    // substr
+    StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
+    if (s_ordinal.empty()) {
+      fatal();
+    }
+    auto opt_id = ParseInt(s_ordinal);
+    if (!opt_id.has_value()) {
+      fatal();
+    }
+    CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
+        << "Ordinal value too large.";
+    device = DeviceOrd::CUDA(opt_id.value());
+  }
+
+  if (device.ordinal < Context::kCpuId) {
+    fatal();
+  }
+  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
+
+  return device;
+}
+}  // namespace
+
+void Context::ConfigureGpuId(bool require_gpu) {
+  if (this->IsCPU() && require_gpu) {
+    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
+  }
+}
+
+void Context::SetDeviceOrdinal(Args const& kwargs) {
+  auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == "gpu_id"; });
+  auto has_gpu_id = gpu_id_it != kwargs.cend();
+  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == kDevice; });
+  auto has_device = device_it != kwargs.cend();
+  if (has_device && has_gpu_id) {
+    LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
+  }
+
+  if (has_gpu_id) {
+    // Compatible with XGBoost < 2.0.0
+    error::WarnDeprecatedGPUId();
+    auto opt_id = ParseInt(StringView{gpu_id_it->second});
+    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
+    if (opt_id.value() > Context::kCpuId) {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
+    } else {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
+    }
+    return;
+  }
+
+  auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
+
+  if (!has_device) {
+    CHECK_EQ(new_d.ordinal, this->device_.ordinal);  // unchanged
+  }
+  this->SetDevice(new_d);
+
+  if (this->IsCPU()) {
+    CHECK_EQ(this->device_.ordinal, kCpuId);
+  } else {
+    CHECK_GT(this->device_.ordinal, kCpuId);
+  }
 }

 std::int32_t Context::Threads() const {
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
  bool valid = iter.Next();
  CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";

-  auto d = MakeProxy(proxy_)->DeviceIdx();
+  auto pctx = MakeProxy(proxy_)->Ctx();

  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
+  ctx.UpdateAllowUnknown(
+      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
  // hardcoded parameter.
  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};

--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -54,6 +54,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
  }

+  CHECK(p_fmat) << "Failed to fallback.";
  return p_fmat;
 }
 }  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -7,28 +7,31 @@

 namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
-  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
-  auto const& value = adapter->Value();
+  auto adapter{std::make_shared<CudfAdapter>(interface_str)};
  this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
  }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }

 void DMatrixProxy::FromCudaArray(StringView interface_str) {
-  std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
+  auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
  this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
  }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }

 namespace cuda_impl {
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -27,7 +27,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
  dh::safe_cuda(cudaSetDevice(device));

  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});

  CHECK(adapter->NumRows() != kAdapterUnknownSize);
  CHECK(adapter->NumColumns() != kAdapterUnknownSize);
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -84,6 +84,25 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                      return name == up->Name();
                    });
 }
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  bool thread_local static logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+                  "lead to higher memory usage and slower performance. XGBoost is running on: "
+               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
+               << ".\n"
+               << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once, and subsequent warnings made by the current thread will be
+suppressed.
+)";
+  logged = true;
+}
 }  // namespace

 void GBTree::Configure(Args const& cfg) {
@@ -208,6 +227,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
  monitor_.Start("BoostNewTrees");

+  predt->predictions.SetDevice(ctx_->Ordinal());
  auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                    model_.learner_model_param->OutputLength());
  CHECK_NE(n_groups, 0);
@@ -521,18 +541,6 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
  }
 }

-namespace {
-inline void MismatchedDevices(Context const* booster, Context const* data) {
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-               << "is running on: " << booster->DeviceName()
-               << ", while the input data is on: " << data->DeviceName() << ".\n"
-               << R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
-)";
-}
-};  // namespace
-
 void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
                          bst_layer_t layer_begin, bst_layer_t layer_end) {
  // dispatch to const function.
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,7 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -711,6 +711,7 @@ class LearnerConfiguration : public Learner {
    // FIXME(trivialfis): Make eval_metric a training parameter.
    keys.emplace_back(kEvalMetric);
    keys.emplace_back("num_output_group");
+    keys.emplace_back("gpu_id");  // deprecated param.

    std::sort(keys.begin(), keys.end());

@@ -1340,10 +1341,9 @@ class LearnerImpl : public LearnerIO {
  }

  void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
-               unsigned layer_end, bool training,
-               bool pred_leaf, bool pred_contribs, bool approx_contribs,
-               bool pred_interactions) override {
+               HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+               bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
+               bool approx_contribs, bool pred_interactions) override {
    int multiple_predictions = static_cast<int>(pred_leaf) +
                               static_cast<int>(pred_interactions) +
                               static_cast<int>(pred_contribs);
@@ -1391,15 +1391,16 @@ class LearnerImpl : public LearnerIO {
  }

  void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                      HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
-                      uint32_t iteration_end) override {
+                      HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
+                      bst_layer_t iteration_end) override {
    this->Configure();
    this->CheckModelInitialized();

    auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    out_predictions.version = 0;
+    out_predictions.Reset();

    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
+
    if (type == PredictionType::kValue) {
      obj_->PredTransform(&out_predictions.predictions);
    } else if (type == PredictionType::kMargin) {
@@ -1454,7 +1455,7 @@ class LearnerImpl : public LearnerIO {
    }

    if (p_fmat->Info().num_row_ == 0) {
-      LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+      error::WarnEmptyDataset();
    }
  }