Define the new device parameter. (#9362)

This commit is contained in:
Jiaming Yuan
2023-07-13 19:30:25 +08:00
committed by GitHub
parent 2d0cd2817e
commit 04aff3af8e
63 changed files with 827 additions and 477 deletions

View File

@@ -117,10 +117,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
RequiredArg<Integer>(config, "iteration_begin", __func__),
RequiredArg<Integer>(config, "iteration_end", __func__));
CHECK(p_predt);
if (learner->Ctx()->IsCPU()) {
// Prediction using DMatrix as fallback.
CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
} else {
if (learner->Ctx()->IsCUDA()) {
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
}
p_predt->SetDevice(proxy->DeviceIdx());

View File

@@ -3,23 +3,18 @@
*/
#include "error_msg.h"
#include "../collective/communicator-inl.h" // for GetRank
#include "xgboost/logging.h"
namespace xgboost::error {
void WarnDeprecatedGPUHist() {
bool static thread_local logged{false};
if (logged) {
return;
}
auto msg =
"The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
R"(parameter to CUDA instead.
E.g. tree_method = "hist", device = "CUDA"
)";
LOG(WARNING) << msg;
logged = true;
}
void WarnManualUpdater() {
@@ -33,4 +28,23 @@ void WarnManualUpdater() {
"behavior. For common uses, we recommend using `tree_method` parameter instead.";
logged = true;
}
void WarnDeprecatedGPUId() {
static thread_local bool logged{false};
if (logged) {
return;
}
LOG(WARNING) << "`gpu_id` is deprecated in favor of the new `device` parameter: "
<< "device = cpu/cuda/cuda:0";
logged = true;
}
void WarnEmptyDataset() {
static thread_local bool logged{false};
if (logged) {
return;
}
LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
logged = true;
}
} // namespace xgboost::error

View File

@@ -82,5 +82,9 @@ inline void WarnOldSerialization() {
void WarnDeprecatedGPUHist();
void WarnManualUpdater();
void WarnDeprecatedGPUId();
void WarnEmptyDataset();
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_

View File

@@ -3,53 +3,201 @@
*
* \brief Context object used for controlling runtime parameters.
*/
#include <xgboost/context.h>
#include "xgboost/context.h"
#include "common/common.h" // AssertGPUSupport
#include <algorithm> // for find_if
#include <charconv> // for from_chars
#include <iterator> // for distance
#include <optional> // for optional
#include <regex> // for regex_replace, regex_match
#include "common/common.h" // AssertGPUSupport
#include "common/error_msg.h" // WarnDeprecatedGPUId
#include "common/threading_utils.h"
#include "xgboost/string_view.h"
namespace xgboost {
DMLC_REGISTER_PARAMETER(Context);
std::int32_t constexpr Context::kCpuId;
bst_d_ordinal_t constexpr Context::kCpuId;
std::int64_t constexpr Context::kDefaultSeed;
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
void Context::ConfigureGpuId(bool require_gpu) {
#if defined(XGBOOST_USE_CUDA)
if (gpu_id == kCpuId) { // 0. User didn't specify the `gpu_id'
if (require_gpu) { // 1. `tree_method' or `predictor' or both are using
// GPU.
// 2. Use device 0 as default.
this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
}
}
namespace {
inline constexpr char const* kDevice = "device";
// 3. When booster is loaded from a memory image (Python pickle or R
// raw model), number of available GPUs could be different. Wrap around it.
int32_t n_gpus = common::AllVisibleGPUs();
if (n_gpus == 0) {
if (gpu_id != kCpuId) {
LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
}
this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
} else if (fail_on_invalid_gpu_id) {
CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
<< "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
} else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
<< gpu_id % n_gpus;
this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
}
#if !defined(XGBOOST_USE_CUDA)
DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
device = DeviceOrd::CPU();
return device;
}
#else
// Just set it to CPU, don't think about it.
this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
(void)(require_gpu);
#endif // defined(XGBOOST_USE_CUDA)
// Check CUDA on the current device, wrap the ordinal if necessary.
[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
// When booster is loaded from a memory image (Python pickle or R raw model), number of
// available GPUs could be different. Wrap around it.
std::int32_t n_visible = common::AllVisibleGPUs();
if (n_visible == 0) {
if (device.IsCUDA()) {
LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
}
device = DeviceOrd::CPU();
} else if (fail_on_invalid) {
CHECK(device.IsCPU() || device.ordinal < n_visible)
<< "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
<< " is invalid.";
} else if (device.IsCUDA() && device.ordinal >= n_visible) {
device.ordinal = device.ordinal % n_visible;
LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
<< device.ordinal;
}
common::SetDevice(this->gpu_id);
if (device.IsCUDA()) {
common::SetDevice(device.ordinal);
}
return device;
}
#endif // !defined(XGBOOST_USE_CUDA)
[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
// Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
// letting go of unknown characters.
if (ordinal.empty()) {
return std::nullopt;
}
std::size_t offset{0};
if (ordinal[0] == '-') {
offset = 1;
}
if (ordinal.size() <= offset) {
return std::nullopt;
}
bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
[](auto c) { return std::isdigit(c); });
if (!valid) {
return std::nullopt;
}
std::int32_t parsed_id{Context::kCpuId};
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
if (res.ec != std::errc()) {
return std::nullopt;
}
return parsed_id;
}
[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
- cpu
- cuda
- cuda:<device ordinal> # e.g. cuda:0
- gpu
- gpu:<device ordinal> # e.g. gpu:0
)"};
auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
#if defined(__MINGW32__)
// mingw hangs on regex using rtools 430. Basic checks only.
CHECK_GE(input.size(), 3) << msg;
auto substr = input.substr(0, 3);
bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
CHECK(valid) << msg;
#else
std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
if (!std::regex_match(input, pattern)) {
fatal();
}
#endif // defined(__MINGW32__)
// handle alias
std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
DeviceOrd device;
device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check.
if (split_it == s_device.cend()) {
// no ordinal.
if (s_device == DeviceSym::CPU()) {
device = DeviceOrd::CPU();
} else if (s_device == DeviceSym::CUDA()) {
device = DeviceOrd::CUDA(0); // use 0 as default;
} else {
fatal();
}
} else {
// must be CUDA when ordinal is specifed.
// +1 for colon
std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
// substr
StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
if (s_ordinal.empty()) {
fatal();
}
auto opt_id = ParseInt(s_ordinal);
if (!opt_id.has_value()) {
fatal();
}
CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
<< "Ordinal value too large.";
device = DeviceOrd::CUDA(opt_id.value());
}
if (device.ordinal < Context::kCpuId) {
fatal();
}
device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
return device;
}
} // namespace
void Context::ConfigureGpuId(bool require_gpu) {
if (this->IsCPU() && require_gpu) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
}
}
void Context::SetDeviceOrdinal(Args const& kwargs) {
auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
[](auto const& p) { return p.first == "gpu_id"; });
auto has_gpu_id = gpu_id_it != kwargs.cend();
auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
[](auto const& p) { return p.first == kDevice; });
auto has_device = device_it != kwargs.cend();
if (has_device && has_gpu_id) {
LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
}
if (has_gpu_id) {
// Compatible with XGBoost < 2.0.0
error::WarnDeprecatedGPUId();
auto opt_id = ParseInt(StringView{gpu_id_it->second});
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
if (opt_id.value() > Context::kCpuId) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
} else {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
}
return;
}
auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
if (!has_device) {
CHECK_EQ(new_d.ordinal, this->device_.ordinal); // unchanged
}
this->SetDevice(new_d);
if (this->IsCPU()) {
CHECK_EQ(this->device_.ordinal, kCpuId);
} else {
CHECK_GT(this->device_.ordinal, kCpuId);
}
}
std::int32_t Context::Threads() const {

View File

@@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
bool valid = iter.Next();
CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
auto d = MakeProxy(proxy_)->DeviceIdx();
auto pctx = MakeProxy(proxy_)->Ctx();
Context ctx;
ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
ctx.UpdateAllowUnknown(
Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
// hardcoded parameter.
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};

View File

@@ -54,6 +54,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
}
CHECK(p_fmat) << "Failed to fallback.";
return p_fmat;
}
} // namespace xgboost::data

View File

@@ -7,28 +7,31 @@
namespace xgboost::data {
void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
auto const& value = adapter->Value();
auto adapter{std::make_shared<CudfAdapter>(interface_str)};
this->batch_ = adapter;
ctx_.gpu_id = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (ctx_.gpu_id < 0) {
if (adapter->DeviceIdx() < 0) {
// empty data
CHECK_EQ(this->Info().num_row_, 0);
ctx_.gpu_id = dh::CurrentDevice();
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return;
}
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
}
void DMatrixProxy::FromCudaArray(StringView interface_str) {
std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
this->batch_ = adapter;
ctx_.gpu_id = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (ctx_.gpu_id < 0) {
if (adapter->DeviceIdx() < 0) {
// empty data
CHECK_EQ(this->Info().num_row_, 0);
ctx_.gpu_id = dh::CurrentDevice();
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return;
}
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
}
namespace cuda_impl {

View File

@@ -27,7 +27,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
dh::safe_cuda(cudaSetDevice(device));
Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
CHECK(adapter->NumRows() != kAdapterUnknownSize);
CHECK(adapter->NumColumns() != kAdapterUnknownSize);

View File

@@ -84,6 +84,25 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
return name == up->Name();
});
}
void MismatchedDevices(Context const* booster, Context const* data) {
bool thread_local static logged{false};
if (logged) {
return;
}
LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
"lead to higher memory usage and slower performance. XGBoost is running on: "
<< booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
<< ".\n"
<< R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
This warning will only be shown once, and subsequent warnings made by the current thread will be
suppressed.
)";
logged = true;
}
} // namespace
void GBTree::Configure(Args const& cfg) {
@@ -208,6 +227,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
monitor_.Start("BoostNewTrees");
predt->predictions.SetDevice(ctx_->Ordinal());
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
model_.learner_model_param->OutputLength());
CHECK_NE(n_groups, 0);
@@ -521,18 +541,6 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
}
}
namespace {
inline void MismatchedDevices(Context const* booster, Context const* data) {
LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
<< "is running on: " << booster->DeviceName()
<< ", while the input data is on: " << data->DeviceName() << ".\n"
<< R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
)";
}
}; // namespace
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) {
// dispatch to const function.

View File

@@ -40,7 +40,7 @@
#include "common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "common/charconv.h" // for to_chars, to_chars_result, NumericLimits, from_...
#include "common/common.h" // for ToString, Split
#include "common/error_msg.h" // for MaxFeatureSize, WarnOldSerialization
#include "common/error_msg.h" // for MaxFeatureSize, WarnOldSerialization, ...
#include "common/io.h" // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
#include "common/observer.h" // for TrainingObserver
#include "common/random.h" // for GlobalRandom
@@ -711,6 +711,7 @@ class LearnerConfiguration : public Learner {
// FIXME(trivialfis): Make eval_metric a training parameter.
keys.emplace_back(kEvalMetric);
keys.emplace_back("num_output_group");
keys.emplace_back("gpu_id"); // deprecated param.
std::sort(keys.begin(), keys.end());
@@ -1340,10 +1341,9 @@ class LearnerImpl : public LearnerIO {
}
void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
unsigned layer_end, bool training,
bool pred_leaf, bool pred_contribs, bool approx_contribs,
bool pred_interactions) override {
HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
bool approx_contribs, bool pred_interactions) override {
int multiple_predictions = static_cast<int>(pred_leaf) +
static_cast<int>(pred_interactions) +
static_cast<int>(pred_contribs);
@@ -1391,15 +1391,16 @@ class LearnerImpl : public LearnerIO {
}
void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
uint32_t iteration_end) override {
HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
bst_layer_t iteration_end) override {
this->Configure();
this->CheckModelInitialized();
auto& out_predictions = this->GetThreadLocal().prediction_entry;
out_predictions.version = 0;
out_predictions.Reset();
this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
if (type == PredictionType::kValue) {
obj_->PredTransform(&out_predictions.predictions);
} else if (type == PredictionType::kMargin) {
@@ -1454,7 +1455,7 @@ class LearnerImpl : public LearnerIO {
}
if (p_fmat->Info().num_row_ == 0) {
LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
error::WarnEmptyDataset();
}
}