diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index b94e4fb21..629275cf5 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -11,6 +11,7 @@ #include namespace xgboost { + struct GenericParameter : public XGBoostParameter { // Constant representing the device ID of CPU. static int32_t constexpr kCpuId = -1; @@ -26,6 +27,8 @@ struct GenericParameter : public XGBoostParameter { int nthread; // primary device, -1 means no gpu. int gpu_id; + // fail when gpu_id is invalid + bool fail_on_invalid_gpu_id {false}; // gpu page size in external memory mode, 0 means using the default. size_t gpu_page_size; bool enable_experimental_json_serialization {true}; @@ -64,6 +67,9 @@ struct GenericParameter : public XGBoostParameter { .set_default(-1) .set_lower_bound(-1) .describe("The primary GPU device ordinal."); + DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id) + .set_default(false) + .describe("Fail with error when gpu_id is invalid."); DMLC_DECLARE_FIELD(gpu_page_size) .set_default(0) .set_lower_bound(0) diff --git a/src/learner.cc b/src/learner.cc index 4e75dd7ea..75b25154a 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -222,6 +222,10 @@ void GenericParameter::ConfigureGpuId(bool require_gpu) { LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1"; } this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}}); + } else if (fail_on_invalid_gpu_id) { + CHECK(gpu_id == kCpuId || gpu_id < n_gpus) + << "Only " << n_gpus << " GPUs are visible, gpu_id " + << gpu_id << " is invalid."; } else if (gpu_id != kCpuId && gpu_id >= n_gpus) { LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to " << gpu_id % n_gpus; diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 8bbd814bc..66ff4c7e6 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -580,7 +580,7 @@ class GPUPredictor : public xgboost::Predictor { Predictor::Predictor{generic_param} {} ~GPUPredictor() override { - if (generic_param_->gpu_id >= 0) { + if (generic_param_->gpu_id >= 0 && generic_param_->gpu_id < common::AllVisibleGPUs()) { dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id)); } } diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py index 2b641c94b..c5ee9ef59 100644 --- a/tests/python-gpu/test_gpu_basic_models.py +++ b/tests/python-gpu/test_gpu_basic_models.py @@ -52,3 +52,17 @@ class TestGPUBasicModels: model_0, model_1 = self.run_cls(X, y, False) assert model_0 != model_1 + + def test_invalid_gpu_id(self): + X = np.random.randn(10, 5) * 1e4 + y = np.random.randint(0, 2, size=10) * 1e4 + # should pass with invalid gpu id + cls1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999) + cls1.fit(X, y) + # should throw error with fail_on_invalid_gpu_id enabled + cls2 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999, fail_on_invalid_gpu_id=True) + try: + cls2.fit(X, y) + assert False, "Should have failed with with fail_on_invalid_gpu_id enabled" + except xgb.core.XGBoostError as err: + assert "gpu_id 9999 is invalid" in str(err)