From ebc86a3afa9d645b9e5f84eb4d839f9331e5927b Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Jan 2020 11:17:31 +0800 Subject: [PATCH] Disable parameter validation for Scikit-Learn interface. (#5167) * Disable parameter validation for now. Scikit-Learn passes all parameters down to XGBoost, whether they are used or not. * Add option `validate_parameters`. --- doc/parameter.rst | 14 +++++++++---- include/xgboost/generic_parameters.h | 4 ++++ python-package/xgboost/core.py | 4 ++++ python-package/xgboost/sklearn.py | 8 ++++++- src/learner.cc | 30 ++++++++++++++++++--------- tests/cpp/common/test_monitor.cc | 2 -- tests/cpp/test_learner.cc | 4 +++- tests/cpp/test_serialization.cc | 2 +- tests/python-gpu/test_gpu_pickling.py | 4 ++++ 9 files changed, 53 insertions(+), 19 deletions(-) diff --git a/doc/parameter.rst b/doc/parameter.rst index 9f0cd1f6c..d0ee4f20d 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -29,10 +29,16 @@ General Parameters * ``verbosity`` [default=1] - - Verbosity of printing messages. Valid values are 0 (silent), - 1 (warning), 2 (info), 3 (debug). Sometimes XGBoost tries to change - configurations based on heuristics, which is displayed as warning message. - If there's unexpected behaviour, please try to increase value of verbosity. + - Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 + (debug). Sometimes XGBoost tries to change configurations based on heuristics, which + is displayed as warning message. If there's unexpected behaviour, please try to + increase value of verbosity. + +* ``validate_parameters`` [default to false, except for Python ``train`` function] + + - When set to True, XGBoost will perform validation of input parameters to check whether + a parameter is used or not. The feature is still experimental. It's expected to have + some false positives, especially when used with Scikit-Learn interface. * ``nthread`` [default to maximum number of threads available if not set] diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index 90dcc0f1c..b01995a7a 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -28,6 +28,7 @@ struct GenericParameter : public XGBoostParameter { // gpu page size in external memory mode, 0 means using the default. size_t gpu_page_size; bool enable_experimental_json_serialization {false}; + bool validate_parameters {false}; void CheckDeprecated() { if (this->n_gpus != 0) { @@ -70,6 +71,9 @@ struct GenericParameter : public XGBoostParameter { .set_default(false) .describe("Enable using JSON for memory serialization (Python Pickle, " "rabit checkpoints etc.)."); + DMLC_DECLARE_FIELD(validate_parameters) + .set_default(false) + .describe("Enable to check whether parameters are used or not."); DMLC_DECLARE_FIELD(n_gpus) .set_default(0) .set_range(0, 1) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index bd3e601c3..d16dc67a6 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1070,6 +1070,10 @@ class Booster(object): self.handle = ctypes.c_void_p() _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)), ctypes.byref(self.handle))) + + if isinstance(params, dict) and \ + 'validate_parameters' not in params.keys(): + params['validate_parameters'] = 1 self.set_param(params or {}) if (params is not None) and ('booster' in params): self.booster = params['booster'] diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 943dc051c..f46601d8e 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -224,7 +224,8 @@ class XGBModel(XGBModelBase): def get_params(self, deep=False): """Get parameters.""" params = super(XGBModel, self).get_params(deep=deep) - if isinstance(self.kwargs, dict): # if kwargs is a dict, update params accordingly + # if kwargs is a dict, update params accordingly + if isinstance(self.kwargs, dict): params.update(self.kwargs) if params['missing'] is np.nan: params['missing'] = None # sklearn doesn't handle nan. see #4725 @@ -233,6 +234,11 @@ class XGBModel(XGBModelBase): if isinstance(params['random_state'], np.random.RandomState): params['random_state'] = params['random_state'].randint( np.iinfo(np.int32).max) + # Parameter validation is not working with Scikit-Learn interface, as + # it passes all paraemters into XGBoost core, whether they are used or + # not. + if 'validate_parameters' not in params.keys(): + params['validate_parameters'] = False return params def get_xgb_params(self): diff --git a/src/learner.cc b/src/learner.cc index dca4d3939..3d2fdbe3e 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -230,7 +230,10 @@ class LearnerImpl : public Learner { obj_->ProbToMargin(mparam_.base_score)); this->need_configuration_ = false; - this->ValidateParameters(); + if (generic_parameters_.validate_parameters) { + this->ValidateParameters(); + } + // FIXME(trivialfis): Clear the cache once binary IO is gone. monitor_.Stop("Configure"); } @@ -266,19 +269,20 @@ class LearnerImpl : public Learner { } } } + auto learner_model_param = mparam_.ToJson(); + for (auto const& kv : get(learner_model_param)) { + keys.emplace_back(kv.first); + } + keys.emplace_back(kEvalMetric); + keys.emplace_back("verbosity"); + keys.emplace_back("num_output_group"); std::sort(keys.begin(), keys.end()); std::vector provided; for (auto const &kv : cfg_) { - // `num_feature` and `num_class` are automatically added due to legacy reason. - // `verbosity` in logger is not saved, we should move it into generic_param_. // FIXME(trivialfis): Make eval_metric a training parameter. - if (kv.first != "num_feature" && kv.first != "verbosity" && - kv.first != "num_class" && kv.first != "num_output_group" && - kv.first != kEvalMetric) { - provided.push_back(kv.first); - } + provided.push_back(kv.first); } std::sort(provided.begin(), provided.end()); @@ -287,12 +291,18 @@ class LearnerImpl : public Learner { keys.end(), std::back_inserter(diff)); if (diff.size() != 0) { std::stringstream ss; - ss << "Parameters: { "; + ss << "\nParameters: { "; for (size_t i = 0; i < diff.size() - 1; ++i) { ss << diff[i] << ", "; } ss << diff.back(); - ss << " } are not used."; + ss << R"W( } might not be used. + + This may not be accurate due to some parameters are only used in language bindings but + passed down to XGBoost core. Or some parameters are not used but slip through this + verification. Please open an issue if you find above cases. + +)W"; LOG(WARNING) << ss.str(); } } diff --git a/tests/cpp/common/test_monitor.cc b/tests/cpp/common/test_monitor.cc index c0aaead33..bc918af64 100644 --- a/tests/cpp/common/test_monitor.cc +++ b/tests/cpp/common/test_monitor.cc @@ -30,8 +30,6 @@ TEST(Monitor, Logging) { run_monitor(); output = testing::internal::GetCapturedStderr(); ASSERT_EQ(output.size(), 0); - - ConsoleLogger::Configure(Args{{"verbosity", "1"}}); } } // namespace common } // namespace xgboost diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 0fab2c7bf..8200578f6 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -31,12 +31,14 @@ TEST(Learner, Basic) { } TEST(Learner, ParameterValidation) { + ConsoleLogger::Configure({{"verbosity", "2"}}); size_t constexpr kRows = 1; size_t constexpr kCols = 1; auto pp_mat = CreateDMatrix(kRows, kCols, 0); auto& p_mat = *pp_mat; auto learner = std::unique_ptr(Learner::Create({p_mat})); + learner->SetParam("validate_parameters", "1"); learner->SetParam("Knock Knock", "Who's there?"); learner->SetParam("Silence", "...."); learner->SetParam("tree_method", "exact"); @@ -45,7 +47,7 @@ TEST(Learner, ParameterValidation) { learner->Configure(); std::string output = testing::internal::GetCapturedStderr(); - ASSERT_TRUE(output.find("Parameters: { Knock Knock, Silence } are not used.") != std::string::npos); + ASSERT_TRUE(output.find("Parameters: { Knock Knock, Silence }") != std::string::npos); delete pp_mat; } diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc index 97eef789e..f2bc39267 100644 --- a/tests/cpp/test_serialization.cc +++ b/tests/cpp/test_serialization.cc @@ -334,7 +334,7 @@ TEST_F(SerializationTest, ConfigurationCount) { } ASSERT_EQ(occureences, 2); - xgboost::ConsoleLogger::Configure({{"verbosity", "1"}}); + xgboost::ConsoleLogger::Configure({{"verbosity", "2"}}); } TEST_F(SerializationTest, GPU_CoordDescent) { diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py index 7c9c9bdb4..969735fed 100644 --- a/tests/python-gpu/test_gpu_pickling.py +++ b/tests/python-gpu/test_gpu_pickling.py @@ -92,6 +92,8 @@ class TestPickling(unittest.TestCase): status = subprocess.call(args, env=env) assert status == 0 + os.remove(model_path) + def test_pickled_predictor(self): x, y = build_dataset() train_x = xgb.DMatrix(x, label=y) @@ -129,6 +131,8 @@ class TestPickling(unittest.TestCase): status = subprocess.call(args, env=env) assert status == 0 + os.remove(model_path) + def test_predict_sklearn_pickle(self): x, y = build_dataset()