Disable parameter validation for Scikit-Learn interface. (#5167)

* Disable parameter validation for now.

Scikit-Learn passes all parameters down to XGBoost, whether they are used or
not.

* Add option `validate_parameters`.
This commit is contained in:
Jiaming Yuan 2020-01-07 11:17:31 +08:00 committed by GitHub
parent 2b9a62a806
commit ebc86a3afa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 53 additions and 19 deletions

View File

@ -29,10 +29,16 @@ General Parameters
* ``verbosity`` [default=1]
- Verbosity of printing messages. Valid values are 0 (silent),
1 (warning), 2 (info), 3 (debug). Sometimes XGBoost tries to change
configurations based on heuristics, which is displayed as warning message.
If there's unexpected behaviour, please try to increase value of verbosity.
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
is displayed as warning message. If there's unexpected behaviour, please try to
increase value of verbosity.
* ``validate_parameters`` [default to false, except for Python ``train`` function]
- When set to True, XGBoost will perform validation of input parameters to check whether
a parameter is used or not. The feature is still experimental. It's expected to have
some false positives, especially when used with Scikit-Learn interface.
* ``nthread`` [default to maximum number of threads available if not set]

View File

@ -28,6 +28,7 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
// gpu page size in external memory mode, 0 means using the default.
size_t gpu_page_size;
bool enable_experimental_json_serialization {false};
bool validate_parameters {false};
void CheckDeprecated() {
if (this->n_gpus != 0) {
@ -70,6 +71,9 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
.set_default(false)
.describe("Enable using JSON for memory serialization (Python Pickle, "
"rabit checkpoints etc.).");
DMLC_DECLARE_FIELD(validate_parameters)
.set_default(false)
.describe("Enable to check whether parameters are used or not.");
DMLC_DECLARE_FIELD(n_gpus)
.set_default(0)
.set_range(0, 1)

View File

@ -1070,6 +1070,10 @@ class Booster(object):
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
ctypes.byref(self.handle)))
if isinstance(params, dict) and \
'validate_parameters' not in params.keys():
params['validate_parameters'] = 1
self.set_param(params or {})
if (params is not None) and ('booster' in params):
self.booster = params['booster']

View File

@ -224,7 +224,8 @@ class XGBModel(XGBModelBase):
def get_params(self, deep=False):
"""Get parameters."""
params = super(XGBModel, self).get_params(deep=deep)
if isinstance(self.kwargs, dict): # if kwargs is a dict, update params accordingly
# if kwargs is a dict, update params accordingly
if isinstance(self.kwargs, dict):
params.update(self.kwargs)
if params['missing'] is np.nan:
params['missing'] = None # sklearn doesn't handle nan. see #4725
@ -233,6 +234,11 @@ class XGBModel(XGBModelBase):
if isinstance(params['random_state'], np.random.RandomState):
params['random_state'] = params['random_state'].randint(
np.iinfo(np.int32).max)
# Parameter validation is not working with Scikit-Learn interface, as
# it passes all paraemters into XGBoost core, whether they are used or
# not.
if 'validate_parameters' not in params.keys():
params['validate_parameters'] = False
return params
def get_xgb_params(self):

View File

@ -230,7 +230,10 @@ class LearnerImpl : public Learner {
obj_->ProbToMargin(mparam_.base_score));
this->need_configuration_ = false;
if (generic_parameters_.validate_parameters) {
this->ValidateParameters();
}
// FIXME(trivialfis): Clear the cache once binary IO is gone.
monitor_.Stop("Configure");
}
@ -266,20 +269,21 @@ class LearnerImpl : public Learner {
}
}
}
auto learner_model_param = mparam_.ToJson();
for (auto const& kv : get<Object>(learner_model_param)) {
keys.emplace_back(kv.first);
}
keys.emplace_back(kEvalMetric);
keys.emplace_back("verbosity");
keys.emplace_back("num_output_group");
std::sort(keys.begin(), keys.end());
std::vector<std::string> provided;
for (auto const &kv : cfg_) {
// `num_feature` and `num_class` are automatically added due to legacy reason.
// `verbosity` in logger is not saved, we should move it into generic_param_.
// FIXME(trivialfis): Make eval_metric a training parameter.
if (kv.first != "num_feature" && kv.first != "verbosity" &&
kv.first != "num_class" && kv.first != "num_output_group" &&
kv.first != kEvalMetric) {
provided.push_back(kv.first);
}
}
std::sort(provided.begin(), provided.end());
std::vector<std::string> diff;
@ -287,12 +291,18 @@ class LearnerImpl : public Learner {
keys.end(), std::back_inserter(diff));
if (diff.size() != 0) {
std::stringstream ss;
ss << "Parameters: { ";
ss << "\nParameters: { ";
for (size_t i = 0; i < diff.size() - 1; ++i) {
ss << diff[i] << ", ";
}
ss << diff.back();
ss << " } are not used.";
ss << R"W( } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
)W";
LOG(WARNING) << ss.str();
}
}

View File

@ -30,8 +30,6 @@ TEST(Monitor, Logging) {
run_monitor();
output = testing::internal::GetCapturedStderr();
ASSERT_EQ(output.size(), 0);
ConsoleLogger::Configure(Args{{"verbosity", "1"}});
}
} // namespace common
} // namespace xgboost

View File

@ -31,12 +31,14 @@ TEST(Learner, Basic) {
}
TEST(Learner, ParameterValidation) {
ConsoleLogger::Configure({{"verbosity", "2"}});
size_t constexpr kRows = 1;
size_t constexpr kCols = 1;
auto pp_mat = CreateDMatrix(kRows, kCols, 0);
auto& p_mat = *pp_mat;
auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
learner->SetParam("validate_parameters", "1");
learner->SetParam("Knock Knock", "Who's there?");
learner->SetParam("Silence", "....");
learner->SetParam("tree_method", "exact");
@ -45,7 +47,7 @@ TEST(Learner, ParameterValidation) {
learner->Configure();
std::string output = testing::internal::GetCapturedStderr();
ASSERT_TRUE(output.find("Parameters: { Knock Knock, Silence } are not used.") != std::string::npos);
ASSERT_TRUE(output.find("Parameters: { Knock Knock, Silence }") != std::string::npos);
delete pp_mat;
}

View File

@ -334,7 +334,7 @@ TEST_F(SerializationTest, ConfigurationCount) {
}
ASSERT_EQ(occureences, 2);
xgboost::ConsoleLogger::Configure({{"verbosity", "1"}});
xgboost::ConsoleLogger::Configure({{"verbosity", "2"}});
}
TEST_F(SerializationTest, GPU_CoordDescent) {

View File

@ -92,6 +92,8 @@ class TestPickling(unittest.TestCase):
status = subprocess.call(args, env=env)
assert status == 0
os.remove(model_path)
def test_pickled_predictor(self):
x, y = build_dataset()
train_x = xgb.DMatrix(x, label=y)
@ -129,6 +131,8 @@ class TestPickling(unittest.TestCase):
status = subprocess.call(args, env=env)
assert status == 0
os.remove(model_path)
def test_predict_sklearn_pickle(self):
x, y = build_dataset()