Disable parameter validation for Scikit-Learn interface. (#5167)
* Disable parameter validation for now. Scikit-Learn passes all parameters down to XGBoost, whether they are used or not. * Add option `validate_parameters`.
This commit is contained in:
parent
2b9a62a806
commit
ebc86a3afa
@ -29,10 +29,16 @@ General Parameters
|
|||||||
|
|
||||||
* ``verbosity`` [default=1]
|
* ``verbosity`` [default=1]
|
||||||
|
|
||||||
- Verbosity of printing messages. Valid values are 0 (silent),
|
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
|
||||||
1 (warning), 2 (info), 3 (debug). Sometimes XGBoost tries to change
|
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
|
||||||
configurations based on heuristics, which is displayed as warning message.
|
is displayed as warning message. If there's unexpected behaviour, please try to
|
||||||
If there's unexpected behaviour, please try to increase value of verbosity.
|
increase value of verbosity.
|
||||||
|
|
||||||
|
* ``validate_parameters`` [default to false, except for Python ``train`` function]
|
||||||
|
|
||||||
|
- When set to True, XGBoost will perform validation of input parameters to check whether
|
||||||
|
a parameter is used or not. The feature is still experimental. It's expected to have
|
||||||
|
some false positives, especially when used with Scikit-Learn interface.
|
||||||
|
|
||||||
* ``nthread`` [default to maximum number of threads available if not set]
|
* ``nthread`` [default to maximum number of threads available if not set]
|
||||||
|
|
||||||
|
|||||||
@ -28,6 +28,7 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
|
|||||||
// gpu page size in external memory mode, 0 means using the default.
|
// gpu page size in external memory mode, 0 means using the default.
|
||||||
size_t gpu_page_size;
|
size_t gpu_page_size;
|
||||||
bool enable_experimental_json_serialization {false};
|
bool enable_experimental_json_serialization {false};
|
||||||
|
bool validate_parameters {false};
|
||||||
|
|
||||||
void CheckDeprecated() {
|
void CheckDeprecated() {
|
||||||
if (this->n_gpus != 0) {
|
if (this->n_gpus != 0) {
|
||||||
@ -70,6 +71,9 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
|
|||||||
.set_default(false)
|
.set_default(false)
|
||||||
.describe("Enable using JSON for memory serialization (Python Pickle, "
|
.describe("Enable using JSON for memory serialization (Python Pickle, "
|
||||||
"rabit checkpoints etc.).");
|
"rabit checkpoints etc.).");
|
||||||
|
DMLC_DECLARE_FIELD(validate_parameters)
|
||||||
|
.set_default(false)
|
||||||
|
.describe("Enable to check whether parameters are used or not.");
|
||||||
DMLC_DECLARE_FIELD(n_gpus)
|
DMLC_DECLARE_FIELD(n_gpus)
|
||||||
.set_default(0)
|
.set_default(0)
|
||||||
.set_range(0, 1)
|
.set_range(0, 1)
|
||||||
|
|||||||
@ -1070,6 +1070,10 @@ class Booster(object):
|
|||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
|
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
|
||||||
ctypes.byref(self.handle)))
|
ctypes.byref(self.handle)))
|
||||||
|
|
||||||
|
if isinstance(params, dict) and \
|
||||||
|
'validate_parameters' not in params.keys():
|
||||||
|
params['validate_parameters'] = 1
|
||||||
self.set_param(params or {})
|
self.set_param(params or {})
|
||||||
if (params is not None) and ('booster' in params):
|
if (params is not None) and ('booster' in params):
|
||||||
self.booster = params['booster']
|
self.booster = params['booster']
|
||||||
|
|||||||
@ -224,7 +224,8 @@ class XGBModel(XGBModelBase):
|
|||||||
def get_params(self, deep=False):
|
def get_params(self, deep=False):
|
||||||
"""Get parameters."""
|
"""Get parameters."""
|
||||||
params = super(XGBModel, self).get_params(deep=deep)
|
params = super(XGBModel, self).get_params(deep=deep)
|
||||||
if isinstance(self.kwargs, dict): # if kwargs is a dict, update params accordingly
|
# if kwargs is a dict, update params accordingly
|
||||||
|
if isinstance(self.kwargs, dict):
|
||||||
params.update(self.kwargs)
|
params.update(self.kwargs)
|
||||||
if params['missing'] is np.nan:
|
if params['missing'] is np.nan:
|
||||||
params['missing'] = None # sklearn doesn't handle nan. see #4725
|
params['missing'] = None # sklearn doesn't handle nan. see #4725
|
||||||
@ -233,6 +234,11 @@ class XGBModel(XGBModelBase):
|
|||||||
if isinstance(params['random_state'], np.random.RandomState):
|
if isinstance(params['random_state'], np.random.RandomState):
|
||||||
params['random_state'] = params['random_state'].randint(
|
params['random_state'] = params['random_state'].randint(
|
||||||
np.iinfo(np.int32).max)
|
np.iinfo(np.int32).max)
|
||||||
|
# Parameter validation is not working with Scikit-Learn interface, as
|
||||||
|
# it passes all paraemters into XGBoost core, whether they are used or
|
||||||
|
# not.
|
||||||
|
if 'validate_parameters' not in params.keys():
|
||||||
|
params['validate_parameters'] = False
|
||||||
return params
|
return params
|
||||||
|
|
||||||
def get_xgb_params(self):
|
def get_xgb_params(self):
|
||||||
|
|||||||
@ -230,7 +230,10 @@ class LearnerImpl : public Learner {
|
|||||||
obj_->ProbToMargin(mparam_.base_score));
|
obj_->ProbToMargin(mparam_.base_score));
|
||||||
|
|
||||||
this->need_configuration_ = false;
|
this->need_configuration_ = false;
|
||||||
|
if (generic_parameters_.validate_parameters) {
|
||||||
this->ValidateParameters();
|
this->ValidateParameters();
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME(trivialfis): Clear the cache once binary IO is gone.
|
// FIXME(trivialfis): Clear the cache once binary IO is gone.
|
||||||
monitor_.Stop("Configure");
|
monitor_.Stop("Configure");
|
||||||
}
|
}
|
||||||
@ -266,20 +269,21 @@ class LearnerImpl : public Learner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
auto learner_model_param = mparam_.ToJson();
|
||||||
|
for (auto const& kv : get<Object>(learner_model_param)) {
|
||||||
|
keys.emplace_back(kv.first);
|
||||||
|
}
|
||||||
|
keys.emplace_back(kEvalMetric);
|
||||||
|
keys.emplace_back("verbosity");
|
||||||
|
keys.emplace_back("num_output_group");
|
||||||
|
|
||||||
std::sort(keys.begin(), keys.end());
|
std::sort(keys.begin(), keys.end());
|
||||||
|
|
||||||
std::vector<std::string> provided;
|
std::vector<std::string> provided;
|
||||||
for (auto const &kv : cfg_) {
|
for (auto const &kv : cfg_) {
|
||||||
// `num_feature` and `num_class` are automatically added due to legacy reason.
|
|
||||||
// `verbosity` in logger is not saved, we should move it into generic_param_.
|
|
||||||
// FIXME(trivialfis): Make eval_metric a training parameter.
|
// FIXME(trivialfis): Make eval_metric a training parameter.
|
||||||
if (kv.first != "num_feature" && kv.first != "verbosity" &&
|
|
||||||
kv.first != "num_class" && kv.first != "num_output_group" &&
|
|
||||||
kv.first != kEvalMetric) {
|
|
||||||
provided.push_back(kv.first);
|
provided.push_back(kv.first);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
std::sort(provided.begin(), provided.end());
|
std::sort(provided.begin(), provided.end());
|
||||||
|
|
||||||
std::vector<std::string> diff;
|
std::vector<std::string> diff;
|
||||||
@ -287,12 +291,18 @@ class LearnerImpl : public Learner {
|
|||||||
keys.end(), std::back_inserter(diff));
|
keys.end(), std::back_inserter(diff));
|
||||||
if (diff.size() != 0) {
|
if (diff.size() != 0) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "Parameters: { ";
|
ss << "\nParameters: { ";
|
||||||
for (size_t i = 0; i < diff.size() - 1; ++i) {
|
for (size_t i = 0; i < diff.size() - 1; ++i) {
|
||||||
ss << diff[i] << ", ";
|
ss << diff[i] << ", ";
|
||||||
}
|
}
|
||||||
ss << diff.back();
|
ss << diff.back();
|
||||||
ss << " } are not used.";
|
ss << R"W( } might not be used.
|
||||||
|
|
||||||
|
This may not be accurate due to some parameters are only used in language bindings but
|
||||||
|
passed down to XGBoost core. Or some parameters are not used but slip through this
|
||||||
|
verification. Please open an issue if you find above cases.
|
||||||
|
|
||||||
|
)W";
|
||||||
LOG(WARNING) << ss.str();
|
LOG(WARNING) << ss.str();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,8 +30,6 @@ TEST(Monitor, Logging) {
|
|||||||
run_monitor();
|
run_monitor();
|
||||||
output = testing::internal::GetCapturedStderr();
|
output = testing::internal::GetCapturedStderr();
|
||||||
ASSERT_EQ(output.size(), 0);
|
ASSERT_EQ(output.size(), 0);
|
||||||
|
|
||||||
ConsoleLogger::Configure(Args{{"verbosity", "1"}});
|
|
||||||
}
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -31,12 +31,14 @@ TEST(Learner, Basic) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(Learner, ParameterValidation) {
|
TEST(Learner, ParameterValidation) {
|
||||||
|
ConsoleLogger::Configure({{"verbosity", "2"}});
|
||||||
size_t constexpr kRows = 1;
|
size_t constexpr kRows = 1;
|
||||||
size_t constexpr kCols = 1;
|
size_t constexpr kCols = 1;
|
||||||
auto pp_mat = CreateDMatrix(kRows, kCols, 0);
|
auto pp_mat = CreateDMatrix(kRows, kCols, 0);
|
||||||
auto& p_mat = *pp_mat;
|
auto& p_mat = *pp_mat;
|
||||||
|
|
||||||
auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
|
auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
|
||||||
|
learner->SetParam("validate_parameters", "1");
|
||||||
learner->SetParam("Knock Knock", "Who's there?");
|
learner->SetParam("Knock Knock", "Who's there?");
|
||||||
learner->SetParam("Silence", "....");
|
learner->SetParam("Silence", "....");
|
||||||
learner->SetParam("tree_method", "exact");
|
learner->SetParam("tree_method", "exact");
|
||||||
@ -45,7 +47,7 @@ TEST(Learner, ParameterValidation) {
|
|||||||
learner->Configure();
|
learner->Configure();
|
||||||
std::string output = testing::internal::GetCapturedStderr();
|
std::string output = testing::internal::GetCapturedStderr();
|
||||||
|
|
||||||
ASSERT_TRUE(output.find("Parameters: { Knock Knock, Silence } are not used.") != std::string::npos);
|
ASSERT_TRUE(output.find("Parameters: { Knock Knock, Silence }") != std::string::npos);
|
||||||
delete pp_mat;
|
delete pp_mat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -334,7 +334,7 @@ TEST_F(SerializationTest, ConfigurationCount) {
|
|||||||
}
|
}
|
||||||
ASSERT_EQ(occureences, 2);
|
ASSERT_EQ(occureences, 2);
|
||||||
|
|
||||||
xgboost::ConsoleLogger::Configure({{"verbosity", "1"}});
|
xgboost::ConsoleLogger::Configure({{"verbosity", "2"}});
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(SerializationTest, GPU_CoordDescent) {
|
TEST_F(SerializationTest, GPU_CoordDescent) {
|
||||||
|
|||||||
@ -92,6 +92,8 @@ class TestPickling(unittest.TestCase):
|
|||||||
status = subprocess.call(args, env=env)
|
status = subprocess.call(args, env=env)
|
||||||
assert status == 0
|
assert status == 0
|
||||||
|
|
||||||
|
os.remove(model_path)
|
||||||
|
|
||||||
def test_pickled_predictor(self):
|
def test_pickled_predictor(self):
|
||||||
x, y = build_dataset()
|
x, y = build_dataset()
|
||||||
train_x = xgb.DMatrix(x, label=y)
|
train_x = xgb.DMatrix(x, label=y)
|
||||||
@ -129,6 +131,8 @@ class TestPickling(unittest.TestCase):
|
|||||||
status = subprocess.call(args, env=env)
|
status = subprocess.call(args, env=env)
|
||||||
assert status == 0
|
assert status == 0
|
||||||
|
|
||||||
|
os.remove(model_path)
|
||||||
|
|
||||||
def test_predict_sklearn_pickle(self):
|
def test_predict_sklearn_pickle(self):
|
||||||
x, y = build_dataset()
|
x, y = build_dataset()
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user