Deprecate use_label_encoder in XGBClassifier. (#7822)

* Deprecate `use_label_encoder` in XGBClassifier.

* We have removed the encoder, now prepare to remove the indicator.
This commit is contained in:
Jiaming Yuan 2022-04-21 13:14:02 +08:00 committed by GitHub
parent 5815df4c46
commit 52d4eda786
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 21 additions and 41 deletions

View File

@ -63,7 +63,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
params = { params = {
"tree_method": "gpu_hist", "tree_method": "gpu_hist",
"use_label_encoder": False,
"n_estimators": 32, "n_estimators": 32,
"colsample_bylevel": 0.7, "colsample_bylevel": 0.7,
} }

View File

@ -14,13 +14,13 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
"""Basic training continuation.""" """Basic training continuation."""
# Train 128 iterations in 1 session # Train 128 iterations in 1 session
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False) clf = xgboost.XGBClassifier(n_estimators=128)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss") clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
print("Total boosted rounds:", clf.get_booster().num_boosted_rounds()) print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
# Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and # Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and
# the second one runs for 96 iterations # the second one runs for 96 iterations
clf = xgboost.XGBClassifier(n_estimators=32, use_label_encoder=False) clf = xgboost.XGBClassifier(n_estimators=32)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss") clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
assert clf.get_booster().num_boosted_rounds() == 32 assert clf.get_booster().num_boosted_rounds() == 32
@ -54,14 +54,14 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
n_estimators = 512 n_estimators = 512
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
clf = xgboost.XGBClassifier(n_estimators=n_estimators, use_label_encoder=False) clf = xgboost.XGBClassifier(n_estimators=n_estimators)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop]) clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
print("Total boosted rounds:", clf.get_booster().num_boosted_rounds()) print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
best = clf.best_iteration best = clf.best_iteration
# Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and # Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and
# the second one runs until early stop. # the second one runs until early stop.
clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False) clf = xgboost.XGBClassifier(n_estimators=128)
# Reinitialize the early stop callback # Reinitialize the early stop callback
early_stop = xgboost.callback.EarlyStopping( early_stop = xgboost.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=True rounds=early_stopping_rounds, save_best=True
@ -79,15 +79,13 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
else: else:
path = os.path.join(tmpdir, "model-first-128.json") path = os.path.join(tmpdir, "model-first-128.json")
clf.save_model(path) clf.save_model(path)
loaded = xgboost.XGBClassifier(use_label_encoder=False) loaded = xgboost.XGBClassifier()
loaded.load_model(path) loaded.load_model(path)
early_stop = xgboost.callback.EarlyStopping( early_stop = xgboost.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=True rounds=early_stopping_rounds, save_best=True
) )
clf = xgboost.XGBClassifier( clf = xgboost.XGBClassifier(n_estimators=n_estimators - 128)
n_estimators=n_estimators - 128, use_label_encoder=False
)
clf.fit( clf.fit(
X, X,
y, y,

View File

@ -35,7 +35,7 @@ def native_interface():
def sklearn_interface(): def sklearn_interface():
X_train, y_train = load_svmlight_file(train) X_train, y_train = load_svmlight_file(train)
X_test, y_test = load_svmlight_file(test) X_test, y_test = load_svmlight_file(test)
clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1, use_label_encoder=False) clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
assert clf.n_classes_ == 2 assert clf.n_classes_ == 2

View File

@ -36,9 +36,7 @@ parameter ``enable_categorical``:
.. code:: python .. code:: python
# Supported tree methods are `gpu_hist`, `approx`, and `hist`. # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
clf = xgb.XGBClassifier( clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
)
# X is the dataframe we created in previous snippet # X is the dataframe we created in previous snippet
clf.fit(X, y) clf.fit(X, y)
# Must use JSON/UBJSON for serialization, otherwise the information is lost. # Must use JSON/UBJSON for serialization, otherwise the information is lost.

View File

@ -1304,13 +1304,15 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
self, self,
*, *,
objective: _SklObjective = "binary:logistic", objective: _SklObjective = "binary:logistic",
use_label_encoder: bool = False, use_label_encoder: Optional[bool] = None,
**kwargs: Any **kwargs: Any
) -> None: ) -> None:
# must match the parameters for `get_params` # must match the parameters for `get_params`
self.use_label_encoder = use_label_encoder self.use_label_encoder = use_label_encoder
if use_label_encoder is True: if use_label_encoder is True:
raise ValueError("Label encoder was removed in 1.6.") raise ValueError("Label encoder was removed in 1.6.")
if use_label_encoder is not None:
warnings.warn("`use_label_encoder` is deprecated in 2.0.0.")
super().__init__(objective=objective, **kwargs) super().__init__(objective=objective, **kwargs)
@_deprecate_positional_args @_deprecate_positional_args

View File

@ -152,16 +152,16 @@ class TestTrainingContinuation:
def test_changed_parameter(self): def test_changed_parameter(self):
from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
clf = xgb.XGBClassifier(n_estimators=2, use_label_encoder=False) clf = xgb.XGBClassifier(n_estimators=2)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss") clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"]) assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
clf.save_model(os.path.join(tmpdir, "clf.json")) clf.save_model(os.path.join(tmpdir, "clf.json"))
loaded = xgb.XGBClassifier(use_label_encoder=False) loaded = xgb.XGBClassifier()
loaded.load_model(os.path.join(tmpdir, "clf.json")) loaded.load_model(os.path.join(tmpdir, "clf.json"))
clf = xgb.XGBClassifier(n_estimators=2, use_label_encoder=False) clf = xgb.XGBClassifier(n_estimators=2)
# change metric to error # change metric to error
clf.fit(X, y, eval_set=[(X, y)], eval_metric="error") clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
assert tm.non_increasing(clf.evals_result()["validation_0"]["error"]) assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])

View File

@ -777,9 +777,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
valid_X = dd.from_array(valid_X_, chunksize=n_samples) valid_X = dd.from_array(valid_X_, chunksize=n_samples)
valid_y = dd.from_array(valid_y_, chunksize=n_samples) valid_y = dd.from_array(valid_y_, chunksize=n_samples)
cls = xgb.dask.DaskXGBClassifier( cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
tree_method=tree_method, n_estimators=2, use_label_encoder=False
)
cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)]) cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
# multiclass # multiclass
@ -808,9 +806,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
valid_X = dd.from_array(valid_X_, chunksize=n_samples) valid_X = dd.from_array(valid_X_, chunksize=n_samples)
valid_y = dd.from_array(valid_y_, chunksize=n_samples) valid_y = dd.from_array(valid_y_, chunksize=n_samples)
cls = xgb.dask.DaskXGBClassifier( cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
tree_method=tree_method, n_estimators=2, use_label_encoder=False
)
cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)]) cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
@ -837,14 +833,10 @@ def run_auc(client: "Client", tree_method: str) -> None:
valid_X = dd.from_array(valid_X_, chunksize=10) valid_X = dd.from_array(valid_X_, chunksize=10)
valid_y = dd.from_array(valid_y_, chunksize=10) valid_y = dd.from_array(valid_y_, chunksize=10)
cls = xgb.XGBClassifier( cls = xgb.XGBClassifier(tree_method=tree_method, n_estimators=2)
tree_method=tree_method, n_estimators=2, use_label_encoder=False
)
cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)]) cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
dcls = xgb.dask.DaskXGBClassifier( dcls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
tree_method=tree_method, n_estimators=2, use_label_encoder=False
)
dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)]) dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
approx = dcls.evals_result()["validation_0"]["auc"] approx = dcls.evals_result()["validation_0"]["auc"]
@ -1693,7 +1685,6 @@ def test_parallel_submits(client: "Client") -> None:
verbosity=1, verbosity=1,
n_estimators=i + 1, n_estimators=i + 1,
eval_metric="merror", eval_metric="merror",
use_label_encoder=False,
) )
f = client.submit(cls.fit, X, y, pure=False) f = client.submit(cls.fit, X, y, pure=False)
futures.append(f) futures.append(f)
@ -1786,7 +1777,6 @@ def test_parallel_submit_multi_clients() -> None:
verbosity=1, verbosity=1,
n_estimators=i + 1, n_estimators=i + 1,
eval_metric="merror", eval_metric="merror",
use_label_encoder=False,
) )
f = client.submit(cls.fit, X, y, pure=False) f = client.submit(cls.fit, X, y, pure=False)
futures.append((client, f)) futures.append((client, f))

View File

@ -774,13 +774,12 @@ def save_load_model(model_path):
X = digits['data'] X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng) kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y): for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier(use_label_encoder=False).fit(X[train_index], y[train_index]) xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
xgb_model.save_model(model_path) xgb_model.save_model(model_path)
xgb_model = xgb.XGBClassifier() xgb_model = xgb.XGBClassifier()
xgb_model.load_model(model_path) xgb_model.load_model(model_path)
assert xgb_model.use_label_encoder is False
assert isinstance(xgb_model.classes_, np.ndarray) assert isinstance(xgb_model.classes_, np.ndarray)
assert isinstance(xgb_model._Booster, xgb.Booster) assert isinstance(xgb_model._Booster, xgb.Booster)
@ -972,8 +971,8 @@ def test_deprecate_position_arg():
model.fit(X, y, w) model.fit(X, y, w)
with pytest.warns(FutureWarning): with pytest.warns(FutureWarning):
xgb.XGBClassifier(1, use_label_encoder=False) xgb.XGBClassifier(1)
model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False) model = xgb.XGBClassifier(n_estimators=1)
with pytest.warns(FutureWarning): with pytest.warns(FutureWarning):
model.fit(X, y, w) model.fit(X, y, w)
@ -990,9 +989,6 @@ def test_deprecate_position_arg():
with pytest.warns(FutureWarning): with pytest.warns(FutureWarning):
model.fit(X, y, w) model.fit(X, y, w)
with pytest.raises(ValueError):
xgb.XGBRFClassifier(1, use_label_encoder=True)
model = xgb.XGBRFClassifier(n_estimators=1) model = xgb.XGBRFClassifier(n_estimators=1)
with pytest.warns(FutureWarning): with pytest.warns(FutureWarning):
model.fit(X, y, w) model.fit(X, y, w)
@ -1334,7 +1330,6 @@ def test_evaluation_metric():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
clf = xgb.XGBClassifier( clf = xgb.XGBClassifier(
use_label_encoder=False,
tree_method="hist", tree_method="hist",
eval_metric=merror, eval_metric=merror,
n_estimators=16, n_estimators=16,
@ -1344,7 +1339,6 @@ def test_evaluation_metric():
custom = clf.evals_result() custom = clf.evals_result()
clf = xgb.XGBClassifier( clf = xgb.XGBClassifier(
use_label_encoder=False,
tree_method="hist", tree_method="hist",
eval_metric="merror", eval_metric="merror",
n_estimators=16, n_estimators=16,
@ -1360,7 +1354,6 @@ def test_evaluation_metric():
) )
clf = xgb.XGBRFClassifier( clf = xgb.XGBRFClassifier(
use_label_encoder=False,
tree_method="hist", n_estimators=16, tree_method="hist", n_estimators=16,
objective=tm.softprob_obj(10), objective=tm.softprob_obj(10),
eval_metric=merror, eval_metric=merror,