Deprecate use_label_encoder in XGBClassifier. (#7822)

* Deprecate `use_label_encoder` in XGBClassifier. * We have removed the encoder, now prepare to remove the indicator.
2022-04-21 13:14:02 +08:00 · 2022-04-21 13:14:02 +08:00 · 52d4eda786
commit 52d4eda786
parent 5815df4c46
8 changed files with 21 additions and 41 deletions
--- a/demo/guide-python/cat_in_the_dat.py
+++ b/demo/guide-python/cat_in_the_dat.py
@ -63,7 +63,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:

 params = {
    "tree_method": "gpu_hist",
-    "use_label_encoder": False,
    "n_estimators": 32,
    "colsample_bylevel": 0.7,
 }
--- a/demo/guide-python/continuation.py
+++ b/demo/guide-python/continuation.py
@ -14,13 +14,13 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
    """Basic training continuation."""
    # Train 128 iterations in 1 session
    X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=128)
    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
    print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())

    # Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and
    # the second one runs for 96 iterations
-    clf = xgboost.XGBClassifier(n_estimators=32, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=32)
    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
    assert clf.get_booster().num_boosted_rounds() == 32

@ -54,14 +54,14 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
    n_estimators = 512

    X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=n_estimators, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=n_estimators)
    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
    print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
    best = clf.best_iteration

    # Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and
    # the second one runs until early stop.
-    clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=128)
    # Reinitialize the early stop callback
    early_stop = xgboost.callback.EarlyStopping(
        rounds=early_stopping_rounds, save_best=True
@ -79,15 +79,13 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
    else:
        path = os.path.join(tmpdir, "model-first-128.json")
        clf.save_model(path)
-        loaded = xgboost.XGBClassifier(use_label_encoder=False)
+        loaded = xgboost.XGBClassifier()
        loaded.load_model(path)

    early_stop = xgboost.callback.EarlyStopping(
        rounds=early_stopping_rounds, save_best=True
    )
-    clf = xgboost.XGBClassifier(
-        n_estimators=n_estimators - 128, use_label_encoder=False
-    )
+    clf = xgboost.XGBClassifier(n_estimators=n_estimators - 128)
    clf.fit(
        X,
        y,
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@ -35,7 +35,7 @@ def native_interface():
 def sklearn_interface():
    X_train, y_train = load_svmlight_file(train)
    X_test, y_test = load_svmlight_file(test)
-    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1, use_label_encoder=False)
+    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1)
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    assert clf.n_classes_ == 2

--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@ -36,9 +36,7 @@ parameter ``enable_categorical``:
 .. code:: python

  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
-  clf = xgb.XGBClassifier(
-      tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
-  )
+  clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
  # X is the dataframe we created in previous snippet
  clf.fit(X, y)
  # Must use JSON/UBJSON for serialization, otherwise the information is lost.
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -1304,13 +1304,15 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        self,
        *,
        objective: _SklObjective = "binary:logistic",
-        use_label_encoder: bool = False,
+        use_label_encoder: Optional[bool] = None,
        **kwargs: Any
    ) -> None:
        # must match the parameters for `get_params`
        self.use_label_encoder = use_label_encoder
        if use_label_encoder is True:
            raise ValueError("Label encoder was removed in 1.6.")
+        if use_label_encoder is not None:
+            warnings.warn("`use_label_encoder` is deprecated in 2.0.0.")
        super().__init__(objective=objective, **kwargs)

    @_deprecate_positional_args
--- a/tests/python/test_training_continuation.py
+++ b/tests/python/test_training_continuation.py
@ -152,16 +152,16 @@ class TestTrainingContinuation:
    def test_changed_parameter(self):
        from sklearn.datasets import load_breast_cancer
        X, y = load_breast_cancer(return_X_y=True)
-        clf = xgb.XGBClassifier(n_estimators=2, use_label_encoder=False)
+        clf = xgb.XGBClassifier(n_estimators=2)
        clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
        assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])

        with tempfile.TemporaryDirectory() as tmpdir:
            clf.save_model(os.path.join(tmpdir, "clf.json"))
-            loaded = xgb.XGBClassifier(use_label_encoder=False)
+            loaded = xgb.XGBClassifier()
            loaded.load_model(os.path.join(tmpdir, "clf.json"))

-        clf = xgb.XGBClassifier(n_estimators=2, use_label_encoder=False)
+        clf = xgb.XGBClassifier(n_estimators=2)
        # change metric to error
        clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
        assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@ -777,9 +777,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
    valid_X = dd.from_array(valid_X_, chunksize=n_samples)
    valid_y = dd.from_array(valid_y_, chunksize=n_samples)

-    cls = xgb.dask.DaskXGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
    cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])

    # multiclass
@ -808,9 +806,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
    valid_X = dd.from_array(valid_X_, chunksize=n_samples)
    valid_y = dd.from_array(valid_y_, chunksize=n_samples)

-    cls = xgb.dask.DaskXGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
    cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])


@ -837,14 +833,10 @@ def run_auc(client: "Client", tree_method: str) -> None:
    valid_X = dd.from_array(valid_X_, chunksize=10)
    valid_y = dd.from_array(valid_y_, chunksize=10)

-    cls = xgb.XGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    cls = xgb.XGBClassifier(tree_method=tree_method, n_estimators=2)
    cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])

-    dcls = xgb.dask.DaskXGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    dcls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
    dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])

    approx = dcls.evals_result()["validation_0"]["auc"]
@ -1693,7 +1685,6 @@ def test_parallel_submits(client: "Client") -> None:
            verbosity=1,
            n_estimators=i + 1,
            eval_metric="merror",
-            use_label_encoder=False,
        )
        f = client.submit(cls.fit, X, y, pure=False)
        futures.append(f)
@ -1786,7 +1777,6 @@ def test_parallel_submit_multi_clients() -> None:
                verbosity=1,
                n_estimators=i + 1,
                eval_metric="merror",
-                use_label_encoder=False,
            )
            f = client.submit(cls.fit, X, y, pure=False)
            futures.append((client, f))
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@ -774,13 +774,12 @@ def save_load_model(model_path):
    X = digits['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
-        xgb_model = xgb.XGBClassifier(use_label_encoder=False).fit(X[train_index], y[train_index])
+        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
        xgb_model.save_model(model_path)

        xgb_model = xgb.XGBClassifier()
        xgb_model.load_model(model_path)

-        assert xgb_model.use_label_encoder is False
        assert isinstance(xgb_model.classes_, np.ndarray)
        assert isinstance(xgb_model._Booster, xgb.Booster)

@ -972,8 +971,8 @@ def test_deprecate_position_arg():
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
-        xgb.XGBClassifier(1, use_label_encoder=False)
-    model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False)
+        xgb.XGBClassifier(1)
+    model = xgb.XGBClassifier(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

@ -990,9 +989,6 @@ def test_deprecate_position_arg():
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

-    with pytest.raises(ValueError):
-        xgb.XGBRFClassifier(1, use_label_encoder=True)
-
    model = xgb.XGBRFClassifier(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)
@ -1334,7 +1330,6 @@ def test_evaluation_metric():
    X, y = load_digits(n_class=10, return_X_y=True)

    clf = xgb.XGBClassifier(
-        use_label_encoder=False,
        tree_method="hist",
        eval_metric=merror,
        n_estimators=16,
@ -1344,7 +1339,6 @@ def test_evaluation_metric():
    custom = clf.evals_result()

    clf = xgb.XGBClassifier(
-        use_label_encoder=False,
        tree_method="hist",
        eval_metric="merror",
        n_estimators=16,
@ -1360,7 +1354,6 @@ def test_evaluation_metric():
    )

    clf = xgb.XGBRFClassifier(
-        use_label_encoder=False,
        tree_method="hist", n_estimators=16,
        objective=tm.softprob_obj(10),
        eval_metric=merror,