merge v2.0.3 from upstream

2024-01-25 07:40:06 -08:00
parent dc7ee041cc 82d846bbeb
commit 420f8d6fde
31 changed files with 542 additions and 163 deletions
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "packager.pep517"

 [project]
 name = "xgboost"
-version = "2.0.1"
+version = "2.0.3"
 authors = [
    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-2.0.1
+2.0.3
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -206,6 +206,7 @@ def _load_lib() -> ctypes.CDLL:
            lib = ctypes.cdll.LoadLibrary(lib_path)
            setattr(lib, "path", os.path.normpath(lib_path))
            lib_success = True
+            break
        except OSError as e:
            os_error_list.append(str(e))
            continue
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -78,7 +78,6 @@ from .data import _is_cudf_ser, _is_cupy_array
 from .sklearn import (
    XGBClassifier,
    XGBClassifierBase,
-    XGBClassifierMixIn,
    XGBModel,
    XGBRanker,
    XGBRankerMixIn,
@@ -1854,7 +1853,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
    "Implementation of the scikit-learn API for XGBoost classification.",
    ["estimators", "model"],
 )
-class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase):
+class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
    # pylint: disable=missing-class-docstring
    async def _fit_async(
        self,
@@ -2036,10 +2035,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
            preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
        return preds

-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-

@xgboost_model_doc(
    """Implementation of the Scikit-Learn API for XGBoost Ranking.
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train


-class XGBClassifierMixIn:  # pylint: disable=too-few-public-methods
-    """MixIn for classification."""
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-
-    def _load_model_attributes(self, booster: Booster) -> None:
-        config = json.loads(booster.save_config())
-        self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
-        # binary classification is treated as regression in XGBoost.
-        self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
-
-
 class XGBRankerMixIn:  # pylint: disable=too-few-public-methods
    """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
    base classes.
@@ -845,21 +832,38 @@ class XGBModel(XGBModelBase):
        self.get_booster().load_model(fname)

        meta_str = self.get_booster().attr("scikit_learn")
-        if meta_str is None:
-            return
+        if meta_str is not None:
+            meta = json.loads(meta_str)
+            t = meta.get("_estimator_type", None)
+            if t is not None and t != self._get_type():
+                raise TypeError(
+                    "Loading an estimator with different type. Expecting: "
+                    f"{self._get_type()}, got: {t}"
+                )

-        meta = json.loads(meta_str)
-        t = meta.get("_estimator_type", None)
-        if t is not None and t != self._get_type():
-            raise TypeError(
-                "Loading an estimator with different type. Expecting: "
-                f"{self._get_type()}, got: {t}"
-            )
        self.feature_types = self.get_booster().feature_types
        self.get_booster().set_attr(scikit_learn=None)
+        config = json.loads(self.get_booster().save_config())
+        self._load_model_attributes(config)

    load_model.__doc__ = f"""{Booster.load_model.__doc__}"""

+    def _load_model_attributes(self, config: dict) -> None:
+        """Load model attributes without hyper-parameters."""
+        from sklearn.base import is_classifier
+
+        booster = self.get_booster()
+
+        self.objective = config["learner"]["objective"]["name"]
+        self.booster = config["learner"]["gradient_booster"]["name"]
+        self.base_score = config["learner"]["learner_model_param"]["base_score"]
+        self.feature_types = booster.feature_types
+
+        if is_classifier(self):
+            self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
+            # binary classification is treated as regression in XGBoost.
+            self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
+
    # pylint: disable=too-many-branches
    def _configure_fit(
        self,
@@ -1409,7 +1413,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
        Number of boosting rounds.
 """,
 )
-class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
+class XGBClassifier(XGBModel, XGBClassifierBase):
    # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
    @_deprecate_positional_args
    def __init__(
@@ -1637,10 +1641,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    def classes_(self) -> np.ndarray:
        return np.arange(self.n_classes_)

-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-

@xgboost_model_doc(
    "scikit-learn API for XGBoost random forest classification.",
@@ -2093,7 +2093,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):

        """
        X, qid = _get_qid(X, None)
-        Xyq = DMatrix(X, y, qid=qid)
+        # fixme(jiamingy): base margin and group weight is not yet supported. We might
+        # need to make extra special fields in the dataframe.
+        Xyq = DMatrix(
+            X,
+            y,
+            qid=qid,
+            missing=self.missing,
+            enable_categorical=self.enable_categorical,
+            nthread=self.n_jobs,
+            feature_types=self.feature_types,
+        )
        if callable(self.eval_metric):
            metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
            result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:

    with pytest.raises(ValueError, match="Either `group` or `qid`."):
        ranker.fit(df, y, eval_set=[(X, y)])
+
+
+def run_ranking_categorical(device: str) -> None:
+    """Test LTR with categorical features."""
+    from sklearn.model_selection import cross_val_score
+
+    X, y = tm.make_categorical(
+        n_samples=512, n_features=10, n_categories=3, onehot=False
+    )
+    rng = np.random.default_rng(1994)
+    qid = rng.choice(3, size=y.shape[0])
+    qid = np.sort(qid)
+    X["qid"] = qid
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+    ltr.fit(X, y)
+    score = ltr.score(X, y)
+    assert score > 0.9
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+
+    # test using the score function inside sklearn.
+    scores = cross_val_score(ltr, X, y)
+    for s in scores:
+        assert s > 0.7
@@ -1 +1 @@
 .0.1
 .0.3