initial merge

2023-03-25 04:31:55 +01:00
parent d97be6f396 cff50fe3ef
commit 7fbc561e17
146 changed files with 6730 additions and 4082 deletions
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -324,7 +324,7 @@ class EarlyStopping(TrainingCallback):

            es = xgboost.callback.EarlyStopping(
                rounds=2,
-                abs_tol=1e-3,
+                min_delta=1e-3,
                save_best=True,
                maximize=False,
                data_name="validation_0",
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -312,6 +312,19 @@ __model_doc = f"""
        needs to be set to have categorical feature support. See :doc:`Categorical Data
        </tutorials/categorical>` and :ref:`cat-param` for details.

+    multi_strategy : Optional[str]
+
+        .. versionadded:: 2.0.0
+
+        .. note:: This parameter is working-in-progress.
+
+        The strategy used for training multi-target models, including multi-target
+        regression and multi-class classification. See :doc:`/tutorials/multioutput` for
+        more information.
+
+        - ``one_output_per_tree``: One model for each target.
+        - ``multi_output_tree``:  Use multi-target trees.
+
    eval_metric : Optional[Union[str, List[str], Callable]]

        .. versionadded:: 1.6.0
@@ -355,18 +368,21 @@ __model_doc = f"""

        .. versionadded:: 1.6.0

-        Activates early stopping. Validation metric needs to improve at least once in
-        every **early_stopping_rounds** round(s) to continue training.  Requires at least
-        one item in **eval_set** in :py:meth:`fit`.
+        - Activates early stopping. Validation metric needs to improve at least once in
+          every **early_stopping_rounds** round(s) to continue training.  Requires at
+          least one item in **eval_set** in :py:meth:`fit`.

-        The method returns the model from the last iteration (not the best one).  If
-        there's more than one item in **eval_set**, the last entry will be used for early
-        stopping.  If there's more than one metric in **eval_metric**, the last metric
-        will be used for early stopping.
+        - The method returns the model from the last iteration, not the best one, use a
+          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
+          model is preferred.

-        If early stopping occurs, the model will have three additional fields:
-        :py:attr:`best_score`, :py:attr:`best_iteration` and
-        :py:attr:`best_ntree_limit`.
+        - If there's more than one item in **eval_set**, the last entry will be used for
+          early stopping.  If there's more than one metric in **eval_metric**, the last
+          metric will be used for early stopping.
+
+        - If early stopping occurs, the model will have three additional fields:
+          :py:attr:`best_score`, :py:attr:`best_iteration` and
+          :py:attr:`best_ntree_limit`.

        .. note::

@@ -466,7 +482,9 @@ Parameters
        doc.extend([get_doc(i) for i in items])
        if end_note:
            doc.append(end_note)
-        full_doc = [header + "\n\n"]
+        full_doc = [
+            header + "\nSee :doc:`/python/sklearn_estimator` for more information.\n"
+        ]
        full_doc.extend(doc)
        cls.__doc__ = "".join(full_doc)
        return cls
@@ -624,6 +642,7 @@ class XGBModel(XGBModelBase):
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
        max_cat_threshold: Optional[int] = None,
+        multi_strategy: Optional[str] = None,
        eval_metric: Optional[Union[str, List[str], Callable]] = None,
        early_stopping_rounds: Optional[int] = None,
        callbacks: Optional[List[TrainingCallback]] = None,
@@ -670,6 +689,7 @@ class XGBModel(XGBModelBase):
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
        self.max_cat_threshold = max_cat_threshold
+        self.multi_strategy = multi_strategy
        self.eval_metric = eval_metric
        self.early_stopping_rounds = early_stopping_rounds
        self.callbacks = callbacks
@@ -1131,10 +1151,10 @@ class XGBModel(XGBModelBase):
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
-        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
-        is used automatically.  For tree models, when data is on GPU, like cupy array or
-        cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
-        automatically, otherwise it will run on CPU.
+        """Predict with `X`.  If the model is trained with early stopping, then
+        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
+        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
+        prediction is run on GPU automatically, otherwise it will run on CPU.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1209,8 +1229,8 @@ class XGBModel(XGBModelBase):
        ntree_limit: int = 0,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> np.ndarray:
-        """Return the predicted leaf every tree for each sample. If the model is trained with
-        early stopping, then `best_iteration` is used automatically.
+        """Return the predicted leaf every tree for each sample. If the model is trained
+        with early stopping, then :py:attr:`best_iteration` is used automatically.

        Parameters
        ----------
@@ -1620,7 +1640,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> np.ndarray:
-        """Predict the probability of each `X` example being of a given class.
+        """Predict the probability of each `X` example being of a given class. If the
+        model is trained with early stopping, then :py:attr:`best_iteration` is used
+        automatically.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1646,6 +1668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        prediction :
            a numpy array of shape array-like of shape (n_samples, n_classes) with the
            probability of each data example being of a given class.
+
        """
        # custom obj:      Do nothing as we don't know what to do.
        # softprob:        Do nothing, output is proba.
@@ -2107,11 +2130,13 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        return super().apply(X, ntree_limit, iteration_range)

    def score(self, X: ArrayLike, y: ArrayLike) -> float:
-        """Evaluate score for data using the last evaluation metric.
+        """Evaluate score for data using the last evaluation metric. If the model is
+        trained with early stopping, then :py:attr:`best_iteration` is used
+        automatically.

        Parameters
        ----------
-        X : pd.DataFrame|cudf.DataFrame
+        X : Union[pd.DataFrame, cudf.DataFrame]
          Feature matrix. A DataFrame with a special `qid` column.

        y :
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -10,7 +10,6 @@ import os
 import platform
 import socket
 import sys
-import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
@@ -28,7 +27,6 @@ from typing import (
    TypedDict,
    Union,
 )
-from urllib import request

 import numpy as np
 import pytest
@@ -37,6 +35,13 @@ from scipy import sparse
 import xgboost as xgb
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
+from xgboost.testing.data import (
+    get_california_housing,
+    get_cancer,
+    get_digits,
+    get_sparse,
+    memory,
+)

 hypothesis = pytest.importorskip("hypothesis")

@@ -44,13 +49,8 @@ hypothesis = pytest.importorskip("hypothesis")
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays

-joblib = pytest.importorskip("joblib")
 datasets = pytest.importorskip("sklearn.datasets")

-Memory = joblib.Memory
-
-memory = Memory("./cachedir", verbose=0)
-
 PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})


@@ -352,137 +352,6 @@ class TestDataset:
        return self.name


-@memory.cache
-def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.fetch_california_housing()
-    return data.data, data.target
-
-
-@memory.cache
-def get_digits() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.load_digits()
-    return data.data, data.target
-
-
-@memory.cache
-def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
-    return datasets.load_breast_cancer(return_X_y=True)
-
-
-@memory.cache
-def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
-    rng = np.random.RandomState(199)
-    n = 2000
-    sparsity = 0.75
-    X, y = datasets.make_regression(n, random_state=rng)
-    flag = rng.binomial(1, sparsity, X.shape)
-    for i in range(X.shape[0]):
-        for j in range(X.shape[1]):
-            if flag[i, j]:
-                X[i, j] = np.nan
-    return X, y
-
-
-@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Number of samples: 1460
-    Number of features: 20
-    Number of categorical features: 10
-    Number of numerical features: 10
-    """
-    from sklearn.datasets import fetch_openml
-
-    X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
-
-    categorical_columns_subset: List[str] = [
-        "BldgType",  # 5 cats, no nan
-        "GarageFinish",  # 3 cats, nan
-        "LotConfig",  # 5 cats, no nan
-        "Functional",  # 7 cats, no nan
-        "MasVnrType",  # 4 cats, nan
-        "HouseStyle",  # 8 cats, no nan
-        "FireplaceQu",  # 5 cats, nan
-        "ExterCond",  # 5 cats, no nan
-        "ExterQual",  # 4 cats, no nan
-        "PoolQC",  # 3 cats, nan
-    ]
-
-    numerical_columns_subset: List[str] = [
-        "3SsnPorch",
-        "Fireplaces",
-        "BsmtHalfBath",
-        "HalfBath",
-        "GarageCars",
-        "TotRmsAbvGrd",
-        "BsmtFinSF1",
-        "BsmtFinSF2",
-        "GrLivArea",
-        "ScreenPorch",
-    ]
-
-    X = X[categorical_columns_subset + numerical_columns_subset]
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
-    return X, y
-
-
-@memory.cache
-def get_mq2008(
-    dpath: str,
-) -> Tuple[
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-]:
-    from sklearn.datasets import load_svmlight_files
-
-    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
-    target = dpath + "/MQ2008.zip"
-    if not os.path.exists(target):
-        request.urlretrieve(url=src, filename=target)
-
-    with zipfile.ZipFile(target, "r") as f:
-        f.extractall(path=dpath)
-
-    (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    ) = load_svmlight_files(
-        (
-            dpath + "MQ2008/Fold1/train.txt",
-            dpath + "MQ2008/Fold1/test.txt",
-            dpath + "MQ2008/Fold1/vali.txt",
-        ),
-        query_id=True,
-        zero_based=False,
-    )
-
-    return (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    )
-
-
 # pylint: disable=too-many-arguments,too-many-locals
@memory.cache
 def make_categorical(
@@ -737,20 +606,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
        TestDataset(
            "calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
        ),
-        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
        TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
-        TestDataset(
-            "mtreg",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:squarederror",
-            "rmse",
-        ),
-        TestDataset(
-            "mtreg-l1",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:absoluteerror",
-            "mae",
-        ),
        TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
        TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
        TestDataset(
@@ -763,37 +619,71 @@ _unweighted_datasets_strategy = strategies.sampled_from(
 )


-@strategies.composite
-def _dataset_weight_margin(draw: Callable) -> TestDataset:
-    data: TestDataset = draw(_unweighted_datasets_strategy)
-    if draw(strategies.booleans()):
-        data.w = draw(
-            arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
-        )
-    if draw(strategies.booleans()):
-        num_class = 1
-        if data.objective == "multi:softmax":
-            num_class = int(np.max(data.y) + 1)
-        elif data.name.startswith("mtreg"):
-            num_class = data.y.shape[1]
+def make_datasets_with_margin(
+    unweighted_strategy: strategies.SearchStrategy,
+) -> Callable:
+    """Factory function for creating strategies that generates datasets with weight and
+    base margin.

-        data.margin = draw(
-            arrays(
-                np.float64,
-                (data.y.shape[0] * num_class),
-                elements=strategies.floats(0.5, 1.0),
+    """
+
+    @strategies.composite
+    def weight_margin(draw: Callable) -> TestDataset:
+        data: TestDataset = draw(unweighted_strategy)
+        if draw(strategies.booleans()):
+            data.w = draw(
+                arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
            )
-        )
-        assert data.margin is not None
-        if num_class != 1:
-            data.margin = data.margin.reshape(data.y.shape[0], num_class)
+        if draw(strategies.booleans()):
+            num_class = 1
+            if data.objective == "multi:softmax":
+                num_class = int(np.max(data.y) + 1)
+            elif data.name.startswith("mtreg"):
+                num_class = data.y.shape[1]

-    return data
+            data.margin = draw(
+                arrays(
+                    np.float64,
+                    (data.y.shape[0] * num_class),
+                    elements=strategies.floats(0.5, 1.0),
+                )
+            )
+            assert data.margin is not None
+            if num_class != 1:
+                data.margin = data.margin.reshape(data.y.shape[0], num_class)
+
+        return data
+
+    return weight_margin


-# A strategy for drawing from a set of example datasets
-# May add random weights to the dataset
-dataset_strategy = _dataset_weight_margin()
+# A strategy for drawing from a set of example datasets. May add random weights to the
+# dataset
+dataset_strategy = make_datasets_with_margin(_unweighted_datasets_strategy)()
+
+
+_unweighted_multi_datasets_strategy = strategies.sampled_from(
+    [
+        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
+        TestDataset(
+            "mtreg",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:squarederror",
+            "rmse",
+        ),
+        TestDataset(
+            "mtreg-l1",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:absoluteerror",
+            "mae",
+        ),
+    ]
+)
+
+# A strategy for drawing from a set of multi-target/multi-class datasets.
+multi_dataset_strategy = make_datasets_with_margin(
+    _unweighted_multi_datasets_strategy
+)()


 def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -1,10 +1,20 @@
 """Utilities for data generation."""
-from typing import Any, Generator, Tuple, Union
+import os
+import zipfile
+from typing import Any, Generator, List, Tuple, Union
+from urllib import request

 import numpy as np
+import pytest
+from numpy.random import Generator as RNG
+from scipy import sparse

+import xgboost
 from xgboost.data import pandas_pyarrow_mapper

+joblib = pytest.importorskip("joblib")
+memory = joblib.Memory("./cachedir", verbose=0)
+

 def np_dtypes(
    n_samples: int, n_features: int
@@ -179,3 +189,154 @@ def pd_arrow_dtypes() -> Generator:
        dtype=pd.ArrowDtype(pa.bool_()),
    )
    yield orig, df
+
+
+def check_inf(rng: RNG) -> None:
+    """Validate there's no inf in X."""
+    X = rng.random(size=32).reshape(8, 4)
+    y = rng.random(size=8)
+    X[5, 2] = np.inf
+
+    with pytest.raises(ValueError, match="Input data contains `inf`"):
+        xgboost.QuantileDMatrix(X, y)
+
+    with pytest.raises(ValueError, match="Input data contains `inf`"):
+        xgboost.DMatrix(X, y)
+
+
+@memory.cache
+def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the California housing dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.fetch_california_housing()
+    return data.data, data.target
+
+
+@memory.cache
+def get_digits() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the digits dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.load_digits()
+    return data.data, data.target
+
+
+@memory.cache
+def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the breast cancer dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    return datasets.load_breast_cancer(return_X_y=True)
+
+
+@memory.cache
+def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
+    """Generate a sparse dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    rng = np.random.RandomState(199)
+    n = 2000
+    sparsity = 0.75
+    X, y = datasets.make_regression(n, random_state=rng)
+    flag = rng.binomial(1, sparsity, X.shape)
+    for i in range(X.shape[0]):
+        for j in range(X.shape[1]):
+            if flag[i, j]:
+                X[i, j] = np.nan
+    return X, y
+
+
+@memory.cache
+def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Number of samples: 1460
+    Number of features: 20
+    Number of categorical features: 10
+    Number of numerical features: 10
+    """
+    datasets = pytest.importorskip("sklearn.datasets")
+    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
+    categorical_columns_subset: List[str] = [
+        "BldgType",  # 5 cats, no nan
+        "GarageFinish",  # 3 cats, nan
+        "LotConfig",  # 5 cats, no nan
+        "Functional",  # 7 cats, no nan
+        "MasVnrType",  # 4 cats, nan
+        "HouseStyle",  # 8 cats, no nan
+        "FireplaceQu",  # 5 cats, nan
+        "ExterCond",  # 5 cats, no nan
+        "ExterQual",  # 4 cats, no nan
+        "PoolQC",  # 3 cats, nan
+    ]
+
+    numerical_columns_subset: List[str] = [
+        "3SsnPorch",
+        "Fireplaces",
+        "BsmtHalfBath",
+        "HalfBath",
+        "GarageCars",
+        "TotRmsAbvGrd",
+        "BsmtFinSF1",
+        "BsmtFinSF2",
+        "GrLivArea",
+        "ScreenPorch",
+    ]
+
+    X = X[categorical_columns_subset + numerical_columns_subset]
+    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+    return X, y
+
+
+@memory.cache
+def get_mq2008(
+    dpath: str,
+) -> Tuple[
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+]:
+    """Fetch the mq2008 dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
+    target = os.path.join(dpath, "MQ2008.zip")
+    if not os.path.exists(target):
+        request.urlretrieve(url=src, filename=target)
+
+    with zipfile.ZipFile(target, "r") as f:
+        f.extractall(path=dpath)
+
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = datasets.load_svmlight_files(
+        (
+            os.path.join(dpath, "MQ2008/Fold1/train.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/test.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
+        ),
+        query_id=True,
+        zero_based=False,
+    )
+
+    return (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    )
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -4,8 +4,8 @@ from typing import cast

 import pytest

-hypothesis = pytest.importorskip("hypothesis")
-from hypothesis import strategies  # pylint:disable=wrong-import-position
+strategies = pytest.importorskip("hypothesis.strategies")
+

 exact_parameter_strategy = strategies.fixed_dictionaries(
    {
@@ -41,6 +41,26 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )

+hist_multi_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 1024),
+        "max_bin": strategies.integers(2, 512),
+        "multi_strategy": strategies.sampled_from(
+            ["multi_output_tree", "one_output_per_tree"]
+        ),
+        "grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
+        "min_child_weight": strategies.floats(0.5, 2.0),
+        # We cannot enable subsampling as the training loss can increase
+        # 'subsample': strategies.floats(0.5, 1.0),
+        "colsample_bytree": strategies.floats(0.5, 1.0),
+        "colsample_bylevel": strategies.floats(0.5, 1.0),
+    }
+).filter(
+    lambda x: (cast(int, x["max_depth"]) > 0 or cast(int, x["max_leaves"]) > 0)
+    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
+)
+
 cat_parameter_strategy = strategies.fixed_dictionaries(
    {
        "max_cat_to_onehot": strategies.integers(1, 128),
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -48,7 +48,12 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
    def neg_mse(*args: Any, **kwargs: Any) -> float:
        return -float(mean_squared_error(*args, **kwargs))

-    ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
+    ranker = xgb.XGBRanker(
+        n_estimators=3,
+        eval_metric=neg_mse,
+        tree_method=tree_method,
+        disable_default_eval_metric=True,
+    )
    ranker.fit(df, y, eval_set=[(valid_df, y)])
    score = ranker.score(valid_df, y)
    assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])