initial merge
This commit is contained in:
@@ -324,7 +324,7 @@ class EarlyStopping(TrainingCallback):
|
||||
|
||||
es = xgboost.callback.EarlyStopping(
|
||||
rounds=2,
|
||||
abs_tol=1e-3,
|
||||
min_delta=1e-3,
|
||||
save_best=True,
|
||||
maximize=False,
|
||||
data_name="validation_0",
|
||||
|
||||
@@ -312,6 +312,19 @@ __model_doc = f"""
|
||||
needs to be set to have categorical feature support. See :doc:`Categorical Data
|
||||
</tutorials/categorical>` and :ref:`cat-param` for details.
|
||||
|
||||
multi_strategy : Optional[str]
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
.. note:: This parameter is working-in-progress.
|
||||
|
||||
The strategy used for training multi-target models, including multi-target
|
||||
regression and multi-class classification. See :doc:`/tutorials/multioutput` for
|
||||
more information.
|
||||
|
||||
- ``one_output_per_tree``: One model for each target.
|
||||
- ``multi_output_tree``: Use multi-target trees.
|
||||
|
||||
eval_metric : Optional[Union[str, List[str], Callable]]
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
@@ -355,18 +368,21 @@ __model_doc = f"""
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
Activates early stopping. Validation metric needs to improve at least once in
|
||||
every **early_stopping_rounds** round(s) to continue training. Requires at least
|
||||
one item in **eval_set** in :py:meth:`fit`.
|
||||
- Activates early stopping. Validation metric needs to improve at least once in
|
||||
every **early_stopping_rounds** round(s) to continue training. Requires at
|
||||
least one item in **eval_set** in :py:meth:`fit`.
|
||||
|
||||
The method returns the model from the last iteration (not the best one). If
|
||||
there's more than one item in **eval_set**, the last entry will be used for early
|
||||
stopping. If there's more than one metric in **eval_metric**, the last metric
|
||||
will be used for early stopping.
|
||||
- The method returns the model from the last iteration, not the best one, use a
|
||||
callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
|
||||
model is preferred.
|
||||
|
||||
If early stopping occurs, the model will have three additional fields:
|
||||
:py:attr:`best_score`, :py:attr:`best_iteration` and
|
||||
:py:attr:`best_ntree_limit`.
|
||||
- If there's more than one item in **eval_set**, the last entry will be used for
|
||||
early stopping. If there's more than one metric in **eval_metric**, the last
|
||||
metric will be used for early stopping.
|
||||
|
||||
- If early stopping occurs, the model will have three additional fields:
|
||||
:py:attr:`best_score`, :py:attr:`best_iteration` and
|
||||
:py:attr:`best_ntree_limit`.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -466,7 +482,9 @@ Parameters
|
||||
doc.extend([get_doc(i) for i in items])
|
||||
if end_note:
|
||||
doc.append(end_note)
|
||||
full_doc = [header + "\n\n"]
|
||||
full_doc = [
|
||||
header + "\nSee :doc:`/python/sklearn_estimator` for more information.\n"
|
||||
]
|
||||
full_doc.extend(doc)
|
||||
cls.__doc__ = "".join(full_doc)
|
||||
return cls
|
||||
@@ -624,6 +642,7 @@ class XGBModel(XGBModelBase):
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
max_cat_to_onehot: Optional[int] = None,
|
||||
max_cat_threshold: Optional[int] = None,
|
||||
multi_strategy: Optional[str] = None,
|
||||
eval_metric: Optional[Union[str, List[str], Callable]] = None,
|
||||
early_stopping_rounds: Optional[int] = None,
|
||||
callbacks: Optional[List[TrainingCallback]] = None,
|
||||
@@ -670,6 +689,7 @@ class XGBModel(XGBModelBase):
|
||||
self.feature_types = feature_types
|
||||
self.max_cat_to_onehot = max_cat_to_onehot
|
||||
self.max_cat_threshold = max_cat_threshold
|
||||
self.multi_strategy = multi_strategy
|
||||
self.eval_metric = eval_metric
|
||||
self.early_stopping_rounds = early_stopping_rounds
|
||||
self.callbacks = callbacks
|
||||
@@ -1131,10 +1151,10 @@ class XGBModel(XGBModelBase):
|
||||
base_margin: Optional[ArrayLike] = None,
|
||||
iteration_range: Optional[Tuple[int, int]] = None,
|
||||
) -> ArrayLike:
|
||||
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration`
|
||||
is used automatically. For tree models, when data is on GPU, like cupy array or
|
||||
cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
|
||||
automatically, otherwise it will run on CPU.
|
||||
"""Predict with `X`. If the model is trained with early stopping, then
|
||||
:py:attr:`best_iteration` is used automatically. For tree models, when data is
|
||||
on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
|
||||
prediction is run on GPU automatically, otherwise it will run on CPU.
|
||||
|
||||
.. note:: This function is only thread safe for `gbtree` and `dart`.
|
||||
|
||||
@@ -1209,8 +1229,8 @@ class XGBModel(XGBModelBase):
|
||||
ntree_limit: int = 0,
|
||||
iteration_range: Optional[Tuple[int, int]] = None,
|
||||
) -> np.ndarray:
|
||||
"""Return the predicted leaf every tree for each sample. If the model is trained with
|
||||
early stopping, then `best_iteration` is used automatically.
|
||||
"""Return the predicted leaf every tree for each sample. If the model is trained
|
||||
with early stopping, then :py:attr:`best_iteration` is used automatically.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -1620,7 +1640,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
base_margin: Optional[ArrayLike] = None,
|
||||
iteration_range: Optional[Tuple[int, int]] = None,
|
||||
) -> np.ndarray:
|
||||
"""Predict the probability of each `X` example being of a given class.
|
||||
"""Predict the probability of each `X` example being of a given class. If the
|
||||
model is trained with early stopping, then :py:attr:`best_iteration` is used
|
||||
automatically.
|
||||
|
||||
.. note:: This function is only thread safe for `gbtree` and `dart`.
|
||||
|
||||
@@ -1646,6 +1668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
prediction :
|
||||
a numpy array of shape array-like of shape (n_samples, n_classes) with the
|
||||
probability of each data example being of a given class.
|
||||
|
||||
"""
|
||||
# custom obj: Do nothing as we don't know what to do.
|
||||
# softprob: Do nothing, output is proba.
|
||||
@@ -2107,11 +2130,13 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
|
||||
return super().apply(X, ntree_limit, iteration_range)
|
||||
|
||||
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
||||
"""Evaluate score for data using the last evaluation metric.
|
||||
"""Evaluate score for data using the last evaluation metric. If the model is
|
||||
trained with early stopping, then :py:attr:`best_iteration` is used
|
||||
automatically.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : pd.DataFrame|cudf.DataFrame
|
||||
X : Union[pd.DataFrame, cudf.DataFrame]
|
||||
Feature matrix. A DataFrame with a special `qid` column.
|
||||
|
||||
y :
|
||||
|
||||
@@ -10,7 +10,6 @@ import os
|
||||
import platform
|
||||
import socket
|
||||
import sys
|
||||
import zipfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextlib import contextmanager
|
||||
from io import StringIO
|
||||
@@ -28,7 +27,6 @@ from typing import (
|
||||
TypedDict,
|
||||
Union,
|
||||
)
|
||||
from urllib import request
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@@ -37,6 +35,13 @@ from scipy import sparse
|
||||
import xgboost as xgb
|
||||
from xgboost.core import ArrayLike
|
||||
from xgboost.sklearn import SklObjective
|
||||
from xgboost.testing.data import (
|
||||
get_california_housing,
|
||||
get_cancer,
|
||||
get_digits,
|
||||
get_sparse,
|
||||
memory,
|
||||
)
|
||||
|
||||
hypothesis = pytest.importorskip("hypothesis")
|
||||
|
||||
@@ -44,13 +49,8 @@ hypothesis = pytest.importorskip("hypothesis")
|
||||
from hypothesis import strategies
|
||||
from hypothesis.extra.numpy import arrays
|
||||
|
||||
joblib = pytest.importorskip("joblib")
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
|
||||
Memory = joblib.Memory
|
||||
|
||||
memory = Memory("./cachedir", verbose=0)
|
||||
|
||||
PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})
|
||||
|
||||
|
||||
@@ -352,137 +352,6 @@ class TestDataset:
|
||||
return self.name
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
data = datasets.fetch_california_housing()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_digits() -> Tuple[np.ndarray, np.ndarray]:
|
||||
data = datasets.load_digits()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
|
||||
return datasets.load_breast_cancer(return_X_y=True)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.75
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = np.nan
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Number of samples: 1460
|
||||
Number of features: 20
|
||||
Number of categorical features: 10
|
||||
Number of numerical features: 10
|
||||
"""
|
||||
from sklearn.datasets import fetch_openml
|
||||
|
||||
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||
|
||||
categorical_columns_subset: List[str] = [
|
||||
"BldgType", # 5 cats, no nan
|
||||
"GarageFinish", # 3 cats, nan
|
||||
"LotConfig", # 5 cats, no nan
|
||||
"Functional", # 7 cats, no nan
|
||||
"MasVnrType", # 4 cats, nan
|
||||
"HouseStyle", # 8 cats, no nan
|
||||
"FireplaceQu", # 5 cats, nan
|
||||
"ExterCond", # 5 cats, no nan
|
||||
"ExterQual", # 4 cats, no nan
|
||||
"PoolQC", # 3 cats, nan
|
||||
]
|
||||
|
||||
numerical_columns_subset: List[str] = [
|
||||
"3SsnPorch",
|
||||
"Fireplaces",
|
||||
"BsmtHalfBath",
|
||||
"HalfBath",
|
||||
"GarageCars",
|
||||
"TotRmsAbvGrd",
|
||||
"BsmtFinSF1",
|
||||
"BsmtFinSF2",
|
||||
"GrLivArea",
|
||||
"ScreenPorch",
|
||||
]
|
||||
|
||||
X = X[categorical_columns_subset + numerical_columns_subset]
|
||||
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_mq2008(
|
||||
dpath: str,
|
||||
) -> Tuple[
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
]:
|
||||
from sklearn.datasets import load_svmlight_files
|
||||
|
||||
src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
|
||||
target = dpath + "/MQ2008.zip"
|
||||
if not os.path.exists(target):
|
||||
request.urlretrieve(url=src, filename=target)
|
||||
|
||||
with zipfile.ZipFile(target, "r") as f:
|
||||
f.extractall(path=dpath)
|
||||
|
||||
(
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
) = load_svmlight_files(
|
||||
(
|
||||
dpath + "MQ2008/Fold1/train.txt",
|
||||
dpath + "MQ2008/Fold1/test.txt",
|
||||
dpath + "MQ2008/Fold1/vali.txt",
|
||||
),
|
||||
query_id=True,
|
||||
zero_based=False,
|
||||
)
|
||||
|
||||
return (
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
)
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments,too-many-locals
|
||||
@memory.cache
|
||||
def make_categorical(
|
||||
@@ -737,20 +606,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
|
||||
TestDataset(
|
||||
"calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
|
||||
),
|
||||
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
|
||||
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
|
||||
TestDataset(
|
||||
"mtreg",
|
||||
lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"mtreg-l1",
|
||||
lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
|
||||
"reg:absoluteerror",
|
||||
"mae",
|
||||
),
|
||||
TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
|
||||
TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
|
||||
TestDataset(
|
||||
@@ -763,37 +619,71 @@ _unweighted_datasets_strategy = strategies.sampled_from(
|
||||
)
|
||||
|
||||
|
||||
@strategies.composite
|
||||
def _dataset_weight_margin(draw: Callable) -> TestDataset:
|
||||
data: TestDataset = draw(_unweighted_datasets_strategy)
|
||||
if draw(strategies.booleans()):
|
||||
data.w = draw(
|
||||
arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
|
||||
)
|
||||
if draw(strategies.booleans()):
|
||||
num_class = 1
|
||||
if data.objective == "multi:softmax":
|
||||
num_class = int(np.max(data.y) + 1)
|
||||
elif data.name.startswith("mtreg"):
|
||||
num_class = data.y.shape[1]
|
||||
def make_datasets_with_margin(
|
||||
unweighted_strategy: strategies.SearchStrategy,
|
||||
) -> Callable:
|
||||
"""Factory function for creating strategies that generates datasets with weight and
|
||||
base margin.
|
||||
|
||||
data.margin = draw(
|
||||
arrays(
|
||||
np.float64,
|
||||
(data.y.shape[0] * num_class),
|
||||
elements=strategies.floats(0.5, 1.0),
|
||||
"""
|
||||
|
||||
@strategies.composite
|
||||
def weight_margin(draw: Callable) -> TestDataset:
|
||||
data: TestDataset = draw(unweighted_strategy)
|
||||
if draw(strategies.booleans()):
|
||||
data.w = draw(
|
||||
arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
|
||||
)
|
||||
)
|
||||
assert data.margin is not None
|
||||
if num_class != 1:
|
||||
data.margin = data.margin.reshape(data.y.shape[0], num_class)
|
||||
if draw(strategies.booleans()):
|
||||
num_class = 1
|
||||
if data.objective == "multi:softmax":
|
||||
num_class = int(np.max(data.y) + 1)
|
||||
elif data.name.startswith("mtreg"):
|
||||
num_class = data.y.shape[1]
|
||||
|
||||
return data
|
||||
data.margin = draw(
|
||||
arrays(
|
||||
np.float64,
|
||||
(data.y.shape[0] * num_class),
|
||||
elements=strategies.floats(0.5, 1.0),
|
||||
)
|
||||
)
|
||||
assert data.margin is not None
|
||||
if num_class != 1:
|
||||
data.margin = data.margin.reshape(data.y.shape[0], num_class)
|
||||
|
||||
return data
|
||||
|
||||
return weight_margin
|
||||
|
||||
|
||||
# A strategy for drawing from a set of example datasets
|
||||
# May add random weights to the dataset
|
||||
dataset_strategy = _dataset_weight_margin()
|
||||
# A strategy for drawing from a set of example datasets. May add random weights to the
|
||||
# dataset
|
||||
dataset_strategy = make_datasets_with_margin(_unweighted_datasets_strategy)()
|
||||
|
||||
|
||||
_unweighted_multi_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
|
||||
TestDataset(
|
||||
"mtreg",
|
||||
lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"mtreg-l1",
|
||||
lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
|
||||
"reg:absoluteerror",
|
||||
"mae",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# A strategy for drawing from a set of multi-target/multi-class datasets.
|
||||
multi_dataset_strategy = make_datasets_with_margin(
|
||||
_unweighted_multi_datasets_strategy
|
||||
)()
|
||||
|
||||
|
||||
def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
|
||||
|
||||
@@ -1,10 +1,20 @@
|
||||
"""Utilities for data generation."""
|
||||
from typing import Any, Generator, Tuple, Union
|
||||
import os
|
||||
import zipfile
|
||||
from typing import Any, Generator, List, Tuple, Union
|
||||
from urllib import request
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.random import Generator as RNG
|
||||
from scipy import sparse
|
||||
|
||||
import xgboost
|
||||
from xgboost.data import pandas_pyarrow_mapper
|
||||
|
||||
joblib = pytest.importorskip("joblib")
|
||||
memory = joblib.Memory("./cachedir", verbose=0)
|
||||
|
||||
|
||||
def np_dtypes(
|
||||
n_samples: int, n_features: int
|
||||
@@ -179,3 +189,154 @@ def pd_arrow_dtypes() -> Generator:
|
||||
dtype=pd.ArrowDtype(pa.bool_()),
|
||||
)
|
||||
yield orig, df
|
||||
|
||||
|
||||
def check_inf(rng: RNG) -> None:
|
||||
"""Validate there's no inf in X."""
|
||||
X = rng.random(size=32).reshape(8, 4)
|
||||
y = rng.random(size=8)
|
||||
X[5, 2] = np.inf
|
||||
|
||||
with pytest.raises(ValueError, match="Input data contains `inf`"):
|
||||
xgboost.QuantileDMatrix(X, y)
|
||||
|
||||
with pytest.raises(ValueError, match="Input data contains `inf`"):
|
||||
xgboost.DMatrix(X, y)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Fetch the California housing dataset from sklearn."""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
data = datasets.fetch_california_housing()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_digits() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Fetch the digits dataset from sklearn."""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
data = datasets.load_digits()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Fetch the breast cancer dataset from sklearn."""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
return datasets.load_breast_cancer(return_X_y=True)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Generate a sparse dataset."""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.75
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = np.nan
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Number of samples: 1460
|
||||
Number of features: 20
|
||||
Number of categorical features: 10
|
||||
Number of numerical features: 10
|
||||
"""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||
|
||||
categorical_columns_subset: List[str] = [
|
||||
"BldgType", # 5 cats, no nan
|
||||
"GarageFinish", # 3 cats, nan
|
||||
"LotConfig", # 5 cats, no nan
|
||||
"Functional", # 7 cats, no nan
|
||||
"MasVnrType", # 4 cats, nan
|
||||
"HouseStyle", # 8 cats, no nan
|
||||
"FireplaceQu", # 5 cats, nan
|
||||
"ExterCond", # 5 cats, no nan
|
||||
"ExterQual", # 4 cats, no nan
|
||||
"PoolQC", # 3 cats, nan
|
||||
]
|
||||
|
||||
numerical_columns_subset: List[str] = [
|
||||
"3SsnPorch",
|
||||
"Fireplaces",
|
||||
"BsmtHalfBath",
|
||||
"HalfBath",
|
||||
"GarageCars",
|
||||
"TotRmsAbvGrd",
|
||||
"BsmtFinSF1",
|
||||
"BsmtFinSF2",
|
||||
"GrLivArea",
|
||||
"ScreenPorch",
|
||||
]
|
||||
|
||||
X = X[categorical_columns_subset + numerical_columns_subset]
|
||||
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_mq2008(
|
||||
dpath: str,
|
||||
) -> Tuple[
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
]:
|
||||
"""Fetch the mq2008 dataset."""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
|
||||
target = os.path.join(dpath, "MQ2008.zip")
|
||||
if not os.path.exists(target):
|
||||
request.urlretrieve(url=src, filename=target)
|
||||
|
||||
with zipfile.ZipFile(target, "r") as f:
|
||||
f.extractall(path=dpath)
|
||||
|
||||
(
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
) = datasets.load_svmlight_files(
|
||||
(
|
||||
os.path.join(dpath, "MQ2008/Fold1/train.txt"),
|
||||
os.path.join(dpath, "MQ2008/Fold1/test.txt"),
|
||||
os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
|
||||
),
|
||||
query_id=True,
|
||||
zero_based=False,
|
||||
)
|
||||
|
||||
return (
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
)
|
||||
|
||||
@@ -4,8 +4,8 @@ from typing import cast
|
||||
|
||||
import pytest
|
||||
|
||||
hypothesis = pytest.importorskip("hypothesis")
|
||||
from hypothesis import strategies # pylint:disable=wrong-import-position
|
||||
strategies = pytest.importorskip("hypothesis.strategies")
|
||||
|
||||
|
||||
exact_parameter_strategy = strategies.fixed_dictionaries(
|
||||
{
|
||||
@@ -41,6 +41,26 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
|
||||
and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
|
||||
)
|
||||
|
||||
hist_multi_parameter_strategy = strategies.fixed_dictionaries(
|
||||
{
|
||||
"max_depth": strategies.integers(1, 11),
|
||||
"max_leaves": strategies.integers(0, 1024),
|
||||
"max_bin": strategies.integers(2, 512),
|
||||
"multi_strategy": strategies.sampled_from(
|
||||
["multi_output_tree", "one_output_per_tree"]
|
||||
),
|
||||
"grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
|
||||
"min_child_weight": strategies.floats(0.5, 2.0),
|
||||
# We cannot enable subsampling as the training loss can increase
|
||||
# 'subsample': strategies.floats(0.5, 1.0),
|
||||
"colsample_bytree": strategies.floats(0.5, 1.0),
|
||||
"colsample_bylevel": strategies.floats(0.5, 1.0),
|
||||
}
|
||||
).filter(
|
||||
lambda x: (cast(int, x["max_depth"]) > 0 or cast(int, x["max_leaves"]) > 0)
|
||||
and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
|
||||
)
|
||||
|
||||
cat_parameter_strategy = strategies.fixed_dictionaries(
|
||||
{
|
||||
"max_cat_to_onehot": strategies.integers(1, 128),
|
||||
|
||||
@@ -48,7 +48,12 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
|
||||
def neg_mse(*args: Any, **kwargs: Any) -> float:
|
||||
return -float(mean_squared_error(*args, **kwargs))
|
||||
|
||||
ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
|
||||
ranker = xgb.XGBRanker(
|
||||
n_estimators=3,
|
||||
eval_metric=neg_mse,
|
||||
tree_method=tree_method,
|
||||
disable_default_eval_metric=True,
|
||||
)
|
||||
ranker.fit(df, y, eval_set=[(valid_df, y)])
|
||||
score = ranker.score(valid_df, y)
|
||||
assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
|
||||
|
||||
Reference in New Issue
Block a user