Remove ntree limit in python package. (#8345)

- Remove `ntree_limit`. The parameter has been deprecated since 1.4.0.
- The SHAP package compatibility is broken.
This commit is contained in:
Jiaming Yuan 2023-03-31 19:01:55 +08:00 committed by GitHub
parent b647403baa
commit bac22734fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 284 additions and 357 deletions

View File

@ -23,13 +23,7 @@ from typing import (
import numpy import numpy
from . import collective from . import collective
from .core import ( from .core import Booster, DMatrix, XGBoostError, _parse_eval_str
Booster,
DMatrix,
XGBoostError,
_get_booster_layer_trees,
_parse_eval_str,
)
__all__ = [ __all__ = [
"TrainingCallback", "TrainingCallback",
@ -177,22 +171,14 @@ class CallbackContainer:
assert isinstance(model, Booster), msg assert isinstance(model, Booster), msg
if not self.is_cv: if not self.is_cv:
num_parallel_tree, _ = _get_booster_layer_trees(model)
if model.attr("best_score") is not None: if model.attr("best_score") is not None:
model.best_score = float(cast(str, model.attr("best_score"))) model.best_score = float(cast(str, model.attr("best_score")))
model.best_iteration = int(cast(str, model.attr("best_iteration"))) model.best_iteration = int(cast(str, model.attr("best_iteration")))
# num_class is handled internally
model.set_attr(
best_ntree_limit=str((model.best_iteration + 1) * num_parallel_tree)
)
model.best_ntree_limit = int(cast(str, model.attr("best_ntree_limit")))
else: else:
# Due to compatibility with version older than 1.4, these attributes are # Due to compatibility with version older than 1.4, these attributes are
# added to Python object even if early stopping is not used. # added to Python object even if early stopping is not used.
model.best_iteration = model.num_boosted_rounds() - 1 model.best_iteration = model.num_boosted_rounds() - 1
model.set_attr(best_iteration=str(model.best_iteration)) model.set_attr(best_iteration=str(model.best_iteration))
model.best_ntree_limit = (model.best_iteration + 1) * num_parallel_tree
model.set_attr(best_ntree_limit=str(model.best_ntree_limit))
return model return model

View File

@ -126,25 +126,6 @@ def _parse_eval_str(result: str) -> List[Tuple[str, float]]:
IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int]) IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int])
def _convert_ntree_limit(
booster: "Booster", ntree_limit: Optional[int], iteration_range: IterRange
) -> IterRange:
if ntree_limit is not None and ntree_limit != 0:
warnings.warn(
"ntree_limit is deprecated, use `iteration_range` or model "
"slicing instead.",
UserWarning,
)
if iteration_range is not None and iteration_range[1] != 0:
raise ValueError(
"Only one of `iteration_range` and `ntree_limit` can be non zero."
)
num_parallel_tree, _ = _get_booster_layer_trees(booster)
num_parallel_tree = max([num_parallel_tree, 1])
iteration_range = (0, ntree_limit // num_parallel_tree)
return iteration_range
def _expect(expectations: Sequence[Type], got: Type) -> str: def _expect(expectations: Sequence[Type], got: Type) -> str:
"""Translate input error into string. """Translate input error into string.
@ -1508,41 +1489,6 @@ Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]] Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
"""Get number of trees added to booster per-iteration. This function will be removed
once `best_ntree_limit` is dropped in favor of `best_iteration`. Returns
`num_parallel_tree` and `num_groups`.
"""
config = json.loads(model.save_config())
booster = config["learner"]["gradient_booster"]["name"]
if booster == "gblinear":
num_parallel_tree = 0
elif booster == "dart":
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree"]["gbtree_model_param"][
"num_parallel_tree"
]
)
elif booster == "gbtree":
try:
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree_model_param"][
"num_parallel_tree"
]
)
except KeyError:
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree_train_param"][
"num_parallel_tree"
]
)
else:
raise ValueError(f"Unknown booster: {booster}")
num_groups = int(config["learner"]["learner_model_param"]["num_class"])
return num_parallel_tree, num_groups
def _configure_metrics(params: BoosterParam) -> BoosterParam: def _configure_metrics(params: BoosterParam) -> BoosterParam:
if ( if (
isinstance(params, dict) isinstance(params, dict)
@ -1576,11 +1522,11 @@ class Booster:
""" """
Parameters Parameters
---------- ----------
params : dict params :
Parameters for boosters. Parameters for boosters.
cache : list cache :
List of cache items. List of cache items.
model_file : string/os.PathLike/Booster/bytearray model_file :
Path to the model file if it's string or PathLike. Path to the model file if it's string or PathLike.
""" """
cache = cache if cache is not None else [] cache = cache if cache is not None else []
@ -2100,7 +2046,6 @@ class Booster:
self, self,
data: DMatrix, data: DMatrix,
output_margin: bool = False, output_margin: bool = False,
ntree_limit: int = 0,
pred_leaf: bool = False, pred_leaf: bool = False,
pred_contribs: bool = False, pred_contribs: bool = False,
approx_contribs: bool = False, approx_contribs: bool = False,
@ -2127,9 +2072,6 @@ class Booster:
output_margin : output_margin :
Whether to output the raw untransformed margin value. Whether to output the raw untransformed margin value.
ntree_limit :
Deprecated, use `iteration_range` instead.
pred_leaf : pred_leaf :
When this option is on, the output will be a matrix of (nsample, When this option is on, the output will be a matrix of (nsample,
ntrees) with each record indicating the predicted leaf index of ntrees) with each record indicating the predicted leaf index of
@ -2196,7 +2138,6 @@ class Booster:
raise TypeError("Expecting data to be a DMatrix object, got: ", type(data)) raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
if validate_features: if validate_features:
self._validate_dmatrix_features(data) self._validate_dmatrix_features(data)
iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
args = { args = {
"type": 0, "type": 0,
"training": training, "training": training,
@ -2522,8 +2463,6 @@ class Booster:
self.best_iteration = int(self.attr("best_iteration")) # type: ignore self.best_iteration = int(self.attr("best_iteration")) # type: ignore
if self.attr("best_score") is not None: if self.attr("best_score") is not None:
self.best_score = float(self.attr("best_score")) # type: ignore self.best_score = float(self.attr("best_score")) # type: ignore
if self.attr("best_ntree_limit") is not None:
self.best_ntree_limit = int(self.attr("best_ntree_limit")) # type: ignore
def num_boosted_rounds(self) -> int: def num_boosted_rounds(self) -> int:
"""Get number of boosted rounds. For gblinear this is reset to 0 after """Get number of boosted rounds. For gblinear this is reset to 0 after

View File

@ -1653,14 +1653,11 @@ class DaskScikitLearnBase(XGBModel):
self, self,
X: _DataT, X: _DataT,
output_margin: bool = False, output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True, validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> Any: ) -> Any:
_assert_dask_support() _assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self.client.sync( return self.client.sync(
self._predict_async, self._predict_async,
X, X,
@ -1694,12 +1691,9 @@ class DaskScikitLearnBase(XGBModel):
def apply( def apply(
self, self,
X: _DataT, X: _DataT,
ntree_limit: Optional[int] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> Any: ) -> Any:
_assert_dask_support() _assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self.client.sync(self._apply_async, X, iteration_range=iteration_range) return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
def __await__(self) -> Awaitable[Any]: def __await__(self) -> Awaitable[Any]:
@ -1993,14 +1987,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
def predict_proba( def predict_proba(
self, self,
X: _DaskCollection, X: _DaskCollection,
ntree_limit: Optional[int] = None,
validate_features: bool = True, validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> Any: ) -> Any:
_assert_dask_support() _assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self._client_sync( return self._client_sync(
self._predict_proba_async, self._predict_proba_async,
X=X, X=X,

View File

@ -36,7 +36,6 @@ from .core import (
Objective, Objective,
QuantileDMatrix, QuantileDMatrix,
XGBoostError, XGBoostError,
_convert_ntree_limit,
_deprecate_positional_args, _deprecate_positional_args,
_parse_eval_str, _parse_eval_str,
) )
@ -391,8 +390,7 @@ __model_doc = f"""
metric will be used for early stopping. metric will be used for early stopping.
- If early stopping occurs, the model will have three additional fields: - If early stopping occurs, the model will have three additional fields:
:py:attr:`best_score`, :py:attr:`best_iteration` and :py:attr:`best_score`, :py:attr:`best_iteration`.
:py:attr:`best_ntree_limit`.
.. note:: .. note::
@ -1117,7 +1115,6 @@ class XGBModel(XGBModelBase):
self, self,
X: ArrayLike, X: ArrayLike,
output_margin: bool = False, output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True, validate_features: bool = True,
base_margin: Optional[ArrayLike] = None, base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
@ -1135,8 +1132,6 @@ class XGBModel(XGBModelBase):
Data to predict with. Data to predict with.
output_margin : output_margin :
Whether to output the raw untransformed margin value. Whether to output the raw untransformed margin value.
ntree_limit :
Deprecated, use `iteration_range` instead.
validate_features : validate_features :
When this is True, validate that the Booster's and data's feature_names are When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same. identical. Otherwise, it is assumed that the feature_names are the same.
@ -1156,9 +1151,6 @@ class XGBModel(XGBModelBase):
""" """
with config_context(verbosity=self.verbosity): with config_context(verbosity=self.verbosity):
iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range
)
iteration_range = self._get_iteration_range(iteration_range) iteration_range = self._get_iteration_range(iteration_range)
if self._can_use_inplace_predict(): if self._can_use_inplace_predict():
try: try:
@ -1197,7 +1189,6 @@ class XGBModel(XGBModelBase):
def apply( def apply(
self, self,
X: ArrayLike, X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray: ) -> np.ndarray:
"""Return the predicted leaf every tree for each sample. If the model is trained """Return the predicted leaf every tree for each sample. If the model is trained
@ -1211,9 +1202,6 @@ class XGBModel(XGBModelBase):
iteration_range : iteration_range :
See :py:meth:`predict`. See :py:meth:`predict`.
ntree_limit :
Deprecated, use ``iteration_range`` instead.
Returns Returns
------- -------
X_leaves : array_like, shape=[n_samples, n_trees] X_leaves : array_like, shape=[n_samples, n_trees]
@ -1223,9 +1211,6 @@ class XGBModel(XGBModelBase):
""" """
with config_context(verbosity=self.verbosity): with config_context(verbosity=self.verbosity):
iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range
)
iteration_range = self._get_iteration_range(iteration_range) iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = DMatrix( test_dmatrix = DMatrix(
X, X,
@ -1309,10 +1294,6 @@ class XGBModel(XGBModelBase):
""" """
return int(self._early_stopping_attr("best_iteration")) return int(self._early_stopping_attr("best_iteration"))
@property
def best_ntree_limit(self) -> int:
return int(self._early_stopping_attr("best_ntree_limit"))
@property @property
def feature_importances_(self) -> np.ndarray: def feature_importances_(self) -> np.ndarray:
"""Feature importances property, return depends on `importance_type` """Feature importances property, return depends on `importance_type`
@ -1562,7 +1543,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
self, self,
X: ArrayLike, X: ArrayLike,
output_margin: bool = False, output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True, validate_features: bool = True,
base_margin: Optional[ArrayLike] = None, base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
@ -1571,7 +1551,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
class_probs = super().predict( class_probs = super().predict(
X=X, X=X,
output_margin=output_margin, output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features, validate_features=validate_features,
base_margin=base_margin, base_margin=base_margin,
iteration_range=iteration_range, iteration_range=iteration_range,
@ -1599,7 +1578,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
def predict_proba( def predict_proba(
self, self,
X: ArrayLike, X: ArrayLike,
ntree_limit: Optional[int] = None,
validate_features: bool = True, validate_features: bool = True,
base_margin: Optional[ArrayLike] = None, base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
@ -1614,8 +1592,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
---------- ----------
X : array_like X : array_like
Feature matrix. See :ref:`py-data` for a list of supported types. Feature matrix. See :ref:`py-data` for a list of supported types.
ntree_limit : int
Deprecated, use `iteration_range` instead.
validate_features : bool validate_features : bool
When this is True, validate that the Booster's and data's feature_names are When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same. identical. Otherwise, it is assumed that the feature_names are the same.
@ -1642,7 +1618,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
if self.objective == "multi:softmax": if self.objective == "multi:softmax":
raw_predt = super().predict( raw_predt = super().predict(
X=X, X=X,
ntree_limit=ntree_limit,
validate_features=validate_features, validate_features=validate_features,
base_margin=base_margin, base_margin=base_margin,
iteration_range=iteration_range, iteration_range=iteration_range,
@ -1652,7 +1627,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
return class_prob return class_prob
class_probs = super().predict( class_probs = super().predict(
X=X, X=X,
ntree_limit=ntree_limit,
validate_features=validate_features, validate_features=validate_features,
base_margin=base_margin, base_margin=base_margin,
iteration_range=iteration_range, iteration_range=iteration_range,
@ -2074,7 +2048,6 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
self, self,
X: ArrayLike, X: ArrayLike,
output_margin: bool = False, output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True, validate_features: bool = True,
base_margin: Optional[ArrayLike] = None, base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
@ -2083,20 +2056,18 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
return super().predict( return super().predict(
X, X,
output_margin, output_margin,
ntree_limit,
validate_features, validate_features,
base_margin, base_margin,
iteration_range, iteration_range=iteration_range,
) )
def apply( def apply(
self, self,
X: ArrayLike, X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> ArrayLike: ) -> ArrayLike:
X, _ = _get_qid(X, None) X, _ = _get_qid(X, None)
return super().apply(X, ntree_limit, iteration_range) return super().apply(X, iteration_range)
def score(self, X: ArrayLike, y: ArrayLike) -> float: def score(self, X: ArrayLike, y: ArrayLike) -> float:
"""Evaluate score for data using the last evaluation metric. If the model is """Evaluate score for data using the last evaluation metric. If the model is

View File

@ -11,7 +11,6 @@ from xgboost import DataIter, DMatrix, QuantileDMatrix, XGBModel
from xgboost.compat import concat from xgboost.compat import concat
from .._typing import ArrayLike from .._typing import ArrayLike
from ..core import _convert_ntree_limit
from .utils import get_logger # type: ignore from .utils import get_logger # type: ignore
@ -343,8 +342,7 @@ def pred_contribs(
strict_shape: bool = False, strict_shape: bool = False,
) -> np.ndarray: ) -> np.ndarray:
"""Predict contributions with data with the full model.""" """Predict contributions with data with the full model."""
iteration_range = _convert_ntree_limit(model.get_booster(), None, None) iteration_range = model._get_iteration_range(None)
iteration_range = model._get_iteration_range(iteration_range)
data_dmatrix = DMatrix( data_dmatrix = DMatrix(
data, data,
base_margin=base_margin, base_margin=base_margin,

View File

@ -31,6 +31,5 @@ dependencies:
- pyspark - pyspark
- cloudpickle - cloudpickle
- pip: - pip:
- shap
- awscli - awscli
- auditwheel - auditwheel

View File

@ -37,7 +37,6 @@ dependencies:
- pyarrow - pyarrow
- protobuf - protobuf
- cloudpickle - cloudpickle
- shap>=0.41
- modin - modin
# TODO: Replace it with pyspark>=3.4 once 3.4 released. # TODO: Replace it with pyspark>=3.4 once 3.4 released.
# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz # - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz

View File

@ -146,6 +146,7 @@ def main(args: argparse.Namespace) -> None:
"tests/python/test_config.py", "tests/python/test_config.py",
"tests/python/test_data_iterator.py", "tests/python/test_data_iterator.py",
"tests/python/test_dt.py", "tests/python/test_dt.py",
"tests/python/test_predict.py",
"tests/python/test_quantile_dmatrix.py", "tests/python/test_quantile_dmatrix.py",
"tests/python/test_tree_regularization.py", "tests/python/test_tree_regularization.py",
"tests/python-gpu/test_gpu_data_iterator.py", "tests/python-gpu/test_gpu_data_iterator.py",

View File

@ -64,7 +64,7 @@ class TestModels:
num_round = 2 num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist) bst = xgb.train(param, dtrain, num_round, watchlist)
# this is prediction # this is prediction
preds = bst.predict(dtest, ntree_limit=num_round) preds = bst.predict(dtest, iteration_range=(0, num_round))
labels = dtest.get_label() labels = dtest.get_label()
err = sum(1 for i in range(len(preds)) err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
@ -83,7 +83,7 @@ class TestModels:
bst2 = xgb.Booster(params=param, model_file=model_path) bst2 = xgb.Booster(params=param, model_file=model_path)
dtest2 = xgb.DMatrix(dtest_path) dtest2 = xgb.DMatrix(dtest_path)
preds2 = bst2.predict(dtest2, ntree_limit=num_round) preds2 = bst2.predict(dtest2, iteration_range=(0, num_round))
# assert they are the same # assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0 assert np.sum(np.abs(preds2 - preds)) == 0
@ -96,7 +96,7 @@ class TestModels:
# check whether custom evaluation metrics work # check whether custom evaluation metrics work
bst = xgb.train(param, dtrain, num_round, watchlist, bst = xgb.train(param, dtrain, num_round, watchlist,
feval=my_logloss) feval=my_logloss)
preds3 = bst.predict(dtest, ntree_limit=num_round) preds3 = bst.predict(dtest, iteration_range=(0, num_round))
assert all(preds3 == preds) assert all(preds3 == preds)
# check whether sample_type and normalize_type work # check whether sample_type and normalize_type work
@ -110,7 +110,7 @@ class TestModels:
param['sample_type'] = p[0] param['sample_type'] = p[0]
param['normalize_type'] = p[1] param['normalize_type'] = p[1]
bst = xgb.train(param, dtrain, num_round, watchlist) bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest, ntree_limit=num_round) preds = bst.predict(dtest, iteration_range=(0, num_round))
err = sum(1 for i in range(len(preds)) err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1 assert err < 0.1
@ -472,8 +472,8 @@ class TestModels:
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
cls = xgb.XGBClassifier(n_estimators=2) cls = xgb.XGBClassifier(n_estimators=2)
cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)]) cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
assert cls.get_booster().best_ntree_limit == 2 assert cls.get_booster().best_iteration == cls.n_estimators - 1
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit assert cls.best_iteration == cls.get_booster().best_iteration
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "cls.json") path = os.path.join(tmpdir, "cls.json")
@ -481,8 +481,8 @@ class TestModels:
cls = xgb.XGBClassifier(n_estimators=2) cls = xgb.XGBClassifier(n_estimators=2)
cls.load_model(path) cls.load_model(path)
assert cls.get_booster().best_ntree_limit == 2 assert cls.get_booster().best_iteration == cls.n_estimators - 1
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit assert cls.best_iteration == cls.get_booster().best_iteration
def run_slice( def run_slice(
self, self,

View File

@ -102,7 +102,6 @@ eval[test] = {data_path}
booster.feature_names = None booster.feature_names = None
booster.feature_types = None booster.feature_types = None
booster.set_attr(best_iteration=None) booster.set_attr(best_iteration=None)
booster.set_attr(best_ntree_limit=None)
booster.save_model(model_out_py) booster.save_model(model_out_py)
py_predt = booster.predict(data) py_predt = booster.predict(data)

View File

@ -1,4 +1,4 @@
'''Tests for running inplace prediction.''' """Tests for running inplace prediction."""
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import numpy as np import numpy as np
@ -17,10 +17,10 @@ def run_threaded_predict(X, rows, predict_func):
per_thread = 20 per_thread = 20
with ThreadPoolExecutor(max_workers=10) as e: with ThreadPoolExecutor(max_workers=10) as e:
for i in range(0, rows, int(rows / per_thread)): for i in range(0, rows, int(rows / per_thread)):
if hasattr(X, 'iloc'): if hasattr(X, "iloc"):
predictor = X.iloc[i:i+per_thread, :] predictor = X.iloc[i : i + per_thread, :]
else: else:
predictor = X[i:i+per_thread, ...] predictor = X[i : i + per_thread, ...]
f = e.submit(predict_func, predictor) f = e.submit(predict_func, predictor)
results.append(f) results.append(f)
@ -61,27 +61,31 @@ def run_predict_leaf(predictor):
validate_leaf_output(leaf, num_parallel_tree) validate_leaf_output(leaf, num_parallel_tree)
ntree_limit = 2 n_iters = 2
sliced = booster.predict( sliced = booster.predict(
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True m,
pred_leaf=True,
iteration_range=(0, n_iters),
strict_shape=True,
) )
first = sliced[0, ...] first = sliced[0, ...]
assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit assert np.prod(first.shape) == classes * num_parallel_tree * n_iters
# When there's only 1 tree, the output is a 1 dim vector # When there's only 1 tree, the output is a 1 dim vector
booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m) booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
assert booster.predict(m, pred_leaf=True).shape == (rows, ) assert booster.predict(m, pred_leaf=True).shape == (rows,)
return leaf return leaf
def test_predict_leaf(): def test_predict_leaf():
run_predict_leaf('cpu_predictor') run_predict_leaf("cpu_predictor")
def test_predict_shape(): def test_predict_shape():
from sklearn.datasets import fetch_california_housing from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True) X, y = fetch_california_housing(return_X_y=True)
reg = xgb.XGBRegressor(n_estimators=1) reg = xgb.XGBRegressor(n_estimators=1)
reg.fit(X, y) reg.fit(X, y)
@ -119,13 +123,14 @@ def test_predict_shape():
class TestInplacePredict: class TestInplacePredict:
'''Tests for running inplace prediction''' """Tests for running inplace prediction"""
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
cls.rows = 1000 cls.rows = 1000
cls.cols = 10 cls.cols = 10
cls.missing = 11 # set to integer for testing cls.missing = 11 # set to integer for testing
cls.rng = np.random.RandomState(1994) cls.rng = np.random.RandomState(1994)
@ -139,7 +144,7 @@ class TestInplacePredict:
cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing) cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)
cls.num_boost_round = 10 cls.num_boost_round = 10
cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10) cls.booster = xgb.train({"tree_method": "hist"}, dtrain, num_boost_round=10)
def test_predict(self): def test_predict(self):
booster = self.booster booster = self.booster
@ -162,28 +167,22 @@ class TestInplacePredict:
predt_from_array = booster.inplace_predict( predt_from_array = booster.inplace_predict(
X[:10, ...], iteration_range=(0, 4), missing=self.missing X[:10, ...], iteration_range=(0, 4), missing=self.missing
) )
predt_from_dmatrix = booster.predict(test, ntree_limit=4) predt_from_dmatrix = booster.predict(test, iteration_range=(0, 4))
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
with pytest.raises(ValueError):
booster.predict(test, ntree_limit=booster.best_ntree_limit + 1)
with pytest.raises(ValueError): with pytest.raises(ValueError):
booster.predict(test, iteration_range=(0, booster.best_iteration + 2)) booster.predict(test, iteration_range=(0, booster.best_iteration + 2))
default = booster.predict(test) default = booster.predict(test)
range_full = booster.predict(test, iteration_range=(0, self.num_boost_round)) range_full = booster.predict(test, iteration_range=(0, self.num_boost_round))
ntree_full = booster.predict(test, ntree_limit=self.num_boost_round)
np.testing.assert_allclose(range_full, default) np.testing.assert_allclose(range_full, default)
np.testing.assert_allclose(ntree_full, default)
range_full = booster.predict( range_full = booster.predict(
test, iteration_range=(0, booster.best_iteration + 1) test, iteration_range=(0, booster.best_iteration + 1)
) )
ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit)
np.testing.assert_allclose(range_full, default) np.testing.assert_allclose(range_full, default)
np.testing.assert_allclose(ntree_full, default)
def predict_dense(x): def predict_dense(x):
inplace_predt = booster.inplace_predict(x) inplace_predt = booster.inplace_predict(x)
@ -251,6 +250,7 @@ class TestInplacePredict:
@pytest.mark.skipif(**tm.no_pandas()) @pytest.mark.skipif(**tm.no_pandas())
def test_pd_dtypes(self) -> None: def test_pd_dtypes(self) -> None:
from pandas.api.types import is_bool_dtype from pandas.api.types import is_bool_dtype
for orig, x in pd_dtypes(): for orig, x in pd_dtypes():
dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes] dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]): if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):

View File

@ -60,7 +60,7 @@ def test_ranking_with_weighted_data():
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:])) assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
for i in range(1, 11): for i in range(1, 11):
pred = bst.predict(dtrain, ntree_limit=i) pred = bst.predict(dtrain, iteration_range=(0, i))
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor? # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
is_sorted = [] is_sorted = []
for k in range(0, 20, 5): for k in range(0, 20, 5):

View File

@ -95,44 +95,39 @@ class TestTrainingContinuation:
res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
assert res1 == res2 assert res1 == res2
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3)
num_boost_round=3)
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration +
1) * self.num_parallel_tree
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
res2 = mean_squared_error(y_2class, res2 = mean_squared_error(
gbdt_04.predict( y_2class,
dtrain_2class, gbdt_04.predict(
ntree_limit=gbdt_04.best_ntree_limit)) dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
)
)
assert res1 == res2 assert res1 == res2
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, gbdt_04 = xgb.train(
num_boost_round=7, xgb_model=gbdt_04) xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04
assert gbdt_04.best_ntree_limit == ( )
gbdt_04.best_iteration + 1) * self.num_parallel_tree
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
res2 = mean_squared_error(y_2class, res2 = mean_squared_error(
gbdt_04.predict( y_2class,
dtrain_2class, gbdt_04.predict(
ntree_limit=gbdt_04.best_ntree_limit)) dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
)
)
assert res1 == res2 assert res1 == res2
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
num_boost_round=7) num_boost_round=7)
assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree
gbdt_05 = xgb.train(xgb_params_03, gbdt_05 = xgb.train(xgb_params_03,
dtrain_5class, dtrain_5class,
num_boost_round=3, num_boost_round=3,
xgb_model=gbdt_05) xgb_model=gbdt_05)
assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree
res1 = gbdt_05.predict(dtrain_5class) res1 = gbdt_05.predict(dtrain_5class)
res2 = gbdt_05.predict(dtrain_5class, res2 = gbdt_05.predict(
ntree_limit=gbdt_05.best_ntree_limit) dtrain_5class, iteration_range=(0, gbdt_05.best_iteration + 1)
)
np.testing.assert_almost_equal(res1, res2) np.testing.assert_almost_equal(res1, res2)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())

View File

@ -13,9 +13,9 @@ except Exception:
pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package") pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
# Check integration is not broken from xgboost side # xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
# Changes in binary format may cause problems @pytest.mark.xfail
def test_with_shap(): def test_with_shap() -> None:
from sklearn.datasets import fetch_california_housing from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True) X, y = fetch_california_housing(return_X_y=True)

View File

@ -63,9 +63,15 @@ def test_multiclass_classification(objective):
assert xgb_model.get_booster().num_boosted_rounds() == 100 assert xgb_model.get_booster().num_boosted_rounds() == 100
preds = xgb_model.predict(X[test_index]) preds = xgb_model.predict(X[test_index])
# test other params in XGBClassifier().fit # test other params in XGBClassifier().fit
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds2 = xgb_model.predict(
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) X[test_index], output_margin=True, iteration_range=(0, 1)
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) )
preds3 = xgb_model.predict(
X[test_index], output_margin=True, iteration_range=None
)
preds4 = xgb_model.predict(
X[test_index], output_margin=False, iteration_range=(0, 1)
)
labels = y[test_index] labels = y[test_index]
check_pred(preds, labels, output_margin=False) check_pred(preds, labels, output_margin=False)
@ -86,25 +92,21 @@ def test_multiclass_classification(objective):
assert proba.shape[1] == cls.n_classes_ assert proba.shape[1] == cls.n_classes_
def test_best_ntree_limit(): def test_best_iteration():
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
def train(booster, forest): def train(booster: str, forest: Optional[int]) -> None:
rounds = 4 rounds = 4
cls = xgb.XGBClassifier( cls = xgb.XGBClassifier(
n_estimators=rounds, num_parallel_tree=forest, booster=booster n_estimators=rounds, num_parallel_tree=forest, booster=booster
).fit( ).fit(
X, y, eval_set=[(X, y)], early_stopping_rounds=3 X, y, eval_set=[(X, y)], early_stopping_rounds=3
) )
assert cls.best_iteration == rounds - 1
if forest: # best_iteration is used by default, assert that under gblinear it's
assert cls.best_ntree_limit == rounds * forest
else:
assert cls.best_ntree_limit == 0
# best_ntree_limit is used by default, assert that under gblinear it's
# automatically ignored due to being 0. # automatically ignored due to being 0.
cls.predict(X) cls.predict(X)
@ -430,12 +432,15 @@ def test_regression():
preds = xgb_model.predict(X[test_index]) preds = xgb_model.predict(X[test_index])
# test other params in XGBRegressor().fit # test other params in XGBRegressor().fit
preds2 = xgb_model.predict(X[test_index], output_margin=True, preds2 = xgb_model.predict(
ntree_limit=3) X[test_index], output_margin=True, iteration_range=(0, 3)
preds3 = xgb_model.predict(X[test_index], output_margin=True, )
ntree_limit=0) preds3 = xgb_model.predict(
preds4 = xgb_model.predict(X[test_index], output_margin=False, X[test_index], output_margin=True, iteration_range=None
ntree_limit=3) )
preds4 = xgb_model.predict(
X[test_index], output_margin=False, iteration_range=(0, 3)
)
labels = y[test_index] labels = y[test_index]
assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds, labels) < 25

View File

@ -169,7 +169,7 @@ def reg_with_weight(
) )
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test")) RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test", "reg_params"))
@pytest.fixture @pytest.fixture
@ -181,6 +181,13 @@ def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
predt0 = reg1.predict(X) predt0 = reg1.predict(X)
pred_contrib0: np.ndarray = pred_contribs(reg1, X, None, False) pred_contrib0: np.ndarray = pred_contribs(reg1, X, None, False)
reg_params = {
"max_depth": 5,
"n_estimators": 10,
"iteration_range": [0, 5],
"max_bin": 9,
}
# convert np array to pyspark dataframe # convert np array to pyspark dataframe
reg_df_train_data = [ reg_df_train_data = [
(Vectors.dense(X[0, :]), int(y[0])), (Vectors.dense(X[0, :]), int(y[0])),
@ -188,26 +195,34 @@ def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
] ]
reg_df_train = spark.createDataFrame(reg_df_train_data, ["features", "label"]) reg_df_train = spark.createDataFrame(reg_df_train_data, ["features", "label"])
reg2 = xgb.XGBRegressor(max_depth=5, n_estimators=10)
reg2.fit(X, y)
predt2 = reg2.predict(X, iteration_range=[0, 5])
# array([0.22185266, 0.77814734], dtype=float32)
reg_df_test = spark.createDataFrame( reg_df_test = spark.createDataFrame(
[ [
( (
Vectors.dense(X[0, :]), Vectors.dense(X[0, :]),
float(predt0[0]), float(predt0[0]),
pred_contrib0[0, :].tolist(), pred_contrib0[0, :].tolist(),
float(predt2[0]),
), ),
( (
Vectors.sparse(3, {1: 1.0, 2: 5.5}), Vectors.sparse(3, {1: 1.0, 2: 5.5}),
float(predt0[1]), float(predt0[1]),
pred_contrib0[1, :].tolist(), pred_contrib0[1, :].tolist(),
float(predt2[1]),
), ),
], ],
[ [
"features", "features",
"expected_prediction", "expected_prediction",
"expected_pred_contribs", "expected_pred_contribs",
"expected_prediction_with_params",
], ],
) )
yield RegData(reg_df_train, reg_df_test) yield RegData(reg_df_train, reg_df_test, reg_params)
MultiClfData = namedtuple("MultiClfData", ("multi_clf_df_train", "multi_clf_df_test")) MultiClfData = namedtuple("MultiClfData", ("multi_clf_df_train", "multi_clf_df_test"))
@ -740,6 +755,76 @@ class TestPySparkLocal:
model = classifier.fit(clf_data.cls_df_train) model = classifier.fit(clf_data.cls_df_train)
model.transform(clf_data.cls_df_test).collect() model.transform(clf_data.cls_df_test).collect()
def test_regressor_model_save_load(self, reg_data: RegData) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
path = "file:" + tmpdir
regressor = SparkXGBRegressor(**reg_data.reg_params)
model = regressor.fit(reg_data.reg_df_train)
model.save(path)
loaded_model = SparkXGBRegressorModel.load(path)
assert model.uid == loaded_model.uid
for k, v in reg_data.reg_params.items():
assert loaded_model.getOrDefault(k) == v
pred_result = loaded_model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
with pytest.raises(AssertionError, match="Expected class name"):
SparkXGBClassifierModel.load(path)
assert_model_compatible(model, tmpdir)
def test_regressor_with_params(self, reg_data: RegData) -> None:
regressor = SparkXGBRegressor(**reg_data.reg_params)
all_params = dict(
**(regressor._gen_xgb_params_dict()),
**(regressor._gen_fit_params_dict()),
**(regressor._gen_predict_params_dict()),
)
check_sub_dict_match(
reg_data.reg_params, all_params, excluding_keys=_non_booster_params
)
model = regressor.fit(reg_data.reg_df_train)
all_params = dict(
**(model._gen_xgb_params_dict()),
**(model._gen_fit_params_dict()),
**(model._gen_predict_params_dict()),
)
check_sub_dict_match(
reg_data.reg_params, all_params, excluding_keys=_non_booster_params
)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
def test_regressor_model_pipeline_save_load(self, reg_data: RegData) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
path = "file:" + tmpdir
regressor = SparkXGBRegressor()
pipeline = Pipeline(stages=[regressor])
pipeline = pipeline.copy(
extra=get_params_map(reg_data.reg_params, regressor)
)
model = pipeline.fit(reg_data.reg_df_train)
model.save(path)
loaded_model = PipelineModel.load(path)
for k, v in reg_data.reg_params.items():
assert loaded_model.stages[0].getOrDefault(k) == v
pred_result = loaded_model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
assert_model_compatible(model.stages[0], tmpdir)
class XgboostLocalTest(SparkTestCase): class XgboostLocalTest(SparkTestCase):
def setUp(self): def setUp(self):
@ -918,12 +1003,6 @@ class XgboostLocalTest(SparkTestCase):
def get_local_tmp_dir(self): def get_local_tmp_dir(self):
return self.tempdir + str(uuid.uuid4()) return self.tempdir + str(uuid.uuid4())
def assert_model_compatible(self, model: XGBModel, model_path: str):
bst = xgb.Booster()
path = glob.glob(f"{model_path}/**/model/part-00000", recursive=True)[0]
bst.load_model(path)
self.assertEqual(model.get_booster().save_raw("json"), bst.save_raw("json"))
def test_convert_to_sklearn_model_reg(self) -> None: def test_convert_to_sklearn_model_reg(self) -> None:
regressor = SparkXGBRegressor( regressor = SparkXGBRegressor(
n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5 n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
@ -1007,80 +1086,6 @@ class XgboostLocalTest(SparkTestCase):
== "float64" == "float64"
) )
def test_regressor_with_params(self):
regressor = SparkXGBRegressor(**self.reg_params)
all_params = dict(
**(regressor._gen_xgb_params_dict()),
**(regressor._gen_fit_params_dict()),
**(regressor._gen_predict_params_dict()),
)
check_sub_dict_match(
self.reg_params, all_params, excluding_keys=_non_booster_params
)
model = regressor.fit(self.reg_df_train)
all_params = dict(
**(model._gen_xgb_params_dict()),
**(model._gen_fit_params_dict()),
**(model._gen_predict_params_dict()),
)
check_sub_dict_match(
self.reg_params, all_params, excluding_keys=_non_booster_params
)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
def test_regressor_model_save_load(self):
tmp_dir = self.get_local_tmp_dir()
path = "file:" + tmp_dir
regressor = SparkXGBRegressor(**self.reg_params)
model = regressor.fit(self.reg_df_train)
model.save(path)
loaded_model = SparkXGBRegressorModel.load(path)
self.assertEqual(model.uid, loaded_model.uid)
for k, v in self.reg_params.items():
self.assertEqual(loaded_model.getOrDefault(k), v)
pred_result = loaded_model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
with self.assertRaisesRegex(AssertionError, "Expected class name"):
SparkXGBClassifierModel.load(path)
self.assert_model_compatible(model, tmp_dir)
def test_regressor_model_pipeline_save_load(self):
tmp_dir = self.get_local_tmp_dir()
path = "file:" + tmp_dir
regressor = SparkXGBRegressor()
pipeline = Pipeline(stages=[regressor])
pipeline = pipeline.copy(extra=get_params_map(self.reg_params, regressor))
model = pipeline.fit(self.reg_df_train)
model.save(path)
loaded_model = PipelineModel.load(path)
for k, v in self.reg_params.items():
self.assertEqual(loaded_model.stages[0].getOrDefault(k), v)
pred_result = loaded_model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
self.assert_model_compatible(model.stages[0], tmp_dir)
def test_callbacks(self): def test_callbacks(self):
from xgboost.callback import LearningRateScheduler from xgboost.callback import LearningRateScheduler

View File

@ -1,16 +1,24 @@
import json import json
import logging
import os import os
import random import random
import tempfile
import uuid import uuid
from collections import namedtuple
import numpy as np import numpy as np
import pytest import pytest
import xgboost as xgb
from xgboost import testing as tm from xgboost import testing as tm
from xgboost.callback import LearningRateScheduler
pytestmark = pytest.mark.skipif(**tm.no_spark()) pytestmark = pytest.mark.skipif(**tm.no_spark())
from typing import Generator
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
from xgboost.spark.utils import _get_max_num_concurrent_tasks from xgboost.spark.utils import _get_max_num_concurrent_tasks
@ -18,51 +26,119 @@ from xgboost.spark.utils import _get_max_num_concurrent_tasks
from .utils import SparkLocalClusterTestCase from .utils import SparkLocalClusterTestCase
@pytest.fixture
def spark() -> Generator[SparkSession, None, None]:
config = {
"spark.master": "local-cluster[2, 2, 1024]",
"spark.python.worker.reuse": "false",
"spark.driver.host": "127.0.0.1",
"spark.task.maxFailures": "1",
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
"spark.cores.max": "4",
"spark.task.cpus": "1",
"spark.executor.cores": "2",
}
builder = SparkSession.builder.appName("XGBoost PySpark Python API Tests")
for k, v in config.items():
builder.config(k, v)
logging.getLogger("pyspark").setLevel(logging.INFO)
sess = builder.getOrCreate()
yield sess
sess.stop()
sess.sparkContext.stop()
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test", "reg_params"))
@pytest.fixture
def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
reg_params = {"max_depth": 5, "n_estimators": 10, "iteration_range": (0, 5)}
X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
y = np.array([0, 1])
def custom_lr(boosting_round):
return 1.0 / (boosting_round + 1)
reg1 = xgb.XGBRegressor(callbacks=[LearningRateScheduler(custom_lr)])
reg1.fit(X, y)
predt1 = reg1.predict(X)
# array([0.02406833, 0.97593164], dtype=float32)
reg2 = xgb.XGBRegressor(max_depth=5, n_estimators=10)
reg2.fit(X, y)
predt2 = reg2.predict(X, iteration_range=(0, 5))
# array([0.22185263, 0.77814734], dtype=float32)
reg_df_train = spark.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
],
["features", "label"],
)
reg_df_test = spark.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0.0, float(predt2[0]), float(predt1[0])),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
1.0,
float(predt2[1]),
float(predt1[1]),
),
],
[
"features",
"expected_prediction",
"expected_prediction_with_params",
"expected_prediction_with_callbacks",
],
)
yield RegData(reg_df_train, reg_df_test, reg_params)
class TestPySparkLocalCluster:
def test_regressor_basic_with_params(self, reg_data: RegData) -> None:
regressor = SparkXGBRegressor(**reg_data.reg_params)
model = regressor.fit(reg_data.reg_df_train)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
def test_callbacks(self, reg_data: RegData) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, str(uuid.uuid4()))
def custom_lr(boosting_round):
return 1.0 / (boosting_round + 1)
cb = [LearningRateScheduler(custom_lr)]
regressor = SparkXGBRegressor(callbacks=cb)
# Test the save/load of the estimator instead of the model, since
# the callbacks param only exists in the estimator but not in the model
regressor.save(path)
regressor = SparkXGBRegressor.load(path)
model = regressor.fit(reg_data.reg_df_train)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
)
class XgboostLocalClusterTestCase(SparkLocalClusterTestCase): class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
def setUp(self): def setUp(self):
random.seed(2020) random.seed(2020)
self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext) self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
# The following code use xgboost python library to train xgb model and predict.
#
# >>> import numpy as np
# >>> import xgboost
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
# >>> y = np.array([0, 1])
# >>> reg1 = xgboost.XGBRegressor()
# >>> reg1.fit(X, y)
# >>> reg1.predict(X)
# array([8.8363886e-04, 9.9911636e-01], dtype=float32)
# >>> def custom_lr(boosting_round, num_boost_round):
# ... return 1.0 / (boosting_round + 1)
# ...
# >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
# >>> reg1.predict(X)
# array([0.02406833, 0.97593164], dtype=float32)
# >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
# >>> reg2.fit(X, y)
# >>> reg2.predict(X, ntree_limit=5)
# array([0.22185263, 0.77814734], dtype=float32)
self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
self.reg_df_train = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
],
["features", "label"],
)
self.reg_df_test = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
],
[
"features",
"expected_prediction",
"expected_prediction_with_params",
"expected_prediction_with_callbacks",
],
)
# Distributed section # Distributed section
# Binary classification # Binary classification
@ -218,42 +294,6 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
self.reg_best_score_eval = 5.239e-05 self.reg_best_score_eval = 5.239e-05
self.reg_best_score_weight_and_eval = 4.850e-05 self.reg_best_score_weight_and_eval = 4.850e-05
def test_regressor_basic_with_params(self):
regressor = SparkXGBRegressor(**self.reg_params)
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
def test_callbacks(self):
from xgboost.callback import LearningRateScheduler
path = os.path.join(self.tempdir, str(uuid.uuid4()))
def custom_learning_rate(boosting_round):
return 1.0 / (boosting_round + 1)
cb = [LearningRateScheduler(custom_learning_rate)]
regressor = SparkXGBRegressor(callbacks=cb)
# Test the save/load of the estimator instead of the model, since
# the callbacks param only exists in the estimator but not in the model
regressor.save(path)
regressor = SparkXGBRegressor.load(path)
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
)
)
def test_classifier_distributed_basic(self): def test_classifier_distributed_basic(self):
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100) classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
model = classifier.fit(self.cls_df_train_distributed) model = classifier.fit(self.cls_df_train_distributed)
@ -409,7 +449,6 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
pred_result = model.transform( pred_result = model.transform(
self.cls_df_test_distributed_lower_estimators self.cls_df_test_distributed_lower_estimators
).collect() ).collect()
print(pred_result)
for row in pred_result: for row in pred_result:
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3)) self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
self.assertTrue( self.assertTrue(