temp merge, disable 1 line, SetValid

This commit is contained in:
Your Name
2023-10-12 16:16:44 -07:00
492 changed files with 15533 additions and 9376 deletions

View File

@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(
if build_config.use_system_libxgboost:
# Find libxgboost from system prefix
sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
libxgboost_sys = sys_prefix / "lib" / _lib_name()
sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
if not libxgboost_sys.exists():
raise RuntimeError(
f"use_system_libxgboost was specified but {_lib_name()} is "

View File

@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
[project]
name = "xgboost"
version = "2.0.0-dev"
version = "2.1.0-dev"
authors = [
{ name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }

View File

@@ -1 +1 @@
2.0.0-dev
2.1.0-dev

View File

@@ -4,7 +4,7 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
"""
from . import tracker # noqa
from . import collective, dask, rabit
from . import collective, dask
from .core import (
Booster,
DataIter,

View File

@@ -8,7 +8,9 @@ from typing import (
Callable,
Dict,
List,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
@@ -20,8 +22,6 @@ import numpy as np
DataType = Any
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatureInfo = Sequence[str]
FeatureNames = FeatureInfo
FeatureTypes = FeatureInfo
@@ -97,6 +97,13 @@ else:
ctypes._Pointer,
]
# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
# being freed.
TransformedData = Tuple[
Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
]
# template parameter
_T = TypeVar("_T")
_F = TypeVar("_F", bound=Callable[..., Any])

View File

@@ -134,13 +134,17 @@ class CallbackContainer:
is_cv: bool = False,
) -> None:
self.callbacks = set(callbacks)
if metric is not None:
msg = (
"metric must be callable object for monitoring. For "
+ "builtin metrics, passing them in training parameter"
+ " will invoke monitor automatically."
)
assert callable(metric), msg
for cb in callbacks:
if not isinstance(cb, TrainingCallback):
raise TypeError("callback must be an instance of `TrainingCallback`.")
msg = (
"metric must be callable object for monitoring. For builtin metrics"
", passing them in training parameter invokes monitor automatically."
)
if metric is not None and not callable(metric):
raise TypeError(msg)
self.metric = metric
self.history: TrainingCallback.EvalsLog = collections.OrderedDict()
self._output_margin = output_margin
@@ -170,16 +174,6 @@ class CallbackContainer:
else:
assert isinstance(model, Booster), msg
if not self.is_cv:
if model.attr("best_score") is not None:
model.best_score = float(cast(str, model.attr("best_score")))
model.best_iteration = int(cast(str, model.attr("best_iteration")))
else:
# Due to compatibility with version older than 1.4, these attributes are
# added to Python object even if early stopping is not used.
model.best_iteration = model.num_boosted_rounds() - 1
model.set_attr(best_iteration=str(model.best_iteration))
return model
def before_iteration(
@@ -267,9 +261,14 @@ class LearningRateScheduler(TrainingCallback):
def __init__(
self, learning_rates: Union[Callable[[int], float], Sequence[float]]
) -> None:
assert callable(learning_rates) or isinstance(
if not callable(learning_rates) and not isinstance(
learning_rates, collections.abc.Sequence
)
):
raise TypeError(
"Invalid learning rates, expecting callable or sequence, got: "
f"{type(learning_rates)}"
)
if callable(learning_rates):
self.learning_rates = learning_rates
else:
@@ -302,24 +301,28 @@ class EarlyStopping(TrainingCallback):
save_best :
Whether training should return the best model or the last model.
min_delta :
Minimum absolute change in score to be qualified as an improvement.
.. versionadded:: 1.5.0
.. code-block:: python
Minimum absolute change in score to be qualified as an improvement.
es = xgboost.callback.EarlyStopping(
rounds=2,
min_delta=1e-3,
save_best=True,
maximize=False,
data_name="validation_0",
metric_name="mlogloss",
)
clf = xgboost.XGBClassifier(tree_method="gpu_hist", callbacks=[es])
Examples
--------
X, y = load_digits(return_X_y=True)
clf.fit(X, y, eval_set=[(X, y)])
.. code-block:: python
es = xgboost.callback.EarlyStopping(
rounds=2,
min_delta=1e-3,
save_best=True,
maximize=False,
data_name="validation_0",
metric_name="mlogloss",
)
clf = xgboost.XGBClassifier(tree_method="hist", device="cuda", callbacks=[es])
X, y = load_digits(return_X_y=True)
clf.fit(X, y, eval_set=[(X, y)])
"""
# pylint: disable=too-many-arguments
@@ -363,7 +366,7 @@ class EarlyStopping(TrainingCallback):
return numpy.greater(get_s(new) - self._min_delta, get_s(best))
def minimize(new: _Score, best: _Score) -> bool:
"""New score should be smaller than the old one."""
"""New score should be lesser than the old one."""
return numpy.greater(get_s(best) - self._min_delta, get_s(new))
if self.maximize is None:
@@ -419,38 +422,53 @@ class EarlyStopping(TrainingCallback):
) -> bool:
epoch += self.starting_round # training continuation
msg = "Must have at least 1 validation dataset for early stopping."
assert len(evals_log.keys()) >= 1, msg
data_name = ""
if len(evals_log.keys()) < 1:
raise ValueError(msg)
# Get data name
if self.data:
for d, _ in evals_log.items():
if d == self.data:
data_name = d
if not data_name:
raise ValueError("No dataset named:", self.data)
data_name = self.data
else:
# Use the last one as default.
data_name = list(evals_log.keys())[-1]
assert isinstance(data_name, str) and data_name
if data_name not in evals_log:
raise ValueError(f"No dataset named: {data_name}")
if not isinstance(data_name, str):
raise TypeError(
f"The name of the dataset should be a string. Got: {type(data_name)}"
)
data_log = evals_log[data_name]
# Filter out scores that can not be used for early stopping.
# Get metric name
if self.metric_name:
metric_name = self.metric_name
else:
# Use last metric by default.
assert isinstance(data_log, collections.OrderedDict)
metric_name = list(data_log.keys())[-1]
if metric_name not in data_log:
raise ValueError(f"No metric named: {metric_name}")
# The latest score
score = data_log[metric_name][-1]
return self._update_rounds(score, data_name, metric_name, model, epoch)
def after_training(self, model: _Model) -> _Model:
if not self.save_best:
return model
try:
if self.save_best:
model = model[: int(model.attr("best_iteration")) + 1]
best_iteration = model.best_iteration
best_score = model.best_score
assert best_iteration is not None and best_score is not None
model = model[: best_iteration + 1]
model.best_iteration = best_iteration
model.best_score = best_score
except XGBoostError as e:
raise XGBoostError(
"`save_best` is not applicable to current booster"
"`save_best` is not applicable to the current booster"
) from e
return model
@@ -462,8 +480,6 @@ class EvaluationMonitor(TrainingCallback):
Parameters
----------
metric :
Extra user defined metric.
rank :
Which worker should be used for printing the result.
period :

View File

@@ -3,11 +3,13 @@
"""Core XGBoost Library."""
import copy
import ctypes
import importlib.util
import json
import os
import re
import sys
import warnings
import weakref
from abc import ABC, abstractmethod
from collections.abc import Mapping
from enum import IntEnum, unique
@@ -50,6 +52,7 @@ from ._typing import (
FeatureTypes,
ModelIn,
NumpyOrCupy,
TransformedData,
c_bst_ulong,
)
from .compat import PANDAS_INSTALLED, DataFrame, py_str
@@ -152,7 +155,11 @@ def _expect(expectations: Sequence[Type], got: Type) -> str:
def _log_callback(msg: bytes) -> None:
"""Redirect logs from native library into Python console"""
print(py_str(msg))
smsg = py_str(msg)
if smsg.find("WARNING:") != -1:
warnings.warn(smsg, UserWarning)
return
print(smsg)
def _get_log_callback_func() -> Callable:
@@ -228,8 +235,11 @@ Error message(s): {os_error_list}
def parse(ver: str) -> Tuple[int, int, int]:
"""Avoid dependency on packaging (PEP 440)."""
# 2.0.0-dev or 2.0.0
# 2.0.0-dev, 2.0.0, or 2.0.0rc1
major, minor, patch = ver.split("-")[0].split(".")
rc = patch.find("rc")
if rc != -1:
patch = patch[:rc]
return int(major), int(minor), int(patch)
libver = _lib_version(lib)
@@ -271,6 +281,44 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
"""Validate parameters in distributed environments."""
device = kwargs.get("device", None)
if device and not isinstance(device, str):
msg = "Invalid type for the `device` parameter"
msg += _expect((str,), type(device))
raise TypeError(msg)
if device and device.find(":") != -1:
raise ValueError(
"Distributed training doesn't support selecting device ordinal as GPUs are"
" managed by the distributed framework. use `device=cuda` or `device=gpu`"
" instead."
)
if kwargs.get("booster", None) == "gblinear":
raise NotImplementedError(
f"booster `{kwargs['booster']}` is not supported for distributed training."
)
def _validate_feature_info(
feature_info: Sequence[str], n_features: int, name: str
) -> List[str]:
if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
raise TypeError(
f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
)
feature_info = list(feature_info)
if len(feature_info) != n_features and n_features != 0:
msg = (
f"{name} must have the same length as the number of data columns, ",
f"expected {n_features}, got {len(feature_info)}",
)
raise ValueError(msg)
return feature_info
def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also,
please note that build time dependency is not the same as runtime dependency. For
@@ -381,6 +429,54 @@ def c_array(
return (ctype * len(values))(*values)
def from_array_interface(interface: dict) -> NumpyOrCupy:
"""Convert array interface to numpy or cupy array"""
class Array: # pylint: disable=too-few-public-methods
"""Wrapper type for communicating with numpy and cupy."""
_interface: Optional[dict] = None
@property
def __array_interface__(self) -> Optional[dict]:
return self._interface
@__array_interface__.setter
def __array_interface__(self, interface: dict) -> None:
self._interface = copy.copy(interface)
# converts some fields to tuple as required by numpy
self._interface["shape"] = tuple(self._interface["shape"])
self._interface["data"] = tuple(self._interface["data"])
if self._interface.get("strides", None) is not None:
self._interface["strides"] = tuple(self._interface["strides"])
@property
def __cuda_array_interface__(self) -> Optional[dict]:
return self.__array_interface__
@__cuda_array_interface__.setter
def __cuda_array_interface__(self, interface: dict) -> None:
self.__array_interface__ = interface
arr = Array()
if "stream" in interface:
# CUDA stream is presented, this is a __cuda_array_interface__.
spec = importlib.util.find_spec("cupy")
if spec is None:
raise ImportError("`cupy` is required for handling CUDA buffer.")
import cupy as cp # pylint: disable=import-error
arr.__cuda_array_interface__ = interface
out = cp.array(arr, copy=True)
else:
arr.__array_interface__ = interface
out = np.array(arr, copy=True)
return out
def _prediction_output(
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
) -> NumpyOrCupy:
@@ -395,13 +491,21 @@ def _prediction_output(
class DataIter(ABC): # pylint: disable=too-many-instance-attributes
"""The interface for user defined data iterator.
"""The interface for user defined data iterator. The iterator facilitates
distributed training, :py:class:`QuantileDMatrix`, and external memory support using
:py:class:`DMatrix`. Most of time, users don't need to interact with this class
directly.
.. note::
The class caches some intermediate results using the `data` input (predictor
`X`) as key. Don't repeat the `X` for multiple batches with different meta data
(like `label`), make a copy if necessary.
Parameters
----------
cache_prefix :
Prefix to the cache files, only used in external memory. It can be either an
URI or a file path.
Prefix to the cache files, only used in external memory.
release_data :
Whether the iterator should release the data during reset. Set it to True if the
data transformation (converting data to np.float32 type) is expensive.
@@ -419,13 +523,13 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
self._allow_host = True
self._release = release_data
# Stage data in Python until reset or next is called to avoid data being free.
self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
self._input_id: int = 0
self._temporary_data: Optional[TransformedData] = None
self._data_ref: Optional[weakref.ReferenceType] = None
def get_callbacks(
self, allow_host: bool, enable_categorical: bool
) -> Tuple[Callable, Callable]:
"""Get callback functions for iterating in C."""
"""Get callback functions for iterating in C. This is an internal function."""
assert hasattr(self, "cache_prefix"), "__init__ is not called."
self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
self._reset_wrapper
@@ -491,8 +595,8 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
@require_keyword_args(True)
def input_data(
data: Any,
*,
data: Any,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
**kwargs: Any,
@@ -500,7 +604,19 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
from .data import _proxy_transform, dispatch_proxy_set_data
# Reduce the amount of transformation that's needed for QuantileDMatrix.
if self._temporary_data is not None and id(data) == self._input_id:
#
# To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
# GPU. If the QDM has only one batch of input (most of the cases), we can
# avoid transforming the data repeatly.
try:
ref = weakref.ref(data)
except TypeError:
ref = None
if (
self._temporary_data is not None
and ref is not None
and ref is self._data_ref
):
new, cat_codes, feature_names, feature_types = self._temporary_data
else:
new, cat_codes, feature_names, feature_types = _proxy_transform(
@@ -517,7 +633,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
feature_types=feature_types,
**kwargs,
)
self._input_id = id(data)
self._data_ref = ref
# pylint: disable=not-callable
return self._handle_exception(lambda: self.next(input_data), 0)
@@ -593,6 +709,9 @@ def require_keyword_args(
@wraps(func)
def inner_f(*args: Any, **kwargs: Any) -> _T:
extra_args = len(args) - len(all_args)
if not all_args and extra_args > 0: # keyword argument only
raise TypeError("Keyword argument is required.")
if extra_args > 0:
# ignore first 'self' argument for instance methods
args_msg = [
@@ -1040,7 +1159,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
testing purposes. If this is a quantized DMatrix then quantized values are
returned instead of input values.
.. versionadded:: 1.7.0
.. versionadded:: 1.7.0
"""
indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
@@ -1060,6 +1179,36 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
)
return ret
def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
"""Get quantile cuts for quantization.
.. versionadded:: 2.0.0
"""
n_features = self.num_col()
c_sindptr = ctypes.c_char_p()
c_sdata = ctypes.c_char_p()
config = make_jcargs()
_check_call(
_LIB.XGDMatrixGetQuantileCut(
self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
)
)
assert c_sindptr.value is not None
assert c_sdata.value is not None
i_indptr = json.loads(c_sindptr.value)
indptr = from_array_interface(i_indptr)
assert indptr.size == n_features + 1
assert indptr.dtype == np.uint64
i_data = json.loads(c_sdata.value)
data = from_array_interface(i_data)
assert data.size == indptr[-1]
assert data.dtype == np.float32
return indptr, data
def num_row(self) -> int:
"""Get the number of rows in the DMatrix."""
ret = c_bst_ulong()
@@ -1117,11 +1266,10 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
@property
def feature_names(self) -> Optional[FeatureNames]:
"""Get feature names (column labels).
"""Labels for features (column labels).
Setting it to ``None`` resets existing feature names.
Returns
-------
feature_names : list or None
"""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1140,67 +1288,61 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
@feature_names.setter
def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
"""Set feature names (column labels).
Parameters
----------
feature_names : list or None
Labels for features. None will reset existing feature names
"""
if feature_names is not None:
# validate feature name
try:
if not isinstance(feature_names, str):
feature_names = list(feature_names)
else:
feature_names = [feature_names]
except TypeError:
feature_names = [cast(str, feature_names)]
if len(feature_names) != len(set(feature_names)):
raise ValueError("feature_names must be unique")
if len(feature_names) != self.num_col() and self.num_col() != 0:
msg = (
"feature_names must have the same length as data, ",
f"expected {self.num_col()}, got {len(feature_names)}",
)
raise ValueError(msg)
# prohibit to use symbols may affect to parse. e.g. []<
if not all(
isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
for f in feature_names
):
raise ValueError(
"feature_names must be string, and may not contain [, ] or <"
)
feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
*feature_names_bytes
)
_check_call(
_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str("feature_name"),
c_feature_names,
c_bst_ulong(len(feature_names)),
)
)
else:
# reset feature_types also
if feature_names is None:
_check_call(
_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str("feature_name"), None, c_bst_ulong(0)
)
)
self.feature_types = None
return
# validate feature name
feature_names = _validate_feature_info(
feature_names, self.num_col(), "feature names"
)
if len(feature_names) != len(set(feature_names)):
values, counts = np.unique(
feature_names,
return_index=False,
return_inverse=False,
return_counts=True,
)
duplicates = [name for name, cnt in zip(values, counts) if cnt > 1]
raise ValueError(
f"feature_names must be unique. Duplicates found: {duplicates}"
)
# prohibit the use symbols that may affect parsing. e.g. []<
if not all(
isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
for f in feature_names
):
raise ValueError(
"feature_names must be string, and may not contain [, ] or <"
)
feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
*feature_names_bytes
)
_check_call(
_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str("feature_name"),
c_feature_names,
c_bst_ulong(len(feature_names)),
)
)
@property
def feature_types(self) -> Optional[FeatureTypes]:
"""Get feature types (column types).
"""Type of features (column types).
This is for displaying the results and categorical data support. See
:py:class:`DMatrix` for details.
Setting it to ``None`` resets existing feature types.
Returns
-------
feature_types : list or None
"""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1218,57 +1360,32 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
return res
@feature_types.setter
def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
"""Set feature types (column types).
This is for displaying the results and categorical data support. See
:py:class:`DMatrix` for details.
Parameters
----------
feature_types :
Labels for features. None will reset existing feature names
"""
# For compatibility reason this function wraps single str input into a list. But
# we should not promote such usage since other than visualization, the field is
# also used for specifying categorical data type.
if feature_types is not None:
if not isinstance(feature_types, (list, str)):
raise TypeError("feature_types must be string or list of strings")
if isinstance(feature_types, str):
# single string will be applied to all columns
feature_types = [feature_types] * self.num_col()
try:
if not isinstance(feature_types, str):
feature_types = list(feature_types)
else:
feature_types = [feature_types]
except TypeError:
feature_types = [cast(str, feature_types)]
feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
*feature_types_bytes
)
_check_call(
_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str("feature_type"),
c_feature_types,
c_bst_ulong(len(feature_types)),
)
)
if len(feature_types) != self.num_col() and self.num_col() != 0:
msg = "feature_types must have the same length as data"
raise ValueError(msg)
else:
# Reset.
def feature_types(self, feature_types: Optional[FeatureTypes]) -> None:
if feature_types is None:
# Reset
_check_call(
_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str("feature_type"), None, c_bst_ulong(0)
)
)
return
feature_types = _validate_feature_info(
feature_types, self.num_col(), "feature types"
)
feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
*feature_types_bytes
)
_check_call(
_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str("feature_type"),
c_feature_types,
c_bst_ulong(len(feature_types)),
)
)
class _ProxyDMatrix(DMatrix):
@@ -1318,13 +1435,13 @@ class _ProxyDMatrix(DMatrix):
class QuantileDMatrix(DMatrix):
"""A DMatrix variant that generates quantilized data directly from input for
``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
number of bins during quantisation, which should be consistent with the training
parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
it defeats the purpose of saving memory) constructed from training dataset. See
"""A DMatrix variant that generates quantilized data directly from input for the
``hist`` tree method. This DMatrix is primarily designed to save memory in training
by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
during quantisation, which should be consistent with the training parameter
``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
defeats the purpose of saving memory) constructed from training dataset. See
:py:obj:`xgboost.DMatrix` for documents on meta info.
.. note::
@@ -1372,7 +1489,7 @@ class QuantileDMatrix(DMatrix):
enable_categorical: bool = False,
data_split_mode: DataSplitMode = DataSplitMode.ROW,
) -> None:
self.max_bin: int = max_bin if max_bin is not None else 256
self.max_bin = max_bin
self.missing = missing if missing is not None else np.nan
self.nthread = nthread if nthread is not None else -1
self._silent = silent # unused, kept for compatibility
@@ -1544,7 +1661,7 @@ class Booster:
)
for d in cache:
# Validate feature only after the feature names are saved into booster.
self._validate_dmatrix_features(d)
self._assign_dmatrix_features(d)
if isinstance(model_file, Booster):
assert self.handle is not None
@@ -1667,6 +1784,11 @@ class Booster:
self.__dict__.update(state)
def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
"""Get a slice of the tree-based model.
.. versionadded:: 1.3.0
"""
if isinstance(val, int):
val = slice(val, val + 1)
if isinstance(val, tuple):
@@ -1705,6 +1827,11 @@ class Booster:
return sliced
def __iter__(self) -> Generator["Booster", None, None]:
"""Iterator method for getting individual trees.
.. versionadded:: 2.0.0
"""
for i in range(0, self.num_boosted_rounds()):
yield self[i]
@@ -1795,7 +1922,7 @@ class Booster:
attr_names = from_cstr_to_pystr(sarr, length)
return {n: self.attr(n) for n in attr_names}
def set_attr(self, **kwargs: Optional[str]) -> None:
def set_attr(self, **kwargs: Optional[Any]) -> None:
"""Set the attribute of the Booster.
Parameters
@@ -1915,7 +2042,7 @@ class Booster:
"""
if not isinstance(dtrain, DMatrix):
raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
self._validate_dmatrix_features(dtrain)
self._assign_dmatrix_features(dtrain)
if fobj is None:
_check_call(
@@ -1926,12 +2053,14 @@ class Booster:
else:
pred = self.predict(dtrain, output_margin=True, training=True)
grad, hess = fobj(pred, dtrain)
self.boost(dtrain, grad, hess)
self.boost(dtrain, iteration=iteration, grad=grad, hess=hess)
def boost(self, dtrain: DMatrix, grad: np.ndarray, hess: np.ndarray) -> None:
"""Boost the booster for one iteration, with customized gradient
statistics. Like :py:func:`xgboost.Booster.update`, this
function should not be called directly by users.
def boost(
self, dtrain: DMatrix, iteration: int, grad: NumpyOrCupy, hess: NumpyOrCupy
) -> None:
"""Boost the booster for one iteration with customized gradient statistics.
Like :py:func:`xgboost.Booster.update`, this function should not be called
directly by users.
Parameters
----------
@@ -1943,19 +2072,53 @@ class Booster:
The second order of gradient.
"""
if len(grad) != len(hess):
raise ValueError(f"grad / hess length mismatch: {len(grad)} / {len(hess)}")
if not isinstance(dtrain, DMatrix):
raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
self._validate_dmatrix_features(dtrain)
from .data import (
_array_interface,
_cuda_array_interface,
_ensure_np_dtype,
_is_cupy_array,
)
self._assign_dmatrix_features(dtrain)
def is_flatten(array: NumpyOrCupy) -> bool:
return len(array.shape) == 1 or array.shape[1] == 1
def array_interface(array: NumpyOrCupy) -> bytes:
# Can we check for __array_interface__ instead of a specific type instead?
msg = (
"Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian."
f" Got: {type(array)}"
)
if not isinstance(array, np.ndarray) and not _is_cupy_array(array):
raise TypeError(msg)
n_samples = dtrain.num_row()
if array.shape[0] != n_samples and is_flatten(array):
warnings.warn(
"Since 2.1.0, the shape of the gradient and hessian is required to"
" be (n_samples, n_targets) or (n_samples, n_classes).",
FutureWarning,
)
array = array.reshape(n_samples, array.size // n_samples)
if isinstance(array, np.ndarray):
array, _ = _ensure_np_dtype(array, array.dtype)
interface = _array_interface(array)
elif _is_cupy_array(array):
interface = _cuda_array_interface(array)
else:
raise TypeError(msg)
return interface
_check_call(
_LIB.XGBoosterBoostOneIter(
_LIB.XGBoosterTrainOneIter(
self.handle,
dtrain.handle,
c_array(ctypes.c_float, grad),
c_array(ctypes.c_float, hess),
c_bst_ulong(len(grad)),
iteration,
array_interface(grad),
array_interface(hess),
)
)
@@ -1988,7 +2151,7 @@ class Booster:
raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
if not isinstance(d[1], str):
raise TypeError(f"expected string, got {type(d[1]).__name__}")
self._validate_dmatrix_features(d[0])
self._assign_dmatrix_features(d[0])
dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
@@ -2040,7 +2203,7 @@ class Booster:
result: str
Evaluation result string.
"""
self._validate_dmatrix_features(data)
self._assign_dmatrix_features(data)
return self.eval_set([(data, name)], iteration)
# pylint: disable=too-many-function-args
@@ -2139,7 +2302,8 @@ class Booster:
if not isinstance(data, DMatrix):
raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
if validate_features:
self._validate_dmatrix_features(data)
fn = data.feature_names
self._validate_features(fn)
args = {
"type": 0,
"training": training,
@@ -2187,20 +2351,25 @@ class Booster:
base_margin: Any = None,
strict_shape: bool = False,
) -> NumpyOrCupy:
"""Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
does not cache the prediction result.
"""Run prediction in-place when possible, Unlike :py:meth:`predict` method,
inplace prediction does not cache the prediction result.
Calling only ``inplace_predict`` in multiple threads is safe and lock
free. But the safety does not hold when used in conjunction with other
methods. E.g. you can't train the booster in one thread and perform
prediction in the other.
.. note::
If the device ordinal of the input data doesn't match the one configured for
the booster, data will be copied to the booster device.
.. code-block:: python
booster.set_param({"predictor": "gpu_predictor"})
booster.set_param({"device": "cuda:0"})
booster.inplace_predict(cupy_array)
booster.set_param({"predictor": "cpu_predictor"})
booster.set_param({"device": "cpu"})
booster.inplace_predict(numpy_array)
.. versionadded:: 1.1.0
@@ -2208,9 +2377,7 @@ class Booster:
Parameters
----------
data :
The input data, must not be a view for numpy array. Set
``predictor`` to ``gpu_predictor`` for running prediction on CuPy
array or CuDF DataFrame.
The input data.
iteration_range :
See :py:meth:`predict` for details.
predict_type :
@@ -2233,8 +2400,8 @@ class Booster:
Returns
-------
prediction : numpy.ndarray/cupy.ndarray
The prediction result. When input data is on GPU, prediction
result is stored in a cupy array.
The prediction result. When input data is on GPU, prediction result is
stored in a cupy array.
"""
preds = ctypes.POINTER(ctypes.c_float)()
@@ -2426,8 +2593,7 @@ class Booster:
return ctypes2buffer(cptr, length.value)
def load_model(self, fname: ModelIn) -> None:
"""Load the model from a file or bytearray. Path to file can be local
or as an URI.
"""Load the model from a file or a bytearray.
The model is loaded from XGBoost format which is universal among the various
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
@@ -2460,10 +2626,35 @@ class Booster:
else:
raise TypeError("Unknown file type: ", fname)
if self.attr("best_iteration") is not None:
self.best_iteration = int(cast(int, self.attr("best_iteration")))
if self.attr("best_score") is not None:
self.best_score = float(cast(float, self.attr("best_score")))
@property
def best_iteration(self) -> int:
"""The best iteration during training."""
best = self.attr("best_iteration")
if best is not None:
return int(best)
raise AttributeError(
"`best_iteration` is only defined when early stopping is used."
)
@best_iteration.setter
def best_iteration(self, iteration: int) -> None:
self.set_attr(best_iteration=iteration)
@property
def best_score(self) -> float:
"""The best evaluation score during training."""
best = self.attr("best_score")
if best is not None:
return float(best)
raise AttributeError(
"`best_score` is only defined when early stopping is used."
)
@best_score.setter
def best_score(self, score: int) -> None:
self.set_attr(best_score=score)
def num_boosted_rounds(self) -> int:
"""Get number of boosted rounds. For gblinear this is reset to 0 after
@@ -2761,14 +2952,13 @@ class Booster:
# pylint: disable=no-member
return df.sort(["Tree", "Node"]).reset_index(drop=True)
def _validate_dmatrix_features(self, data: DMatrix) -> None:
def _assign_dmatrix_features(self, data: DMatrix) -> None:
if data.num_row() == 0:
return
fn = data.feature_names
ft = data.feature_types
# Be consistent with versions before 1.7, "validate" actually modifies the
# booster.
if self.feature_names is None:
self.feature_names = fn
if self.feature_types is None:

View File

@@ -47,6 +47,7 @@ from typing import (
Callable,
Dict,
Generator,
Iterable,
List,
Optional,
Sequence,
@@ -70,6 +71,7 @@ from .core import (
Metric,
Objective,
QuantileDMatrix,
_check_distributed_params,
_deprecate_positional_args,
_expect,
)
@@ -82,6 +84,7 @@ from .sklearn import (
XGBRanker,
XGBRankerMixIn,
XGBRegressorBase,
_can_use_qdm,
_check_rf_callback,
_cls_predict_proba,
_objective_decorator,
@@ -95,10 +98,12 @@ if TYPE_CHECKING:
import dask
import distributed
from dask import array as da
from dask import bag as db
from dask import dataframe as dd
else:
dd = LazyLoader("dd", globals(), "dask.dataframe")
da = LazyLoader("da", globals(), "dask.array")
db = LazyLoader("db", globals(), "dask.bag")
dask = LazyLoader("dask", globals(), "dask")
distributed = LazyLoader("distributed", globals(), "dask.distributed")
@@ -507,12 +512,10 @@ async def map_worker_partitions(
func: Callable[..., _MapRetT],
*refs: Any,
workers: Sequence[str],
) -> List[_MapRetT]:
) -> _MapRetT:
"""Map a function onto partitions of each worker."""
# Note for function purity:
# XGBoost is deterministic in most of the cases, which means train function is
# supposed to be idempotent. One known exception is gblinear with shotgun updater.
# We haven't been able to do a full verification so here we keep pure to be False.
# XGBoost is sensitive to data partition and uses random number generator.
client = _xgb_get_client(client)
futures = []
for addr in workers:
@@ -524,11 +527,26 @@ async def map_worker_partitions(
else:
args.append(ref)
fut = client.submit(
func, *args, pure=False, workers=[addr], allow_other_workers=False
# turn result into a list for bag construction
lambda *args, **kwargs: [func(*args, **kwargs)],
*args,
pure=False,
workers=[addr],
allow_other_workers=False,
)
futures.append(fut)
results = await client.gather(futures)
return results
def first_valid(results: Iterable[Optional[_MapRetT]]) -> Optional[_MapRetT]:
for v in results:
if v is not None:
return v
return None
bag = db.from_delayed(futures)
fut = await bag.reduction(first_valid, first_valid)
result = await client.compute(fut).result()
return result
_DataParts = List[Dict[str, Any]]
@@ -617,14 +635,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
if self._iter == len(self._data):
# Return 0 when there's no more batch.
return 0
feature_names: Optional[FeatureNames] = None
if self._feature_names:
feature_names = self._feature_names
else:
if hasattr(self.data(), "columns"):
feature_names = self.data().columns.format()
else:
feature_names = None
input_data(
data=self.data(),
label=self._get("_label"),
@@ -634,7 +645,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
base_margin=self._get("_base_margin"),
label_lower_bound=self._get("_label_lower_bound"),
label_upper_bound=self._get("_label_upper_bound"),
feature_names=feature_names,
feature_names=self._feature_names,
feature_types=self._feature_types,
feature_weights=self._feature_weights,
)
@@ -855,8 +866,6 @@ async def _get_rabit_args(
except Exception: # pylint: disable=broad-except
sched_addr = None
# make sure all workers are online so that we can obtain reliable scheduler_info
await client.wait_for_workers(n_workers) # type: ignore
env = await client.run_on_scheduler(
_start_tracker, n_workers, sched_addr, user_addr
)
@@ -889,27 +898,14 @@ def _get_workers_from_data(
return list(X_worker_map)
def _filter_empty(
booster: Booster, local_history: TrainingCallback.EvalsLog, is_valid: bool
) -> Optional[TrainReturnT]:
n_workers = collective.get_world_size()
non_empty = numpy.zeros(shape=(n_workers,), dtype=numpy.int32)
rank = collective.get_rank()
non_empty[rank] = int(is_valid)
non_empty = collective.allreduce(non_empty, collective.Op.SUM)
non_empty = non_empty.astype(bool)
ret: Optional[TrainReturnT] = {
"booster": booster,
"history": local_history,
}
for i in range(non_empty.size):
# This is the first valid worker
if non_empty[i] and i == rank:
return ret
if non_empty[i]:
return None
raise ValueError("None of the workers can provide a valid result.")
async def _check_workers_are_alive(
workers: List[str], client: "distributed.Client"
) -> None:
info = await client.scheduler.identity()
current_workers = info["workers"].keys()
missing_workers = set(workers) - current_workers
if missing_workers:
raise RuntimeError(f"Missing required workers: {missing_workers}")
async def _train_async(
@@ -929,12 +925,9 @@ async def _train_async(
custom_metric: Optional[Metric],
) -> Optional[TrainReturnT]:
workers = _get_workers_from_data(dtrain, evals)
await _check_workers_are_alive(workers, client)
_rabit_args = await _get_rabit_args(len(workers), dconfig, client)
if params.get("booster", None) == "gblinear":
raise NotImplementedError(
f"booster `{params['booster']}` is not yet supported for dask."
)
_check_distributed_params(params)
def dispatched_train(
parameters: Dict,
@@ -997,10 +990,17 @@ async def _train_async(
xgb_model=xgb_model,
callbacks=callbacks,
)
# Don't return the boosters from empty workers. It's quite difficult to
# guarantee everything is in sync in the present of empty workers,
# especially with complex objectives like quantile.
return _filter_empty(booster, local_history, Xy.num_row() != 0)
# Don't return the boosters from empty workers. It's quite difficult to
# guarantee everything is in sync in the present of empty workers, especially
# with complex objectives like quantile.
if Xy.num_row() != 0:
ret: Optional[TrainReturnT] = {
"booster": booster,
"history": local_history,
}
else:
ret = None
return ret
async with distributed.MultiLock(workers, client):
if evals is not None:
@@ -1012,7 +1012,7 @@ async def _train_async(
evals_name = []
evals_id = []
results = await map_worker_partitions(
result = await map_worker_partitions(
client,
dispatched_train,
# extra function parameters
@@ -1025,7 +1025,7 @@ async def _train_async(
# workers to be used for training
workers=workers,
)
return list(filter(lambda ret: ret is not None, results))[0]
return result
@_deprecate_positional_args
@@ -1574,7 +1574,7 @@ async def _async_wrap_evaluation_matrices(
"""A switch function for async environment."""
def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
if tree_method in ("hist", "gpu_hist"):
if _can_use_qdm(tree_method):
return DaskQuantileDMatrix(
client=client, ref=ref, max_bin=max_bin, **kwargs
)

View File

@@ -5,7 +5,7 @@ import ctypes
import json
import os
import warnings
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast
import numpy as np
@@ -17,6 +17,7 @@ from ._typing import (
FloatCompatible,
NumpyDType,
PandasDType,
TransformedData,
c_bst_ulong,
)
from .compat import DataFrame, lazy_isinstance
@@ -197,6 +198,7 @@ def _from_numpy_array(
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
data_split_mode: DataSplitMode = DataSplitMode.ROW,
) -> DispatchedDataBackendReturnType:
"""Initialize data from a 2-D numpy matrix."""
_check_data_shape(data)
@@ -205,7 +207,11 @@ def _from_numpy_array(
_check_call(
_LIB.XGDMatrixCreateFromDense(
_array_interface(data),
make_jcargs(missing=float(missing), nthread=int(nthread)),
make_jcargs(
missing=float(missing),
nthread=int(nthread),
data_split_mode=int(data_split_mode),
),
ctypes.byref(handle),
)
)
@@ -1046,7 +1052,9 @@ def dispatch_data_backend(
data.tocsr(), missing, threads, feature_names, feature_types
)
if _is_numpy_array(data):
return _from_numpy_array(data, missing, threads, feature_names, feature_types)
return _from_numpy_array(
data, missing, threads, feature_names, feature_types, data_split_mode
)
if _is_uri(data):
return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
if _is_list(data):
@@ -1261,12 +1269,7 @@ def _proxy_transform(
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> Tuple[
Union[bool, ctypes.c_void_p, np.ndarray],
Optional[list],
Optional[FeatureNames],
Optional[FeatureTypes],
]:
) -> TransformedData:
if _is_cudf_df(data) or _is_cudf_ser(data):
return _transform_cudf_df(
data, feature_names, feature_types, enable_categorical

View File

@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
# use libxgboost from a system prefix, if available. This should be the last
# option.
os.path.join(sys.prefix, "lib"),
os.path.join(sys.base_prefix, "lib"),
]
if sys.platform == "win32":
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
+ ("\n- ".join(dll_path))
+ "\nXGBoost Python package path: "
+ curr_path
+ "\nsys.prefix: "
+ sys.prefix
+ "\nsys.base_prefix: "
+ sys.base_prefix
+ "\nSee: "
+ link
+ " for installing XGBoost."

View File

@@ -1,169 +0,0 @@
"""Compatibility shim for xgboost.rabit; to be removed in 2.0"""
import logging
import warnings
from enum import IntEnum, unique
from typing import Any, Callable, List, Optional, TypeVar
import numpy as np
from . import collective
LOGGER = logging.getLogger("[xgboost.rabit]")
def _deprecation_warning() -> str:
return (
"The xgboost.rabit submodule is marked as deprecated in 1.7 and will be removed "
"in 2.0. Please use xgboost.collective instead."
)
def init(args: Optional[List[bytes]] = None) -> None:
"""Initialize the rabit library with arguments"""
warnings.warn(_deprecation_warning(), FutureWarning)
parsed = {}
if args:
for arg in args:
kv = arg.decode().split("=")
if len(kv) == 2:
parsed[kv[0]] = kv[1]
collective.init(**parsed)
def finalize() -> None:
"""Finalize the process, notify tracker everything is done."""
collective.finalize()
def get_rank() -> int:
"""Get rank of current process.
Returns
-------
rank : int
Rank of current process.
"""
return collective.get_rank()
def get_world_size() -> int:
"""Get total number workers.
Returns
-------
n : int
Total number of process.
"""
return collective.get_world_size()
def is_distributed() -> int:
"""If rabit is distributed."""
return collective.is_distributed()
def tracker_print(msg: Any) -> None:
"""Print message to the tracker.
This function can be used to communicate the information of
the progress to the tracker
Parameters
----------
msg : str
The message to be printed to tracker.
"""
collective.communicator_print(msg)
def get_processor_name() -> bytes:
"""Get the processor name.
Returns
-------
name : str
the name of processor(host)
"""
return collective.get_processor_name().encode()
T = TypeVar("T") # pylint:disable=invalid-name
def broadcast(data: T, root: int) -> T:
"""Broadcast object from one node to all other nodes.
Parameters
----------
data : any type that can be pickled
Input data, if current rank does not equal root, this can be None
root : int
Rank of the node to broadcast data from.
Returns
-------
object : int
the result of broadcast.
"""
return collective.broadcast(data, root)
@unique
class Op(IntEnum):
"""Supported operations for rabit."""
MAX = 0
MIN = 1
SUM = 2
OR = 3
def allreduce( # pylint:disable=invalid-name
data: np.ndarray, op: Op, prepare_fun: Optional[Callable[[np.ndarray], None]] = None
) -> np.ndarray:
"""Perform allreduce, return the result.
Parameters
----------
data :
Input data.
op :
Reduction operators, can be MIN, MAX, SUM, BITOR
prepare_fun :
Lazy preprocessing function, if it is not None, prepare_fun(data)
will be called by the function before performing allreduce, to initialize the data
If the result of Allreduce can be recovered directly,
then prepare_fun will NOT be called
Returns
-------
result :
The result of allreduce, have same shape as data
Notes
-----
This function is not thread-safe.
"""
if prepare_fun is None:
return collective.allreduce(data, collective.Op(op))
raise ValueError("preprocessing function is no longer supported")
def version_number() -> int:
"""Returns version number of current stored model.
This means how many calls to CheckPoint we made so far.
Returns
-------
version : int
Version number of currently stored model
"""
return 0
class RabitContext:
"""A context controlling rabit initialization and finalization."""
def __init__(self, args: Optional[List[bytes]] = None) -> None:
if args is None:
args = []
self.args = args
def __enter__(self) -> None:
init(self.args)
assert is_distributed()
LOGGER.warning(_deprecation_warning())
LOGGER.debug("-------------- rabit say hello ------------------")
def __exit__(self, *args: List) -> None:
finalize()
LOGGER.debug("--------------- rabit say bye ------------------")

View File

@@ -76,6 +76,10 @@ def _check_rf_callback(
)
def _can_use_qdm(tree_method: Optional[str]) -> bool:
return tree_method in ("hist", "gpu_hist", None, "auto")
SklObjective = Optional[
Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
]
@@ -226,10 +230,10 @@ __model_doc = f"""
subsample : Optional[float]
Subsample ratio of the training instance.
sampling_method :
Sampling method. Used only by `gpu_hist` tree method.
- `uniform`: select random training instances uniformly.
- `gradient_based` select random training instances with higher probability when
the gradient and hessian are larger. (cf. CatBoost)
Sampling method. Used only by the GPU version of ``hist`` tree method.
- ``uniform``: select random training instances uniformly.
- ``gradient_based`` select random training instances with higher probability
when the gradient and hessian are larger. (cf. CatBoost)
colsample_bytree : Optional[float]
Subsample ratio of columns when constructing each tree.
colsample_bylevel : Optional[float]
@@ -273,13 +277,16 @@ __model_doc = f"""
* For linear model, only "weight" is defined and it's the normalized coefficients
without bias.
gpu_id : Optional[int]
Device ordinal.
device : Optional[str]
.. versionadded:: 2.0.0
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
validate_parameters : Optional[bool]
Give warnings for unknown parameter.
predictor : Optional[str]
Force XGBoost to use specific predictor, available choices are [cpu_predictor,
gpu_predictor].
enable_categorical : bool
.. versionadded:: 1.5.0
@@ -381,17 +388,21 @@ __model_doc = f"""
every **early_stopping_rounds** round(s) to continue training. Requires at
least one item in **eval_set** in :py:meth:`fit`.
- The method returns the model from the last iteration, not the best one, use a
callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
model is preferred.
- If early stopping occurs, the model will have two additional attributes:
:py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
:py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
number of trees during inference. If users want to access the full model
(including trees built after early stopping), they can specify the
`iteration_range` in these inference methods. In addition, other utilities
like model plotting can also use the entire model.
- If you prefer to discard the trees after `best_iteration`, consider using the
callback function :py:class:`xgboost.callback.EarlyStopping`.
- If there's more than one item in **eval_set**, the last entry will be used for
early stopping. If there's more than one metric in **eval_metric**, the last
metric will be used for early stopping.
- If early stopping occurs, the model will have three additional fields:
:py:attr:`best_score`, :py:attr:`best_iteration`.
.. note::
This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
@@ -646,9 +657,8 @@ class XGBModel(XGBModelBase):
monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
importance_type: Optional[str] = None,
gpu_id: Optional[int] = None,
device: Optional[str] = None,
validate_parameters: Optional[bool] = None,
predictor: Optional[str] = None,
enable_categorical: bool = False,
feature_types: Optional[FeatureTypes] = None,
max_cat_to_onehot: Optional[int] = None,
@@ -693,9 +703,8 @@ class XGBModel(XGBModelBase):
self.monotone_constraints = monotone_constraints
self.interaction_constraints = interaction_constraints
self.importance_type = importance_type
self.gpu_id = gpu_id
self.device = device
self.validate_parameters = validate_parameters
self.predictor = predictor
self.enable_categorical = enable_categorical
self.feature_types = feature_types
self.max_cat_to_onehot = max_cat_to_onehot
@@ -931,8 +940,7 @@ class XGBModel(XGBModelBase):
callbacks = self.callbacks if self.callbacks is not None else callbacks
tree_method = params.get("tree_method", None)
cat_support = {"gpu_hist", "approx", "hist"}
if self.enable_categorical and tree_method not in cat_support:
if self.enable_categorical and tree_method == "exact":
raise ValueError(
"Experimental support for categorical data is not implemented for"
" current tree method yet."
@@ -941,7 +949,7 @@ class XGBModel(XGBModelBase):
def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
# Use `QuantileDMatrix` to save memory.
if self.tree_method in ("hist", "gpu_hist"):
if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
try:
return QuantileDMatrix(
**kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
@@ -984,12 +992,12 @@ class XGBModel(XGBModelBase):
X :
Feature matrix. See :ref:`py-data` for a list of supported types.
When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
When the ``tree_method`` is set to ``hist``, internally, the
:py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
for conserving memory. However, this has performance implications when the
device of input data is not matched with algorithm. For instance, if the
input is a numpy array on CPU but ``gpu_hist`` is used for training, then
the data is first processed on CPU then transferred to GPU.
input is a numpy array on CPU but ``cuda`` is used for training, then the
data is first processed on CPU then transferred to GPU.
y :
Labels
sample_weight :
@@ -1002,13 +1010,17 @@ class XGBModel(XGBModelBase):
Validation metrics will help us track the performance of the model.
eval_metric : str, list of str, or callable, optional
.. deprecated:: 1.6.0
Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
early_stopping_rounds : int
.. deprecated:: 1.6.0
Use `early_stopping_rounds` in :py:meth:`__init__` or
:py:meth:`set_params` instead.
Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
instead.
verbose :
If `verbose` is True and an evaluation set is used, the evaluation metric
measured on the validation set is printed to stdout at each boosting stage.
@@ -1089,12 +1101,7 @@ class XGBModel(XGBModelBase):
return self
def _can_use_inplace_predict(self) -> bool:
# When predictor is explicitly set, using `inplace_predict` might result into
# error with incompatible data type.
# Inplace predict doesn't handle as many data types as DMatrix, but it's
# sufficient for dask interface where input is simpiler.
predictor = self.get_xgb_params().get("predictor", None)
if predictor in ("auto", None) and self.booster != "gblinear":
if self.booster != "gblinear":
return True
return False
@@ -1120,9 +1127,9 @@ class XGBModel(XGBModelBase):
iteration_range: Optional[Tuple[int, int]] = None,
) -> ArrayLike:
"""Predict with `X`. If the model is trained with early stopping, then
:py:attr:`best_iteration` is used automatically. For tree models, when data is
on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
prediction is run on GPU automatically, otherwise it will run on CPU.
:py:attr:`best_iteration` is used automatically. The estimator uses
`inplace_predict` by default and falls back to using :py:class:`DMatrix` if
devices between the data and the estimator don't match.
.. note:: This function is only thread safe for `gbtree` and `dart`.
@@ -1272,19 +1279,10 @@ class XGBModel(XGBModelBase):
)
return np.array(feature_names)
def _early_stopping_attr(self, attr: str) -> Union[float, int]:
booster = self.get_booster()
try:
return getattr(booster, attr)
except AttributeError as e:
raise AttributeError(
f"`{attr}` in only defined when early stopping is used."
) from e
@property
def best_score(self) -> float:
"""The best score obtained by early stopping."""
return float(self._early_stopping_attr("best_score"))
return self.get_booster().best_score
@property
def best_iteration(self) -> int:
@@ -1292,7 +1290,7 @@ class XGBModel(XGBModelBase):
for instance if the best iteration is the first round, then best_iteration is 0.
"""
return int(self._early_stopping_attr("best_iteration"))
return self.get_booster().best_iteration
@property
def feature_importances_(self) -> np.ndarray:
@@ -1361,25 +1359,25 @@ class XGBModel(XGBModelBase):
@property
def intercept_(self) -> np.ndarray:
"""
Intercept (bias) property
"""Intercept (bias) property
.. note:: Intercept is defined only for linear learners
Intercept (bias) is only defined when the linear model is chosen as base
learner (`booster=gblinear`). It is not defined for other base learner types,
such as tree learners (`booster=gbtree`).
For tree-based model, the returned value is the `base_score`.
Returns
-------
intercept_ : array of shape ``(1,)`` or ``[n_classes]``
"""
if self.get_xgb_params()["booster"] != "gblinear":
raise AttributeError(
f"Intercept (bias) is not defined for Booster type {self.booster}"
)
booster_config = self.get_xgb_params()["booster"]
b = self.get_booster()
return np.array(json.loads(b.get_dump(dump_format="json")[0])["bias"])
if booster_config != "gblinear": # gbtree, dart
config = json.loads(b.save_config())
intercept = config["learner"]["learner_model_param"]["base_score"]
return np.array([float(intercept)], dtype=np.float32)
return np.array(
json.loads(b.get_dump(dump_format="json")[0])["bias"], dtype=np.float32
)
PredtT = TypeVar("PredtT", bound=np.ndarray)
@@ -1584,7 +1582,9 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
) -> np.ndarray:
"""Predict the probability of each `X` example being of a given class. If the
model is trained with early stopping, then :py:attr:`best_iteration` is used
automatically.
automatically. The estimator uses `inplace_predict` by default and falls back to
using :py:class:`DMatrix` if devices between the data and the estimator don't
match.
.. note:: This function is only thread safe for `gbtree` and `dart`.
@@ -1917,12 +1917,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
| 1 | :math:`x_{20}` | :math:`x_{21}` |
+-----+----------------+----------------+
When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
When the ``tree_method`` is set to ``hist``, internally, the
:py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
for conserving memory. However, this has performance implications when the
device of input data is not matched with algorithm. For instance, if the
input is a numpy array on CPU but ``gpu_hist`` is used for training, then
the data is first processed on CPU then transferred to GPU.
input is a numpy array on CPU but ``cuda`` is used for training, then the
data is first processed on CPU then transferred to GPU.
y :
Labels
group :

View File

@@ -1,4 +1,4 @@
"""Xgboost pyspark integration submodule for core code."""
"""XGBoost pyspark integration submodule for core code."""
import base64
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@@ -60,10 +60,11 @@ from scipy.special import expit, softmax # pylint: disable=no-name-in-module
import xgboost
from xgboost import XGBClassifier
from xgboost.compat import is_cudf_available
from xgboost.core import Booster
from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel
from xgboost.core import Booster, _check_distributed_params
from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
from xgboost.training import train as worker_train
from .._typing import ArrayLike
from .data import (
_read_csr_matrix_from_unwrapped_spark_vec,
alias,
@@ -92,6 +93,7 @@ from .utils import (
get_class_name,
get_logger,
serialize_booster,
use_cuda,
)
# Put pyspark specific params here, they won't be passed to XGBoost.
@@ -108,13 +110,13 @@ _pyspark_specific_params = [
"arbitrary_params_dict",
"force_repartition",
"num_workers",
"use_gpu",
"feature_names",
"features_cols",
"enable_sparse_data_optim",
"qid_col",
"repartition_random_shuffle",
"pred_contrib_col",
"use_gpu",
]
_non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -132,7 +134,7 @@ _pyspark_param_alias_map = {
_inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.items()}
_unsupported_xgb_params = [
"gpu_id", # we have "use_gpu" pyspark param instead.
"gpu_id", # we have "device" pyspark param instead.
"enable_categorical", # Use feature_types param to specify categorical feature instead
"use_label_encoder",
"n_jobs", # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -197,11 +199,24 @@ class _SparkXGBParams(
"The number of XGBoost workers. Each XGBoost worker corresponds to one spark task.",
TypeConverters.toInt,
)
device = Param(
Params._dummy(),
"device",
(
"The device type for XGBoost executors. Available options are `cpu`,`cuda`"
" and `gpu`. Set `device` to `cuda` or `gpu` if the executors are running "
"on GPU instances. Currently, only one GPU per task is supported."
),
TypeConverters.toString,
)
use_gpu = Param(
Params._dummy(),
"use_gpu",
"A boolean variable. Set use_gpu=true if the executors "
+ "are running on GPU instances. Currently, only one GPU per task is supported.",
(
"Deprecated, use `device` instead. A boolean variable. Set use_gpu=true "
"if the executors are running on GPU instances. Currently, only one GPU per"
" task is supported."
),
TypeConverters.toBoolean,
)
force_repartition = Param(
@@ -335,10 +350,18 @@ class _SparkXGBParams(
f"It cannot be less than 1 [Default is 1]"
)
tree_method = self.getOrDefault(self.getParam("tree_method"))
if tree_method == "exact":
raise ValueError(
"The `exact` tree method is not supported for distributed systems."
)
if self.getOrDefault(self.features_cols):
if not self.getOrDefault(self.use_gpu):
if not use_cuda(self.getOrDefault(self.device)) and not self.getOrDefault(
self.use_gpu
):
raise ValueError(
"features_col param with list value requires enabling use_gpu."
"features_col param with list value requires `device=cuda`."
)
if self.getOrDefault("objective") is not None:
@@ -391,17 +414,7 @@ class _SparkXGBParams(
"`pyspark.ml.linalg.Vector` type."
)
if self.getOrDefault(self.use_gpu):
tree_method = self.getParam("tree_method")
if (
self.getOrDefault(tree_method) is not None
and self.getOrDefault(tree_method) != "gpu_hist"
):
raise ValueError(
f"tree_method should be 'gpu_hist' or None when use_gpu is True,"
f"found {self.getOrDefault(tree_method)}."
)
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
gpu_per_task = (
_get_spark_session()
.sparkContext.getConf()
@@ -412,35 +425,41 @@ class _SparkXGBParams(
if is_local:
# checking spark local mode.
if gpu_per_task:
if gpu_per_task is not None:
raise RuntimeError(
"The spark cluster does not support gpu configuration for local mode. "
"Please delete spark.executor.resource.gpu.amount and "
"The spark local mode does not support gpu configuration."
"Please remove spark.executor.resource.gpu.amount and "
"spark.task.resource.gpu.amount"
)
# Support GPU training in Spark local mode is just for debugging purposes,
# so it's okay for printing the below warning instead of checking the real
# gpu numbers and raising the exception.
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You enabled use_gpu in spark local mode. Please make sure your local node "
"has at least %d GPUs",
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
# checking spark non-local mode.
if not gpu_per_task or int(gpu_per_task) < 1:
raise RuntimeError(
"The spark cluster does not have the necessary GPU"
+ "configuration for the spark task. Therefore, we cannot"
+ "run xgboost training using GPU."
)
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if int(gpu_per_task) > 1:
get_logger(self.__class__.__name__).warning(
"You configured %s GPU cores for each spark task, but in "
"XGBoost training, every Spark task will only use one GPU core.",
gpu_per_task,
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
@@ -557,6 +576,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
# they are added in `setParams`.
self._setDefault(
num_workers=1,
device="cpu",
use_gpu=False,
force_repartition=False,
repartition_random_shuffle=False,
@@ -565,9 +585,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
arbitrary_params_dict={},
)
def setParams(
self, **kwargs: Dict[str, Any]
) -> None: # pylint: disable=invalid-name
def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name
"""
Set params for the estimator.
"""
@@ -612,6 +630,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
)
raise ValueError(err_msg)
_extra_params[k] = v
_check_distributed_params(kwargs)
_existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
@@ -708,9 +728,6 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
# TODO: support "num_parallel_tree" for random forest
params["num_boost_round"] = self.getOrDefault("n_estimators")
if self.getOrDefault(self.use_gpu):
params["tree_method"] = "gpu_hist"
return params
@classmethod
@@ -882,8 +899,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dmatrix_kwargs,
) = self._get_xgb_parameters(dataset)
use_gpu = self.getOrDefault(self.use_gpu)
run_on_gpu = use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
self.use_gpu
)
is_local = _is_local(_get_spark_session().sparkContext)
num_workers = self.getOrDefault(self.num_workers)
@@ -899,34 +917,30 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
context = BarrierTaskContext.get()
gpu_id = None
use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
dev_ordinal = None
use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
if use_gpu:
gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
booster_params["gpu_id"] = gpu_id
if run_on_gpu:
dev_ordinal = (
context.partitionId() if is_local else _get_gpu_id(context)
)
booster_params["device"] = "cuda:" + str(dev_ordinal)
# If cuDF is not installed, then using DMatrix instead of QDM,
# because without cuDF, DMatrix performs better than QDM.
# Note: Checking `is_cudf_available` in spark worker side because
# spark worker might has different python environment with driver side.
use_qdm = use_hist and is_cudf_available()
else:
use_qdm = use_hist
use_qdm = use_qdm and is_cudf_available()
get_logger("XGBoost-PySpark").info(
"Leveraging %s to train with QDM: %s",
booster_params["device"],
"on" if use_qdm else "off",
)
if use_qdm and (booster_params.get("max_bin", None) is not None):
dmatrix_kwargs["max_bin"] = booster_params["max_bin"]
_rabit_args = {}
if context.partitionId() == 0:
get_logger("XGBoostPySpark").debug(
"booster params: %s\n"
"train_call_kwargs_params: %s\n"
"dmatrix_kwargs: %s",
booster_params,
train_call_kwargs_params,
dmatrix_kwargs,
)
_rabit_args = _get_rabit_args(context, num_workers)
worker_message = {
@@ -945,7 +959,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dtrain, dvalid = create_dmatrix_from_partitions(
pandas_df_iter,
feature_prop.features_cols_names,
gpu_id,
dev_ordinal,
use_qdm,
dmatrix_kwargs,
enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
@@ -983,7 +997,19 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
)
return ret[0], ret[1]
get_logger("XGBoost-PySpark").info(
"Running xgboost-%s on %s workers with"
"\n\tbooster params: %s"
"\n\ttrain_call_kwargs_params: %s"
"\n\tdmatrix_kwargs: %s",
xgboost._py_version(),
num_workers,
booster_params,
train_call_kwargs_params,
dmatrix_kwargs,
)
(config, booster) = _run_job()
get_logger("XGBoost-PySpark").info("Finished xgboost training!")
result_xgb_model = self._convert_to_sklearn_model(
bytearray(booster, "utf-8"), config
@@ -1092,12 +1118,86 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
)
return features_col, feature_col_names
def _get_pred_contrib_col_name(self) -> Optional[str]:
"""Return the pred_contrib_col col name"""
pred_contrib_col_name = None
if (
self.isDefined(self.pred_contrib_col)
and self.getOrDefault(self.pred_contrib_col) != ""
):
pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
return pred_contrib_col_name
def _out_schema(self) -> Tuple[bool, str]:
"""Return the bool to indicate if it's a single prediction, true is single prediction,
and the returned type of the user-defined function. The value must
be a DDL-formatted type string."""
if self._get_pred_contrib_col_name() is not None:
return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
return True, "double"
def _get_predict_func(self) -> Callable:
"""Return the true prediction function which will be running on the executor side"""
predict_params = self._gen_predict_params_dict()
pred_contrib_col_name = self._get_pred_contrib_col_name()
def _predict(
model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
) -> Union[pd.DataFrame, pd.Series]:
data = {}
preds = model.predict(
X,
base_margin=base_margin,
validate_features=False,
**predict_params,
)
data[pred.prediction] = pd.Series(preds)
if pred_contrib_col_name is not None:
contribs = pred_contribs(model, X, base_margin)
data[pred.pred_contrib] = pd.Series(list(contribs))
return pd.DataFrame(data=data)
return data[pred.prediction]
return _predict
def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
"""Post process of transform"""
prediction_col_name = self.getOrDefault(self.predictionCol)
single_pred, _ = self._out_schema()
if single_pred:
if prediction_col_name:
dataset = dataset.withColumn(prediction_col_name, pred_col)
else:
pred_struct_col = "_prediction_struct"
dataset = dataset.withColumn(pred_struct_col, pred_col)
if prediction_col_name:
dataset = dataset.withColumn(
prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
)
pred_contrib_col_name = self._get_pred_contrib_col_name()
if pred_contrib_col_name is not None:
dataset = dataset.withColumn(
pred_contrib_col_name,
array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
)
dataset = dataset.drop(pred_struct_col)
return dataset
def _transform(self, dataset: DataFrame) -> DataFrame:
# pylint: disable=too-many-statements, too-many-locals
# Save xgb_sklearn_model and predict_params to be local variable
# to avoid the `self` object to be pickled to remote.
xgb_sklearn_model = self._xgb_sklearn_model
predict_params = self._gen_predict_params_dict()
has_base_margin = False
if (
@@ -1112,18 +1212,9 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
features_col, feature_col_names = self._get_feature_col(dataset)
enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
pred_contrib_col_name = None
if (
self.isDefined(self.pred_contrib_col)
and self.getOrDefault(self.pred_contrib_col) != ""
):
pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
predict_func = self._get_predict_func()
single_pred = True
schema = "double"
if pred_contrib_col_name:
single_pred = False
schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
_, schema = self._out_schema()
@pandas_udf(schema) # type: ignore
def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
@@ -1143,48 +1234,14 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
else:
base_margin = None
data = {}
preds = model.predict(
X,
base_margin=base_margin,
validate_features=False,
**predict_params,
)
data[pred.prediction] = pd.Series(preds)
if pred_contrib_col_name:
contribs = pred_contribs(model, X, base_margin)
data[pred.pred_contrib] = pd.Series(list(contribs))
yield pd.DataFrame(data=data)
else:
yield data[pred.prediction]
yield predict_func(model, X, base_margin)
if has_base_margin:
pred_col = predict_udf(struct(*features_col, base_margin_col))
else:
pred_col = predict_udf(struct(*features_col))
prediction_col_name = self.getOrDefault(self.predictionCol)
if single_pred:
dataset = dataset.withColumn(prediction_col_name, pred_col)
else:
pred_struct_col = "_prediction_struct"
dataset = dataset.withColumn(pred_struct_col, pred_col)
dataset = dataset.withColumn(
prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
)
if pred_contrib_col_name:
dataset = dataset.withColumn(
pred_contrib_col_name,
array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
)
dataset = dataset.drop(pred_struct_col)
return dataset
return self._post_transform(dataset, pred_col)
class _ClassificationModel( # pylint: disable=abstract-method
@@ -1196,22 +1253,21 @@ class _ClassificationModel( # pylint: disable=abstract-method
.. Note:: This API is experimental.
"""
def _transform(self, dataset: DataFrame) -> DataFrame:
# pylint: disable=too-many-statements, too-many-locals
# Save xgb_sklearn_model and predict_params to be local variable
# to avoid the `self` object to be pickled to remote.
xgb_sklearn_model = self._xgb_sklearn_model
predict_params = self._gen_predict_params_dict()
def _out_schema(self) -> Tuple[bool, str]:
schema = (
f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
f" {pred.probability} array<double>"
)
if self._get_pred_contrib_col_name() is not None:
# We will force setting strict_shape to True when predicting contribs,
# So, it will also output 3-D shape result.
schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
has_base_margin = False
if (
self.isDefined(self.base_margin_col)
and self.getOrDefault(self.base_margin_col) != ""
):
has_base_margin = True
base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
alias.margin
)
return False, schema
def _get_predict_func(self) -> Callable:
predict_params = self._gen_predict_params_dict()
pred_contrib_col_name = self._get_pred_contrib_col_name()
def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
if margins.ndim == 1:
@@ -1226,76 +1282,38 @@ class _ClassificationModel( # pylint: disable=abstract-method
class_probs = softmax(raw_preds, axis=1)
return raw_preds, class_probs
features_col, feature_col_names = self._get_feature_col(dataset)
enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
def _predict(
model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
) -> Union[pd.DataFrame, pd.Series]:
margins = model.predict(
X,
base_margin=base_margin,
output_margin=True,
validate_features=False,
**predict_params,
)
raw_preds, class_probs = transform_margin(margins)
pred_contrib_col_name = None
if (
self.isDefined(self.pred_contrib_col)
and self.getOrDefault(self.pred_contrib_col) != ""
):
pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
# It seems that they use argmax of class probs,
# not of margin to get the prediction (Note: scala implementation)
preds = np.argmax(class_probs, axis=1)
result: Dict[str, pd.Series] = {
pred.raw_prediction: pd.Series(list(raw_preds)),
pred.prediction: pd.Series(preds),
pred.probability: pd.Series(list(class_probs)),
}
schema = (
f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
f" {pred.probability} array<double>"
)
if pred_contrib_col_name:
# We will force setting strict_shape to True when predicting contribs,
# So, it will also output 3-D shape result.
schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
if pred_contrib_col_name is not None:
contribs = pred_contribs(model, X, base_margin, strict_shape=True)
result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
@pandas_udf(schema) # type: ignore
def predict_udf(
iterator: Iterator[Tuple[pd.Series, ...]]
) -> Iterator[pd.DataFrame]:
assert xgb_sklearn_model is not None
model = xgb_sklearn_model
for data in iterator:
if enable_sparse_data_optim:
X = _read_csr_matrix_from_unwrapped_spark_vec(data)
else:
if feature_col_names is not None:
X = data[feature_col_names] # type: ignore
else:
X = stack_series(data[alias.data])
return pd.DataFrame(data=result)
if has_base_margin:
base_margin = stack_series(data[alias.margin])
else:
base_margin = None
margins = model.predict(
X,
base_margin=base_margin,
output_margin=True,
validate_features=False,
**predict_params,
)
raw_preds, class_probs = transform_margin(margins)
# It seems that they use argmax of class probs,
# not of margin to get the prediction (Note: scala implementation)
preds = np.argmax(class_probs, axis=1)
result: Dict[str, pd.Series] = {
pred.raw_prediction: pd.Series(list(raw_preds)),
pred.prediction: pd.Series(preds),
pred.probability: pd.Series(list(class_probs)),
}
if pred_contrib_col_name:
contribs = pred_contribs(model, X, base_margin, strict_shape=True)
result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
yield pd.DataFrame(data=result)
if has_base_margin:
pred_struct = predict_udf(struct(*features_col, base_margin_col))
else:
pred_struct = predict_udf(struct(*features_col))
return _predict
def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
pred_struct_col = "_prediction_struct"
dataset = dataset.withColumn(pred_struct_col, pred_struct)
dataset = dataset.withColumn(pred_struct_col, pred_col)
raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
if raw_prediction_col_name:
@@ -1317,7 +1335,8 @@ class _ClassificationModel( # pylint: disable=abstract-method
array_to_vector(getattr(col(pred_struct_col), pred.probability)),
)
if pred_contrib_col_name:
pred_contrib_col_name = self._get_pred_contrib_col_name()
if pred_contrib_col_name is not None:
dataset = dataset.withColumn(
pred_contrib_col_name,
getattr(col(pred_struct_col), pred.pred_contrib),

View File

@@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:
def make_qdm(
data: Dict[str, List[np.ndarray]],
gpu_id: Optional[int],
dev_ordinal: Optional[int],
meta: Dict[str, Any],
ref: Optional[DMatrix],
params: Dict[str, Any],
@@ -165,7 +165,7 @@ def make_qdm(
"""Handle empty partition for QuantileDMatrix."""
if not data:
return QuantileDMatrix(np.empty((0, 0)), ref=ref)
it = PartIter(data, gpu_id, **meta)
it = PartIter(data, dev_ordinal, **meta)
m = QuantileDMatrix(it, **params, ref=ref)
return m
@@ -173,7 +173,7 @@ def make_qdm(
def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
iterator: Iterator[pd.DataFrame],
feature_cols: Optional[Sequence[str]],
gpu_id: Optional[int],
dev_ordinal: Optional[int],
use_qdm: bool,
kwargs: Dict[str, Any], # use dict to make sure this parameter is passed.
enable_sparse_data_optim: bool,
@@ -187,7 +187,7 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
Pyspark partition iterator.
feature_cols:
A sequence of feature names, used only when rapids plugin is enabled.
gpu_id:
dev_ordinal:
Device ordinal, used when GPU is enabled.
use_qdm :
Whether QuantileDMatrix should be used instead of DMatrix.
@@ -304,13 +304,13 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
if feature_cols is not None and use_qdm:
cache_partitions(iterator, append_fn)
dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
elif feature_cols is not None and not use_qdm:
cache_partitions(iterator, append_fn)
dtrain = make(train_data, kwargs)
elif feature_cols is None and use_qdm:
cache_partitions(iterator, append_fn)
dtrain = make_qdm(train_data, gpu_id, meta, None, params)
dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
else:
cache_partitions(iterator, append_fn)
dtrain = make(train_data, kwargs)
@@ -324,7 +324,7 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
if has_validation_col:
if use_qdm:
dvalid: Optional[DMatrix] = make_qdm(
valid_data, gpu_id, meta, dtrain, params
valid_data, dev_ordinal, meta, dtrain, params
)
else:
dvalid = make(valid_data, kwargs) if has_validation_col else None

View File

@@ -3,8 +3,8 @@
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
# pylint: disable=unused-argument, too-many-locals
from typing import Any, Dict, List, Optional, Type, Union
import warnings
from typing import Any, List, Optional, Type, Union
import numpy as np
from pyspark import keyword_only
@@ -77,28 +77,35 @@ def _set_pyspark_xgb_cls_param_attrs(
set_param_attrs(name, param_obj)
def _deprecated_use_gpu() -> None:
warnings.warn(
"`use_gpu` is deprecated since 2.0.0, use `device` instead", FutureWarning
)
class SparkXGBRegressor(_SparkXGBEstimator):
"""
SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
"""SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
:py:class:`~pyspark.ml.classification.OneVsRest`
and PySpark ML meta algorithms like
- :py:class:`~pyspark.ml.tuning.CrossValidator`/
- :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
- :py:class:`~pyspark.ml.classification.OneVsRest`
SparkXGBRegressor automatically supports most of the parameters in
:py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
:py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
:py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict`
method.
SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details.
To enable GPU support, set `device` to `cuda` or `gpu`.
SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but
support another param called `base_margin_col`. see doc below for more details.
SparkXGBRegressor doesn't support `validate_features` and `output_margin` param.
SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the
`nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
config value.
Parameters
@@ -134,8 +141,16 @@ class SparkXGBRegressor(_SparkXGBEstimator):
How many XGBoost workers to be used to train.
Each XGBoost worker corresponds to one spark task.
use_gpu:
Boolean value to specify whether the executors are running on GPU
instances.
.. deprecated:: 2.0.0
Use `device` instead.
device:
.. versionadded:: 2.0.0
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training.
@@ -194,14 +209,17 @@ class SparkXGBRegressor(_SparkXGBEstimator):
weight_col: Optional[str] = None,
base_margin_col: Optional[str] = None,
num_workers: int = 1,
use_gpu: bool = False,
use_gpu: Optional[bool] = None,
device: Optional[str] = None,
force_repartition: bool = False,
repartition_random_shuffle: bool = False,
enable_sparse_data_optim: bool = False,
**kwargs: Dict[str, Any],
**kwargs: Any,
) -> None:
super().__init__()
input_kwargs = self._input_kwargs
if use_gpu:
_deprecated_use_gpu()
self.setParams(**input_kwargs)
@classmethod
@@ -239,27 +257,29 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
"""SparkXGBClassifier is a PySpark ML estimator. It implements the XGBoost
classification algorithm based on XGBoost python library, and it can be used in
PySpark Pipeline and PySpark ML meta algorithms like
:py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
:py:class:`~pyspark.ml.classification.OneVsRest`
- :py:class:`~pyspark.ml.tuning.CrossValidator`/
- :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
- :py:class:`~pyspark.ml.classification.OneVsRest`
SparkXGBClassifier automatically supports most of the parameters in
:py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
:py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
:py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict`
method.
SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details.
To enable GPU support, set `device` to `cuda` or `gpu`.
SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but
support another param called `base_margin_col`. see doc below for more details.
SparkXGBClassifier doesn't support setting `output_margin`, but we can get output margin
from the raw prediction column. See `raw_prediction_col` param doc below for more details.
SparkXGBClassifier doesn't support setting `output_margin`, but we can get output
margin from the raw prediction column. See `raw_prediction_col` param doc below for
more details.
SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.
SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the
`nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
config value.
Parameters
@@ -301,8 +321,16 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
How many XGBoost workers to be used to train.
Each XGBoost worker corresponds to one spark task.
use_gpu:
Boolean value to specify whether the executors are running on GPU
instances.
.. deprecated:: 2.0.0
Use `device` instead.
device:
.. versionadded:: 2.0.0
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training.
@@ -361,11 +389,12 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
weight_col: Optional[str] = None,
base_margin_col: Optional[str] = None,
num_workers: int = 1,
use_gpu: bool = False,
use_gpu: Optional[bool] = None,
device: Optional[str] = None,
force_repartition: bool = False,
repartition_random_shuffle: bool = False,
enable_sparse_data_optim: bool = False,
**kwargs: Dict[str, Any],
**kwargs: Any,
) -> None:
super().__init__()
# The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
@@ -373,6 +402,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
# binary or multinomial input dataset, and we need to remove the fixed default
# param value as well to avoid causing ambiguity.
input_kwargs = self._input_kwargs
if use_gpu:
_deprecated_use_gpu()
self.setParams(**input_kwargs)
self._setDefault(objective=None)
@@ -423,19 +454,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
:py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
:py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details.
To enable GPU support, set `device` to `cuda` or `gpu`.
SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
SparkXGBRanker doesn't support setting `output_margin`, but we can get output margin
from the raw prediction column. See `raw_prediction_col` param doc below for more details.
from the raw prediction column. See `raw_prediction_col` param doc below for more
details.
SparkXGBRanker doesn't support `validate_features` and `output_margin` param.
SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the
`nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
config value.
Parameters
@@ -468,13 +500,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
:py:class:`xgboost.XGBRanker` fit method.
qid_col:
Query id column name.
num_workers:
How many XGBoost workers to be used to train.
Each XGBoost worker corresponds to one spark task.
use_gpu:
Boolean value to specify whether the executors are running on GPU
instances.
.. deprecated:: 2.0.0
Use `device` instead.
device:
.. versionadded:: 2.0.0
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training.
@@ -539,14 +578,17 @@ class SparkXGBRanker(_SparkXGBEstimator):
base_margin_col: Optional[str] = None,
qid_col: Optional[str] = None,
num_workers: int = 1,
use_gpu: bool = False,
use_gpu: Optional[bool] = None,
device: Optional[str] = None,
force_repartition: bool = False,
repartition_random_shuffle: bool = False,
enable_sparse_data_optim: bool = False,
**kwargs: Dict[str, Any],
**kwargs: Any,
) -> None:
super().__init__()
input_kwargs = self._input_kwargs
if use_gpu:
_deprecated_use_gpu()
self.setParams(**input_kwargs)
@classmethod

View File

@@ -7,7 +7,7 @@ import os
import sys
import uuid
from threading import Thread
from typing import Any, Callable, Dict, Set, Type
from typing import Any, Callable, Dict, Optional, Set, Type
import pyspark
from pyspark import BarrierTaskContext, SparkContext, SparkFiles
@@ -104,6 +104,10 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
# If the logger is configured, skip the configure
if not logger.handlers and not logging.getLogger().handlers:
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(name)s: %(funcName)s %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
@@ -186,3 +190,8 @@ def deserialize_booster(model: str) -> Booster:
f.write(model)
booster.load_model(tmp_file_name)
return booster
def use_cuda(device: Optional[str]) -> bool:
"""Whether xgboost is using CUDA workers."""
return device in ("cuda", "gpu")

View File

@@ -25,6 +25,7 @@ from typing import (
Set,
Tuple,
TypedDict,
TypeVar,
Union,
)
@@ -93,6 +94,10 @@ def no_ipv6() -> PytestSkip:
return {"condition": not has_ipv6(), "reason": "IPv6 is required to be enabled."}
def not_linux() -> PytestSkip:
return {"condition": system() != "Linux", "reason": "Linux is required."}
def no_ubjson() -> PytestSkip:
return no_mod("ubjson")
@@ -198,20 +203,20 @@ class IteratorForTest(xgb.core.DataIter):
X: Sequence,
y: Sequence,
w: Optional[Sequence],
cache: Optional[str] = "./",
cache: Optional[str],
) -> None:
assert len(X) == len(y)
self.X = X
self.y = y
self.w = w
self.it = 0
super().__init__(cache)
super().__init__(cache_prefix=cache)
def next(self, input_data: Callable) -> int:
if self.it == len(self.X):
return 0
with pytest.raises(TypeError, match="keyword args"):
with pytest.raises(TypeError, match="Keyword argument"):
input_data(self.X[self.it], self.y[self.it], None)
# Use copy to make sure the iterator doesn't hold a reference to the data.
@@ -229,7 +234,7 @@ class IteratorForTest(xgb.core.DataIter):
def as_arrays(
self,
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
if isinstance(self.X[0], sparse.csr_matrix):
X = sparse.vstack(self.X, format="csr")
else:
@@ -243,7 +248,12 @@ class IteratorForTest(xgb.core.DataIter):
def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
n_samples_per_batch: int,
n_features: int,
n_batches: int,
use_cupy: bool = False,
*,
vary_size: bool = False,
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
X = []
y = []
@@ -254,16 +264,25 @@ def make_batches(
rng = cupy.random.RandomState(1994)
else:
rng = np.random.RandomState(1994)
for _ in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
for i in range(n_batches):
n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
_X = rng.randn(n_samples, n_features)
_y = rng.randn(n_samples)
_w = rng.uniform(low=0, high=1, size=n_samples)
X.append(_X)
y.append(_y)
w.append(_w)
return X, y, w
def make_regression(
n_samples: int, n_features: int, use_cupy: bool
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
"""Make a simple regression dataset."""
X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
return X[0], y[0], w[0]
def make_batches_sparse(
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
@@ -347,7 +366,9 @@ class TestDataset:
if w is not None:
weight.append(w)
it = IteratorForTest(predictor, response, weight if weight else None)
it = IteratorForTest(
predictor, response, weight if weight else None, cache="cache"
)
return xgb.DMatrix(it)
def __repr__(self) -> str:
@@ -709,6 +730,9 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
)
M = TypeVar("M", xgb.Booster, xgb.XGBModel)
def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
"""Evaluation metric for xgb.train"""
label = dtrain.get_label()
@@ -743,13 +767,31 @@ def softmax(x: np.ndarray) -> np.ndarray:
return e / np.sum(e)
def softprob_obj(classes: int) -> SklObjective:
def softprob_obj(
classes: int, use_cupy: bool = False, order: str = "C", gdtype: str = "float32"
) -> SklObjective:
"""Custom softprob objective for testing.
Parameters
----------
use_cupy :
Whether the objective should return cupy arrays.
order :
The order of gradient matrices. "C" or "F".
gdtype :
DType for gradient. Hessian is not set. This is for testing asymmetric types.
"""
if use_cupy:
import cupy as backend
else:
backend = np
def objective(
labels: np.ndarray, predt: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
labels: backend.ndarray, predt: backend.ndarray
) -> Tuple[backend.ndarray, backend.ndarray]:
rows = labels.shape[0]
grad = np.zeros((rows, classes), dtype=float)
hess = np.zeros((rows, classes), dtype=float)
grad = backend.zeros((rows, classes), dtype=np.float32)
hess = backend.zeros((rows, classes), dtype=np.float32)
eps = 1e-6
for r in range(predt.shape[0]):
target = labels[r]
@@ -761,8 +803,10 @@ def softprob_obj(classes: int) -> SklObjective:
grad[r, c] = g
hess[r, c] = h
grad = grad.reshape((rows * classes, 1))
hess = hess.reshape((rows * classes, 1))
grad = grad.reshape((rows, classes))
hess = hess.reshape((rows, classes))
grad = backend.require(grad, requirements=order, dtype=gdtype)
hess = backend.require(hess, requirements=order)
return grad, hess
return objective

View File

@@ -0,0 +1,34 @@
"""Tests related to the `DataIter` interface."""
import numpy as np
import xgboost
from xgboost import testing as tm
def run_mixed_sparsity(device: str) -> None:
"""Check QDM with mixed batches."""
X_0, y_0, _ = tm.make_regression(128, 16, False)
if device.startswith("cuda"):
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
else:
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, False)
X_2, y_2 = tm.make_sparse_regression(512, 16, 0.9, True)
X = [X_0, X_1, X_2]
y = [y_0, y_1, y_2]
if device.startswith("cuda"):
import cupy as cp # pylint: disable=import-error
X = [cp.array(batch) for batch in X]
it = tm.IteratorForTest(X, y, None, None)
Xy_0 = xgboost.QuantileDMatrix(it)
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
X = [X_0, X_1, X_2]
y = [y_0, y_1, y_2]
X_arr = np.concatenate(X, axis=0)
y_arr = np.concatenate(y, axis=0)
Xy_1 = xgboost.QuantileDMatrix(X_arr, y_arr)
assert tm.predictor_equal(Xy_0, Xy_1)

View File

@@ -41,6 +41,10 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
)
hist_cache_strategy = strategies.fixed_dictionaries(
{"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
)
hist_multi_parameter_strategy = strategies.fixed_dictionaries(
{
"max_depth": strategies.integers(1, 11),

View File

@@ -1,7 +1,7 @@
"""Tests for updaters."""
import json
from functools import partial, update_wrapper
from typing import Dict
from typing import Any, Dict, List
import numpy as np
@@ -159,3 +159,238 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:
for i in range(alpha.shape[0]):
np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
def check_cut(
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
) -> None:
"""Check the cut values."""
from pandas.api.types import is_categorical_dtype
assert data.shape[0] == indptr[-1]
assert data.shape[0] == n_entries
assert indptr.dtype == np.uint64
for i in range(1, indptr.size):
beg = int(indptr[i - 1])
end = int(indptr[i])
for j in range(beg + 1, end):
assert data[j] > data[j - 1]
if is_categorical_dtype(dtypes[i - 1]):
assert data[j] == data[j - 1] + 1
def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
"""Check with optional cupy."""
from pandas.api.types import is_categorical_dtype
n_samples = 1024
n_features = 14
max_bin = 16
dtypes = [np.float32] * n_features
# numerical
X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
# - qdm
Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
indptr, data = Xyw.get_quantile_cut()
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
# - dm
Xyw = xgb.DMatrix(X, y, weight=w)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
indptr, data = Xyw.get_quantile_cut()
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
# - ext mem
n_batches = 3
n_samples_per_batch = 256
it = tm.IteratorForTest(
*tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
cache="cache",
)
Xy: xgb.DMatrix = xgb.DMatrix(it)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
indptr, data = Xyw.get_quantile_cut()
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
# categorical
n_categories = 32
X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
if use_cupy:
import cudf # pylint: disable=import-error
import cupy as cp # pylint: disable=import-error
X = cudf.from_pandas(X)
y = cp.array(y)
# - qdm
Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
indptr, data = Xy.get_quantile_cut()
check_cut(n_categories * n_features, indptr, data, X.dtypes)
# - dm
Xy = xgb.DMatrix(X, y, enable_categorical=True)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
indptr, data = Xy.get_quantile_cut()
check_cut(n_categories * n_features, indptr, data, X.dtypes)
# mixed
X, y = tm.make_categorical(
n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
)
n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
n_num_features = n_features - n_cat_features
n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
# - qdm
Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
indptr, data = Xy.get_quantile_cut()
check_cut(n_entries, indptr, data, X.dtypes)
# - dm
Xy = xgb.DMatrix(X, y, enable_categorical=True)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
indptr, data = Xy.get_quantile_cut()
check_cut(n_entries, indptr, data, X.dtypes)
def check_get_quantile_cut(tree_method: str) -> None:
"""Check the quantile cut getter."""
use_cupy = tree_method == "gpu_hist"
check_get_quantile_cut_device(tree_method, False)
if use_cupy:
check_get_quantile_cut_device(tree_method, True)
USE_ONEHOT = np.iinfo(np.int32).max
USE_PART = 1
def check_categorical_ohe( # pylint: disable=too-many-arguments
rows: int, cols: int, rounds: int, cats: int, device: str, tree_method: str
) -> None:
"Test for one-hot encoding with categorical data."
onehot, label = tm.make_categorical(rows, cols, cats, True)
cat, _ = tm.make_categorical(rows, cols, cats, False)
by_etl_results: Dict[str, Dict[str, List[float]]] = {}
by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
parameters: Dict[str, Any] = {
"tree_method": tree_method,
# Use one-hot exclusively
"max_cat_to_onehot": USE_ONEHOT,
"device": device,
}
m = xgb.DMatrix(onehot, label, enable_categorical=False)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_etl_results,
)
m = xgb.DMatrix(cat, label, enable_categorical=True)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_builtin_results,
)
# There are guidelines on how to specify tolerance based on considering output
# as random variables. But in here the tree construction is extremely sensitive
# to floating point errors. An 1e-5 error in a histogram bin can lead to an
# entirely different tree. So even though the test is quite lenient, hypothesis
# can still pick up falsifying examples from time to time.
np.testing.assert_allclose(
np.array(by_etl_results["Train"]["rmse"]),
np.array(by_builtin_results["Train"]["rmse"]),
rtol=1e-3,
)
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
by_grouping: Dict[str, Dict[str, List[float]]] = {}
# switch to partition-based splits
parameters["max_cat_to_onehot"] = USE_PART
parameters["reg_lambda"] = 0
m = xgb.DMatrix(cat, label, enable_categorical=True)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_grouping,
)
rmse_oh = by_builtin_results["Train"]["rmse"]
rmse_group = by_grouping["Train"]["rmse"]
# always better or equal to onehot when there's no regularization.
for a, b in zip(rmse_oh, rmse_group):
assert a >= b
parameters["reg_lambda"] = 1.0
by_grouping = {}
xgb.train(
parameters,
m,
num_boost_round=32,
evals=[(m, "Train")],
evals_result=by_grouping,
)
assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
def check_categorical_missing(
rows: int, cols: int, cats: int, device: str, tree_method: str
) -> None:
"""Check categorical data with missing values."""
parameters: Dict[str, Any] = {"tree_method": tree_method, "device": device}
cat, label = tm.make_categorical(
rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
)
Xy = xgb.DMatrix(cat, label, enable_categorical=True)
def run(max_cat_to_onehot: int) -> None:
# Test with onehot splits
parameters["max_cat_to_onehot"] = max_cat_to_onehot
evals_result: Dict[str, Dict] = {}
booster = xgb.train(
parameters,
Xy,
num_boost_round=16,
evals=[(Xy, "Train")],
evals_result=evals_result,
)
assert tm.non_increasing(evals_result["Train"]["rmse"])
y_predt = booster.predict(Xy)
rmse = tm.root_mean_square(label, y_predt)
np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1], rtol=2e-5)
# Test with OHE split
run(USE_ONEHOT)
# Test with partition-based split
run(USE_PART)
def train_result(
param: Dict[str, Any], dmat: xgb.DMatrix, num_rounds: int
) -> Dict[str, Any]:
"""Get training result from parameters and data."""
result: Dict[str, Any] = {}
booster = xgb.train(
param,
dmat,
num_rounds,
evals=[(dmat, "train")],
verbose_eval=False,
evals_result=result,
)
assert booster.num_features() == dmat.num_col()
assert booster.num_boosted_rounds() == num_rounds
assert booster.feature_names == dmat.feature_names
assert booster.feature_types == dmat.feature_types
return result

View File

@@ -137,15 +137,9 @@ class WorkerEntry:
return self._get_remote(wait_conn, nnset)
def _get_remote(
self, wait_conn: Dict[int, "WorkerEntry"], nnset: Set[int]
self, wait_conn: Dict[int, "WorkerEntry"], badset: Set[int]
) -> List[int]:
while True:
ngood = self.sock.recvint()
goodset = set()
for _ in range(ngood):
goodset.add(self.sock.recvint())
assert goodset.issubset(nnset)
badset = nnset - goodset
conset = []
for r in badset:
if r in wait_conn:
@@ -343,7 +337,7 @@ class RabitTracker:
shutdown[s.rank] = s
logging.debug("Received %s signal from %d", s.cmd, s.rank)
continue
assert s.cmd in ("start", "recover")
assert s.cmd == "start"
# lazily initialize the workers
if tree_map is None:
assert s.cmd == "start"

View File

@@ -28,17 +28,6 @@ from .core import (
_CVFolds = Sequence["CVPack"]
def _assert_new_callback(callbacks: Optional[Sequence[TrainingCallback]]) -> None:
is_new_callback: bool = not callbacks or all(
isinstance(c, TrainingCallback) for c in callbacks
)
if not is_new_callback:
link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
raise ValueError(
f"Old style callback was removed in version 1.6. See: {link}."
)
def _configure_custom_metric(
feval: Optional[Metric], custom_metric: Optional[Metric]
) -> Optional[Metric]:
@@ -170,7 +159,6 @@ def train(
bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
start_iteration = 0
_assert_new_callback(callbacks)
if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(EvaluationMonitor(period=verbose_eval))
@@ -190,7 +178,7 @@ def train(
for i in range(start_iteration, num_boost_round):
if cb_container.before_iteration(bst, i, dtrain, evals):
break
bst.update(dtrain, i, obj)
bst.update(dtrain, iteration=i, fobj=obj)
if cb_container.after_iteration(bst, i, dtrain, evals):
break
@@ -247,7 +235,7 @@ class _PackedBooster:
result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
return result
def set_attr(self, **kwargs: Optional[str]) -> Any:
def set_attr(self, **kwargs: Optional[Any]) -> Any:
"""Iterate through folds for setting attributes"""
for f in self.cvfolds:
f.bst.set_attr(**kwargs)
@@ -274,11 +262,20 @@ class _PackedBooster:
"""Get best_iteration"""
return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))
@best_iteration.setter
def best_iteration(self, iteration: int) -> None:
"""Get best_iteration"""
self.set_attr(best_iteration=iteration)
@property
def best_score(self) -> float:
"""Get best_score."""
return float(cast(float, self.cvfolds[0].bst.attr("best_score")))
@best_score.setter
def best_score(self, score: float) -> None:
self.set_attr(best_score=score)
def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
"""
@@ -551,7 +548,6 @@ def cv(
# setup callbacks
callbacks = [] if callbacks is None else copy.copy(list(callbacks))
_assert_new_callback(callbacks)
if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval