merge latest changes

This commit is contained in:
Hui Liu
2024-03-12 09:13:09 -07:00
174 changed files with 5276 additions and 2304 deletions

View File

@@ -32,7 +32,10 @@ def build_libxgboost(
build_dir: pathlib.Path,
build_config: BuildConfiguration,
) -> pathlib.Path:
"""Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
"""Build libxgboost in a temporary directory and obtain the path to built
libxgboost.
"""
logger = logging.getLogger("xgboost.packager.build_libxgboost")
if not cpp_src_dir.is_dir():
@@ -51,8 +54,8 @@ def build_libxgboost(
cmake_cmd.extend(build_config.get_cmake_args())
# Flag for cross-compiling for Apple Silicon
# We use environment variable because it's the only way to pass down custom flags
# through the cibuildwheel package, which calls `pip wheel` command.
# We use environment variable because it's the only way to pass down custom
# flags through the cibuildwheel package, which calls `pip wheel` command.
if "CIBW_TARGET_OSX_ARM64" in os.environ:
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")

View File

@@ -804,10 +804,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
Otherwise, one can pass a list-like input with the same length as number
of columns in `data`, with the following possible values:
- "c", which represents categorical columns.
- "q", which represents numeric columns.
- "int", which represents integer columns.
- "i", which represents boolean columns.
- "c", which represents categorical columns.
- "q", which represents numeric columns.
- "int", which represents integer columns.
- "i", which represents boolean columns.
Note that, while categorical types are treated differently from
the rest for model fitting purposes, the other types do not influence
@@ -861,9 +862,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
self.nthread = nthread if nthread is not None else -1
self.silent = silent
# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle: Optional[ctypes.c_void_p] = None
if isinstance(data, ctypes.c_void_p):
# Used for constructing DMatrix slice.
self.handle = data
return
from .data import _is_iter, dispatch_data_backend
@@ -925,9 +926,10 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
self.handle = handle
def __del__(self) -> None:
if hasattr(self, "handle") and self.handle:
if hasattr(self, "handle"):
assert self.handle is not None
_check_call(_LIB.XGDMatrixFree(self.handle))
self.handle = None
del self.handle
@_deprecate_positional_args
def set_info(
@@ -1281,19 +1283,19 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
"""
from .data import _maybe_np_slice
res = DMatrix(None)
res.handle = ctypes.c_void_p()
handle = ctypes.c_void_p()
rindex = _maybe_np_slice(rindex, dtype=np.int32)
_check_call(
_LIB.XGDMatrixSliceDMatrixEx(
self.handle,
c_array(ctypes.c_int, rindex),
c_bst_ulong(len(rindex)),
ctypes.byref(res.handle),
ctypes.byref(handle),
ctypes.c_int(1 if allow_groups else 0),
)
)
return res
return DMatrix(handle)
@property
def feature_names(self) -> Optional[FeatureNames]:

View File

@@ -1053,10 +1053,10 @@ def _is_dlpack(data: DataType) -> bool:
def _transform_dlpack(data: DataType) -> bool:
from cupy import fromDlpack # pylint: disable=E0401
from cupy import from_dlpack # pylint: disable=E0401
assert "used_dltensor" not in str(data)
data = fromDlpack(data)
data = from_dlpack(data)
return data

View File

@@ -5,12 +5,14 @@ import json
import os
import warnings
from concurrent.futures import ThreadPoolExecutor
from inspect import signature
from typing import (
Any,
Callable,
Dict,
List,
Optional,
Protocol,
Sequence,
Tuple,
Type,
@@ -67,14 +69,20 @@ def _can_use_qdm(tree_method: Optional[str]) -> bool:
return tree_method in ("hist", "gpu_hist", None, "auto")
SklObjective = Optional[
Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
]
class _SklObjWProto(Protocol): # pylint: disable=too-few-public-methods
def __call__(
self,
y_true: ArrayLike,
y_pred: ArrayLike,
sample_weight: Optional[ArrayLike],
) -> Tuple[ArrayLike, ArrayLike]: ...
def _objective_decorator(
func: Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]
) -> Objective:
_SklObjProto = Callable[[ArrayLike, ArrayLike], Tuple[np.ndarray, np.ndarray]]
SklObjective = Optional[Union[str, _SklObjWProto, _SklObjProto]]
def _objective_decorator(func: Union[_SklObjWProto, _SklObjProto]) -> Objective:
"""Decorate an objective function
Converts an objective function using the typical sklearn metrics
@@ -89,6 +97,8 @@ def _objective_decorator(
The target values
y_pred: array_like of shape [n_samples]
The predicted values
sample_weight :
Optional sample weight, None or a ndarray.
Returns
-------
@@ -103,10 +113,25 @@ def _objective_decorator(
``dmatrix.get_label()``
"""
parameters = signature(func).parameters
supports_sw = "sample_weight" in parameters
def inner(preds: np.ndarray, dmatrix: DMatrix) -> Tuple[np.ndarray, np.ndarray]:
"""internal function"""
"""Internal function."""
sample_weight = dmatrix.get_weight()
labels = dmatrix.get_label()
return func(labels, preds)
if sample_weight.size > 0 and not supports_sw:
raise ValueError(
"Custom objective doesn't have the `sample_weight` parameter while"
" sample_weight is used."
)
if sample_weight.size > 0:
fnw = cast(_SklObjWProto, func)
return fnw(labels, preds, sample_weight=sample_weight)
fn = cast(_SklObjProto, func)
return fn(labels, preds)
return inner
@@ -172,75 +197,121 @@ def ltr_metric_decorator(func: Callable, n_jobs: Optional[int]) -> Metric:
return inner
__estimator_doc = """
n_estimators : Optional[int]
__estimator_doc = f"""
n_estimators : {Optional[int]}
Number of gradient boosted trees. Equivalent to number of boosting
rounds.
"""
__model_doc = f"""
max_depth : Optional[int]
max_depth : {Optional[int]}
Maximum tree depth for base learners.
max_leaves :
max_leaves : {Optional[int]}
Maximum number of leaves; 0 indicates no limit.
max_bin :
max_bin : {Optional[int]}
If using histogram-based algorithm, maximum number of bins per feature
grow_policy :
Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
depth-wise. 1: favor splitting at nodes with highest loss change.
learning_rate : Optional[float]
grow_policy : {Optional[str]}
Tree growing policy.
- depthwise: Favors splitting at nodes closest to the node,
- lossguide: Favors splitting at nodes with highest loss change.
learning_rate : {Optional[float]}
Boosting learning rate (xgb's "eta")
verbosity : Optional[int]
verbosity : {Optional[int]}
The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
objective : {SklObjective}
Specify the learning task and the corresponding learning objective or a custom
objective function to be used. For custom objective, see
:doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
information.
objective function to be used.
For custom objective, see :doc:`/tutorials/custom_metric_obj` and
:ref:`custom-obj-metric` for more information, along with the end note for
function signatures.
booster: {Optional[str]}
Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
tree_method : {Optional[str]}
booster: Optional[str]
Specify which booster to use: `gbtree`, `gblinear` or `dart`.
tree_method: Optional[str]
Specify which tree method to use. Default to auto. If this parameter is set to
default, XGBoost will choose the most conservative option available. It's
recommended to study this option from the parameters document :doc:`tree method
</treemethod>`
n_jobs : Optional[int]
n_jobs : {Optional[int]}
Number of parallel threads used to run xgboost. When used with other
Scikit-Learn algorithms like grid search, you may choose which algorithm to
parallelize and balance the threads. Creating thread contention will
significantly slow down both algorithms.
gamma : Optional[float]
(min_split_loss) Minimum loss reduction required to make a further partition on a
leaf node of the tree.
min_child_weight : Optional[float]
gamma : {Optional[float]}
(min_split_loss) Minimum loss reduction required to make a further partition on
a leaf node of the tree.
min_child_weight : {Optional[float]}
Minimum sum of instance weight(hessian) needed in a child.
max_delta_step : Optional[float]
max_delta_step : {Optional[float]}
Maximum delta step we allow each tree's weight estimation to be.
subsample : Optional[float]
subsample : {Optional[float]}
Subsample ratio of the training instance.
sampling_method :
sampling_method : {Optional[str]}
Sampling method. Used only by the GPU version of ``hist`` tree method.
- ``uniform``: select random training instances uniformly.
- ``gradient_based`` select random training instances with higher probability
- ``uniform``: Select random training instances uniformly.
- ``gradient_based``: Select random training instances with higher probability
when the gradient and hessian are larger. (cf. CatBoost)
colsample_bytree : Optional[float]
colsample_bytree : {Optional[float]}
Subsample ratio of columns when constructing each tree.
colsample_bylevel : Optional[float]
colsample_bylevel : {Optional[float]}
Subsample ratio of columns for each level.
colsample_bynode : Optional[float]
colsample_bynode : {Optional[float]}
Subsample ratio of columns for each split.
reg_alpha : Optional[float]
reg_alpha : {Optional[float]}
L1 regularization term on weights (xgb's alpha).
reg_lambda : Optional[float]
reg_lambda : {Optional[float]}
L2 regularization term on weights (xgb's lambda).
scale_pos_weight : Optional[float]
scale_pos_weight : {Optional[float]}
Balancing of positive and negative weights.
base_score : Optional[float]
base_score : {Optional[float]}
The initial prediction score of all instances, global bias.
random_state : Optional[Union[numpy.random.RandomState, numpy.random.Generator, int]]
random_state : {Optional[Union[np.random.RandomState, np.random.Generator, int]]}
Random number seed.
.. note::
@@ -248,34 +319,44 @@ __model_doc = f"""
Using gblinear booster with shotgun updater is nondeterministic as
it uses Hogwild algorithm.
missing : float, default np.nan
Value in the data which needs to be present as a missing value.
num_parallel_tree: Optional[int]
missing : float
Value in the data which needs to be present as a missing value. Default to
:py:data:`numpy.nan`.
num_parallel_tree: {Optional[int]}
Used for boosting random forest.
monotone_constraints : Optional[Union[Dict[str, int], str]]
monotone_constraints : {Optional[Union[Dict[str, int], str]]}
Constraint of variable monotonicity. See :doc:`tutorial </tutorials/monotonic>`
for more information.
interaction_constraints : Optional[Union[str, List[Tuple[str]]]]
interaction_constraints : {Optional[Union[str, List[Tuple[str]]]]}
Constraints for interaction representing permitted interactions. The
constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
3, 4]]``, where each inner list is a group of indices of features that are
allowed to interact with each other. See :doc:`tutorial
</tutorials/feature_interaction_constraint>` for more information
importance_type: Optional[str]
importance_type: {Optional[str]}
The feature importance type for the feature_importances\\_ property:
* For tree model, it's either "gain", "weight", "cover", "total_gain" or
"total_cover".
* For linear model, only "weight" is defined and it's the normalized coefficients
without bias.
* For linear model, only "weight" is defined and it's the normalized
coefficients without bias.
device : Optional[str]
device : {Optional[str]}
.. versionadded:: 2.0.0
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
validate_parameters : Optional[bool]
validate_parameters : {Optional[bool]}
Give warnings for unknown parameter.
@@ -283,14 +364,14 @@ __model_doc = f"""
See the same parameter of :py:class:`DMatrix` for details.
feature_types : Optional[FeatureTypes]
feature_types : {Optional[FeatureTypes]}
.. versionadded:: 1.7.0
Used for specifying feature types without constructing a dataframe. See
:py:class:`DMatrix` for details.
max_cat_to_onehot : Optional[int]
max_cat_to_onehot : {Optional[int]}
.. versionadded:: 1.6.0
@@ -303,7 +384,7 @@ __model_doc = f"""
categorical feature support. See :doc:`Categorical Data
</tutorials/categorical>` and :ref:`cat-param` for details.
max_cat_threshold : Optional[int]
max_cat_threshold : {Optional[int]}
.. versionadded:: 1.7.0
@@ -314,7 +395,7 @@ __model_doc = f"""
needs to be set to have categorical feature support. See :doc:`Categorical Data
</tutorials/categorical>` and :ref:`cat-param` for details.
multi_strategy : Optional[str]
multi_strategy : {Optional[str]}
.. versionadded:: 2.0.0
@@ -327,7 +408,7 @@ __model_doc = f"""
- ``one_output_per_tree``: One model for each target.
- ``multi_output_tree``: Use multi-target trees.
eval_metric : Optional[Union[str, List[str], Callable]]
eval_metric : {Optional[Union[str, List[str], Callable]]}
.. versionadded:: 1.6.0
@@ -360,7 +441,7 @@ __model_doc = f"""
)
reg.fit(X, y, eval_set=[(X, y)])
early_stopping_rounds : Optional[int]
early_stopping_rounds : {Optional[int]}
.. versionadded:: 1.6.0
@@ -383,7 +464,8 @@ __model_doc = f"""
early stopping. If there's more than one metric in **eval_metric**, the last
metric will be used for early stopping.
callbacks : Optional[List[TrainingCallback]]
callbacks : {Optional[List[TrainingCallback]]}
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using
:ref:`Callback API <callback_api>`.
@@ -402,7 +484,8 @@ __model_doc = f"""
reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
reg.fit(X, y)
kwargs : dict, optional
kwargs : {Optional[Any]}
Keyword arguments for XGBoost Booster object. Full documentation of parameters
can be found :doc:`here </parameter>`.
Attempting to set a parameter via the constructor args and \\*\\*kwargs
@@ -419,13 +502,16 @@ __custom_obj_note = """
.. note:: Custom objective function
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
parameter. In this case, it should have the signature ``objective(y_true,
y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
-> [grad, hess]``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
sample_weight :
Optional sample weights.
grad: array_like of shape [n_samples]
The value of the gradient for each sample point.

View File

@@ -95,6 +95,7 @@ from .utils import (
deserialize_xgb_model,
get_class_name,
get_logger,
get_logger_level,
serialize_booster,
use_cuda,
)
@@ -181,6 +182,8 @@ pred = Pred("prediction", "rawPrediction", "probability", "predContrib")
_INIT_BOOSTER_SAVE_PATH = "init_booster.json"
_LOG_TAG = "XGBoost-PySpark"
class _SparkXGBParams(
HasFeaturesCol,
@@ -1034,6 +1037,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
num_workers = self.getOrDefault(self.num_workers)
log_level = get_logger_level(_LOG_TAG)
def _train_booster(
pandas_df_iter: Iterator[pd.DataFrame],
) -> Iterator[pd.DataFrame]:
@@ -1047,7 +1052,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dev_ordinal = None
use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
msg = "Training on CPUs"
if run_on_gpu:
dev_ordinal = (
context.partitionId() if is_local else _get_gpu_id(context)
@@ -1058,10 +1063,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
# Note: Checking `is_cudf_available` in spark worker side because
# spark worker might has different python environment with driver side.
use_qdm = use_qdm and is_cudf_available()
get_logger("XGBoost-PySpark").info(
"Leveraging %s to train with QDM: %s",
booster_params["device"],
"on" if use_qdm else "off",
msg = (
f"Leveraging {booster_params['device']} to train with "
f"QDM: {'on' if use_qdm else 'off'}"
)
if use_qdm and (booster_params.get("max_bin", None) is not None):
@@ -1070,6 +1074,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
_rabit_args = {}
if context.partitionId() == 0:
_rabit_args = _get_rabit_args(context, num_workers)
get_logger(_LOG_TAG, log_level).info(msg)
worker_message = {
"rabit_msg": _rabit_args,
@@ -1127,7 +1132,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
ret = rdd_with_resource.collect()[0]
return ret[0], ret[1]
get_logger("XGBoost-PySpark").info(
get_logger(_LOG_TAG).info(
"Running xgboost-%s on %s workers with"
"\n\tbooster params: %s"
"\n\ttrain_call_kwargs_params: %s"
@@ -1139,7 +1144,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dmatrix_kwargs,
)
(config, booster) = _run_job()
get_logger("XGBoost-PySpark").info("Finished xgboost training!")
get_logger(_LOG_TAG).info("Finished xgboost training!")
result_xgb_model = self._convert_to_sklearn_model(
bytearray(booster, "utf-8"), config
@@ -1342,7 +1347,7 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
# User don't set gpu configurations, just use cpu
if gpu_per_task is None:
if use_gpu_by_params:
get_logger("XGBoost-PySpark").warning(
get_logger(_LOG_TAG).warning(
"Do the prediction on the CPUs since "
"no gpu configurations are set"
)
@@ -1377,6 +1382,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
is_local = _is_local(_get_spark_session().sparkContext)
run_on_gpu = self._run_on_gpu()
log_level = get_logger_level(_LOG_TAG)
@pandas_udf(schema) # type: ignore
def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
assert xgb_sklearn_model is not None
@@ -1413,7 +1420,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
else:
msg = "CUDF or Cupy is unavailable, fallback the inference on the CPUs"
get_logger("XGBoost-PySpark").info(msg)
if context.partitionId() == 0:
get_logger(_LOG_TAG, log_level).info(msg)
def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
"""Move the data to gpu if possible"""

View File

@@ -8,7 +8,7 @@ import os
import sys
import uuid
from threading import Thread
from typing import Any, Callable, Dict, Optional, Set, Type
from typing import Any, Callable, Dict, Optional, Set, Type, Union
import pyspark
from pyspark import BarrierTaskContext, SparkConf, SparkContext, SparkFiles, TaskContext
@@ -98,10 +98,15 @@ def _get_spark_session() -> SparkSession:
return SparkSession.builder.getOrCreate()
def get_logger(name: str, level: str = "INFO") -> logging.Logger:
def get_logger(name: str, level: Optional[Union[str, int]] = None) -> logging.Logger:
"""Gets a logger by name, or creates and configures it for the first time."""
logger = logging.getLogger(name)
logger.setLevel(level)
if level is not None:
logger.setLevel(level)
else:
# Default to info if not set.
if logger.level == logging.NOTSET:
logger.setLevel(logging.INFO)
# If the logger is configured, skip the configure
if not logger.handlers and not logging.getLogger().handlers:
handler = logging.StreamHandler(sys.stderr)
@@ -113,6 +118,12 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
return logger
def get_logger_level(name: str) -> Optional[int]:
"""Get the logger level for the given log name"""
logger = logging.getLogger(name)
return None if logger.level == logging.NOTSET else logger.level
def _get_max_num_concurrent_tasks(spark_context: SparkContext) -> int:
"""Gets the current max number of concurrent tasks."""
# pylint: disable=protected-access

View File

@@ -815,10 +815,15 @@ def softprob_obj(
return objective
def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
def ls_obj(
y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None
) -> Tuple[np.ndarray, np.ndarray]:
"""Least squared error."""
grad = y_pred - y_true
hess = np.ones(len(y_true))
if sample_weight is not None:
grad *= sample_weight
hess *= sample_weight
return grad, hess

View File

@@ -100,3 +100,21 @@ def run_ranking_categorical(device: str) -> None:
scores = cross_val_score(ltr, X, y)
for s in scores:
assert s > 0.7
def run_normalization(device: str) -> None:
"""Test normalization."""
X, y, qid, _ = tm.make_ltr(2048, 4, 64, 3)
ltr = xgb.XGBRanker(objective="rank:pairwise", n_estimators=4, device=device)
ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
e0 = ltr.evals_result()
ltr = xgb.XGBRanker(
objective="rank:pairwise",
n_estimators=4,
device=device,
lambdarank_normalization=False,
)
ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
e1 = ltr.evals_result()
assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]