merge latest changes
This commit is contained in:
@@ -32,7 +32,10 @@ def build_libxgboost(
|
||||
build_dir: pathlib.Path,
|
||||
build_config: BuildConfiguration,
|
||||
) -> pathlib.Path:
|
||||
"""Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
|
||||
"""Build libxgboost in a temporary directory and obtain the path to built
|
||||
libxgboost.
|
||||
|
||||
"""
|
||||
logger = logging.getLogger("xgboost.packager.build_libxgboost")
|
||||
|
||||
if not cpp_src_dir.is_dir():
|
||||
@@ -51,8 +54,8 @@ def build_libxgboost(
|
||||
cmake_cmd.extend(build_config.get_cmake_args())
|
||||
|
||||
# Flag for cross-compiling for Apple Silicon
|
||||
# We use environment variable because it's the only way to pass down custom flags
|
||||
# through the cibuildwheel package, which calls `pip wheel` command.
|
||||
# We use environment variable because it's the only way to pass down custom
|
||||
# flags through the cibuildwheel package, which calls `pip wheel` command.
|
||||
if "CIBW_TARGET_OSX_ARM64" in os.environ:
|
||||
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
|
||||
|
||||
|
||||
@@ -804,10 +804,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
|
||||
Otherwise, one can pass a list-like input with the same length as number
|
||||
of columns in `data`, with the following possible values:
|
||||
- "c", which represents categorical columns.
|
||||
- "q", which represents numeric columns.
|
||||
- "int", which represents integer columns.
|
||||
- "i", which represents boolean columns.
|
||||
|
||||
- "c", which represents categorical columns.
|
||||
- "q", which represents numeric columns.
|
||||
- "int", which represents integer columns.
|
||||
- "i", which represents boolean columns.
|
||||
|
||||
Note that, while categorical types are treated differently from
|
||||
the rest for model fitting purposes, the other types do not influence
|
||||
@@ -861,9 +862,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
self.nthread = nthread if nthread is not None else -1
|
||||
self.silent = silent
|
||||
|
||||
# force into void_p, mac need to pass things in as void_p
|
||||
if data is None:
|
||||
self.handle: Optional[ctypes.c_void_p] = None
|
||||
if isinstance(data, ctypes.c_void_p):
|
||||
# Used for constructing DMatrix slice.
|
||||
self.handle = data
|
||||
return
|
||||
|
||||
from .data import _is_iter, dispatch_data_backend
|
||||
@@ -925,9 +926,10 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
self.handle = handle
|
||||
|
||||
def __del__(self) -> None:
|
||||
if hasattr(self, "handle") and self.handle:
|
||||
if hasattr(self, "handle"):
|
||||
assert self.handle is not None
|
||||
_check_call(_LIB.XGDMatrixFree(self.handle))
|
||||
self.handle = None
|
||||
del self.handle
|
||||
|
||||
@_deprecate_positional_args
|
||||
def set_info(
|
||||
@@ -1281,19 +1283,19 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
"""
|
||||
from .data import _maybe_np_slice
|
||||
|
||||
res = DMatrix(None)
|
||||
res.handle = ctypes.c_void_p()
|
||||
handle = ctypes.c_void_p()
|
||||
|
||||
rindex = _maybe_np_slice(rindex, dtype=np.int32)
|
||||
_check_call(
|
||||
_LIB.XGDMatrixSliceDMatrixEx(
|
||||
self.handle,
|
||||
c_array(ctypes.c_int, rindex),
|
||||
c_bst_ulong(len(rindex)),
|
||||
ctypes.byref(res.handle),
|
||||
ctypes.byref(handle),
|
||||
ctypes.c_int(1 if allow_groups else 0),
|
||||
)
|
||||
)
|
||||
return res
|
||||
return DMatrix(handle)
|
||||
|
||||
@property
|
||||
def feature_names(self) -> Optional[FeatureNames]:
|
||||
|
||||
@@ -1053,10 +1053,10 @@ def _is_dlpack(data: DataType) -> bool:
|
||||
|
||||
|
||||
def _transform_dlpack(data: DataType) -> bool:
|
||||
from cupy import fromDlpack # pylint: disable=E0401
|
||||
from cupy import from_dlpack # pylint: disable=E0401
|
||||
|
||||
assert "used_dltensor" not in str(data)
|
||||
data = fromDlpack(data)
|
||||
data = from_dlpack(data)
|
||||
return data
|
||||
|
||||
|
||||
|
||||
@@ -5,12 +5,14 @@ import json
|
||||
import os
|
||||
import warnings
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from inspect import signature
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Protocol,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
@@ -67,14 +69,20 @@ def _can_use_qdm(tree_method: Optional[str]) -> bool:
|
||||
return tree_method in ("hist", "gpu_hist", None, "auto")
|
||||
|
||||
|
||||
SklObjective = Optional[
|
||||
Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
|
||||
]
|
||||
class _SklObjWProto(Protocol): # pylint: disable=too-few-public-methods
|
||||
def __call__(
|
||||
self,
|
||||
y_true: ArrayLike,
|
||||
y_pred: ArrayLike,
|
||||
sample_weight: Optional[ArrayLike],
|
||||
) -> Tuple[ArrayLike, ArrayLike]: ...
|
||||
|
||||
|
||||
def _objective_decorator(
|
||||
func: Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]
|
||||
) -> Objective:
|
||||
_SklObjProto = Callable[[ArrayLike, ArrayLike], Tuple[np.ndarray, np.ndarray]]
|
||||
SklObjective = Optional[Union[str, _SklObjWProto, _SklObjProto]]
|
||||
|
||||
|
||||
def _objective_decorator(func: Union[_SklObjWProto, _SklObjProto]) -> Objective:
|
||||
"""Decorate an objective function
|
||||
|
||||
Converts an objective function using the typical sklearn metrics
|
||||
@@ -89,6 +97,8 @@ def _objective_decorator(
|
||||
The target values
|
||||
y_pred: array_like of shape [n_samples]
|
||||
The predicted values
|
||||
sample_weight :
|
||||
Optional sample weight, None or a ndarray.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -103,10 +113,25 @@ def _objective_decorator(
|
||||
``dmatrix.get_label()``
|
||||
"""
|
||||
|
||||
parameters = signature(func).parameters
|
||||
supports_sw = "sample_weight" in parameters
|
||||
|
||||
def inner(preds: np.ndarray, dmatrix: DMatrix) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""internal function"""
|
||||
"""Internal function."""
|
||||
sample_weight = dmatrix.get_weight()
|
||||
labels = dmatrix.get_label()
|
||||
return func(labels, preds)
|
||||
|
||||
if sample_weight.size > 0 and not supports_sw:
|
||||
raise ValueError(
|
||||
"Custom objective doesn't have the `sample_weight` parameter while"
|
||||
" sample_weight is used."
|
||||
)
|
||||
if sample_weight.size > 0:
|
||||
fnw = cast(_SklObjWProto, func)
|
||||
return fnw(labels, preds, sample_weight=sample_weight)
|
||||
|
||||
fn = cast(_SklObjProto, func)
|
||||
return fn(labels, preds)
|
||||
|
||||
return inner
|
||||
|
||||
@@ -172,75 +197,121 @@ def ltr_metric_decorator(func: Callable, n_jobs: Optional[int]) -> Metric:
|
||||
return inner
|
||||
|
||||
|
||||
__estimator_doc = """
|
||||
n_estimators : Optional[int]
|
||||
__estimator_doc = f"""
|
||||
n_estimators : {Optional[int]}
|
||||
Number of gradient boosted trees. Equivalent to number of boosting
|
||||
rounds.
|
||||
"""
|
||||
|
||||
__model_doc = f"""
|
||||
max_depth : Optional[int]
|
||||
max_depth : {Optional[int]}
|
||||
|
||||
Maximum tree depth for base learners.
|
||||
max_leaves :
|
||||
|
||||
max_leaves : {Optional[int]}
|
||||
|
||||
Maximum number of leaves; 0 indicates no limit.
|
||||
max_bin :
|
||||
|
||||
max_bin : {Optional[int]}
|
||||
|
||||
If using histogram-based algorithm, maximum number of bins per feature
|
||||
grow_policy :
|
||||
Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
|
||||
depth-wise. 1: favor splitting at nodes with highest loss change.
|
||||
learning_rate : Optional[float]
|
||||
|
||||
grow_policy : {Optional[str]}
|
||||
|
||||
Tree growing policy.
|
||||
|
||||
- depthwise: Favors splitting at nodes closest to the node,
|
||||
- lossguide: Favors splitting at nodes with highest loss change.
|
||||
|
||||
learning_rate : {Optional[float]}
|
||||
|
||||
Boosting learning rate (xgb's "eta")
|
||||
verbosity : Optional[int]
|
||||
|
||||
verbosity : {Optional[int]}
|
||||
|
||||
The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
|
||||
|
||||
objective : {SklObjective}
|
||||
|
||||
Specify the learning task and the corresponding learning objective or a custom
|
||||
objective function to be used. For custom objective, see
|
||||
:doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
|
||||
information.
|
||||
objective function to be used.
|
||||
|
||||
For custom objective, see :doc:`/tutorials/custom_metric_obj` and
|
||||
:ref:`custom-obj-metric` for more information, along with the end note for
|
||||
function signatures.
|
||||
|
||||
booster: {Optional[str]}
|
||||
|
||||
Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
|
||||
|
||||
tree_method : {Optional[str]}
|
||||
|
||||
booster: Optional[str]
|
||||
Specify which booster to use: `gbtree`, `gblinear` or `dart`.
|
||||
tree_method: Optional[str]
|
||||
Specify which tree method to use. Default to auto. If this parameter is set to
|
||||
default, XGBoost will choose the most conservative option available. It's
|
||||
recommended to study this option from the parameters document :doc:`tree method
|
||||
</treemethod>`
|
||||
n_jobs : Optional[int]
|
||||
|
||||
n_jobs : {Optional[int]}
|
||||
|
||||
Number of parallel threads used to run xgboost. When used with other
|
||||
Scikit-Learn algorithms like grid search, you may choose which algorithm to
|
||||
parallelize and balance the threads. Creating thread contention will
|
||||
significantly slow down both algorithms.
|
||||
gamma : Optional[float]
|
||||
(min_split_loss) Minimum loss reduction required to make a further partition on a
|
||||
leaf node of the tree.
|
||||
min_child_weight : Optional[float]
|
||||
|
||||
gamma : {Optional[float]}
|
||||
|
||||
(min_split_loss) Minimum loss reduction required to make a further partition on
|
||||
a leaf node of the tree.
|
||||
|
||||
min_child_weight : {Optional[float]}
|
||||
|
||||
Minimum sum of instance weight(hessian) needed in a child.
|
||||
max_delta_step : Optional[float]
|
||||
|
||||
max_delta_step : {Optional[float]}
|
||||
|
||||
Maximum delta step we allow each tree's weight estimation to be.
|
||||
subsample : Optional[float]
|
||||
|
||||
subsample : {Optional[float]}
|
||||
|
||||
Subsample ratio of the training instance.
|
||||
sampling_method :
|
||||
|
||||
sampling_method : {Optional[str]}
|
||||
|
||||
Sampling method. Used only by the GPU version of ``hist`` tree method.
|
||||
- ``uniform``: select random training instances uniformly.
|
||||
- ``gradient_based`` select random training instances with higher probability
|
||||
|
||||
- ``uniform``: Select random training instances uniformly.
|
||||
- ``gradient_based``: Select random training instances with higher probability
|
||||
when the gradient and hessian are larger. (cf. CatBoost)
|
||||
colsample_bytree : Optional[float]
|
||||
|
||||
colsample_bytree : {Optional[float]}
|
||||
|
||||
Subsample ratio of columns when constructing each tree.
|
||||
colsample_bylevel : Optional[float]
|
||||
|
||||
colsample_bylevel : {Optional[float]}
|
||||
|
||||
Subsample ratio of columns for each level.
|
||||
colsample_bynode : Optional[float]
|
||||
|
||||
colsample_bynode : {Optional[float]}
|
||||
|
||||
Subsample ratio of columns for each split.
|
||||
reg_alpha : Optional[float]
|
||||
|
||||
reg_alpha : {Optional[float]}
|
||||
|
||||
L1 regularization term on weights (xgb's alpha).
|
||||
reg_lambda : Optional[float]
|
||||
|
||||
reg_lambda : {Optional[float]}
|
||||
|
||||
L2 regularization term on weights (xgb's lambda).
|
||||
scale_pos_weight : Optional[float]
|
||||
|
||||
scale_pos_weight : {Optional[float]}
|
||||
Balancing of positive and negative weights.
|
||||
base_score : Optional[float]
|
||||
|
||||
base_score : {Optional[float]}
|
||||
|
||||
The initial prediction score of all instances, global bias.
|
||||
random_state : Optional[Union[numpy.random.RandomState, numpy.random.Generator, int]]
|
||||
|
||||
random_state : {Optional[Union[np.random.RandomState, np.random.Generator, int]]}
|
||||
|
||||
Random number seed.
|
||||
|
||||
.. note::
|
||||
@@ -248,34 +319,44 @@ __model_doc = f"""
|
||||
Using gblinear booster with shotgun updater is nondeterministic as
|
||||
it uses Hogwild algorithm.
|
||||
|
||||
missing : float, default np.nan
|
||||
Value in the data which needs to be present as a missing value.
|
||||
num_parallel_tree: Optional[int]
|
||||
missing : float
|
||||
|
||||
Value in the data which needs to be present as a missing value. Default to
|
||||
:py:data:`numpy.nan`.
|
||||
|
||||
num_parallel_tree: {Optional[int]}
|
||||
|
||||
Used for boosting random forest.
|
||||
monotone_constraints : Optional[Union[Dict[str, int], str]]
|
||||
|
||||
monotone_constraints : {Optional[Union[Dict[str, int], str]]}
|
||||
|
||||
Constraint of variable monotonicity. See :doc:`tutorial </tutorials/monotonic>`
|
||||
for more information.
|
||||
interaction_constraints : Optional[Union[str, List[Tuple[str]]]]
|
||||
|
||||
interaction_constraints : {Optional[Union[str, List[Tuple[str]]]]}
|
||||
|
||||
Constraints for interaction representing permitted interactions. The
|
||||
constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
|
||||
3, 4]]``, where each inner list is a group of indices of features that are
|
||||
allowed to interact with each other. See :doc:`tutorial
|
||||
</tutorials/feature_interaction_constraint>` for more information
|
||||
importance_type: Optional[str]
|
||||
|
||||
importance_type: {Optional[str]}
|
||||
|
||||
The feature importance type for the feature_importances\\_ property:
|
||||
|
||||
* For tree model, it's either "gain", "weight", "cover", "total_gain" or
|
||||
"total_cover".
|
||||
* For linear model, only "weight" is defined and it's the normalized coefficients
|
||||
without bias.
|
||||
* For linear model, only "weight" is defined and it's the normalized
|
||||
coefficients without bias.
|
||||
|
||||
device : Optional[str]
|
||||
device : {Optional[str]}
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
|
||||
|
||||
validate_parameters : Optional[bool]
|
||||
validate_parameters : {Optional[bool]}
|
||||
|
||||
Give warnings for unknown parameter.
|
||||
|
||||
@@ -283,14 +364,14 @@ __model_doc = f"""
|
||||
|
||||
See the same parameter of :py:class:`DMatrix` for details.
|
||||
|
||||
feature_types : Optional[FeatureTypes]
|
||||
feature_types : {Optional[FeatureTypes]}
|
||||
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
Used for specifying feature types without constructing a dataframe. See
|
||||
:py:class:`DMatrix` for details.
|
||||
|
||||
max_cat_to_onehot : Optional[int]
|
||||
max_cat_to_onehot : {Optional[int]}
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
@@ -303,7 +384,7 @@ __model_doc = f"""
|
||||
categorical feature support. See :doc:`Categorical Data
|
||||
</tutorials/categorical>` and :ref:`cat-param` for details.
|
||||
|
||||
max_cat_threshold : Optional[int]
|
||||
max_cat_threshold : {Optional[int]}
|
||||
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
@@ -314,7 +395,7 @@ __model_doc = f"""
|
||||
needs to be set to have categorical feature support. See :doc:`Categorical Data
|
||||
</tutorials/categorical>` and :ref:`cat-param` for details.
|
||||
|
||||
multi_strategy : Optional[str]
|
||||
multi_strategy : {Optional[str]}
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
@@ -327,7 +408,7 @@ __model_doc = f"""
|
||||
- ``one_output_per_tree``: One model for each target.
|
||||
- ``multi_output_tree``: Use multi-target trees.
|
||||
|
||||
eval_metric : Optional[Union[str, List[str], Callable]]
|
||||
eval_metric : {Optional[Union[str, List[str], Callable]]}
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
@@ -360,7 +441,7 @@ __model_doc = f"""
|
||||
)
|
||||
reg.fit(X, y, eval_set=[(X, y)])
|
||||
|
||||
early_stopping_rounds : Optional[int]
|
||||
early_stopping_rounds : {Optional[int]}
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
@@ -383,7 +464,8 @@ __model_doc = f"""
|
||||
early stopping. If there's more than one metric in **eval_metric**, the last
|
||||
metric will be used for early stopping.
|
||||
|
||||
callbacks : Optional[List[TrainingCallback]]
|
||||
callbacks : {Optional[List[TrainingCallback]]}
|
||||
|
||||
List of callback functions that are applied at end of each iteration.
|
||||
It is possible to use predefined callbacks by using
|
||||
:ref:`Callback API <callback_api>`.
|
||||
@@ -402,7 +484,8 @@ __model_doc = f"""
|
||||
reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
|
||||
reg.fit(X, y)
|
||||
|
||||
kwargs : dict, optional
|
||||
kwargs : {Optional[Any]}
|
||||
|
||||
Keyword arguments for XGBoost Booster object. Full documentation of parameters
|
||||
can be found :doc:`here </parameter>`.
|
||||
Attempting to set a parameter via the constructor args and \\*\\*kwargs
|
||||
@@ -419,13 +502,16 @@ __custom_obj_note = """
|
||||
.. note:: Custom objective function
|
||||
|
||||
A custom objective function can be provided for the ``objective``
|
||||
parameter. In this case, it should have the signature
|
||||
``objective(y_true, y_pred) -> grad, hess``:
|
||||
parameter. In this case, it should have the signature ``objective(y_true,
|
||||
y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
|
||||
-> [grad, hess]``:
|
||||
|
||||
y_true: array_like of shape [n_samples]
|
||||
The target values
|
||||
y_pred: array_like of shape [n_samples]
|
||||
The predicted values
|
||||
sample_weight :
|
||||
Optional sample weights.
|
||||
|
||||
grad: array_like of shape [n_samples]
|
||||
The value of the gradient for each sample point.
|
||||
|
||||
@@ -95,6 +95,7 @@ from .utils import (
|
||||
deserialize_xgb_model,
|
||||
get_class_name,
|
||||
get_logger,
|
||||
get_logger_level,
|
||||
serialize_booster,
|
||||
use_cuda,
|
||||
)
|
||||
@@ -181,6 +182,8 @@ pred = Pred("prediction", "rawPrediction", "probability", "predContrib")
|
||||
|
||||
_INIT_BOOSTER_SAVE_PATH = "init_booster.json"
|
||||
|
||||
_LOG_TAG = "XGBoost-PySpark"
|
||||
|
||||
|
||||
class _SparkXGBParams(
|
||||
HasFeaturesCol,
|
||||
@@ -1034,6 +1037,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
|
||||
num_workers = self.getOrDefault(self.num_workers)
|
||||
|
||||
log_level = get_logger_level(_LOG_TAG)
|
||||
|
||||
def _train_booster(
|
||||
pandas_df_iter: Iterator[pd.DataFrame],
|
||||
) -> Iterator[pd.DataFrame]:
|
||||
@@ -1047,7 +1052,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
|
||||
dev_ordinal = None
|
||||
use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
|
||||
|
||||
msg = "Training on CPUs"
|
||||
if run_on_gpu:
|
||||
dev_ordinal = (
|
||||
context.partitionId() if is_local else _get_gpu_id(context)
|
||||
@@ -1058,10 +1063,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
# Note: Checking `is_cudf_available` in spark worker side because
|
||||
# spark worker might has different python environment with driver side.
|
||||
use_qdm = use_qdm and is_cudf_available()
|
||||
get_logger("XGBoost-PySpark").info(
|
||||
"Leveraging %s to train with QDM: %s",
|
||||
booster_params["device"],
|
||||
"on" if use_qdm else "off",
|
||||
msg = (
|
||||
f"Leveraging {booster_params['device']} to train with "
|
||||
f"QDM: {'on' if use_qdm else 'off'}"
|
||||
)
|
||||
|
||||
if use_qdm and (booster_params.get("max_bin", None) is not None):
|
||||
@@ -1070,6 +1074,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
_rabit_args = {}
|
||||
if context.partitionId() == 0:
|
||||
_rabit_args = _get_rabit_args(context, num_workers)
|
||||
get_logger(_LOG_TAG, log_level).info(msg)
|
||||
|
||||
worker_message = {
|
||||
"rabit_msg": _rabit_args,
|
||||
@@ -1127,7 +1132,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
ret = rdd_with_resource.collect()[0]
|
||||
return ret[0], ret[1]
|
||||
|
||||
get_logger("XGBoost-PySpark").info(
|
||||
get_logger(_LOG_TAG).info(
|
||||
"Running xgboost-%s on %s workers with"
|
||||
"\n\tbooster params: %s"
|
||||
"\n\ttrain_call_kwargs_params: %s"
|
||||
@@ -1139,7 +1144,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
dmatrix_kwargs,
|
||||
)
|
||||
(config, booster) = _run_job()
|
||||
get_logger("XGBoost-PySpark").info("Finished xgboost training!")
|
||||
get_logger(_LOG_TAG).info("Finished xgboost training!")
|
||||
|
||||
result_xgb_model = self._convert_to_sklearn_model(
|
||||
bytearray(booster, "utf-8"), config
|
||||
@@ -1342,7 +1347,7 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
|
||||
# User don't set gpu configurations, just use cpu
|
||||
if gpu_per_task is None:
|
||||
if use_gpu_by_params:
|
||||
get_logger("XGBoost-PySpark").warning(
|
||||
get_logger(_LOG_TAG).warning(
|
||||
"Do the prediction on the CPUs since "
|
||||
"no gpu configurations are set"
|
||||
)
|
||||
@@ -1377,6 +1382,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
|
||||
is_local = _is_local(_get_spark_session().sparkContext)
|
||||
run_on_gpu = self._run_on_gpu()
|
||||
|
||||
log_level = get_logger_level(_LOG_TAG)
|
||||
|
||||
@pandas_udf(schema) # type: ignore
|
||||
def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
|
||||
assert xgb_sklearn_model is not None
|
||||
@@ -1413,7 +1420,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
|
||||
else:
|
||||
msg = "CUDF or Cupy is unavailable, fallback the inference on the CPUs"
|
||||
|
||||
get_logger("XGBoost-PySpark").info(msg)
|
||||
if context.partitionId() == 0:
|
||||
get_logger(_LOG_TAG, log_level).info(msg)
|
||||
|
||||
def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
|
||||
"""Move the data to gpu if possible"""
|
||||
|
||||
@@ -8,7 +8,7 @@ import os
|
||||
import sys
|
||||
import uuid
|
||||
from threading import Thread
|
||||
from typing import Any, Callable, Dict, Optional, Set, Type
|
||||
from typing import Any, Callable, Dict, Optional, Set, Type, Union
|
||||
|
||||
import pyspark
|
||||
from pyspark import BarrierTaskContext, SparkConf, SparkContext, SparkFiles, TaskContext
|
||||
@@ -98,10 +98,15 @@ def _get_spark_session() -> SparkSession:
|
||||
return SparkSession.builder.getOrCreate()
|
||||
|
||||
|
||||
def get_logger(name: str, level: str = "INFO") -> logging.Logger:
|
||||
def get_logger(name: str, level: Optional[Union[str, int]] = None) -> logging.Logger:
|
||||
"""Gets a logger by name, or creates and configures it for the first time."""
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
if level is not None:
|
||||
logger.setLevel(level)
|
||||
else:
|
||||
# Default to info if not set.
|
||||
if logger.level == logging.NOTSET:
|
||||
logger.setLevel(logging.INFO)
|
||||
# If the logger is configured, skip the configure
|
||||
if not logger.handlers and not logging.getLogger().handlers:
|
||||
handler = logging.StreamHandler(sys.stderr)
|
||||
@@ -113,6 +118,12 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
|
||||
return logger
|
||||
|
||||
|
||||
def get_logger_level(name: str) -> Optional[int]:
|
||||
"""Get the logger level for the given log name"""
|
||||
logger = logging.getLogger(name)
|
||||
return None if logger.level == logging.NOTSET else logger.level
|
||||
|
||||
|
||||
def _get_max_num_concurrent_tasks(spark_context: SparkContext) -> int:
|
||||
"""Gets the current max number of concurrent tasks."""
|
||||
# pylint: disable=protected-access
|
||||
|
||||
@@ -815,10 +815,15 @@ def softprob_obj(
|
||||
return objective
|
||||
|
||||
|
||||
def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
def ls_obj(
|
||||
y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Least squared error."""
|
||||
grad = y_pred - y_true
|
||||
hess = np.ones(len(y_true))
|
||||
if sample_weight is not None:
|
||||
grad *= sample_weight
|
||||
hess *= sample_weight
|
||||
return grad, hess
|
||||
|
||||
|
||||
|
||||
@@ -100,3 +100,21 @@ def run_ranking_categorical(device: str) -> None:
|
||||
scores = cross_val_score(ltr, X, y)
|
||||
for s in scores:
|
||||
assert s > 0.7
|
||||
|
||||
|
||||
def run_normalization(device: str) -> None:
|
||||
"""Test normalization."""
|
||||
X, y, qid, _ = tm.make_ltr(2048, 4, 64, 3)
|
||||
ltr = xgb.XGBRanker(objective="rank:pairwise", n_estimators=4, device=device)
|
||||
ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
|
||||
e0 = ltr.evals_result()
|
||||
|
||||
ltr = xgb.XGBRanker(
|
||||
objective="rank:pairwise",
|
||||
n_estimators=4,
|
||||
device=device,
|
||||
lambdarank_normalization=False,
|
||||
)
|
||||
ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
|
||||
e1 = ltr.evals_result()
|
||||
assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]
|
||||
|
||||
Reference in New Issue
Block a user