Merge branch 'master' into sync-2024Jan24

This commit is contained in:
Hui Liu
2024-02-01 14:41:48 -08:00
99 changed files with 2476 additions and 283 deletions

View File

@@ -2,6 +2,7 @@
Custom hook to customize the behavior of Hatchling.
Here, we customize the tag of the generated wheels.
"""
import sysconfig
from typing import Any, Dict

View File

@@ -1,4 +1,5 @@
"""Build configuration"""
import dataclasses
from typing import Any, Dict, List, Optional

View File

@@ -1,6 +1,7 @@
"""
Functions for building libxgboost
"""
import logging
import os
import pathlib

View File

@@ -4,6 +4,7 @@ Builds source distribution and binary wheels, following PEP 517 / PEP 660.
Reuses components of Hatchling (https://github.com/pypa/hatch/tree/master/backend) for the sake
of brevity.
"""
import dataclasses
import logging
import os

View File

@@ -1,6 +1,7 @@
"""
Functions for building sdist
"""
import logging
import pathlib

View File

@@ -1,6 +1,7 @@
"""
Utility functions for implementing PEP 517 backend
"""
import logging
import pathlib
import shutil

View File

@@ -36,6 +36,11 @@ PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
FloatCompatible = Union[float, np.float32, np.float64]
# typing.SupportsInt is not suitable here since floating point values are convertible to
# integers as well.
Integer = Union[int, np.integer]
IterationRange = Tuple[Integer, Integer]
# callables
FPreProcCallable = Callable

View File

@@ -1,4 +1,5 @@
"""XGBoost collective communication related API."""
import ctypes
import json
import logging

View File

@@ -48,6 +48,8 @@ from ._typing import (
FeatureInfo,
FeatureNames,
FeatureTypes,
Integer,
IterationRange,
ModelIn,
NumpyOrCupy,
TransformedData,
@@ -62,13 +64,11 @@ class XGBoostError(ValueError):
@overload
def from_pystr_to_cstr(data: str) -> bytes:
...
def from_pystr_to_cstr(data: str) -> bytes: ...
@overload
def from_pystr_to_cstr(data: List[str]) -> ctypes.Array:
...
def from_pystr_to_cstr(data: List[str]) -> ctypes.Array: ...
def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]:
@@ -798,9 +798,23 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
Set names for features.
feature_types :
Set types for features. When `enable_categorical` is set to `True`, string
"c" represents categorical data type while "q" represents numerical feature
type. For categorical features, the input is assumed to be preprocessed and
Set types for features. If `data` is a DataFrame type and passing
`enable_categorical=True`, the types will be deduced automatically
from the column types.
Otherwise, one can pass a list-like input with the same length as number
of columns in `data`, with the following possible values:
- "c", which represents categorical columns.
- "q", which represents numeric columns.
- "int", which represents integer columns.
- "i", which represents boolean columns.
Note that, while categorical types are treated differently from
the rest for model fitting purposes, the other types do not influence
the generated model, but have effects in other functionalities such as
feature importances.
For categorical features, the input is assumed to be preprocessed and
encoded by the users. The encoding can be done via
:py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
`.cat.codes` method. This is useful when users want to specify categorical
@@ -1812,19 +1826,25 @@ class Booster:
state["handle"] = handle
self.__dict__.update(state)
def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
"""Get a slice of the tree-based model.
.. versionadded:: 1.3.0
"""
if isinstance(val, int):
val = slice(val, val + 1)
# convert to slice for all other types
if isinstance(val, (np.integer, int)):
val = slice(int(val), int(val + 1))
if isinstance(val, type(Ellipsis)):
val = slice(0, 0)
if isinstance(val, tuple):
raise ValueError("Only supports slicing through 1 dimension.")
# All supported types are now slice
# FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
if not isinstance(val, slice):
msg = _expect((int, slice), type(val))
msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
raise TypeError(msg)
if isinstance(val.start, type(Ellipsis)) or val.start is None:
start = 0
else:
@@ -2246,12 +2266,13 @@ class Booster:
pred_interactions: bool = False,
validate_features: bool = True,
training: bool = False,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
strict_shape: bool = False,
) -> np.ndarray:
"""Predict with data. The full model will be used unless `iteration_range` is specified,
meaning user have to either slice the model or use the ``best_iteration``
attribute to get prediction from best model returned from early stopping.
"""Predict with data. The full model will be used unless `iteration_range` is
specified, meaning user have to either slice the model or use the
``best_iteration`` attribute to get prediction from best model returned from
early stopping.
.. note::
@@ -2336,8 +2357,8 @@ class Booster:
args = {
"type": 0,
"training": training,
"iteration_begin": iteration_range[0],
"iteration_end": iteration_range[1],
"iteration_begin": int(iteration_range[0]),
"iteration_end": int(iteration_range[1]),
"strict_shape": strict_shape,
}
@@ -2373,7 +2394,7 @@ class Booster:
def inplace_predict(
self,
data: DataType,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
predict_type: str = "value",
missing: float = np.nan,
validate_features: bool = True,
@@ -2439,8 +2460,8 @@ class Booster:
args = make_jcargs(
type=1 if predict_type == "margin" else 0,
training=False,
iteration_begin=iteration_range[0],
iteration_end=iteration_range[1],
iteration_begin=int(iteration_range[0]),
iteration_end=int(iteration_range[1]),
missing=missing,
strict_shape=strict_shape,
cache_id=0,

View File

@@ -61,7 +61,7 @@ from typing import (
import numpy
from xgboost import collective, config
from xgboost._typing import _T, FeatureNames, FeatureTypes
from xgboost._typing import _T, FeatureNames, FeatureTypes, IterationRange
from xgboost.callback import TrainingCallback
from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
from xgboost.core import (
@@ -1146,9 +1146,9 @@ async def _direct_predict_impl( # pylint: disable=too-many-branches
if _can_output_df(isinstance(data, dd.DataFrame), output_shape):
if base_margin is not None and isinstance(base_margin, da.Array):
# Easier for map_partitions
base_margin_df: Optional[
Union[dd.DataFrame, dd.Series]
] = base_margin.to_dask_dataframe()
base_margin_df: Optional[Union[dd.DataFrame, dd.Series]] = (
base_margin.to_dask_dataframe()
)
else:
base_margin_df = base_margin
predictions = dd.map_partitions(
@@ -1263,7 +1263,7 @@ async def _predict_async(
approx_contribs: bool,
pred_interactions: bool,
validate_features: bool,
iteration_range: Tuple[int, int],
iteration_range: IterationRange,
strict_shape: bool,
) -> _DaskCollection:
_booster = await _get_model_future(client, model)
@@ -1410,7 +1410,7 @@ def predict( # pylint: disable=unused-argument
approx_contribs: bool = False,
pred_interactions: bool = False,
validate_features: bool = True,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
strict_shape: bool = False,
) -> Any:
"""Run prediction with a trained booster.
@@ -1458,7 +1458,7 @@ async def _inplace_predict_async( # pylint: disable=too-many-branches
global_config: Dict[str, Any],
model: Union[Booster, Dict, "distributed.Future"],
data: _DataT,
iteration_range: Tuple[int, int],
iteration_range: IterationRange,
predict_type: str,
missing: float,
validate_features: bool,
@@ -1516,7 +1516,7 @@ def inplace_predict( # pylint: disable=unused-argument
client: Optional["distributed.Client"],
model: Union[TrainReturnT, Booster, "distributed.Future"],
data: _DataT,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
predict_type: str = "value",
missing: float = numpy.nan,
validate_features: bool = True,
@@ -1624,7 +1624,7 @@ class DaskScikitLearnBase(XGBModel):
output_margin: bool,
validate_features: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
iteration_range: Optional[IterationRange],
) -> Any:
iteration_range = self._get_iteration_range(iteration_range)
if self._can_use_inplace_predict():
@@ -1664,7 +1664,7 @@ class DaskScikitLearnBase(XGBModel):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
_assert_dask_support()
return self.client.sync(
@@ -1679,7 +1679,7 @@ class DaskScikitLearnBase(XGBModel):
async def _apply_async(
self,
X: _DataT,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = await DaskDMatrix(
@@ -1700,7 +1700,7 @@ class DaskScikitLearnBase(XGBModel):
def apply(
self,
X: _DataT,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
_assert_dask_support()
return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
@@ -1962,7 +1962,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
X: _DataT,
validate_features: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
iteration_range: Optional[IterationRange],
) -> _DaskCollection:
if self.objective == "multi:softmax":
raise ValueError(
@@ -1987,7 +1987,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
X: _DaskCollection,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
_assert_dask_support()
return self._client_sync(
@@ -2006,7 +2006,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
output_margin: bool,
validate_features: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
iteration_range: Optional[IterationRange],
) -> _DaskCollection:
pred_probs = await super()._predict_async(
data, output_margin, validate_features, base_margin, iteration_range

View File

@@ -1,4 +1,5 @@
"""Utilities for the XGBoost Dask interface."""
import logging
from typing import TYPE_CHECKING, Any, Dict

View File

@@ -22,7 +22,7 @@ from typing import (
import numpy as np
from scipy.special import softmax
from ._typing import ArrayLike, FeatureNames, FeatureTypes, ModelIn
from ._typing import ArrayLike, FeatureNames, FeatureTypes, IterationRange, ModelIn
from .callback import TrainingCallback
# Do not use class names on scikit-learn directly. Re-define the classes on
@@ -1039,8 +1039,8 @@ class XGBModel(XGBModelBase):
return False
def _get_iteration_range(
self, iteration_range: Optional[Tuple[int, int]]
) -> Tuple[int, int]:
self, iteration_range: Optional[IterationRange]
) -> IterationRange:
if iteration_range is None or iteration_range[1] == 0:
# Use best_iteration if defined.
try:
@@ -1057,7 +1057,7 @@ class XGBModel(XGBModelBase):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
"""Predict with `X`. If the model is trained with early stopping, then
:py:attr:`best_iteration` is used automatically. The estimator uses
@@ -1129,7 +1129,7 @@ class XGBModel(XGBModelBase):
def apply(
self,
X: ArrayLike,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> np.ndarray:
"""Return the predicted leaf every tree for each sample. If the model is trained
with early stopping, then :py:attr:`best_iteration` is used automatically.
@@ -1465,7 +1465,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
with config_context(verbosity=self.verbosity):
class_probs = super().predict(
@@ -1500,7 +1500,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
X: ArrayLike,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> np.ndarray:
"""Predict the probability of each `X` example being of a given class. If the
model is trained with early stopping, then :py:attr:`best_iteration` is used
@@ -1942,7 +1942,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
X, _ = _get_qid(X, None)
return super().predict(
@@ -1956,7 +1956,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
def apply(
self,
X: ArrayLike,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
X, _ = _get_qid(X, None)
return super().apply(X, iteration_range)

View File

@@ -1,4 +1,5 @@
"""XGBoost pyspark integration submodule for core code."""
import base64
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name

View File

@@ -1,4 +1,5 @@
"""Xgboost pyspark integration submodule for estimator API."""
# pylint: disable=too-many-ancestors
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
# pylint: disable=unused-argument, too-many-locals

View File

@@ -1,4 +1,5 @@
"""Xgboost pyspark integration submodule for params."""
from typing import Dict
# pylint: disable=too-few-public-methods
@@ -55,7 +56,6 @@ class HasFeaturesCols(Params):
class HasEnableSparseDataOptim(Params):
"""
This is a Params based class that is extended by _SparkXGBParams
and holds the variable to store the boolean config of enabling sparse data optimization.

View File

@@ -1,4 +1,5 @@
"""Xgboost pyspark integration submodule for helper functions."""
# pylint: disable=fixme
import inspect

View File

@@ -2,6 +2,7 @@
change without notice.
"""
# pylint: disable=invalid-name,missing-function-docstring,import-error
import gc
import importlib.util

View File

@@ -1,4 +1,5 @@
"""Tests for training continuation."""
import json
from typing import Any, Dict, TypeVar

View File

@@ -1,4 +1,5 @@
"""Tests for dask shared by different test modules."""
import numpy as np
import pandas as pd
from dask import array as da

View File

@@ -1,4 +1,5 @@
"""Tests related to the `DataIter` interface."""
import numpy as np
import xgboost

View File

@@ -1,4 +1,5 @@
"""Tests for evaluation metrics."""
from typing import Dict, List
import numpy as np

View File

@@ -1,4 +1,5 @@
"""Testing code shared by other tests."""
# pylint: disable=invalid-name
import collections
import importlib.util

View File

@@ -1,4 +1,5 @@
"""Tests for updaters."""
import json
from functools import partial, update_wrapper
from typing import Any, Dict, List