[dask] Cleanup dask module. (#7634)
* Add a new utility for mapping function onto workers. * Unify the type for feature names. * Clean up the iterator. * Fix prediction with DaskDMatrix worker specification. * Fix base margin with DeviceQuantileDMatrix. * Support vs 2022 in setup.py.
This commit is contained in:
parent
926af9951e
commit
fe4ce920b2
@ -171,8 +171,13 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
|
||||
|
||||
if system() == 'Windows':
|
||||
# Pick up from LGB, just test every possible tool chain.
|
||||
for vs in ('-GVisual Studio 16 2019', '-GVisual Studio 15 2017',
|
||||
'-GVisual Studio 14 2015', '-GMinGW Makefiles'):
|
||||
for vs in (
|
||||
"-GVisual Studio 17 2022",
|
||||
'-GVisual Studio 16 2019',
|
||||
'-GVisual Studio 15 2017',
|
||||
'-GVisual Studio 14 2015',
|
||||
'-GMinGW Makefiles',
|
||||
):
|
||||
try:
|
||||
self.build(src_dir, build_dir, vs)
|
||||
self.logger.info(
|
||||
|
||||
@ -25,6 +25,9 @@ from .libpath import find_lib_path
|
||||
|
||||
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
|
||||
c_bst_ulong = ctypes.c_uint64
|
||||
# xgboost accepts some other possible types in practice due to historical reason, which is
|
||||
# lesser tested. For now we encourage users to pass a simple list of string.
|
||||
FeatNamesT = Optional[List[str]]
|
||||
|
||||
|
||||
class XGBoostError(ValueError):
|
||||
@ -328,7 +331,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
||||
self._enable_categorical = False
|
||||
self._allow_host = True
|
||||
# Stage data in Python until reset or next is called to avoid data being free.
|
||||
self._temporary_data = None
|
||||
self._temporary_data: Optional[Tuple[Any, Any]] = None
|
||||
|
||||
def _get_callbacks(
|
||||
self, allow_host: bool, enable_categorical: bool
|
||||
@ -397,7 +400,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
||||
def data_handle(
|
||||
data: Any,
|
||||
*,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_names: FeatNamesT = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
@ -516,7 +519,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
base_margin=None,
|
||||
missing: Optional[float] = None,
|
||||
silent=False,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_names: FeatNamesT = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
nthread: Optional[int] = None,
|
||||
group=None,
|
||||
@ -673,7 +676,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
qid=None,
|
||||
label_lower_bound=None,
|
||||
label_upper_bound=None,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_names: FeatNamesT = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
feature_weights=None
|
||||
) -> None:
|
||||
@ -978,7 +981,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
return feature_names
|
||||
|
||||
@feature_names.setter
|
||||
def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None:
|
||||
def feature_names(self, feature_names: FeatNamesT) -> None:
|
||||
"""Set feature names (column labels).
|
||||
|
||||
Parameters
|
||||
@ -1163,7 +1166,7 @@ class DeviceQuantileDMatrix(DMatrix):
|
||||
base_margin=None,
|
||||
missing=None,
|
||||
silent=False,
|
||||
feature_names=None,
|
||||
feature_names: FeatNamesT = None,
|
||||
feature_types=None,
|
||||
nthread: Optional[int] = None,
|
||||
max_bin: int = 256,
|
||||
@ -1644,7 +1647,7 @@ class Booster:
|
||||
return self._get_feature_info("feature_name")
|
||||
|
||||
@feature_names.setter
|
||||
def feature_names(self, features: Optional[List[str]]) -> None:
|
||||
def feature_names(self, features: FeatNamesT) -> None:
|
||||
self._set_feature_info(features, "feature_name")
|
||||
|
||||
def set_param(self, params, value=None):
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -11,7 +11,7 @@ import numpy as np
|
||||
|
||||
from .core import c_array, _LIB, _check_call, c_str
|
||||
from .core import _cuda_array_interface
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatNamesT
|
||||
from .compat import lazy_isinstance, DataFrame
|
||||
|
||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
@ -68,7 +68,7 @@ def _from_scipy_csr(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
"""Initialize data from a CSR matrix."""
|
||||
@ -107,7 +107,7 @@ def _is_scipy_csc(data):
|
||||
def _from_scipy_csc(
|
||||
data,
|
||||
missing,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
if len(data.indices) != len(data.data):
|
||||
@ -163,7 +163,7 @@ def _from_numpy_array(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
"""Initialize data from a 2-D numpy matrix.
|
||||
@ -244,11 +244,11 @@ be set to `True`.""" + err
|
||||
def _transform_pandas_df(
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_names: FeatNamesT = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[str] = None,
|
||||
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
|
||||
) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||
|
||||
@ -282,7 +282,8 @@ def _transform_pandas_df(
|
||||
|
||||
# handle category codes.
|
||||
transformed = pd.DataFrame()
|
||||
if enable_categorical:
|
||||
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
|
||||
if enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes):
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
if is_categorical_dtype(dtype):
|
||||
# pandas uses -1 as default missing value for categorical data
|
||||
@ -311,9 +312,9 @@ def _from_pandas_df(
|
||||
enable_categorical: bool,
|
||||
missing: float,
|
||||
nthread: int,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
|
||||
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
|
||||
data, feature_names, feature_types = _transform_pandas_df(
|
||||
data, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
@ -353,7 +354,7 @@ def _from_pandas_series(
|
||||
missing: float,
|
||||
nthread: int,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
@ -384,7 +385,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
|
||||
|
||||
def _transform_dt_df(
|
||||
data,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
meta=None,
|
||||
meta_type=None,
|
||||
@ -425,10 +426,10 @@ def _from_dt_df(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
|
||||
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
|
||||
if enable_categorical:
|
||||
raise ValueError("categorical data in datatable is not supported yet.")
|
||||
data, feature_names, feature_types = _transform_dt_df(
|
||||
@ -510,7 +511,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
|
||||
def _transform_cudf_df(
|
||||
data,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool,
|
||||
):
|
||||
@ -576,7 +577,7 @@ def _from_cudf_df(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||
@ -626,7 +627,7 @@ def _from_cupy_array(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
"""Initialize DMatrix from cupy ndarray."""
|
||||
@ -673,7 +674,7 @@ def _from_dlpack(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
data = _transform_dlpack(data)
|
||||
@ -688,7 +689,7 @@ def _is_uri(data):
|
||||
def _from_uri(
|
||||
data,
|
||||
missing,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
_warn_unused_missing(data, missing)
|
||||
@ -708,7 +709,7 @@ def _from_list(
|
||||
data,
|
||||
missing,
|
||||
n_threads,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
array = np.array(data)
|
||||
@ -724,7 +725,7 @@ def _from_tuple(
|
||||
data,
|
||||
missing,
|
||||
n_threads,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
return _from_list(data, missing, n_threads, feature_names, feature_types)
|
||||
@ -760,7 +761,7 @@ def dispatch_data_backend(
|
||||
data,
|
||||
missing,
|
||||
threads,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool = False,
|
||||
):
|
||||
@ -988,7 +989,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
|
||||
|
||||
def _proxy_transform(
|
||||
data,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_names: FeatNamesT,
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool,
|
||||
):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user