[dask] Cleanup dask module. (#7634)
* Add a new utility for mapping function onto workers. * Unify the type for feature names. * Clean up the iterator. * Fix prediction with DaskDMatrix worker specification. * Fix base margin with DeviceQuantileDMatrix. * Support vs 2022 in setup.py.
This commit is contained in:
parent
926af9951e
commit
fe4ce920b2
@ -171,8 +171,13 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
|
|||||||
|
|
||||||
if system() == 'Windows':
|
if system() == 'Windows':
|
||||||
# Pick up from LGB, just test every possible tool chain.
|
# Pick up from LGB, just test every possible tool chain.
|
||||||
for vs in ('-GVisual Studio 16 2019', '-GVisual Studio 15 2017',
|
for vs in (
|
||||||
'-GVisual Studio 14 2015', '-GMinGW Makefiles'):
|
"-GVisual Studio 17 2022",
|
||||||
|
'-GVisual Studio 16 2019',
|
||||||
|
'-GVisual Studio 15 2017',
|
||||||
|
'-GVisual Studio 14 2015',
|
||||||
|
'-GMinGW Makefiles',
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
self.build(src_dir, build_dir, vs)
|
self.build(src_dir, build_dir, vs)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
|||||||
@ -25,6 +25,9 @@ from .libpath import find_lib_path
|
|||||||
|
|
||||||
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
|
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
|
||||||
c_bst_ulong = ctypes.c_uint64
|
c_bst_ulong = ctypes.c_uint64
|
||||||
|
# xgboost accepts some other possible types in practice due to historical reason, which is
|
||||||
|
# lesser tested. For now we encourage users to pass a simple list of string.
|
||||||
|
FeatNamesT = Optional[List[str]]
|
||||||
|
|
||||||
|
|
||||||
class XGBoostError(ValueError):
|
class XGBoostError(ValueError):
|
||||||
@ -328,7 +331,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
|||||||
self._enable_categorical = False
|
self._enable_categorical = False
|
||||||
self._allow_host = True
|
self._allow_host = True
|
||||||
# Stage data in Python until reset or next is called to avoid data being free.
|
# Stage data in Python until reset or next is called to avoid data being free.
|
||||||
self._temporary_data = None
|
self._temporary_data: Optional[Tuple[Any, Any]] = None
|
||||||
|
|
||||||
def _get_callbacks(
|
def _get_callbacks(
|
||||||
self, allow_host: bool, enable_categorical: bool
|
self, allow_host: bool, enable_categorical: bool
|
||||||
@ -397,7 +400,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
|||||||
def data_handle(
|
def data_handle(
|
||||||
data: Any,
|
data: Any,
|
||||||
*,
|
*,
|
||||||
feature_names: Optional[List[str]] = None,
|
feature_names: FeatNamesT = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -516,7 +519,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
base_margin=None,
|
base_margin=None,
|
||||||
missing: Optional[float] = None,
|
missing: Optional[float] = None,
|
||||||
silent=False,
|
silent=False,
|
||||||
feature_names: Optional[List[str]] = None,
|
feature_names: FeatNamesT = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
nthread: Optional[int] = None,
|
nthread: Optional[int] = None,
|
||||||
group=None,
|
group=None,
|
||||||
@ -673,7 +676,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
qid=None,
|
qid=None,
|
||||||
label_lower_bound=None,
|
label_lower_bound=None,
|
||||||
label_upper_bound=None,
|
label_upper_bound=None,
|
||||||
feature_names: Optional[List[str]] = None,
|
feature_names: FeatNamesT = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
feature_weights=None
|
feature_weights=None
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -978,7 +981,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
return feature_names
|
return feature_names
|
||||||
|
|
||||||
@feature_names.setter
|
@feature_names.setter
|
||||||
def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None:
|
def feature_names(self, feature_names: FeatNamesT) -> None:
|
||||||
"""Set feature names (column labels).
|
"""Set feature names (column labels).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -1163,7 +1166,7 @@ class DeviceQuantileDMatrix(DMatrix):
|
|||||||
base_margin=None,
|
base_margin=None,
|
||||||
missing=None,
|
missing=None,
|
||||||
silent=False,
|
silent=False,
|
||||||
feature_names=None,
|
feature_names: FeatNamesT = None,
|
||||||
feature_types=None,
|
feature_types=None,
|
||||||
nthread: Optional[int] = None,
|
nthread: Optional[int] = None,
|
||||||
max_bin: int = 256,
|
max_bin: int = 256,
|
||||||
@ -1644,7 +1647,7 @@ class Booster:
|
|||||||
return self._get_feature_info("feature_name")
|
return self._get_feature_info("feature_name")
|
||||||
|
|
||||||
@feature_names.setter
|
@feature_names.setter
|
||||||
def feature_names(self, features: Optional[List[str]]) -> None:
|
def feature_names(self, features: FeatNamesT) -> None:
|
||||||
self._set_feature_info(features, "feature_name")
|
self._set_feature_info(features, "feature_name")
|
||||||
|
|
||||||
def set_param(self, params, value=None):
|
def set_param(self, params, value=None):
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -11,7 +11,7 @@ import numpy as np
|
|||||||
|
|
||||||
from .core import c_array, _LIB, _check_call, c_str
|
from .core import c_array, _LIB, _check_call, c_str
|
||||||
from .core import _cuda_array_interface
|
from .core import _cuda_array_interface
|
||||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatNamesT
|
||||||
from .compat import lazy_isinstance, DataFrame
|
from .compat import lazy_isinstance, DataFrame
|
||||||
|
|
||||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||||
@ -68,7 +68,7 @@ def _from_scipy_csr(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
"""Initialize data from a CSR matrix."""
|
"""Initialize data from a CSR matrix."""
|
||||||
@ -107,7 +107,7 @@ def _is_scipy_csc(data):
|
|||||||
def _from_scipy_csc(
|
def _from_scipy_csc(
|
||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
if len(data.indices) != len(data.data):
|
if len(data.indices) != len(data.data):
|
||||||
@ -163,7 +163,7 @@ def _from_numpy_array(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
"""Initialize data from a 2-D numpy matrix.
|
"""Initialize data from a 2-D numpy matrix.
|
||||||
@ -244,11 +244,11 @@ be set to `True`.""" + err
|
|||||||
def _transform_pandas_df(
|
def _transform_pandas_df(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
feature_names: Optional[List[str]] = None,
|
feature_names: FeatNamesT = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
meta: Optional[str] = None,
|
meta: Optional[str] = None,
|
||||||
meta_type: Optional[str] = None,
|
meta_type: Optional[str] = None,
|
||||||
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
|
) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||||
|
|
||||||
@ -282,7 +282,8 @@ def _transform_pandas_df(
|
|||||||
|
|
||||||
# handle category codes.
|
# handle category codes.
|
||||||
transformed = pd.DataFrame()
|
transformed = pd.DataFrame()
|
||||||
if enable_categorical:
|
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
|
||||||
|
if enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes):
|
||||||
for i, dtype in enumerate(data.dtypes):
|
for i, dtype in enumerate(data.dtypes):
|
||||||
if is_categorical_dtype(dtype):
|
if is_categorical_dtype(dtype):
|
||||||
# pandas uses -1 as default missing value for categorical data
|
# pandas uses -1 as default missing value for categorical data
|
||||||
@ -311,9 +312,9 @@ def _from_pandas_df(
|
|||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
missing: float,
|
missing: float,
|
||||||
nthread: int,
|
nthread: int,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
|
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
|
||||||
data, feature_names, feature_types = _transform_pandas_df(
|
data, feature_names, feature_types = _transform_pandas_df(
|
||||||
data, enable_categorical, feature_names, feature_types
|
data, enable_categorical, feature_names, feature_types
|
||||||
)
|
)
|
||||||
@ -353,7 +354,7 @@ def _from_pandas_series(
|
|||||||
missing: float,
|
missing: float,
|
||||||
nthread: int,
|
nthread: int,
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
from pandas.api.types import is_categorical_dtype
|
from pandas.api.types import is_categorical_dtype
|
||||||
@ -384,7 +385,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
|
|||||||
|
|
||||||
def _transform_dt_df(
|
def _transform_dt_df(
|
||||||
data,
|
data,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
meta=None,
|
meta=None,
|
||||||
meta_type=None,
|
meta_type=None,
|
||||||
@ -425,10 +426,10 @@ def _from_dt_df(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
|
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
|
||||||
if enable_categorical:
|
if enable_categorical:
|
||||||
raise ValueError("categorical data in datatable is not supported yet.")
|
raise ValueError("categorical data in datatable is not supported yet.")
|
||||||
data, feature_names, feature_types = _transform_dt_df(
|
data, feature_names, feature_types = _transform_dt_df(
|
||||||
@ -510,7 +511,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
|||||||
|
|
||||||
def _transform_cudf_df(
|
def _transform_cudf_df(
|
||||||
data,
|
data,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
):
|
):
|
||||||
@ -576,7 +577,7 @@ def _from_cudf_df(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||||
@ -626,7 +627,7 @@ def _from_cupy_array(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
"""Initialize DMatrix from cupy ndarray."""
|
"""Initialize DMatrix from cupy ndarray."""
|
||||||
@ -673,7 +674,7 @@ def _from_dlpack(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
data = _transform_dlpack(data)
|
data = _transform_dlpack(data)
|
||||||
@ -688,7 +689,7 @@ def _is_uri(data):
|
|||||||
def _from_uri(
|
def _from_uri(
|
||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
_warn_unused_missing(data, missing)
|
_warn_unused_missing(data, missing)
|
||||||
@ -708,7 +709,7 @@ def _from_list(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
n_threads,
|
n_threads,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
array = np.array(data)
|
array = np.array(data)
|
||||||
@ -724,7 +725,7 @@ def _from_tuple(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
n_threads,
|
n_threads,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
return _from_list(data, missing, n_threads, feature_names, feature_types)
|
return _from_list(data, missing, n_threads, feature_names, feature_types)
|
||||||
@ -760,7 +761,7 @@ def dispatch_data_backend(
|
|||||||
data,
|
data,
|
||||||
missing,
|
missing,
|
||||||
threads,
|
threads,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool = False,
|
enable_categorical: bool = False,
|
||||||
):
|
):
|
||||||
@ -988,7 +989,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
|
|||||||
|
|
||||||
def _proxy_transform(
|
def _proxy_transform(
|
||||||
data,
|
data,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: FeatNamesT,
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
):
|
):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user