[dask] Cleanup dask module. (#7634)

* Add a new utility for mapping function onto workers.
* Unify the type for feature names.
* Clean up the iterator.
* Fix prediction with DaskDMatrix worker specification.
* Fix base margin with DeviceQuantileDMatrix.
* Support vs 2022 in setup.py.
This commit is contained in:
Jiaming Yuan 2022-02-08 20:41:46 +08:00 committed by GitHub
parent 926af9951e
commit fe4ce920b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 369 additions and 389 deletions

View File

@ -171,8 +171,13 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
if system() == 'Windows': if system() == 'Windows':
# Pick up from LGB, just test every possible tool chain. # Pick up from LGB, just test every possible tool chain.
for vs in ('-GVisual Studio 16 2019', '-GVisual Studio 15 2017', for vs in (
'-GVisual Studio 14 2015', '-GMinGW Makefiles'): "-GVisual Studio 17 2022",
'-GVisual Studio 16 2019',
'-GVisual Studio 15 2017',
'-GVisual Studio 14 2015',
'-GMinGW Makefiles',
):
try: try:
self.build(src_dir, build_dir, vs) self.build(src_dir, build_dir, vs)
self.logger.info( self.logger.info(

View File

@ -25,6 +25,9 @@ from .libpath import find_lib_path
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h # c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64 c_bst_ulong = ctypes.c_uint64
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatNamesT = Optional[List[str]]
class XGBoostError(ValueError): class XGBoostError(ValueError):
@ -328,7 +331,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
self._enable_categorical = False self._enable_categorical = False
self._allow_host = True self._allow_host = True
# Stage data in Python until reset or next is called to avoid data being free. # Stage data in Python until reset or next is called to avoid data being free.
self._temporary_data = None self._temporary_data: Optional[Tuple[Any, Any]] = None
def _get_callbacks( def _get_callbacks(
self, allow_host: bool, enable_categorical: bool self, allow_host: bool, enable_categorical: bool
@ -397,7 +400,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
def data_handle( def data_handle(
data: Any, data: Any,
*, *,
feature_names: Optional[List[str]] = None, feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
@ -516,7 +519,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
base_margin=None, base_margin=None,
missing: Optional[float] = None, missing: Optional[float] = None,
silent=False, silent=False,
feature_names: Optional[List[str]] = None, feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None, nthread: Optional[int] = None,
group=None, group=None,
@ -673,7 +676,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
qid=None, qid=None,
label_lower_bound=None, label_lower_bound=None,
label_upper_bound=None, label_upper_bound=None,
feature_names: Optional[List[str]] = None, feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
feature_weights=None feature_weights=None
) -> None: ) -> None:
@ -978,7 +981,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return feature_names return feature_names
@feature_names.setter @feature_names.setter
def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None: def feature_names(self, feature_names: FeatNamesT) -> None:
"""Set feature names (column labels). """Set feature names (column labels).
Parameters Parameters
@ -1163,7 +1166,7 @@ class DeviceQuantileDMatrix(DMatrix):
base_margin=None, base_margin=None,
missing=None, missing=None,
silent=False, silent=False,
feature_names=None, feature_names: FeatNamesT = None,
feature_types=None, feature_types=None,
nthread: Optional[int] = None, nthread: Optional[int] = None,
max_bin: int = 256, max_bin: int = 256,
@ -1644,7 +1647,7 @@ class Booster:
return self._get_feature_info("feature_name") return self._get_feature_info("feature_name")
@feature_names.setter @feature_names.setter
def feature_names(self, features: Optional[List[str]]) -> None: def feature_names(self, features: FeatNamesT) -> None:
self._set_feature_info(features, "feature_name") self._set_feature_info(features, "feature_name")
def set_param(self, params, value=None): def set_param(self, params, value=None):

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix from .core import DataIter, _ProxyDMatrix, DMatrix, FeatNamesT
from .compat import lazy_isinstance, DataFrame from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@ -68,7 +68,7 @@ def _from_scipy_csr(
data, data,
missing, missing,
nthread, nthread,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
"""Initialize data from a CSR matrix.""" """Initialize data from a CSR matrix."""
@ -107,7 +107,7 @@ def _is_scipy_csc(data):
def _from_scipy_csc( def _from_scipy_csc(
data, data,
missing, missing,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
if len(data.indices) != len(data.data): if len(data.indices) != len(data.data):
@ -163,7 +163,7 @@ def _from_numpy_array(
data, data,
missing, missing,
nthread, nthread,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
"""Initialize data from a 2-D numpy matrix. """Initialize data from a 2-D numpy matrix.
@ -244,11 +244,11 @@ be set to `True`.""" + err
def _transform_pandas_df( def _transform_pandas_df(
data: DataFrame, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
feature_names: Optional[List[str]] = None, feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
meta: Optional[str] = None, meta: Optional[str] = None,
meta_type: Optional[str] = None, meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]: ) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
import pandas as pd import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype from pandas.api.types import is_sparse, is_categorical_dtype
@ -282,7 +282,8 @@ def _transform_pandas_df(
# handle category codes. # handle category codes.
transformed = pd.DataFrame() transformed = pd.DataFrame()
if enable_categorical: # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
if enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes):
for i, dtype in enumerate(data.dtypes): for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype): if is_categorical_dtype(dtype):
# pandas uses -1 as default missing value for categorical data # pandas uses -1 as default missing value for categorical data
@ -311,9 +312,9 @@ def _from_pandas_df(
enable_categorical: bool, enable_categorical: bool,
missing: float, missing: float,
nthread: int, nthread: int,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]: ) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
data, feature_names, feature_types = _transform_pandas_df( data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types data, enable_categorical, feature_names, feature_types
) )
@ -353,7 +354,7 @@ def _from_pandas_series(
missing: float, missing: float,
nthread: int, nthread: int,
enable_categorical: bool, enable_categorical: bool,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
from pandas.api.types import is_categorical_dtype from pandas.api.types import is_categorical_dtype
@ -384,7 +385,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df( def _transform_dt_df(
data, data,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
meta=None, meta=None,
meta_type=None, meta_type=None,
@ -425,10 +426,10 @@ def _from_dt_df(
data, data,
missing, missing,
nthread, nthread,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]: ) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
if enable_categorical: if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.") raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df( data, feature_names, feature_types = _transform_dt_df(
@ -510,7 +511,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df( def _transform_cudf_df(
data, data,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool, enable_categorical: bool,
): ):
@ -576,7 +577,7 @@ def _from_cudf_df(
data, data,
missing, missing,
nthread, nthread,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]: ) -> Tuple[ctypes.c_void_p, Any, Any]:
@ -626,7 +627,7 @@ def _from_cupy_array(
data, data,
missing, missing,
nthread, nthread,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
"""Initialize DMatrix from cupy ndarray.""" """Initialize DMatrix from cupy ndarray."""
@ -673,7 +674,7 @@ def _from_dlpack(
data, data,
missing, missing,
nthread, nthread,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
data = _transform_dlpack(data) data = _transform_dlpack(data)
@ -688,7 +689,7 @@ def _is_uri(data):
def _from_uri( def _from_uri(
data, data,
missing, missing,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
_warn_unused_missing(data, missing) _warn_unused_missing(data, missing)
@ -708,7 +709,7 @@ def _from_list(
data, data,
missing, missing,
n_threads, n_threads,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
array = np.array(data) array = np.array(data)
@ -724,7 +725,7 @@ def _from_tuple(
data, data,
missing, missing,
n_threads, n_threads,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
return _from_list(data, missing, n_threads, feature_names, feature_types) return _from_list(data, missing, n_threads, feature_names, feature_types)
@ -760,7 +761,7 @@ def dispatch_data_backend(
data, data,
missing, missing,
threads, threads,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool = False, enable_categorical: bool = False,
): ):
@ -988,7 +989,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform( def _proxy_transform(
data, data,
feature_names: Optional[List[str]], feature_names: FeatNamesT,
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool, enable_categorical: bool,
): ):