[dask] Cleanup dask module. (#7634)

* Add a new utility for mapping function onto workers.
* Unify the type for feature names.
* Clean up the iterator.
* Fix prediction with DaskDMatrix worker specification.
* Fix base margin with DeviceQuantileDMatrix.
* Support vs 2022 in setup.py.
This commit is contained in:
Jiaming Yuan 2022-02-08 20:41:46 +08:00 committed by GitHub
parent 926af9951e
commit fe4ce920b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 369 additions and 389 deletions

View File

@ -171,8 +171,13 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
if system() == 'Windows':
# Pick up from LGB, just test every possible tool chain.
for vs in ('-GVisual Studio 16 2019', '-GVisual Studio 15 2017',
'-GVisual Studio 14 2015', '-GMinGW Makefiles'):
for vs in (
"-GVisual Studio 17 2022",
'-GVisual Studio 16 2019',
'-GVisual Studio 15 2017',
'-GVisual Studio 14 2015',
'-GMinGW Makefiles',
):
try:
self.build(src_dir, build_dir, vs)
self.logger.info(

View File

@ -25,6 +25,9 @@ from .libpath import find_lib_path
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatNamesT = Optional[List[str]]
class XGBoostError(ValueError):
@ -328,7 +331,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
self._enable_categorical = False
self._allow_host = True
# Stage data in Python until reset or next is called to avoid data being free.
self._temporary_data = None
self._temporary_data: Optional[Tuple[Any, Any]] = None
def _get_callbacks(
self, allow_host: bool, enable_categorical: bool
@ -397,7 +400,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
def data_handle(
data: Any,
*,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
@ -516,7 +519,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
base_margin=None,
missing: Optional[float] = None,
silent=False,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None,
group=None,
@ -673,7 +676,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
feature_weights=None
) -> None:
@ -978,7 +981,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return feature_names
@feature_names.setter
def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None:
def feature_names(self, feature_names: FeatNamesT) -> None:
"""Set feature names (column labels).
Parameters
@ -1163,7 +1166,7 @@ class DeviceQuantileDMatrix(DMatrix):
base_margin=None,
missing=None,
silent=False,
feature_names=None,
feature_names: FeatNamesT = None,
feature_types=None,
nthread: Optional[int] = None,
max_bin: int = 256,
@ -1644,7 +1647,7 @@ class Booster:
return self._get_feature_info("feature_name")
@feature_names.setter
def feature_names(self, features: Optional[List[str]]) -> None:
def feature_names(self, features: FeatNamesT) -> None:
self._set_feature_info(features, "feature_name")
def set_param(self, params, value=None):

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatNamesT
from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@ -68,7 +68,7 @@ def _from_scipy_csr(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
"""Initialize data from a CSR matrix."""
@ -107,7 +107,7 @@ def _is_scipy_csc(data):
def _from_scipy_csc(
data,
missing,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
if len(data.indices) != len(data.data):
@ -163,7 +163,7 @@ def _from_numpy_array(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
"""Initialize data from a 2-D numpy matrix.
@ -244,11 +244,11 @@ be set to `True`.""" + err
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype
@ -282,7 +282,8 @@ def _transform_pandas_df(
# handle category codes.
transformed = pd.DataFrame()
if enable_categorical:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
if enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes):
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
# pandas uses -1 as default missing value for categorical data
@ -311,9 +312,9 @@ def _from_pandas_df(
enable_categorical: bool,
missing: float,
nthread: int,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
@ -353,7 +354,7 @@ def _from_pandas_series(
missing: float,
nthread: int,
enable_categorical: bool,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
from pandas.api.types import is_categorical_dtype
@ -384,7 +385,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df(
data,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
meta=None,
meta_type=None,
@ -425,10 +426,10 @@ def _from_dt_df(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df(
@ -510,7 +511,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df(
data,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
):
@ -576,7 +577,7 @@ def _from_cudf_df(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]:
@ -626,7 +627,7 @@ def _from_cupy_array(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
"""Initialize DMatrix from cupy ndarray."""
@ -673,7 +674,7 @@ def _from_dlpack(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
data = _transform_dlpack(data)
@ -688,7 +689,7 @@ def _is_uri(data):
def _from_uri(
data,
missing,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
_warn_unused_missing(data, missing)
@ -708,7 +709,7 @@ def _from_list(
data,
missing,
n_threads,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
array = np.array(data)
@ -724,7 +725,7 @@ def _from_tuple(
data,
missing,
n_threads,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
return _from_list(data, missing, n_threads, feature_names, feature_types)
@ -760,7 +761,7 @@ def dispatch_data_backend(
data,
missing,
threads,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool = False,
):
@ -988,7 +989,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform(
data,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
):