[dask] Cleanup dask module. (#7634)

* Add a new utility for mapping function onto workers.
* Unify the type for feature names.
* Clean up the iterator.
* Fix prediction with DaskDMatrix worker specification.
* Fix base margin with DeviceQuantileDMatrix.
* Support vs 2022 in setup.py.
This commit is contained in:
Jiaming Yuan
2022-02-08 20:41:46 +08:00
committed by GitHub
parent 926af9951e
commit fe4ce920b2
4 changed files with 369 additions and 389 deletions

View File

@@ -25,6 +25,9 @@ from .libpath import find_lib_path
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatNamesT = Optional[List[str]]
class XGBoostError(ValueError):
@@ -328,7 +331,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
self._enable_categorical = False
self._allow_host = True
# Stage data in Python until reset or next is called to avoid data being free.
self._temporary_data = None
self._temporary_data: Optional[Tuple[Any, Any]] = None
def _get_callbacks(
self, allow_host: bool, enable_categorical: bool
@@ -397,7 +400,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
def data_handle(
data: Any,
*,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
@@ -516,7 +519,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
base_margin=None,
missing: Optional[float] = None,
silent=False,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None,
group=None,
@@ -673,7 +676,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
feature_weights=None
) -> None:
@@ -978,7 +981,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return feature_names
@feature_names.setter
def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None:
def feature_names(self, feature_names: FeatNamesT) -> None:
"""Set feature names (column labels).
Parameters
@@ -1163,7 +1166,7 @@ class DeviceQuantileDMatrix(DMatrix):
base_margin=None,
missing=None,
silent=False,
feature_names=None,
feature_names: FeatNamesT = None,
feature_types=None,
nthread: Optional[int] = None,
max_bin: int = 256,
@@ -1644,7 +1647,7 @@ class Booster:
return self._get_feature_info("feature_name")
@feature_names.setter
def feature_names(self, features: Optional[List[str]]) -> None:
def feature_names(self, features: FeatNamesT) -> None:
self._set_feature_info(features, "feature_name")
def set_param(self, params, value=None):

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatNamesT
from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@@ -68,7 +68,7 @@ def _from_scipy_csr(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
"""Initialize data from a CSR matrix."""
@@ -107,7 +107,7 @@ def _is_scipy_csc(data):
def _from_scipy_csc(
data,
missing,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
if len(data.indices) != len(data.data):
@@ -163,7 +163,7 @@ def _from_numpy_array(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
"""Initialize data from a 2-D numpy matrix.
@@ -244,11 +244,11 @@ be set to `True`.""" + err
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype
@@ -282,7 +282,8 @@ def _transform_pandas_df(
# handle category codes.
transformed = pd.DataFrame()
if enable_categorical:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
if enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes):
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
# pandas uses -1 as default missing value for categorical data
@@ -311,9 +312,9 @@ def _from_pandas_df(
enable_categorical: bool,
missing: float,
nthread: int,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
@@ -353,7 +354,7 @@ def _from_pandas_series(
missing: float,
nthread: int,
enable_categorical: bool,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
from pandas.api.types import is_categorical_dtype
@@ -384,7 +385,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df(
data,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
meta=None,
meta_type=None,
@@ -425,10 +426,10 @@ def _from_dt_df(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df(
@@ -510,7 +511,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df(
data,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
):
@@ -576,7 +577,7 @@ def _from_cudf_df(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]:
@@ -626,7 +627,7 @@ def _from_cupy_array(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
"""Initialize DMatrix from cupy ndarray."""
@@ -673,7 +674,7 @@ def _from_dlpack(
data,
missing,
nthread,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
data = _transform_dlpack(data)
@@ -688,7 +689,7 @@ def _is_uri(data):
def _from_uri(
data,
missing,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
_warn_unused_missing(data, missing)
@@ -708,7 +709,7 @@ def _from_list(
data,
missing,
n_threads,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
array = np.array(data)
@@ -724,7 +725,7 @@ def _from_tuple(
data,
missing,
n_threads,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
):
return _from_list(data, missing, n_threads, feature_names, feature_types)
@@ -760,7 +761,7 @@ def dispatch_data_backend(
data,
missing,
threads,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool = False,
):
@@ -988,7 +989,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform(
data,
feature_names: Optional[List[str]],
feature_names: FeatNamesT,
feature_types: Optional[List[str]],
enable_categorical: bool,
):