Refactor pandas dataframe handling. (#7843)

2022-04-26 18:53:43 +08:00 · 2022-04-26 18:53:43 +08:00 · ad06172c6b
commit ad06172c6b
parent bef1f939ce
1 changed files with 80 additions and 53 deletions
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -6,7 +6,7 @@ from distutils import version
 import json
 import warnings
 import os
-from typing import Any, Tuple, Callable, Optional, List, Union, Iterator
+from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
 import numpy as np
@ -21,8 +21,6 @@ c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
 CAT_T = "c"
 # meta info that can be a matrix instead of vector.
 # For now it's base_margin for multi-class, but it can be extended to label once we have
 # multi-output.
 _matrix_meta = {"base_margin", "label"}
@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
    raise ValueError(msg)
-# pylint: disable=too-many-locals
+def _pandas_feature_info(
 def _transform_pandas_df(
    data: DataFrame,
    meta: Optional[str],
    feature_names: FeatureNames,
    feature_types: FeatureTypes,
    enable_categorical: bool,
-    feature_names: FeatureNames = None,
+) -> Tuple[FeatureNames, FeatureTypes]:
    feature_types: Optional[List[str]] = None,
    meta: Optional[str] = None,
    meta_type: Optional[str] = None,
 ) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
    import pandas as pd
    from pandas.api.types import (
        is_sparse,
        is_categorical_dtype,
        is_integer_dtype,
        is_bool_dtype,
    )
    nullable_alias = {"Int16", "Int32", "Int64"}
    # dtype: pd.core.arrays.numeric.NumericDtype
    def is_nullable_dtype(dtype: Any) -> bool:
        is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
        # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
        is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
        return is_int or is_bool
    if not all(
        dtype.name in _pandas_dtype_mapper
        or is_sparse(dtype)
        or is_nullable_dtype(dtype)
        or (is_categorical_dtype(dtype) and enable_categorical)
        for dtype in data.dtypes
    ):
        _invalid_dataframe_dtype(data)
    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
@ -300,43 +276,94 @@ def _transform_pandas_df(
    # handle feature types
    if feature_types is None and meta is None:
        feature_types = []
-        for i, dtype in enumerate(data.dtypes):
+        for dtype in data.dtypes:
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])
    return feature_names, feature_types
-    # handle category codes.
+
-    transformed = pd.DataFrame()
+def is_nullable_dtype(dtype: Any) -> bool:
-    # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
+    """Wether dtype is a pandas nullable type."""
-    if (
+    from pandas.api.types import is_integer_dtype, is_bool_dtype
-        enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
+    # dtype: pd.core.arrays.numeric.NumericDtype
-    ) or any(is_nullable_dtype(dtype) for dtype in data.dtypes):
+    nullable_alias = {"Int16", "Int32", "Int64"}
-        for i, dtype in enumerate(data.dtypes):
+    is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
-            if is_categorical_dtype(dtype):
+    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
-                # pandas uses -1 as default missing value for categorical data
+    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
-                transformed[data.columns[i]] = (
+    return is_int or is_bool
-                    data[data.columns[i]]
+
-                    .cat.codes.astype(np.float32)
+
-                    .replace(-1.0, np.NaN)
+def _pandas_cat_null(data: DataFrame) -> DataFrame:
-                )
+    from pandas.api.types import is_categorical_dtype
-            elif is_nullable_dtype(dtype):
+    # handle category codes and nullable.
-                # Converts integer <NA> to float NaN
+    cat_columns = [
-                transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32)
+        col
-            else:
+        for col, dtype in zip(data.columns, data.dtypes)
-                transformed[data.columns[i]] = data[data.columns[i]]
+        if is_categorical_dtype(dtype)
    ]
    nul_columns = [
        col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
    ]
    if cat_columns or nul_columns:
        # Avoid transformation due to: PerformanceWarning: DataFrame is highly
        # fragmented
        transformed = data.copy()
    else:
        transformed = data
    if cat_columns:
        # DF doesn't have the cat attribute, so we use apply here
        transformed[cat_columns] = (
            transformed[cat_columns]
            .apply(lambda x: x.cat.codes)
            .astype(np.float32)
            .replace(-1.0, np.NaN)
        )
    if nul_columns:
        transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
    return transformed
 def _transform_pandas_df(
    data: DataFrame,
    enable_categorical: bool,
    feature_names: FeatureNames = None,
    feature_types: FeatureTypes = None,
    meta: Optional[str] = None,
    meta_type: Optional[str] = None,
 ) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
    from pandas.api.types import (
        is_sparse,
        is_categorical_dtype,
    )
    if not all(
        dtype.name in _pandas_dtype_mapper
        or is_sparse(dtype)
        or is_nullable_dtype(dtype)
        or (is_categorical_dtype(dtype) and enable_categorical)
        for dtype in data.dtypes
    ):
        _invalid_dataframe_dtype(data)
    feature_names, feature_types = _pandas_feature_info(
        data, meta, feature_names, feature_types, enable_categorical
    )
    transformed = _pandas_cat_null(data)
    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
-    dtype = meta_type if meta_type else np.float32
+    dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
-    arr = transformed.values
+    arr: np.ndarray = transformed.values
    if meta_type:
-        arr = arr.astype(meta_type)
+        arr = arr.astype(dtype)
    return arr, feature_names, feature_types