Refactor pandas dataframe handling. (#7843)

2022-04-26 18:53:43 +08:00 · 2022-04-26 18:53:43 +08:00 · ad06172c6b
commit ad06172c6b
parent bef1f939ce
1 changed files with 80 additions and 53 deletions
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -6,7 +6,7 @@ from distutils import version
 import json
 import warnings
 import os
-from typing import Any, Tuple, Callable, Optional, List, Union, Iterator
+from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type

 import numpy as np

@ -21,8 +21,6 @@ c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
 CAT_T = "c"

 # meta info that can be a matrix instead of vector.
-# For now it's base_margin for multi-class, but it can be extended to label once we have
-# multi-output.
 _matrix_meta = {"base_margin", "label"}


@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
    raise ValueError(msg)


-# pylint: disable=too-many-locals
-def _transform_pandas_df(
+def _pandas_feature_info(
    data: DataFrame,
+    meta: Optional[str],
+    feature_names: FeatureNames,
+    feature_types: FeatureTypes,
    enable_categorical: bool,
-    feature_names: FeatureNames = None,
-    feature_types: Optional[List[str]] = None,
-    meta: Optional[str] = None,
-    meta_type: Optional[str] = None,
-) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
+) -> Tuple[FeatureNames, FeatureTypes]:
    import pandas as pd
    from pandas.api.types import (
        is_sparse,
        is_categorical_dtype,
-        is_integer_dtype,
-        is_bool_dtype,
    )

-    nullable_alias = {"Int16", "Int32", "Int64"}
-
-    # dtype: pd.core.arrays.numeric.NumericDtype
-    def is_nullable_dtype(dtype: Any) -> bool:
-        is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
-        # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
-        is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
-        return is_int or is_bool
-
-    if not all(
-        dtype.name in _pandas_dtype_mapper
-        or is_sparse(dtype)
-        or is_nullable_dtype(dtype)
-        or (is_categorical_dtype(dtype) and enable_categorical)
-        for dtype in data.dtypes
-    ):
-        _invalid_dataframe_dtype(data)
-
    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
@ -300,43 +276,94 @@ def _transform_pandas_df(
    # handle feature types
    if feature_types is None and meta is None:
        feature_types = []
-        for i, dtype in enumerate(data.dtypes):
+        for dtype in data.dtypes:
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])
+    return feature_names, feature_types

-    # handle category codes.
-    transformed = pd.DataFrame()
-    # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
-    if (
-        enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
-    ) or any(is_nullable_dtype(dtype) for dtype in data.dtypes):
-        for i, dtype in enumerate(data.dtypes):
-            if is_categorical_dtype(dtype):
-                # pandas uses -1 as default missing value for categorical data
-                transformed[data.columns[i]] = (
-                    data[data.columns[i]]
-                    .cat.codes.astype(np.float32)
-                    .replace(-1.0, np.NaN)
-                )
-            elif is_nullable_dtype(dtype):
-                # Converts integer <NA> to float NaN
-                transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32)
-            else:
-                transformed[data.columns[i]] = data[data.columns[i]]
+
+def is_nullable_dtype(dtype: Any) -> bool:
+    """Wether dtype is a pandas nullable type."""
+    from pandas.api.types import is_integer_dtype, is_bool_dtype
+    # dtype: pd.core.arrays.numeric.NumericDtype
+    nullable_alias = {"Int16", "Int32", "Int64"}
+    is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
+    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
+    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
+    return is_int or is_bool
+
+
+def _pandas_cat_null(data: DataFrame) -> DataFrame:
+    from pandas.api.types import is_categorical_dtype
+    # handle category codes and nullable.
+    cat_columns = [
+        col
+        for col, dtype in zip(data.columns, data.dtypes)
+        if is_categorical_dtype(dtype)
+    ]
+    nul_columns = [
+        col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
+    ]
+    if cat_columns or nul_columns:
+        # Avoid transformation due to: PerformanceWarning: DataFrame is highly
+        # fragmented
+        transformed = data.copy()
    else:
        transformed = data

+    if cat_columns:
+        # DF doesn't have the cat attribute, so we use apply here
+        transformed[cat_columns] = (
+            transformed[cat_columns]
+            .apply(lambda x: x.cat.codes)
+            .astype(np.float32)
+            .replace(-1.0, np.NaN)
+        )
+    if nul_columns:
+        transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
+
+    return transformed
+
+
+def _transform_pandas_df(
+    data: DataFrame,
+    enable_categorical: bool,
+    feature_names: FeatureNames = None,
+    feature_types: FeatureTypes = None,
+    meta: Optional[str] = None,
+    meta_type: Optional[str] = None,
+) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
+    from pandas.api.types import (
+        is_sparse,
+        is_categorical_dtype,
+    )
+
+    if not all(
+        dtype.name in _pandas_dtype_mapper
+        or is_sparse(dtype)
+        or is_nullable_dtype(dtype)
+        or (is_categorical_dtype(dtype) and enable_categorical)
+        for dtype in data.dtypes
+    ):
+        _invalid_dataframe_dtype(data)
+
+    feature_names, feature_types = _pandas_feature_info(
+        data, meta, feature_names, feature_types, enable_categorical
+    )
+
+    transformed = _pandas_cat_null(data)
+
    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

-    dtype = meta_type if meta_type else np.float32
-    arr = transformed.values
+    dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
+    arr: np.ndarray = transformed.values
    if meta_type:
-        arr = arr.astype(meta_type)
+        arr = arr.astype(dtype)
    return arr, feature_names, feature_types