Refactor pandas dataframe handling. (#7843)

This commit is contained in:
Jiaming Yuan 2022-04-26 18:53:43 +08:00 committed by GitHub
parent bef1f939ce
commit ad06172c6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,7 +6,7 @@ from distutils import version
import json
import warnings
import os
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
import numpy as np
@ -21,8 +21,6 @@ c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
CAT_T = "c"
# meta info that can be a matrix instead of vector.
# For now it's base_margin for multi-class, but it can be extended to label once we have
# multi-output.
_matrix_meta = {"base_margin", "label"}
@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
raise ValueError(msg)
# pylint: disable=too-many-locals
def _transform_pandas_df(
def _pandas_feature_info(
data: DataFrame,
meta: Optional[str],
feature_names: FeatureNames,
feature_types: FeatureTypes,
enable_categorical: bool,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
) -> Tuple[FeatureNames, FeatureTypes]:
import pandas as pd
from pandas.api.types import (
is_sparse,
is_categorical_dtype,
is_integer_dtype,
is_bool_dtype,
)
nullable_alias = {"Int16", "Int32", "Int64"}
# dtype: pd.core.arrays.numeric.NumericDtype
def is_nullable_dtype(dtype: Any) -> bool:
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool
if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or is_nullable_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)
# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex):
@ -300,43 +276,94 @@ def _transform_pandas_df(
# handle feature types
if feature_types is None and meta is None:
feature_types = []
for i, dtype in enumerate(data.dtypes):
for dtype in data.dtypes:
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])
return feature_names, feature_types
# handle category codes.
transformed = pd.DataFrame()
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
if (
enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
) or any(is_nullable_dtype(dtype) for dtype in data.dtypes):
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
# pandas uses -1 as default missing value for categorical data
transformed[data.columns[i]] = (
data[data.columns[i]]
.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN)
)
elif is_nullable_dtype(dtype):
# Converts integer <NA> to float NaN
transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32)
else:
transformed[data.columns[i]] = data[data.columns[i]]
def is_nullable_dtype(dtype: Any) -> bool:
"""Wether dtype is a pandas nullable type."""
from pandas.api.types import is_integer_dtype, is_bool_dtype
# dtype: pd.core.arrays.numeric.NumericDtype
nullable_alias = {"Int16", "Int32", "Int64"}
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool
def _pandas_cat_null(data: DataFrame) -> DataFrame:
from pandas.api.types import is_categorical_dtype
# handle category codes and nullable.
cat_columns = [
col
for col, dtype in zip(data.columns, data.dtypes)
if is_categorical_dtype(dtype)
]
nul_columns = [
col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
]
if cat_columns or nul_columns:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
# fragmented
transformed = data.copy()
else:
transformed = data
if cat_columns:
# DF doesn't have the cat attribute, so we use apply here
transformed[cat_columns] = (
transformed[cat_columns]
.apply(lambda x: x.cat.codes)
.astype(np.float32)
.replace(-1.0, np.NaN)
)
if nul_columns:
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
return transformed
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: FeatureNames = None,
feature_types: FeatureTypes = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
from pandas.api.types import (
is_sparse,
is_categorical_dtype,
)
if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or is_nullable_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)
feature_names, feature_types = _pandas_feature_info(
data, meta, feature_names, feature_types, enable_categorical
)
transformed = _pandas_cat_null(data)
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype = meta_type if meta_type else np.float32
arr = transformed.values
dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
arr: np.ndarray = transformed.values
if meta_type:
arr = arr.astype(meta_type)
arr = arr.astype(dtype)
return arr, feature_names, feature_types