Refactor pandas dataframe handling. (#7843)

This commit is contained in:
Jiaming Yuan 2022-04-26 18:53:43 +08:00 committed by GitHub
parent bef1f939ce
commit ad06172c6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,7 +6,7 @@ from distutils import version
import json import json
import warnings import warnings
import os import os
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
import numpy as np import numpy as np
@ -21,8 +21,6 @@ c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
CAT_T = "c" CAT_T = "c"
# meta info that can be a matrix instead of vector. # meta info that can be a matrix instead of vector.
# For now it's base_margin for multi-class, but it can be extended to label once we have
# multi-output.
_matrix_meta = {"base_margin", "label"} _matrix_meta = {"base_margin", "label"}
@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
raise ValueError(msg) raise ValueError(msg)
# pylint: disable=too-many-locals def _pandas_feature_info(
def _transform_pandas_df(
data: DataFrame, data: DataFrame,
meta: Optional[str],
feature_names: FeatureNames,
feature_types: FeatureTypes,
enable_categorical: bool, enable_categorical: bool,
feature_names: FeatureNames = None, ) -> Tuple[FeatureNames, FeatureTypes]:
feature_types: Optional[List[str]] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
import pandas as pd import pandas as pd
from pandas.api.types import ( from pandas.api.types import (
is_sparse, is_sparse,
is_categorical_dtype, is_categorical_dtype,
is_integer_dtype,
is_bool_dtype,
) )
nullable_alias = {"Int16", "Int32", "Int64"}
# dtype: pd.core.arrays.numeric.NumericDtype
def is_nullable_dtype(dtype: Any) -> bool:
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool
if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or is_nullable_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)
# handle feature names # handle feature names
if feature_names is None and meta is None: if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex): if isinstance(data.columns, pd.MultiIndex):
@ -300,43 +276,94 @@ def _transform_pandas_df(
# handle feature types # handle feature types
if feature_types is None and meta is None: if feature_types is None and meta is None:
feature_types = [] feature_types = []
for i, dtype in enumerate(data.dtypes): for dtype in data.dtypes:
if is_sparse(dtype): if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical: elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T) feature_types.append(CAT_T)
else: else:
feature_types.append(_pandas_dtype_mapper[dtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.name])
return feature_names, feature_types
# handle category codes.
transformed = pd.DataFrame() def is_nullable_dtype(dtype: Any) -> bool:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented """Wether dtype is a pandas nullable type."""
if ( from pandas.api.types import is_integer_dtype, is_bool_dtype
enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes) # dtype: pd.core.arrays.numeric.NumericDtype
) or any(is_nullable_dtype(dtype) for dtype in data.dtypes): nullable_alias = {"Int16", "Int32", "Int64"}
for i, dtype in enumerate(data.dtypes): is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
if is_categorical_dtype(dtype): # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
# pandas uses -1 as default missing value for categorical data is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
transformed[data.columns[i]] = ( return is_int or is_bool
data[data.columns[i]]
.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN) def _pandas_cat_null(data: DataFrame) -> DataFrame:
) from pandas.api.types import is_categorical_dtype
elif is_nullable_dtype(dtype): # handle category codes and nullable.
# Converts integer <NA> to float NaN cat_columns = [
transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32) col
else: for col, dtype in zip(data.columns, data.dtypes)
transformed[data.columns[i]] = data[data.columns[i]] if is_categorical_dtype(dtype)
]
nul_columns = [
col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
]
if cat_columns or nul_columns:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
# fragmented
transformed = data.copy()
else: else:
transformed = data transformed = data
if cat_columns:
# DF doesn't have the cat attribute, so we use apply here
transformed[cat_columns] = (
transformed[cat_columns]
.apply(lambda x: x.cat.codes)
.astype(np.float32)
.replace(-1.0, np.NaN)
)
if nul_columns:
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
return transformed
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: FeatureNames = None,
feature_types: FeatureTypes = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
from pandas.api.types import (
is_sparse,
is_categorical_dtype,
)
if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or is_nullable_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)
feature_names, feature_types = _pandas_feature_info(
data, meta, feature_names, feature_types, enable_categorical
)
transformed = _pandas_cat_null(data)
if meta and len(data.columns) > 1 and meta not in _matrix_meta: if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns") raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype = meta_type if meta_type else np.float32 dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
arr = transformed.values arr: np.ndarray = transformed.values
if meta_type: if meta_type:
arr = arr.astype(meta_type) arr = arr.astype(dtype)
return arr, feature_names, feature_types return arr, feature_names, feature_types