Refactor pandas dataframe handling. (#7843)
This commit is contained in:
parent
bef1f939ce
commit
ad06172c6b
@ -6,7 +6,7 @@ from distutils import version
|
||||
import json
|
||||
import warnings
|
||||
import os
|
||||
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator
|
||||
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
|
||||
|
||||
import numpy as np
|
||||
|
||||
@ -21,8 +21,6 @@ c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
CAT_T = "c"
|
||||
|
||||
# meta info that can be a matrix instead of vector.
|
||||
# For now it's base_margin for multi-class, but it can be extended to label once we have
|
||||
# multi-output.
|
||||
_matrix_meta = {"base_margin", "label"}
|
||||
|
||||
|
||||
@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
def _transform_pandas_df(
|
||||
def _pandas_feature_info(
|
||||
data: DataFrame,
|
||||
meta: Optional[str],
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
enable_categorical: bool,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[str] = None,
|
||||
) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
|
||||
) -> Tuple[FeatureNames, FeatureTypes]:
|
||||
import pandas as pd
|
||||
from pandas.api.types import (
|
||||
is_sparse,
|
||||
is_categorical_dtype,
|
||||
is_integer_dtype,
|
||||
is_bool_dtype,
|
||||
)
|
||||
|
||||
nullable_alias = {"Int16", "Int32", "Int64"}
|
||||
|
||||
# dtype: pd.core.arrays.numeric.NumericDtype
|
||||
def is_nullable_dtype(dtype: Any) -> bool:
|
||||
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
|
||||
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
|
||||
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
|
||||
return is_int or is_bool
|
||||
|
||||
if not all(
|
||||
dtype.name in _pandas_dtype_mapper
|
||||
or is_sparse(dtype)
|
||||
or is_nullable_dtype(dtype)
|
||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data.dtypes
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
|
||||
# handle feature names
|
||||
if feature_names is None and meta is None:
|
||||
if isinstance(data.columns, pd.MultiIndex):
|
||||
@ -300,43 +276,94 @@ def _transform_pandas_df(
|
||||
# handle feature types
|
||||
if feature_types is None and meta is None:
|
||||
feature_types = []
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
for dtype in data.dtypes:
|
||||
if is_sparse(dtype):
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
|
||||
elif is_categorical_dtype(dtype) and enable_categorical:
|
||||
feature_types.append(CAT_T)
|
||||
else:
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||
return feature_names, feature_types
|
||||
|
||||
# handle category codes.
|
||||
transformed = pd.DataFrame()
|
||||
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
|
||||
if (
|
||||
enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
|
||||
) or any(is_nullable_dtype(dtype) for dtype in data.dtypes):
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
if is_categorical_dtype(dtype):
|
||||
# pandas uses -1 as default missing value for categorical data
|
||||
transformed[data.columns[i]] = (
|
||||
data[data.columns[i]]
|
||||
.cat.codes.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
)
|
||||
elif is_nullable_dtype(dtype):
|
||||
# Converts integer <NA> to float NaN
|
||||
transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32)
|
||||
else:
|
||||
transformed[data.columns[i]] = data[data.columns[i]]
|
||||
|
||||
def is_nullable_dtype(dtype: Any) -> bool:
|
||||
"""Wether dtype is a pandas nullable type."""
|
||||
from pandas.api.types import is_integer_dtype, is_bool_dtype
|
||||
# dtype: pd.core.arrays.numeric.NumericDtype
|
||||
nullable_alias = {"Int16", "Int32", "Int64"}
|
||||
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
|
||||
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
|
||||
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
|
||||
return is_int or is_bool
|
||||
|
||||
|
||||
def _pandas_cat_null(data: DataFrame) -> DataFrame:
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
# handle category codes and nullable.
|
||||
cat_columns = [
|
||||
col
|
||||
for col, dtype in zip(data.columns, data.dtypes)
|
||||
if is_categorical_dtype(dtype)
|
||||
]
|
||||
nul_columns = [
|
||||
col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
|
||||
]
|
||||
if cat_columns or nul_columns:
|
||||
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
|
||||
# fragmented
|
||||
transformed = data.copy()
|
||||
else:
|
||||
transformed = data
|
||||
|
||||
if cat_columns:
|
||||
# DF doesn't have the cat attribute, so we use apply here
|
||||
transformed[cat_columns] = (
|
||||
transformed[cat_columns]
|
||||
.apply(lambda x: x.cat.codes)
|
||||
.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
)
|
||||
if nul_columns:
|
||||
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
|
||||
|
||||
return transformed
|
||||
|
||||
|
||||
def _transform_pandas_df(
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: FeatureTypes = None,
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[str] = None,
|
||||
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
|
||||
from pandas.api.types import (
|
||||
is_sparse,
|
||||
is_categorical_dtype,
|
||||
)
|
||||
|
||||
if not all(
|
||||
dtype.name in _pandas_dtype_mapper
|
||||
or is_sparse(dtype)
|
||||
or is_nullable_dtype(dtype)
|
||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data.dtypes
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
|
||||
feature_names, feature_types = _pandas_feature_info(
|
||||
data, meta, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
|
||||
transformed = _pandas_cat_null(data)
|
||||
|
||||
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
|
||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||
|
||||
dtype = meta_type if meta_type else np.float32
|
||||
arr = transformed.values
|
||||
dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
|
||||
arr: np.ndarray = transformed.values
|
||||
if meta_type:
|
||||
arr = arr.astype(meta_type)
|
||||
arr = arr.astype(dtype)
|
||||
return arr, feature_names, feature_types
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user