Update JSON parser demo with categorical feature. (#8401)
- Parse categorical features in the Python example. - Add tests. - Update document.
This commit is contained in:
@@ -3,6 +3,7 @@ change without notice.
|
||||
|
||||
"""
|
||||
# pylint: disable=invalid-name,missing-function-docstring,import-error
|
||||
import copy
|
||||
import gc
|
||||
import importlib.util
|
||||
import multiprocessing
|
||||
@@ -477,6 +478,7 @@ def get_mq2008(
|
||||
)
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments,too-many-locals
|
||||
@memory.cache
|
||||
def make_categorical(
|
||||
n_samples: int,
|
||||
@@ -484,8 +486,27 @@ def make_categorical(
|
||||
n_categories: int,
|
||||
onehot: bool,
|
||||
sparsity: float = 0.0,
|
||||
cat_ratio: float = 1.0,
|
||||
) -> Tuple[ArrayLike, np.ndarray]:
|
||||
"""Generate categorical features for test.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_categories:
|
||||
Number of categories for categorical features.
|
||||
onehot:
|
||||
Should we apply one-hot encoding to the data?
|
||||
sparsity:
|
||||
The ratio of the amount of missing values over the number of all entries.
|
||||
cat_ratio:
|
||||
The ratio of features that are categorical.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X, y
|
||||
"""
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
@@ -501,10 +522,11 @@ def make_categorical(
|
||||
label += df.iloc[:, i]
|
||||
label += 1
|
||||
|
||||
df = df.astype("category")
|
||||
categories = np.arange(0, n_categories)
|
||||
for col in df.columns:
|
||||
df[col] = df[col].cat.set_categories(categories)
|
||||
if rng.binomial(1, cat_ratio, size=1)[0] == 1:
|
||||
df[col] = df[col].astype("category")
|
||||
df[col] = df[col].cat.set_categories(categories)
|
||||
|
||||
if sparsity > 0.0:
|
||||
for i in range(n_features):
|
||||
@@ -512,9 +534,14 @@ def make_categorical(
|
||||
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
|
||||
)
|
||||
df.iloc[index, i] = np.NaN
|
||||
assert n_categories == np.unique(df.dtypes[i].categories).size
|
||||
if is_categorical_dtype(df.dtypes[i]):
|
||||
assert n_categories == np.unique(df.dtypes[i].categories).size
|
||||
|
||||
if onehot:
|
||||
df = pd.get_dummies(df)
|
||||
columns = list(df.columns)
|
||||
rng.shuffle(columns)
|
||||
df = df[columns]
|
||||
return pd.get_dummies(df), label
|
||||
return df, label
|
||||
|
||||
|
||||
Reference in New Issue
Block a user