Update JSON parser demo with categorical feature. (#8401)

- Parse categorical features in the Python example.
- Add tests.
- Update document.
This commit is contained in:
Jiaming Yuan
2022-10-28 20:57:43 +08:00
committed by GitHub
parent cfd2a9f872
commit a408c34558
7 changed files with 318 additions and 133 deletions

View File

@@ -3,6 +3,7 @@ change without notice.
"""
# pylint: disable=invalid-name,missing-function-docstring,import-error
import copy
import gc
import importlib.util
import multiprocessing
@@ -477,6 +478,7 @@ def get_mq2008(
)
# pylint: disable=too-many-arguments,too-many-locals
@memory.cache
def make_categorical(
n_samples: int,
@@ -484,8 +486,27 @@ def make_categorical(
n_categories: int,
onehot: bool,
sparsity: float = 0.0,
cat_ratio: float = 1.0,
) -> Tuple[ArrayLike, np.ndarray]:
"""Generate categorical features for test.
Parameters
----------
n_categories:
Number of categories for categorical features.
onehot:
Should we apply one-hot encoding to the data?
sparsity:
The ratio of the amount of missing values over the number of all entries.
cat_ratio:
The ratio of features that are categorical.
Returns
-------
X, y
"""
import pandas as pd
from pandas.api.types import is_categorical_dtype
rng = np.random.RandomState(1994)
@@ -501,10 +522,11 @@ def make_categorical(
label += df.iloc[:, i]
label += 1
df = df.astype("category")
categories = np.arange(0, n_categories)
for col in df.columns:
df[col] = df[col].cat.set_categories(categories)
if rng.binomial(1, cat_ratio, size=1)[0] == 1:
df[col] = df[col].astype("category")
df[col] = df[col].cat.set_categories(categories)
if sparsity > 0.0:
for i in range(n_features):
@@ -512,9 +534,14 @@ def make_categorical(
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
)
df.iloc[index, i] = np.NaN
assert n_categories == np.unique(df.dtypes[i].categories).size
if is_categorical_dtype(df.dtypes[i]):
assert n_categories == np.unique(df.dtypes[i].categories).size
if onehot:
df = pd.get_dummies(df)
columns = list(df.columns)
rng.shuffle(columns)
df = df[columns]
return pd.get_dummies(df), label
return df, label