Synthesize the AMES housing dataset for tests. (#9963)

This commit is contained in:
Jiaming Yuan 2024-01-09 00:54:23 +08:00 committed by GitHub
parent 9a30bdd313
commit fa5e2f6c45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,10 +4,11 @@ import os
import zipfile import zipfile
from dataclasses import dataclass from dataclasses import dataclass
from typing import ( from typing import (
TYPE_CHECKING,
Any, Any,
Callable, Callable,
Dict,
Generator, Generator,
List,
NamedTuple, NamedTuple,
Optional, Optional,
Tuple, Tuple,
@ -25,6 +26,11 @@ from scipy import sparse
import xgboost import xgboost
from xgboost.data import pandas_pyarrow_mapper from xgboost.data import pandas_pyarrow_mapper
if TYPE_CHECKING:
from ..compat import DataFrame as DataFrameT
else:
DataFrameT = Any
joblib = pytest.importorskip("joblib") joblib = pytest.importorskip("joblib")
memory = joblib.Memory("./cachedir", verbose=0) memory = joblib.Memory("./cachedir", verbose=0)
@ -256,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
return X, y return X, y
# pylint: disable=too-many-statements
@memory.cache @memory.cache
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]: def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
""" """Get a synthetic version of the amse housing dataset.
The real one can be obtained via:
.. code-block::
from sklearn import datasets
datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
Number of samples: 1460 Number of samples: 1460
Number of features: 20 Number of features: 20
Number of categorical features: 10 Number of categorical features: 10
Number of numerical features: 10 Number of numerical features: 10
""" """
datasets = pytest.importorskip("sklearn.datasets") pytest.importorskip("pandas")
X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True) import pandas as pd
categorical_columns_subset: List[str] = [ rng = np.random.default_rng(1994)
"BldgType", # 5 cats, no nan n_samples = 1460
"GarageFinish", # 3 cats, nan df = pd.DataFrame()
"LotConfig", # 5 cats, no nan
"Functional", # 7 cats, no nan
"MasVnrType", # 4 cats, nan
"HouseStyle", # 8 cats, no nan
"FireplaceQu", # 5 cats, nan
"ExterCond", # 5 cats, no nan
"ExterQual", # 4 cats, no nan
"PoolQC", # 3 cats, nan
]
numerical_columns_subset: List[str] = [ def synth_cat(
"3SsnPorch", name_proba: Dict[Union[str, float], float], density: float
"Fireplaces", ) -> pd.Series:
"BsmtHalfBath", n_nulls = int(n_samples * (1 - density))
"HalfBath", has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
"GarageCars", if has_nan:
"TotRmsAbvGrd", sparsity = 1.0 - density
"BsmtFinSF1", name_proba[np.nan] = sparsity
"BsmtFinSF2",
"GrLivArea",
"ScreenPorch",
]
X = X[categorical_columns_subset + numerical_columns_subset] keys = list(name_proba.keys())
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category") p = list(name_proba.values())
return X, y p[-1] += 1.0 - np.sum(p) # Fix floating point error
x = rng.choice(keys, size=n_samples, p=p)
series = pd.Series(
x,
dtype=pd.CategoricalDtype(
# not NA
filter(lambda x: isinstance(x, str), keys)
),
)
return series
df["BldgType"] = synth_cat(
{
"1Fam": 0.835616,
"2fmCon": 0.078082,
"Duplex": 0.035616,
"Twnhs": 0.029452,
"TwnhsE": 0.021233,
},
1.0,
)
df["GarageFinish"] = synth_cat(
{"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
)
df["LotConfig"] = synth_cat(
{
"Corner": 0.180137,
"CulDSac": 0.064384,
"FR2": 0.032192,
"FR3": 0.002740,
},
1.0,
)
df["Functional"] = synth_cat(
{
"Typ": 0.931506,
"Min2": 0.023287,
"Min1": 0.021232,
"Mod": 0.010273,
"Maj1": 0.009589,
"Maj2": 0.003424,
"Sev": 0.000684,
},
1.0,
)
df["MasVnrType"] = synth_cat(
{
"None": 0.591780,
"BrkFace": 0.304794,
"Stone": 0.087671,
"BrkCmn": 0.010273,
},
0.99452,
)
df["HouseStyle"] = synth_cat(
{
"1Story": 0.497260,
"2Story": 0.304794,
"1.5Fin": 0.105479,
"SLvl": 0.044520,
"SFoyer": 0.025342,
"1.5Unf": 0.009589,
"2.5Unf": 0.007534,
"2.5Fin": 0.005479,
},
1.0,
)
df["FireplaceQu"] = synth_cat(
{
"Gd": 0.260273,
"TA": 0.214383,
"Fa": 0.022602,
"Ex": 0.016438,
"Po": 0.013698,
},
0.527397,
)
df["ExterCond"] = synth_cat(
{
"TA": 0.878082,
"Gd": 0.1,
"Fa": 0.019178,
"Ex": 0.002054,
"Po": 0.000684,
},
1.0,
)
df["ExterQual"] = synth_cat(
{
"TA": 0.620547,
"Gd": 0.334246,
"Ex": 0.035616,
"Fa": 0.009589,
},
1.0,
)
df["PoolQC"] = synth_cat(
{
"Gd": 0.002054,
"Ex": 0.001369,
"Fa": 0.001369,
},
0.004794,
)
# We focus on the cateogircal values here, for numerical features, simple normal
# distribution is used, which doesn't match the original data.
def synth_num(loc: float, std: float, density: float) -> pd.Series:
x = rng.normal(loc=loc, scale=std, size=n_samples)
n_nulls = int(n_samples * (1 - density))
if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
x[null_idx] = np.nan
return pd.Series(x, dtype=np.float64)
df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
columns = list(df.columns)
rng.shuffle(columns)
df = df[columns]
# linear interaction for testing purposes.
y = np.zeros(shape=(n_samples,))
for c in df.columns:
if isinstance(df[c].dtype, pd.CategoricalDtype):
y += df[c].cat.codes.astype(np.float64)
else:
y += df[c].values
# Shift and scale to match the original y.
y *= 79442.50288288662 / y.std()
y += 180921.19589041095 - y.mean()
return df, y
@memory.cache @memory.cache