Synthesize the AMES housing dataset for tests. (#9963)
This commit is contained in:
parent
9a30bdd313
commit
fa5e2f6c45
@ -4,10 +4,11 @@ import os
|
|||||||
import zipfile
|
import zipfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import (
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
|
Dict,
|
||||||
Generator,
|
Generator,
|
||||||
List,
|
|
||||||
NamedTuple,
|
NamedTuple,
|
||||||
Optional,
|
Optional,
|
||||||
Tuple,
|
Tuple,
|
||||||
@ -25,6 +26,11 @@ from scipy import sparse
|
|||||||
import xgboost
|
import xgboost
|
||||||
from xgboost.data import pandas_pyarrow_mapper
|
from xgboost.data import pandas_pyarrow_mapper
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ..compat import DataFrame as DataFrameT
|
||||||
|
else:
|
||||||
|
DataFrameT = Any
|
||||||
|
|
||||||
joblib = pytest.importorskip("joblib")
|
joblib = pytest.importorskip("joblib")
|
||||||
memory = joblib.Memory("./cachedir", verbose=0)
|
memory = joblib.Memory("./cachedir", verbose=0)
|
||||||
|
|
||||||
@ -256,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
|
|||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
# pylint: disable=too-many-statements
|
||||||
@memory.cache
|
@memory.cache
|
||||||
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
|
def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
|
||||||
"""
|
"""Get a synthetic version of the amse housing dataset.
|
||||||
|
|
||||||
|
The real one can be obtained via:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
from sklearn import datasets
|
||||||
|
|
||||||
|
datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||||
|
|
||||||
Number of samples: 1460
|
Number of samples: 1460
|
||||||
Number of features: 20
|
Number of features: 20
|
||||||
Number of categorical features: 10
|
Number of categorical features: 10
|
||||||
Number of numerical features: 10
|
Number of numerical features: 10
|
||||||
"""
|
"""
|
||||||
datasets = pytest.importorskip("sklearn.datasets")
|
pytest.importorskip("pandas")
|
||||||
X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
import pandas as pd
|
||||||
|
|
||||||
categorical_columns_subset: List[str] = [
|
rng = np.random.default_rng(1994)
|
||||||
"BldgType", # 5 cats, no nan
|
n_samples = 1460
|
||||||
"GarageFinish", # 3 cats, nan
|
df = pd.DataFrame()
|
||||||
"LotConfig", # 5 cats, no nan
|
|
||||||
"Functional", # 7 cats, no nan
|
|
||||||
"MasVnrType", # 4 cats, nan
|
|
||||||
"HouseStyle", # 8 cats, no nan
|
|
||||||
"FireplaceQu", # 5 cats, nan
|
|
||||||
"ExterCond", # 5 cats, no nan
|
|
||||||
"ExterQual", # 4 cats, no nan
|
|
||||||
"PoolQC", # 3 cats, nan
|
|
||||||
]
|
|
||||||
|
|
||||||
numerical_columns_subset: List[str] = [
|
def synth_cat(
|
||||||
"3SsnPorch",
|
name_proba: Dict[Union[str, float], float], density: float
|
||||||
"Fireplaces",
|
) -> pd.Series:
|
||||||
"BsmtHalfBath",
|
n_nulls = int(n_samples * (1 - density))
|
||||||
"HalfBath",
|
has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
|
||||||
"GarageCars",
|
if has_nan:
|
||||||
"TotRmsAbvGrd",
|
sparsity = 1.0 - density
|
||||||
"BsmtFinSF1",
|
name_proba[np.nan] = sparsity
|
||||||
"BsmtFinSF2",
|
|
||||||
"GrLivArea",
|
|
||||||
"ScreenPorch",
|
|
||||||
]
|
|
||||||
|
|
||||||
X = X[categorical_columns_subset + numerical_columns_subset]
|
keys = list(name_proba.keys())
|
||||||
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
|
p = list(name_proba.values())
|
||||||
return X, y
|
p[-1] += 1.0 - np.sum(p) # Fix floating point error
|
||||||
|
x = rng.choice(keys, size=n_samples, p=p)
|
||||||
|
|
||||||
|
series = pd.Series(
|
||||||
|
x,
|
||||||
|
dtype=pd.CategoricalDtype(
|
||||||
|
# not NA
|
||||||
|
filter(lambda x: isinstance(x, str), keys)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return series
|
||||||
|
|
||||||
|
df["BldgType"] = synth_cat(
|
||||||
|
{
|
||||||
|
"1Fam": 0.835616,
|
||||||
|
"2fmCon": 0.078082,
|
||||||
|
"Duplex": 0.035616,
|
||||||
|
"Twnhs": 0.029452,
|
||||||
|
"TwnhsE": 0.021233,
|
||||||
|
},
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
df["GarageFinish"] = synth_cat(
|
||||||
|
{"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
|
||||||
|
)
|
||||||
|
df["LotConfig"] = synth_cat(
|
||||||
|
{
|
||||||
|
"Corner": 0.180137,
|
||||||
|
"CulDSac": 0.064384,
|
||||||
|
"FR2": 0.032192,
|
||||||
|
"FR3": 0.002740,
|
||||||
|
},
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
df["Functional"] = synth_cat(
|
||||||
|
{
|
||||||
|
"Typ": 0.931506,
|
||||||
|
"Min2": 0.023287,
|
||||||
|
"Min1": 0.021232,
|
||||||
|
"Mod": 0.010273,
|
||||||
|
"Maj1": 0.009589,
|
||||||
|
"Maj2": 0.003424,
|
||||||
|
"Sev": 0.000684,
|
||||||
|
},
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
df["MasVnrType"] = synth_cat(
|
||||||
|
{
|
||||||
|
"None": 0.591780,
|
||||||
|
"BrkFace": 0.304794,
|
||||||
|
"Stone": 0.087671,
|
||||||
|
"BrkCmn": 0.010273,
|
||||||
|
},
|
||||||
|
0.99452,
|
||||||
|
)
|
||||||
|
df["HouseStyle"] = synth_cat(
|
||||||
|
{
|
||||||
|
"1Story": 0.497260,
|
||||||
|
"2Story": 0.304794,
|
||||||
|
"1.5Fin": 0.105479,
|
||||||
|
"SLvl": 0.044520,
|
||||||
|
"SFoyer": 0.025342,
|
||||||
|
"1.5Unf": 0.009589,
|
||||||
|
"2.5Unf": 0.007534,
|
||||||
|
"2.5Fin": 0.005479,
|
||||||
|
},
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
df["FireplaceQu"] = synth_cat(
|
||||||
|
{
|
||||||
|
"Gd": 0.260273,
|
||||||
|
"TA": 0.214383,
|
||||||
|
"Fa": 0.022602,
|
||||||
|
"Ex": 0.016438,
|
||||||
|
"Po": 0.013698,
|
||||||
|
},
|
||||||
|
0.527397,
|
||||||
|
)
|
||||||
|
df["ExterCond"] = synth_cat(
|
||||||
|
{
|
||||||
|
"TA": 0.878082,
|
||||||
|
"Gd": 0.1,
|
||||||
|
"Fa": 0.019178,
|
||||||
|
"Ex": 0.002054,
|
||||||
|
"Po": 0.000684,
|
||||||
|
},
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
df["ExterQual"] = synth_cat(
|
||||||
|
{
|
||||||
|
"TA": 0.620547,
|
||||||
|
"Gd": 0.334246,
|
||||||
|
"Ex": 0.035616,
|
||||||
|
"Fa": 0.009589,
|
||||||
|
},
|
||||||
|
1.0,
|
||||||
|
)
|
||||||
|
df["PoolQC"] = synth_cat(
|
||||||
|
{
|
||||||
|
"Gd": 0.002054,
|
||||||
|
"Ex": 0.001369,
|
||||||
|
"Fa": 0.001369,
|
||||||
|
},
|
||||||
|
0.004794,
|
||||||
|
)
|
||||||
|
|
||||||
|
# We focus on the cateogircal values here, for numerical features, simple normal
|
||||||
|
# distribution is used, which doesn't match the original data.
|
||||||
|
def synth_num(loc: float, std: float, density: float) -> pd.Series:
|
||||||
|
x = rng.normal(loc=loc, scale=std, size=n_samples)
|
||||||
|
n_nulls = int(n_samples * (1 - density))
|
||||||
|
if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
|
||||||
|
null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
|
||||||
|
x[null_idx] = np.nan
|
||||||
|
return pd.Series(x, dtype=np.float64)
|
||||||
|
|
||||||
|
df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
|
||||||
|
df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
|
||||||
|
df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
|
||||||
|
df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
|
||||||
|
df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
|
||||||
|
df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
|
||||||
|
df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
|
||||||
|
df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
|
||||||
|
df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
|
||||||
|
df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
|
||||||
|
|
||||||
|
columns = list(df.columns)
|
||||||
|
rng.shuffle(columns)
|
||||||
|
df = df[columns]
|
||||||
|
|
||||||
|
# linear interaction for testing purposes.
|
||||||
|
y = np.zeros(shape=(n_samples,))
|
||||||
|
for c in df.columns:
|
||||||
|
if isinstance(df[c].dtype, pd.CategoricalDtype):
|
||||||
|
y += df[c].cat.codes.astype(np.float64)
|
||||||
|
else:
|
||||||
|
y += df[c].values
|
||||||
|
|
||||||
|
# Shift and scale to match the original y.
|
||||||
|
y *= 79442.50288288662 / y.std()
|
||||||
|
y += 180921.19589041095 - y.mean()
|
||||||
|
|
||||||
|
return df, y
|
||||||
|
|
||||||
|
|
||||||
@memory.cache
|
@memory.cache
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user