Synthesize the AMES housing dataset for tests. (#9963)

2024-01-09 00:54:23 +08:00 · 2024-01-09 00:54:23 +08:00 · fa5e2f6c45
commit fa5e2f6c45
parent 9a30bdd313
1 changed files with 178 additions and 32 deletions
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@ -4,10 +4,11 @@ import os
 import zipfile
 from dataclasses import dataclass
 from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Generator,
    List,
    NamedTuple,
    Optional,
    Tuple,
@ -25,6 +26,11 @@ from scipy import sparse
 import xgboost
 from xgboost.data import pandas_pyarrow_mapper
 if TYPE_CHECKING:
    from ..compat import DataFrame as DataFrameT
 else:
    DataFrameT = Any
 joblib = pytest.importorskip("joblib")
 memory = joblib.Memory("./cachedir", verbose=0)
@ -256,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
    return X, y
 # pylint: disable=too-many-statements
@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
+def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
-    """
+    """Get a synthetic version of the amse housing dataset.
    The real one can be obtained via:
    .. code-block::
        from sklearn import datasets
        datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
    Number of samples: 1460
    Number of features: 20
    Number of categorical features: 10
    Number of numerical features: 10
    """
-    datasets = pytest.importorskip("sklearn.datasets")
+    pytest.importorskip("pandas")
-    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+    import pandas as pd
-    categorical_columns_subset: List[str] = [
+    rng = np.random.default_rng(1994)
-        "BldgType",  # 5 cats, no nan
+    n_samples = 1460
-        "GarageFinish",  # 3 cats, nan
+    df = pd.DataFrame()
        "LotConfig",  # 5 cats, no nan
        "Functional",  # 7 cats, no nan
        "MasVnrType",  # 4 cats, nan
        "HouseStyle",  # 8 cats, no nan
        "FireplaceQu",  # 5 cats, nan
        "ExterCond",  # 5 cats, no nan
        "ExterQual",  # 4 cats, no nan
        "PoolQC",  # 3 cats, nan
    ]
-    numerical_columns_subset: List[str] = [
+    def synth_cat(
-        "3SsnPorch",
+        name_proba: Dict[Union[str, float], float], density: float
-        "Fireplaces",
+    ) -> pd.Series:
-        "BsmtHalfBath",
+        n_nulls = int(n_samples * (1 - density))
-        "HalfBath",
+        has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
-        "GarageCars",
+        if has_nan:
-        "TotRmsAbvGrd",
+            sparsity = 1.0 - density
-        "BsmtFinSF1",
+            name_proba[np.nan] = sparsity
        "BsmtFinSF2",
        "GrLivArea",
        "ScreenPorch",
    ]
-    X = X[categorical_columns_subset + numerical_columns_subset]
+        keys = list(name_proba.keys())
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+        p = list(name_proba.values())
-    return X, y
+        p[-1] += 1.0 - np.sum(p)  # Fix floating point error
        x = rng.choice(keys, size=n_samples, p=p)
        series = pd.Series(
            x,
            dtype=pd.CategoricalDtype(
                # not NA
                filter(lambda x: isinstance(x, str), keys)
            ),
        )
        return series
    df["BldgType"] = synth_cat(
        {
            "1Fam": 0.835616,
            "2fmCon": 0.078082,
            "Duplex": 0.035616,
            "Twnhs": 0.029452,
            "TwnhsE": 0.021233,
        },
        1.0,
    )
    df["GarageFinish"] = synth_cat(
        {"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
    )
    df["LotConfig"] = synth_cat(
        {
            "Corner": 0.180137,
            "CulDSac": 0.064384,
            "FR2": 0.032192,
            "FR3": 0.002740,
        },
        1.0,
    )
    df["Functional"] = synth_cat(
        {
            "Typ": 0.931506,
            "Min2": 0.023287,
            "Min1": 0.021232,
            "Mod": 0.010273,
            "Maj1": 0.009589,
            "Maj2": 0.003424,
            "Sev": 0.000684,
        },
        1.0,
    )
    df["MasVnrType"] = synth_cat(
        {
            "None": 0.591780,
            "BrkFace": 0.304794,
            "Stone": 0.087671,
            "BrkCmn": 0.010273,
        },
        0.99452,
    )
    df["HouseStyle"] = synth_cat(
        {
            "1Story": 0.497260,
            "2Story": 0.304794,
            "1.5Fin": 0.105479,
            "SLvl": 0.044520,
            "SFoyer": 0.025342,
            "1.5Unf": 0.009589,
            "2.5Unf": 0.007534,
            "2.5Fin": 0.005479,
        },
        1.0,
    )
    df["FireplaceQu"] = synth_cat(
        {
            "Gd": 0.260273,
            "TA": 0.214383,
            "Fa": 0.022602,
            "Ex": 0.016438,
            "Po": 0.013698,
        },
        0.527397,
    )
    df["ExterCond"] = synth_cat(
        {
            "TA": 0.878082,
            "Gd": 0.1,
            "Fa": 0.019178,
            "Ex": 0.002054,
            "Po": 0.000684,
        },
        1.0,
    )
    df["ExterQual"] = synth_cat(
        {
            "TA": 0.620547,
            "Gd": 0.334246,
            "Ex": 0.035616,
            "Fa": 0.009589,
        },
        1.0,
    )
    df["PoolQC"] = synth_cat(
        {
            "Gd": 0.002054,
            "Ex": 0.001369,
            "Fa": 0.001369,
        },
        0.004794,
    )
    # We focus on the cateogircal values here, for numerical features, simple normal
    # distribution is used, which doesn't match the original data.
    def synth_num(loc: float, std: float, density: float) -> pd.Series:
        x = rng.normal(loc=loc, scale=std, size=n_samples)
        n_nulls = int(n_samples * (1 - density))
        if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
            null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
            x[null_idx] = np.nan
        return pd.Series(x, dtype=np.float64)
    df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
    df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
    df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
    df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
    df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
    df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
    df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
    df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
    df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
    df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
    columns = list(df.columns)
    rng.shuffle(columns)
    df = df[columns]
    # linear interaction for testing purposes.
    y = np.zeros(shape=(n_samples,))
    for c in df.columns:
        if isinstance(df[c].dtype, pd.CategoricalDtype):
            y += df[c].cat.codes.astype(np.float64)
        else:
            y += df[c].values
    # Shift and scale to match the original y.
    y *= 79442.50288288662 / y.std()
    y += 180921.19589041095 - y.mean()
    return df, y
@memory.cache