Synthesize the AMES housing dataset for tests. (#9963)

2024-01-09 00:54:23 +08:00 · 2024-01-09 00:54:23 +08:00 · fa5e2f6c45
commit fa5e2f6c45
parent 9a30bdd313
1 changed files with 178 additions and 32 deletions
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@ -4,10 +4,11 @@ import os
 import zipfile
 from dataclasses import dataclass
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
+    Dict,
    Generator,
-    List,
    NamedTuple,
    Optional,
    Tuple,
@ -25,6 +26,11 @@ from scipy import sparse
 import xgboost
 from xgboost.data import pandas_pyarrow_mapper

+if TYPE_CHECKING:
+    from ..compat import DataFrame as DataFrameT
+else:
+    DataFrameT = Any
+
 joblib = pytest.importorskip("joblib")
 memory = joblib.Memory("./cachedir", verbose=0)

@ -256,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
    return X, y


+# pylint: disable=too-many-statements
@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """
+def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
+    """Get a synthetic version of the amse housing dataset.
+
+    The real one can be obtained via:
+
+    .. code-block::
+
+        from sklearn import datasets
+
+        datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
    Number of samples: 1460
    Number of features: 20
    Number of categorical features: 10
    Number of numerical features: 10
    """
-    datasets = pytest.importorskip("sklearn.datasets")
-    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+    pytest.importorskip("pandas")
+    import pandas as pd

-    categorical_columns_subset: List[str] = [
-        "BldgType",  # 5 cats, no nan
-        "GarageFinish",  # 3 cats, nan
-        "LotConfig",  # 5 cats, no nan
-        "Functional",  # 7 cats, no nan
-        "MasVnrType",  # 4 cats, nan
-        "HouseStyle",  # 8 cats, no nan
-        "FireplaceQu",  # 5 cats, nan
-        "ExterCond",  # 5 cats, no nan
-        "ExterQual",  # 4 cats, no nan
-        "PoolQC",  # 3 cats, nan
-    ]
+    rng = np.random.default_rng(1994)
+    n_samples = 1460
+    df = pd.DataFrame()

-    numerical_columns_subset: List[str] = [
-        "3SsnPorch",
-        "Fireplaces",
-        "BsmtHalfBath",
-        "HalfBath",
-        "GarageCars",
-        "TotRmsAbvGrd",
-        "BsmtFinSF1",
-        "BsmtFinSF2",
-        "GrLivArea",
-        "ScreenPorch",
-    ]
+    def synth_cat(
+        name_proba: Dict[Union[str, float], float], density: float
+    ) -> pd.Series:
+        n_nulls = int(n_samples * (1 - density))
+        has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
+        if has_nan:
+            sparsity = 1.0 - density
+            name_proba[np.nan] = sparsity

-    X = X[categorical_columns_subset + numerical_columns_subset]
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
-    return X, y
+        keys = list(name_proba.keys())
+        p = list(name_proba.values())
+        p[-1] += 1.0 - np.sum(p)  # Fix floating point error
+        x = rng.choice(keys, size=n_samples, p=p)
+
+        series = pd.Series(
+            x,
+            dtype=pd.CategoricalDtype(
+                # not NA
+                filter(lambda x: isinstance(x, str), keys)
+            ),
+        )
+        return series
+
+    df["BldgType"] = synth_cat(
+        {
+            "1Fam": 0.835616,
+            "2fmCon": 0.078082,
+            "Duplex": 0.035616,
+            "Twnhs": 0.029452,
+            "TwnhsE": 0.021233,
+        },
+        1.0,
+    )
+    df["GarageFinish"] = synth_cat(
+        {"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
+    )
+    df["LotConfig"] = synth_cat(
+        {
+            "Corner": 0.180137,
+            "CulDSac": 0.064384,
+            "FR2": 0.032192,
+            "FR3": 0.002740,
+        },
+        1.0,
+    )
+    df["Functional"] = synth_cat(
+        {
+            "Typ": 0.931506,
+            "Min2": 0.023287,
+            "Min1": 0.021232,
+            "Mod": 0.010273,
+            "Maj1": 0.009589,
+            "Maj2": 0.003424,
+            "Sev": 0.000684,
+        },
+        1.0,
+    )
+    df["MasVnrType"] = synth_cat(
+        {
+            "None": 0.591780,
+            "BrkFace": 0.304794,
+            "Stone": 0.087671,
+            "BrkCmn": 0.010273,
+        },
+        0.99452,
+    )
+    df["HouseStyle"] = synth_cat(
+        {
+            "1Story": 0.497260,
+            "2Story": 0.304794,
+            "1.5Fin": 0.105479,
+            "SLvl": 0.044520,
+            "SFoyer": 0.025342,
+            "1.5Unf": 0.009589,
+            "2.5Unf": 0.007534,
+            "2.5Fin": 0.005479,
+        },
+        1.0,
+    )
+    df["FireplaceQu"] = synth_cat(
+        {
+            "Gd": 0.260273,
+            "TA": 0.214383,
+            "Fa": 0.022602,
+            "Ex": 0.016438,
+            "Po": 0.013698,
+        },
+        0.527397,
+    )
+    df["ExterCond"] = synth_cat(
+        {
+            "TA": 0.878082,
+            "Gd": 0.1,
+            "Fa": 0.019178,
+            "Ex": 0.002054,
+            "Po": 0.000684,
+        },
+        1.0,
+    )
+    df["ExterQual"] = synth_cat(
+        {
+            "TA": 0.620547,
+            "Gd": 0.334246,
+            "Ex": 0.035616,
+            "Fa": 0.009589,
+        },
+        1.0,
+    )
+    df["PoolQC"] = synth_cat(
+        {
+            "Gd": 0.002054,
+            "Ex": 0.001369,
+            "Fa": 0.001369,
+        },
+        0.004794,
+    )
+
+    # We focus on the cateogircal values here, for numerical features, simple normal
+    # distribution is used, which doesn't match the original data.
+    def synth_num(loc: float, std: float, density: float) -> pd.Series:
+        x = rng.normal(loc=loc, scale=std, size=n_samples)
+        n_nulls = int(n_samples * (1 - density))
+        if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
+            null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
+            x[null_idx] = np.nan
+        return pd.Series(x, dtype=np.float64)
+
+    df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
+    df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
+    df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
+    df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
+    df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
+    df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
+    df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
+    df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
+    df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
+    df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
+
+    columns = list(df.columns)
+    rng.shuffle(columns)
+    df = df[columns]
+
+    # linear interaction for testing purposes.
+    y = np.zeros(shape=(n_samples,))
+    for c in df.columns:
+        if isinstance(df[c].dtype, pd.CategoricalDtype):
+            y += df[c].cat.codes.astype(np.float64)
+        else:
+            y += df[c].values
+
+    # Shift and scale to match the original y.
+    y *= 79442.50288288662 / y.std()
+    y += 180921.19589041095 - y.mean()
+
+    return df, y


@memory.cache