Replace all uses of deprecated function sklearn.datasets.load_boston (#7373)

* Replace all uses of deprecated function sklearn.datasets.load_boston * More renaming * Fix bad name * Update assertion * Fix n boosted rounds. * Avoid over regularization. * Rebase. * Avoid over regularization. * Whac-a-mole Co-authored-by: fis <jm.yuan@outlook.com>
2022-01-30 04:27:57 -08:00
parent b4340abf56
commit c621775f34
13 changed files with 56 additions and 66 deletions
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -384,7 +384,7 @@ class TestCallbacks:
                    os.path.join(tmpdir, 'model_' + str(i) + '.pkl'))

    def test_callback_list(self):
-        X, y = tm.get_boston()
+        X, y = tm.get_california_housing()
        m = xgb.DMatrix(X, y)
        callbacks = [xgb.callback.EarlyStopping(rounds=10)]
        for i in range(4):
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -45,8 +45,8 @@ def test_sklearn_demo():
    script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
    cmd = ['python', script]
    subprocess.check_call(cmd)
-    assert os.path.exists('best_boston.pkl')
-    os.remove('best_boston.pkl')
+    assert os.path.exists('best_calif.pkl')
+    os.remove('best_calif.pkl')


@pytest.mark.skipif(**tm.no_sklearn())
--- a/tests/python/test_linear.py
+++ b/tests/python/test_linear.py
@@ -39,8 +39,8 @@ class TestLinear:
    # We test a weaker condition that the loss has not increased between the first and last
    # iteration
    @given(parameter_strategy, strategies.integers(10, 50),
-           tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
-           strategies.floats(1e-5, 2.0))
+           tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 1.0),
+           strategies.floats(1e-5, 1.0))
    @settings(deadline=None)
    def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
        param['updater'] = 'coord_descent'
@@ -69,8 +69,8 @@ class TestLinear:
        assert tm.non_increasing(sampled_result)

    @given(parameter_strategy, strategies.integers(10, 50),
-           tm.dataset_strategy, strategies.floats(1e-5, 2.0),
-           strategies.floats(1e-5, 2.0))
+           tm.dataset_strategy, strategies.floats(1e-5, 1.0),
+           strategies.floats(1e-5, 1.0))
    @settings(deadline=None)
    def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
        param['updater'] = 'shotgun'
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -88,8 +88,8 @@ def test_predict_leaf():


 def test_predict_shape():
-    from sklearn.datasets import load_boston
-    X, y = load_boston(return_X_y=True)
+    from sklearn.datasets import fetch_california_housing
+    X, y = fetch_california_housing(return_X_y=True)
    reg = xgb.XGBRegressor(n_estimators=1)
    reg.fit(X, y)
    predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -425,8 +425,8 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:


 def test_inplace_predict(client: "Client") -> None:
-    from sklearn.datasets import load_boston
-    X_, y_ = load_boston(return_X_y=True)
+    from sklearn.datasets import fetch_california_housing
+    X_, y_ = fetch_california_housing(return_X_y=True)
    X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32)
    reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y)
    booster = reg.get_booster()
@@ -1405,8 +1405,8 @@ class TestWithDask:
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_custom_objective(self, client: "Client") -> None:
-        from sklearn.datasets import load_boston
-        X, y = load_boston(return_X_y=True)
+        from sklearn.datasets import fetch_california_housing
+        X, y = fetch_california_housing(return_X_y=True)
        X, y = da.from_array(X), da.from_array(y)
        rounds = 20

@@ -1552,8 +1552,8 @@ class TestWithDask:
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)

    def test_shap(self, client: "Client") -> None:
-        from sklearn.datasets import load_boston, load_digits
-        X, y = load_boston(return_X_y=True)
+        from sklearn.datasets import fetch_california_housing, load_digits
+        X, y = fetch_california_housing(return_X_y=True)
        params: Dict[str, Any] = {'objective': 'reg:squarederror'}
        self.run_shap(X, y, params, client)

@@ -1597,8 +1597,8 @@ class TestWithDask:
                           1e-5, 1e-5)

    def test_shap_interactions(self, client: "Client") -> None:
-        from sklearn.datasets import load_boston
-        X, y = load_boston(return_X_y=True)
+        from sklearn.datasets import fetch_california_housing
+        X, y = fetch_california_housing(return_X_y=True)
        params = {'objective': 'reg:squarederror'}
        self.run_shap_interactions(X, y, params, client)

--- a/tests/python/test_with_shap.py
+++ b/tests/python/test_with_shap.py
@@ -14,7 +14,8 @@ pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
 # Check integration is not broken from xgboost side
 # Changes in binary format may cause problems
 def test_with_shap():
-    X, y = shap.datasets.boston()
+    from sklearn.datasets import fetch_california_housing
+    X, y = fetch_california_housing(return_X_y=True)
    dtrain = xgb.DMatrix(X, label=y)
    model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
    explainer = shap.TreeExplainer(model)
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -328,16 +328,16 @@ def test_select_feature():


 def test_num_parallel_tree():
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
    reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4,
                           tree_method='hist')
-    boston = load_boston()
-    bst = reg.fit(X=boston['data'], y=boston['target'])
+    X, y = fetch_california_housing(return_X_y=True)
+    bst = reg.fit(X=X, y=y)
    dump = bst.get_booster().get_dump(dump_format='json')
    assert len(dump) == 16

    reg = xgb.XGBRFRegressor(n_estimators=4)
-    bst = reg.fit(X=boston['data'], y=boston['target'])
+    bst = reg.fit(X=X, y=y)
    dump = bst.get_booster().get_dump(dump_format='json')
    assert len(dump) == 4

@@ -346,14 +346,12 @@ def test_num_parallel_tree():
        'num_parallel_tree']) == 4


-def test_boston_housing_regression():
+def test_calif_housing_regression():
    from sklearn.metrics import mean_squared_error
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import KFold

-    boston = load_boston()
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
@@ -377,12 +375,12 @@ def test_boston_housing_regression():
            xgb_model.feature_names_in_


-def run_boston_housing_rf_regression(tree_method):
+def run_calif_housing_rf_regression(tree_method):
    from sklearn.metrics import mean_squared_error
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import KFold

-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRFRegressor(random_state=42, tree_method=tree_method).fit(
@@ -397,29 +395,27 @@ def run_boston_housing_rf_regression(tree_method):
        rfreg.fit(X, y, early_stopping_rounds=10)


-def test_boston_housing_rf_regression():
-    run_boston_housing_rf_regression("hist")
+def test_calif_housing_rf_regression():
+    run_calif_housing_rf_regression("hist")


 def test_parameter_tuning():
    from sklearn.model_selection import GridSearchCV
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing

-    boston = load_boston()
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
    xgb_model = xgb.XGBRegressor(learning_rate=0.1)
    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]},
                       cv=3, verbose=1)
    clf.fit(X, y)
    assert clf.best_score_ < 0.7
-    assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
+    assert clf.best_params_ == {'n_estimators': 200, 'max_depth': 4}


 def test_regression_with_custom_objective():
    from sklearn.metrics import mean_squared_error
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import KFold

    def objective_ls(y_true, y_pred):
@@ -427,9 +423,7 @@ def test_regression_with_custom_objective():
        hess = np.ones(len(y_true))
        return grad, hess

-    boston = load_boston()
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
@@ -841,13 +835,13 @@ def test_save_load_model():


 def test_RFECV():
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
    from sklearn.datasets import load_breast_cancer
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import RFECV

    # Regression
-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
    bst = xgb.XGBRegressor(booster='gblinear', learning_rate=0.1,
                           n_estimators=10,
                           objective='reg:squarederror',
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -229,8 +229,8 @@ class TestDataset:


@memory.cache
-def get_boston():
-    data = datasets.load_boston()
+def get_california_housing():
+    data = datasets.fetch_california_housing()
    return data.data, data.target


@@ -315,7 +315,9 @@ def make_categorical(

 _unweighted_datasets_strategy = strategies.sampled_from(
    [
-        TestDataset("boston", get_boston, "reg:squarederror", "rmse"),
+        TestDataset(
+            "calif_housing", get_california_housing, "reg:squarederror", "rmse"
+        ),
        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
        TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
        TestDataset(