From c621775f34887862cd75f7fa7d0584ff9d1b256f Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Sun, 30 Jan 2022 04:27:57 -0800 Subject: [PATCH] Replace all uses of deprecated function sklearn.datasets.load_boston (#7373) * Replace all uses of deprecated function sklearn.datasets.load_boston * More renaming * Fix bad name * Update assertion * Fix n boosted rounds. * Avoid over regularization. * Rebase. * Avoid over regularization. * Whac-a-mole Co-authored-by: fis --- demo/guide-python/sklearn_examples.py | 14 +++----- demo/guide-python/sklearn_parallel.py | 7 ++-- demo/guide-python/update_process.py | 4 +-- tests/python-gpu/test_gpu_linear.py | 6 ++-- tests/python-gpu/test_gpu_with_sklearn.py | 2 +- tests/python/test_callback.py | 2 +- tests/python/test_demos.py | 4 +-- tests/python/test_linear.py | 8 ++--- tests/python/test_predict.py | 4 +-- tests/python/test_with_dask.py | 16 ++++----- tests/python/test_with_shap.py | 3 +- tests/python/test_with_sklearn.py | 44 ++++++++++------------- tests/python/testing.py | 8 +++-- 13 files changed, 56 insertions(+), 66 deletions(-) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index b5de652a6..e8bcc676d 100644 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -12,7 +12,7 @@ import xgboost as xgb import numpy as np from sklearn.model_selection import KFold, train_test_split, GridSearchCV from sklearn.metrics import confusion_matrix, mean_squared_error -from sklearn.datasets import load_iris, load_digits, load_boston +from sklearn.datasets import load_iris, load_digits, fetch_california_housing rng = np.random.RandomState(31337) @@ -38,10 +38,8 @@ for train_index, test_index in kf.split(X): actuals = y[test_index] print(confusion_matrix(actuals, predictions)) -print("Boston Housing: regression") -boston = load_boston() -y = boston['target'] -X = boston['data'] +print("California Housing: regression") +X, y = fetch_california_housing(return_X_y=True) kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X): xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index]) @@ -50,8 +48,6 @@ for train_index, test_index in kf.split(X): print(mean_squared_error(actuals, predictions)) print("Parameter optimization") -y = boston['target'] -X = boston['data'] xgb_model = xgb.XGBRegressor(n_jobs=1) clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], @@ -63,8 +59,8 @@ print(clf.best_params_) # The sklearn API models are picklable print("Pickling sklearn API models") # must open in binary format to pickle -pickle.dump(clf, open("best_boston.pkl", "wb")) -clf2 = pickle.load(open("best_boston.pkl", "rb")) +pickle.dump(clf, open("best_calif.pkl", "wb")) +clf2 = pickle.load(open("best_calif.pkl", "rb")) print(np.allclose(clf.predict(X), clf2.predict(X))) # Early-stopping diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py index dd472da67..c65fd7c22 100644 --- a/demo/guide-python/sklearn_parallel.py +++ b/demo/guide-python/sklearn_parallel.py @@ -3,16 +3,13 @@ Demo for using xgboost with sklearn =================================== """ from sklearn.model_selection import GridSearchCV -from sklearn.datasets import load_boston +from sklearn.datasets import fetch_california_housing import xgboost as xgb import multiprocessing if __name__ == "__main__": print("Parallel Parameter optimization") - boston = load_boston() - - y = boston['target'] - X = boston['data'] + X, y = fetch_california_housing(return_X_y=True) xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2) clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200]}, verbose=1, diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py index 8ed03fd78..907399fcf 100644 --- a/demo/guide-python/update_process.py +++ b/demo/guide-python/update_process.py @@ -8,14 +8,14 @@ experiment. """ import xgboost as xgb -from sklearn.datasets import load_boston +from sklearn.datasets import fetch_california_housing import numpy as np def main(): n_rounds = 32 - X, y = load_boston(return_X_y=True) + X, y = fetch_california_housing(return_X_y=True) # Train a model first X_train = X[: X.shape[0] // 2] diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py index e8ec23b72..9791169f8 100644 --- a/tests/python-gpu/test_gpu_linear.py +++ b/tests/python-gpu/test_gpu_linear.py @@ -43,8 +43,8 @@ class TestGPULinear: # We test a weaker condition that the loss has not increased between the first and last # iteration @given(parameter_strategy, strategies.integers(10, 50), - tm.dataset_strategy, strategies.floats(1e-5, 2.0), - strategies.floats(1e-5, 2.0)) + tm.dataset_strategy, strategies.floats(1e-5, 1.0), + strategies.floats(1e-5, 1.0)) @settings(deadline=None) def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd): assume(len(dataset.y) > 0) @@ -63,7 +63,7 @@ class TestGPULinear: import cupy params = {'booster': 'gblinear', 'updater': 'gpu_coord_descent', 'n_estimators': 100} - X, y = tm.get_boston() + X, y = tm.get_california_housing() cpu_model = xgb.XGBRegressor(**params) cpu_model.fit(X, y) cpu_predt = cpu_model.predict(X) diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index ae359f2f1..87d5a651d 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -61,7 +61,7 @@ def test_boost_from_prediction_gpu_hist(): def test_num_parallel_tree(): - twskl.run_boston_housing_rf_regression("gpu_hist") + twskl.run_calif_housing_rf_regression("gpu_hist") @pytest.mark.skipif(**tm.no_pandas()) diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py index 08ba9ee79..dcd898ac0 100644 --- a/tests/python/test_callback.py +++ b/tests/python/test_callback.py @@ -384,7 +384,7 @@ class TestCallbacks: os.path.join(tmpdir, 'model_' + str(i) + '.pkl')) def test_callback_list(self): - X, y = tm.get_boston() + X, y = tm.get_california_housing() m = xgb.DMatrix(X, y) callbacks = [xgb.callback.EarlyStopping(rounds=10)] for i in range(4): diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index b87317c47..4c1f1fb38 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -45,8 +45,8 @@ def test_sklearn_demo(): script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py') cmd = ['python', script] subprocess.check_call(cmd) - assert os.path.exists('best_boston.pkl') - os.remove('best_boston.pkl') + assert os.path.exists('best_calif.pkl') + os.remove('best_calif.pkl') @pytest.mark.skipif(**tm.no_sklearn()) diff --git a/tests/python/test_linear.py b/tests/python/test_linear.py index 2ed5eea2f..635048ddb 100644 --- a/tests/python/test_linear.py +++ b/tests/python/test_linear.py @@ -39,8 +39,8 @@ class TestLinear: # We test a weaker condition that the loss has not increased between the first and last # iteration @given(parameter_strategy, strategies.integers(10, 50), - tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0), - strategies.floats(1e-5, 2.0)) + tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 1.0), + strategies.floats(1e-5, 1.0)) @settings(deadline=None) def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd): param['updater'] = 'coord_descent' @@ -69,8 +69,8 @@ class TestLinear: assert tm.non_increasing(sampled_result) @given(parameter_strategy, strategies.integers(10, 50), - tm.dataset_strategy, strategies.floats(1e-5, 2.0), - strategies.floats(1e-5, 2.0)) + tm.dataset_strategy, strategies.floats(1e-5, 1.0), + strategies.floats(1e-5, 1.0)) @settings(deadline=None) def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd): param['updater'] = 'shotgun' diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py index 800a4838a..b34d508cd 100644 --- a/tests/python/test_predict.py +++ b/tests/python/test_predict.py @@ -88,8 +88,8 @@ def test_predict_leaf(): def test_predict_shape(): - from sklearn.datasets import load_boston - X, y = load_boston(return_X_y=True) + from sklearn.datasets import fetch_california_housing + X, y = fetch_california_housing(return_X_y=True) reg = xgb.XGBRegressor(n_estimators=1) reg.fit(X, y) predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True) diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index 243c75e21..92869515c 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -425,8 +425,8 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None: def test_inplace_predict(client: "Client") -> None: - from sklearn.datasets import load_boston - X_, y_ = load_boston(return_X_y=True) + from sklearn.datasets import fetch_california_housing + X_, y_ = fetch_california_housing(return_X_y=True) X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32) reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y) booster = reg.get_booster() @@ -1405,8 +1405,8 @@ class TestWithDask: @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_sklearn()) def test_custom_objective(self, client: "Client") -> None: - from sklearn.datasets import load_boston - X, y = load_boston(return_X_y=True) + from sklearn.datasets import fetch_california_housing + X, y = fetch_california_housing(return_X_y=True) X, y = da.from_array(X), da.from_array(y) rounds = 20 @@ -1552,8 +1552,8 @@ class TestWithDask: assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5) def test_shap(self, client: "Client") -> None: - from sklearn.datasets import load_boston, load_digits - X, y = load_boston(return_X_y=True) + from sklearn.datasets import fetch_california_housing, load_digits + X, y = fetch_california_housing(return_X_y=True) params: Dict[str, Any] = {'objective': 'reg:squarederror'} self.run_shap(X, y, params, client) @@ -1597,8 +1597,8 @@ class TestWithDask: 1e-5, 1e-5) def test_shap_interactions(self, client: "Client") -> None: - from sklearn.datasets import load_boston - X, y = load_boston(return_X_y=True) + from sklearn.datasets import fetch_california_housing + X, y = fetch_california_housing(return_X_y=True) params = {'objective': 'reg:squarederror'} self.run_shap_interactions(X, y, params, client) diff --git a/tests/python/test_with_shap.py b/tests/python/test_with_shap.py index 253ce25e9..1e03e0700 100644 --- a/tests/python/test_with_shap.py +++ b/tests/python/test_with_shap.py @@ -14,7 +14,8 @@ pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package") # Check integration is not broken from xgboost side # Changes in binary format may cause problems def test_with_shap(): - X, y = shap.datasets.boston() + from sklearn.datasets import fetch_california_housing + X, y = fetch_california_housing(return_X_y=True) dtrain = xgb.DMatrix(X, label=y) model = xgb.train({"learning_rate": 0.01}, dtrain, 10) explainer = shap.TreeExplainer(model) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 98ae09ed7..46c40da4f 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -328,16 +328,16 @@ def test_select_feature(): def test_num_parallel_tree(): - from sklearn.datasets import load_boston + from sklearn.datasets import fetch_california_housing reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method='hist') - boston = load_boston() - bst = reg.fit(X=boston['data'], y=boston['target']) + X, y = fetch_california_housing(return_X_y=True) + bst = reg.fit(X=X, y=y) dump = bst.get_booster().get_dump(dump_format='json') assert len(dump) == 16 reg = xgb.XGBRFRegressor(n_estimators=4) - bst = reg.fit(X=boston['data'], y=boston['target']) + bst = reg.fit(X=X, y=y) dump = bst.get_booster().get_dump(dump_format='json') assert len(dump) == 4 @@ -346,14 +346,12 @@ def test_num_parallel_tree(): 'num_parallel_tree']) == 4 -def test_boston_housing_regression(): +def test_calif_housing_regression(): from sklearn.metrics import mean_squared_error - from sklearn.datasets import load_boston + from sklearn.datasets import fetch_california_housing from sklearn.model_selection import KFold - boston = load_boston() - y = boston['target'] - X = boston['data'] + X, y = fetch_california_housing(return_X_y=True) kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) @@ -377,12 +375,12 @@ def test_boston_housing_regression(): xgb_model.feature_names_in_ -def run_boston_housing_rf_regression(tree_method): +def run_calif_housing_rf_regression(tree_method): from sklearn.metrics import mean_squared_error - from sklearn.datasets import load_boston + from sklearn.datasets import fetch_california_housing from sklearn.model_selection import KFold - X, y = load_boston(return_X_y=True) + X, y = fetch_california_housing(return_X_y=True) kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRFRegressor(random_state=42, tree_method=tree_method).fit( @@ -397,29 +395,27 @@ def run_boston_housing_rf_regression(tree_method): rfreg.fit(X, y, early_stopping_rounds=10) -def test_boston_housing_rf_regression(): - run_boston_housing_rf_regression("hist") +def test_calif_housing_rf_regression(): + run_calif_housing_rf_regression("hist") def test_parameter_tuning(): from sklearn.model_selection import GridSearchCV - from sklearn.datasets import load_boston + from sklearn.datasets import fetch_california_housing - boston = load_boston() - y = boston['target'] - X = boston['data'] + X, y = fetch_california_housing(return_X_y=True) xgb_model = xgb.XGBRegressor(learning_rate=0.1) clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200]}, cv=3, verbose=1) clf.fit(X, y) assert clf.best_score_ < 0.7 - assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4} + assert clf.best_params_ == {'n_estimators': 200, 'max_depth': 4} def test_regression_with_custom_objective(): from sklearn.metrics import mean_squared_error - from sklearn.datasets import load_boston + from sklearn.datasets import fetch_california_housing from sklearn.model_selection import KFold def objective_ls(y_true, y_pred): @@ -427,9 +423,7 @@ def test_regression_with_custom_objective(): hess = np.ones(len(y_true)) return grad, hess - boston = load_boston() - y = boston['target'] - X = boston['data'] + X, y = fetch_california_housing(return_X_y=True) kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor(objective=objective_ls).fit( @@ -841,13 +835,13 @@ def test_save_load_model(): def test_RFECV(): - from sklearn.datasets import load_boston + from sklearn.datasets import fetch_california_housing from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_iris from sklearn.feature_selection import RFECV # Regression - X, y = load_boston(return_X_y=True) + X, y = fetch_california_housing(return_X_y=True) bst = xgb.XGBRegressor(booster='gblinear', learning_rate=0.1, n_estimators=10, objective='reg:squarederror', diff --git a/tests/python/testing.py b/tests/python/testing.py index 5f8aef124..d2b45bdec 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -229,8 +229,8 @@ class TestDataset: @memory.cache -def get_boston(): - data = datasets.load_boston() +def get_california_housing(): + data = datasets.fetch_california_housing() return data.data, data.target @@ -315,7 +315,9 @@ def make_categorical( _unweighted_datasets_strategy = strategies.sampled_from( [ - TestDataset("boston", get_boston, "reg:squarederror", "rmse"), + TestDataset( + "calif_housing", get_california_housing, "reg:squarederror", "rmse" + ), TestDataset("digits", get_digits, "multi:softmax", "mlogloss"), TestDataset("cancer", get_cancer, "binary:logistic", "logloss"), TestDataset(