Replace all uses of deprecated function sklearn.datasets.load_boston (#7373)

* Replace all uses of deprecated function sklearn.datasets.load_boston * More renaming * Fix bad name * Update assertion * Fix n boosted rounds. * Avoid over regularization. * Rebase. * Avoid over regularization. * Whac-a-mole Co-authored-by: fis <jm.yuan@outlook.com>
2022-01-30 04:27:57 -08:00
parent b4340abf56
commit c621775f34
13 changed files with 56 additions and 66 deletions
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -12,7 +12,7 @@ import xgboost as xgb
 import numpy as np
 from sklearn.model_selection import KFold, train_test_split, GridSearchCV
 from sklearn.metrics import confusion_matrix, mean_squared_error
-from sklearn.datasets import load_iris, load_digits, load_boston
+from sklearn.datasets import load_iris, load_digits, fetch_california_housing

 rng = np.random.RandomState(31337)

@@ -38,10 +38,8 @@ for train_index, test_index in kf.split(X):
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))

-print("Boston Housing: regression")
-boston = load_boston()
-y = boston['target']
-X = boston['data']
+print("California Housing: regression")
+X, y = fetch_california_housing(return_X_y=True)
 kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index])
@@ -50,8 +48,6 @@ for train_index, test_index in kf.split(X):
    print(mean_squared_error(actuals, predictions))

 print("Parameter optimization")
-y = boston['target']
-X = boston['data']
 xgb_model = xgb.XGBRegressor(n_jobs=1)
 clf = GridSearchCV(xgb_model,
                   {'max_depth': [2, 4, 6],
@@ -63,8 +59,8 @@ print(clf.best_params_)
 # The sklearn API models are picklable
 print("Pickling sklearn API models")
 # must open in binary format to pickle
-pickle.dump(clf, open("best_boston.pkl", "wb"))
-clf2 = pickle.load(open("best_boston.pkl", "rb"))
+pickle.dump(clf, open("best_calif.pkl", "wb"))
+clf2 = pickle.load(open("best_calif.pkl", "rb"))
 print(np.allclose(clf.predict(X), clf2.predict(X)))

 # Early-stopping
--- a/demo/guide-python/sklearn_parallel.py
+++ b/demo/guide-python/sklearn_parallel.py
@@ -3,16 +3,13 @@ Demo for using xgboost with sklearn
 ===================================
 """
 from sklearn.model_selection import GridSearchCV
-from sklearn.datasets import load_boston
+from sklearn.datasets import fetch_california_housing
 import xgboost as xgb
 import multiprocessing

 if __name__ == "__main__":
    print("Parallel Parameter optimization")
-    boston = load_boston()
-
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
    xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2)
    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
--- a/demo/guide-python/update_process.py
+++ b/demo/guide-python/update_process.py
@@ -8,14 +8,14 @@ experiment.
 """

 import xgboost as xgb
-from sklearn.datasets import load_boston
+from sklearn.datasets import fetch_california_housing
 import numpy as np


 def main():
    n_rounds = 32

-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)

    # Train a model first
    X_train = X[: X.shape[0] // 2]