From 932af821c557c910e42ca22b66fa895f3fbcbdb2 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 8 May 2015 09:25:55 -0500 Subject: [PATCH 1/4] CLN: Remove unused import. Fix comment. --- wrapper/xgboost.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index ed0b1c2df..25ff8b1b2 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -15,7 +15,6 @@ import re import ctypes import platform import collections -from io import BytesIO import numpy as np import scipy.sparse @@ -492,7 +491,7 @@ class Booster(object): def save_raw(self): """ Save the model to a in memory buffer represetation - + Returns ------- a in memory buffer represetation of the model @@ -876,12 +875,12 @@ class XGBModel(XGBModelBase): self._Booster = None def __getstate__(self): - # can't pickle ctypes pointers so put _Booster in a BytesIO obj - this = self.__dict__.copy() # don't modify in place + # can't pickle ctypes pointers so put _Booster in a bytearray object + this = self.__dict__.copy() # don't modify in place bst = this["_Booster"] if bst is not None: raw = this["_Booster"].save_raw() - this["_Booster"] = raw + this["_Booster"] = raw return this def __setstate__(self, state): @@ -894,7 +893,7 @@ class XGBModel(XGBModelBase): """ get the underlying xgboost Booster of this model will raise an exception when fit was not called - + Returns ------- booster : a xgboost booster of underlying model @@ -902,7 +901,7 @@ class XGBModel(XGBModelBase): if self._Booster is None: raise XGBError('need to call fit beforehand') return self._Booster - + def get_xgb_params(self): xgb_params = self.get_params() From 99c2df99137bdf4e2bda28584001b33b86f672a4 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 8 May 2015 14:34:37 -0500 Subject: [PATCH 2/4] EX: Show example of pickling and parallel use. --- demo/guide-python/sklearn_examples.py | 122 ++++++++++++++++---------- 1 file changed, 74 insertions(+), 48 deletions(-) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index dd0620a7c..b378d28cc 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -4,60 +4,86 @@ Created on 1 Apr 2015 @author: Jamie Hall ''' +if __name__ == "__main__": + # NOTE: This *has* to be here and in the `__name__ == "__main__"` clause + # to run XGBoost in parallel, if XGBoost was built with OpenMP support. + # Otherwise, you can use fork, which is the default backend for joblib, + # and omit this. + from multiprocessing import set_start_method + set_start_method("forkserver") -import xgboost as xgb + import pickle + import os + import xgboost as xgb -import numpy as np -from sklearn.cross_validation import KFold -from sklearn.grid_search import GridSearchCV -from sklearn.metrics import confusion_matrix, mean_squared_error -from sklearn.datasets import load_iris, load_digits, load_boston + import numpy as np + from sklearn.cross_validation import KFold + from sklearn.grid_search import GridSearchCV + from sklearn.metrics import confusion_matrix, mean_squared_error + from sklearn.datasets import load_iris, load_digits, load_boston -rng = np.random.RandomState(31337) + rng = np.random.RandomState(31337) + print("Zeros and Ones from the Digits dataset: binary classification") + digits = load_digits(2) + y = digits['target'] + X = digits['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) -print("Zeros and Ones from the Digits dataset: binary classification") -digits = load_digits(2) -y = digits['target'] -X = digits['data'] -kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) -for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) + print("Iris: multiclass classification") + iris = load_iris() + y = iris['target'] + X = iris['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) -print("Iris: multiclass classification") -iris = load_iris() -y = iris['target'] -X = iris['data'] -kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) -for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) + print("Boston Housing: regression") + boston = load_boston() + y = boston['target'] + X = boston['data'] + kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) + for train_index, test_index in kf: + xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(mean_squared_error(actuals, predictions)) -print("Boston Housing: regression") -boston = load_boston() -y = boston['target'] -X = boston['data'] -kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) -for train_index, test_index in kf: - xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(mean_squared_error(actuals, predictions)) - -print("Parameter optimization") -y = boston['target'] -X = boston['data'] -xgb_model = xgb.XGBRegressor() -clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1) -clf.fit(X,y) -print(clf.best_score_) -print(clf.best_params_) + print("Parameter optimization") + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1) + clf.fit(X,y) + print(clf.best_score_) + print(clf.best_params_) + # The sklearn API models are picklable + print("Pickling sklearn API models") + # must open in binary format to pickle + pickle.dump(clf, open("best_boston.pkl", "wb")) + clf2 = pickle.load(open("best_boston.pkl", "rb")) + print(np.allclose(clf.predict(X), clf2.predict(X))) + print("Parallel Parameter optimization") + os.environ["OMP_NUM_THREADS"] = "1" + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1, + n_jobs=2) + clf.fit(X, y) + print(clf.best_score_) + print(clf.best_params_) From fa8c6e2f0b5f04b8b5b9c7fb12ed16ca46a9dc77 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 8 May 2015 14:34:58 -0500 Subject: [PATCH 3/4] DOC: Add warning about fork + openmp --- wrapper/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wrapper/README.md b/wrapper/README.md index 9c0399693..b863353b1 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -7,6 +7,8 @@ Python * To make the python module, type ```./build.sh``` in the root directory of project * Install with `python setup.py install` from this directory. * Refer also to the walk through example in [demo folder](../demo/guide-python) +* **NOTE**: if you want to run XGBoost in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_examples.py demo. + R ===== From 15ea00540a7c31d208d6a11c096b7a172ddadad2 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 11 May 2015 09:30:51 -0500 Subject: [PATCH 4/4] EX: Make separate example for fork issue. --- demo/guide-python/sklearn_examples.py | 130 +++++++++++--------------- demo/guide-python/sklearn_parallel.py | 35 +++++++ 2 files changed, 89 insertions(+), 76 deletions(-) create mode 100644 demo/guide-python/sklearn_parallel.py diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index b378d28cc..ce8c8d01e 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -4,86 +4,64 @@ Created on 1 Apr 2015 @author: Jamie Hall ''' -if __name__ == "__main__": - # NOTE: This *has* to be here and in the `__name__ == "__main__"` clause - # to run XGBoost in parallel, if XGBoost was built with OpenMP support. - # Otherwise, you can use fork, which is the default backend for joblib, - # and omit this. - from multiprocessing import set_start_method - set_start_method("forkserver") +import pickle +import xgboost as xgb - import pickle - import os - import xgboost as xgb +import numpy as np +from sklearn.cross_validation import KFold +from sklearn.metrics import confusion_matrix, mean_squared_error +from sklearn.grid_search import GridSearchCV +from sklearn.datasets import load_iris, load_digits, load_boston - import numpy as np - from sklearn.cross_validation import KFold - from sklearn.grid_search import GridSearchCV - from sklearn.metrics import confusion_matrix, mean_squared_error - from sklearn.datasets import load_iris, load_digits, load_boston +rng = np.random.RandomState(31337) - rng = np.random.RandomState(31337) +print("Zeros and Ones from the Digits dataset: binary classification") +digits = load_digits(2) +y = digits['target'] +X = digits['data'] +kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) +for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) - print("Zeros and Ones from the Digits dataset: binary classification") - digits = load_digits(2) - y = digits['target'] - X = digits['data'] - kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) - for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) +print("Iris: multiclass classification") +iris = load_iris() +y = iris['target'] +X = iris['data'] +kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) +for train_index, test_index in kf: + xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(confusion_matrix(actuals, predictions)) - print("Iris: multiclass classification") - iris = load_iris() - y = iris['target'] - X = iris['data'] - kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) - for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(confusion_matrix(actuals, predictions)) +print("Boston Housing: regression") +boston = load_boston() +y = boston['target'] +X = boston['data'] +kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) +for train_index, test_index in kf: + xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) + predictions = xgb_model.predict(X[test_index]) + actuals = y[test_index] + print(mean_squared_error(actuals, predictions)) - print("Boston Housing: regression") - boston = load_boston() - y = boston['target'] - X = boston['data'] - kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) - for train_index, test_index in kf: - xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) - predictions = xgb_model.predict(X[test_index]) - actuals = y[test_index] - print(mean_squared_error(actuals, predictions)) +print("Parameter optimization") +y = boston['target'] +X = boston['data'] +xgb_model = xgb.XGBRegressor() +clf = GridSearchCV(xgb_model, + {'max_depth': [2,4,6], + 'n_estimators': [50,100,200]}, verbose=1) +clf.fit(X,y) +print(clf.best_score_) +print(clf.best_params_) - print("Parameter optimization") - y = boston['target'] - X = boston['data'] - xgb_model = xgb.XGBRegressor() - clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1) - clf.fit(X,y) - print(clf.best_score_) - print(clf.best_params_) - - # The sklearn API models are picklable - print("Pickling sklearn API models") - # must open in binary format to pickle - pickle.dump(clf, open("best_boston.pkl", "wb")) - clf2 = pickle.load(open("best_boston.pkl", "rb")) - print(np.allclose(clf.predict(X), clf2.predict(X))) - - print("Parallel Parameter optimization") - os.environ["OMP_NUM_THREADS"] = "1" - y = boston['target'] - X = boston['data'] - xgb_model = xgb.XGBRegressor() - clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1, - n_jobs=2) - clf.fit(X, y) - print(clf.best_score_) - print(clf.best_params_) +# The sklearn API models are picklable +print("Pickling sklearn API models") +# must open in binary format to pickle +pickle.dump(clf, open("best_boston.pkl", "wb")) +clf2 = pickle.load(open("best_boston.pkl", "rb")) +print(np.allclose(clf.predict(X), clf2.predict(X))) diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py new file mode 100644 index 000000000..803f3fac8 --- /dev/null +++ b/demo/guide-python/sklearn_parallel.py @@ -0,0 +1,35 @@ +import os + +if __name__ == "__main__": + # NOTE: on posix systems, this *has* to be here and in the + # `__name__ == "__main__"` clause to run XGBoost in parallel processes + # using fork, if XGBoost was built with OpenMP support. Otherwise, if you + # build XGBoost without OpenMP support, you can use fork, which is the + # default backend for joblib, and omit this. + try: + from multiprocessing import set_start_method + except ImportError: + raise ImportError("Unable to import multiprocessing.set_start_method." + " This example only runs on Python 3.4") + set_start_method("forkserver") + + import numpy as np + from sklearn.grid_search import GridSearchCV + from sklearn.datasets import load_boston + import xgboost as xgb + + rng = np.random.RandomState(31337) + + print("Parallel Parameter optimization") + boston = load_boston() + + os.environ["OMP_NUM_THREADS"] = "2" # or to whatever you want + y = boston['target'] + X = boston['data'] + xgb_model = xgb.XGBRegressor() + clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], + 'n_estimators': [50, 100, 200]}, verbose=1, + n_jobs=2) + clf.fit(X, y) + print(clf.best_score_) + print(clf.best_params_)