""" Getting started with XGBoost ============================ This is a simple example of using the native XGBoost interface, there are other interfaces in the Python package like scikit-learn interface and Dask interface. See :doc:`/python/python_intro` and :doc:`/tutorials/index` for other references. """ import os import pickle import numpy as np from sklearn.datasets import load_svmlight_file import xgboost as xgb # Make sure the demo knows where to load the data. CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR)) DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, "demo") # X is a scipy csr matrix, XGBoost supports many other input types, X, y = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.train")) dtrain = xgb.DMatrix(X, y) # validation set X_test, y_test = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.test")) dtest = xgb.DMatrix(X_test, y_test) # specify parameters via map, definition are same as c++ version param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} # specify validations set to watch performance watchlist = [(dtest, "eval"), (dtrain, "train")] # number of boosting rounds num_round = 2 bst = xgb.train(param, dtrain, num_boost_round=num_round, evals=watchlist) # run prediction preds = bst.predict(dtest) labels = dtest.get_label() print( "error=%f" % ( sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) ) ) bst.save_model("model-0.json") # dump model bst.dump_model("dump.raw.txt") # dump model with feature map bst.dump_model("dump.nice.txt", os.path.join(DEMO_DIR, "data/featmap.txt")) # save dmatrix into binary buffer dtest.save_binary("dtest.dmatrix") # save model bst.save_model("model-1.json") # load model and data in bst2 = xgb.Booster(model_file="model-1.json") dtest2 = xgb.DMatrix("dtest.dmatrix") preds2 = bst2.predict(dtest2) # assert they are the same assert np.sum(np.abs(preds2 - preds)) == 0 # alternatively, you can pickle the booster pks = pickle.dumps(bst2) # load model and data in bst3 = pickle.loads(pks) preds3 = bst3.predict(dtest2) # assert they are the same assert np.sum(np.abs(preds3 - preds)) == 0