From 0f5f9c03850073ce756f01cd67b0b86aa0934ac7 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 20 May 2015 14:17:03 -0500 Subject: [PATCH 01/83] ENH: Allow early stopping in sklearn API. --- wrapper/xgboost.py | 118 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 6 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 96f6c2573..35c24a1f2 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -772,7 +772,6 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, ------- booster : a trained booster model """ - evals = list(evals) bst = Booster(params, [dtrain] + [d[0] for d in evals]) @@ -1074,6 +1073,8 @@ class XGBModel(XGBModelBase): params = super(XGBModel, self).get_params(deep=deep) if params['missing'] is np.nan: params['missing'] = None # sklearn doesn't handle nan. see #4725 + if not params.get('eval_metric', True): + del params['eval_metric'] # don't give as None param to Booster return params def get_xgb_params(self): @@ -1086,10 +1087,62 @@ class XGBModel(XGBModelBase): xgb_params.pop('nthread', None) return xgb_params - def fit(self, data, y): + def fit(self, X, y, eval_set=None, eval_metric=None, + early_stopping_rounds=None, feval=None): # pylint: disable=missing-docstring,invalid-name - train_dmatrix = DMatrix(data, label=y, missing=self.missing) - self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators) + """ + Fit the gradient boosting model + + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + eval_set : list, optional + A list of (X, y) tuple pairs to use as a validation set for + early-stopping + eval_metric : str, optional + Built-in evaluation metric to use. + early_stopping_rounds : int + Activates early stopping. Validation error needs to decrease at + least every round(s) to continue training. + Requires at least one item in evals. If there's more than one, + will use the last. Returns the model from the last iteration + (not the best one). If early stopping occurs, the model will + have two additional fields: bst.best_score and bst.best_iteration. + feval : function, optional + Custom evaluation metric to use. The call signature is + feval(y_predicted, y_true) where y_true will be a DMatrix object + such that you may need to call the get_label method. This objective + if always assumed to be minimized, so use -feval when appropriate. + """ + trainDmatrix = DMatrix(X, label=y, missing=self.missing) + + eval_results = {} + if eval_set is not None: + evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) + evals = list(zip(evals, + ["validation_{}" for i in range(len(evals))])) + else: + evals = () + + params = self.get_xgb_params() + + if eval_metric is not None: + params.update({'eval_metric': eval_metric}) + + self._Booster = train(params, trainDmatrix, + self.n_estimators, evals=evals, + early_stopping_rounds=early_stopping_rounds, + evals_result=eval_results, feval=None) + if eval_results: + eval_results = {k: np.array(v, dtype=float) + for k, v in eval_results.items()} + eval_results = {k: np.array(v) for k, v in eval_results.items()} + self.eval_results_ = eval_results + self.best_score_ = self._Booster.best_score + self.best_iteration_ = self._Booster.best_iteration return self def predict(self, data): @@ -1117,8 +1170,39 @@ class XGBClassifier(XGBModel, XGBClassifierBase): colsample_bytree, base_score, seed, missing) - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, + early_stopping_rounds=None, feval=None): # pylint: disable = attribute-defined-outside-init,arguments-differ + """ + Fit gradient boosting classifier + + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + sample_weight : array_like + Weight for each instance + eval_set : list, optional + A list of (X, y) pairs to use as a validation set for + early-stopping + eval_metric : str + Built-in evaluation metric to use. + early_stopping_rounds : int, optional + Activates early stopping. Validation error needs to decrease at + least every round(s) to continue training. + Requires at least one item in evals. If there's more than one, + will use the last. Returns the model from the last iteration + (not the best one). If early stopping occurs, the model will + have two additional fields: bst.best_score and bst.best_iteration. + feval : function, optional + Custom evaluation metric to use. The call signature is + feval(y_predicted, y_true) where y_true will be a DMatrix object + such that you may need to call the get_label method. This objective + if always assumed to be minimized, so use -feval when appropriate. + """ + eval_results = {} self.classes_ = list(np.unique(y)) self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: @@ -1129,6 +1213,18 @@ class XGBClassifier(XGBModel, XGBClassifierBase): else: xgb_options = self.get_xgb_params() + if eval_metric is not None: + xgb_options.update({"eval_metric": eval_metric}) + + if eval_set is not None: + # TODO: use sample_weight if given? + evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) + nevals = len(evals) + eval_names = ["validation_{}".format(i) for i in range(nevals)] + evals = list(zip(evals, eval_names)) + else: + evals = () + self._le = LabelEncoder().fit(y) training_labels = self._le.transform(y) @@ -1139,7 +1235,17 @@ class XGBClassifier(XGBModel, XGBClassifierBase): train_dmatrix = DMatrix(X, label=training_labels, missing=self.missing) - self._Booster = train(xgb_options, train_dmatrix, self.n_estimators) + self._Booster = train(xgb_options, train_dmatrix, self.n_estimators, + evals=evals, + early_stopping_rounds=early_stopping_rounds, + evals_result=eval_results, feval=feval) + + if eval_results: + eval_results = {k: np.array(v, dtype=float) + for k, v in eval_results.items()} + self.eval_results_ = eval_results + self.best_score_ = self._Booster.best_score + self.best_iteration_ = self._Booster.best_iteration return self From 3952b525b82d2d2a2019429e3a97fe0f1f331f0c Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 20 May 2015 14:17:30 -0500 Subject: [PATCH 02/83] ENH: Allow possibly negative evaluation metrics. --- wrapper/xgboost.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 35c24a1f2..bc52da633 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -795,7 +795,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":([0-9.]+).", msg) + res = re.findall(":-?([0-9.]+).", msg) for key, val in zip(evals_name, res): evals_result[key].append(val) return bst @@ -842,7 +842,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":([0-9.]+).", msg) + res = re.findall(":-([0-9.]+).", msg) for key, val in zip(evals_name, res): evals_result[key].append(val) From cf89ae64e2c198c9e5acde076e8be40b1aab2e92 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 20 May 2015 14:27:22 -0500 Subject: [PATCH 03/83] ENH: Allow for silent evaluation --- wrapper/xgboost.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index bc52da633..a4acd5a7f 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -738,7 +738,7 @@ class Booster(object): def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, - early_stopping_rounds=None, evals_result=None): + early_stopping_rounds=None, evals_result=None, verbose_eval=True): # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init """Train a booster with given parameters. @@ -793,7 +793,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, else: msg = bst_eval_set.decode() - sys.stderr.write(msg + '\n') + if verbose_eval: + sys.stderr.write(msg + '\n') if evals_result is not None: res = re.findall(":-?([0-9.]+).", msg) for key, val in zip(evals_name, res): @@ -839,7 +840,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, else: msg = bst_eval_set.decode() - sys.stderr.write(msg + '\n') + if verbose_eval: + sys.stderr.write(msg + '\n') if evals_result is not None: res = re.findall(":-([0-9.]+).", msg) @@ -1088,7 +1090,7 @@ class XGBModel(XGBModelBase): return xgb_params def fit(self, X, y, eval_set=None, eval_metric=None, - early_stopping_rounds=None, feval=None): + early_stopping_rounds=None, feval=None, verbose=True): # pylint: disable=missing-docstring,invalid-name """ Fit the gradient boosting model @@ -1116,6 +1118,9 @@ class XGBModel(XGBModelBase): feval(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. This objective if always assumed to be minimized, so use -feval when appropriate. + verbose : bool + If `verbose` and an evaluation set is used, writes the evaluation + metric measured on the validation set to stderr. """ trainDmatrix = DMatrix(X, label=y, missing=self.missing) @@ -1135,7 +1140,8 @@ class XGBModel(XGBModelBase): self._Booster = train(params, trainDmatrix, self.n_estimators, evals=evals, early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=None) + evals_result=eval_results, feval=None, + verbose_eval=verbose) if eval_results: eval_results = {k: np.array(v, dtype=float) for k, v in eval_results.items()} @@ -1171,7 +1177,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase): base_score, seed, missing) def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, - early_stopping_rounds=None, feval=None): + early_stopping_rounds=None, feval=None, versbose=True): # pylint: disable = attribute-defined-outside-init,arguments-differ """ Fit gradient boosting classifier @@ -1201,6 +1207,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase): feval(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. This objective if always assumed to be minimized, so use -feval when appropriate. + verbose : bool + If `verbose` and an evaluation set is used, writes the evaluation + metric measured on the validation set to stderr. """ eval_results = {} self.classes_ = list(np.unique(y)) @@ -1238,7 +1247,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): self._Booster = train(xgb_options, train_dmatrix, self.n_estimators, evals=evals, early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=feval) + evals_result=eval_results, feval=feval, + verbose_eval=verbose) if eval_results: eval_results = {k: np.array(v, dtype=float) From 46e9520a28b4aca9281c938a919620a8754cb4d9 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 20 May 2015 14:38:45 -0500 Subject: [PATCH 04/83] DOC: Document verbose_eval --- wrapper/xgboost.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index a4acd5a7f..a4ad84bf5 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -767,6 +767,9 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst.best_score and bst.best_iteration. evals_result: dict This dictionary stores the evaluation results of all the items in watchlist + verbose_eval : bool + If `verbose_eval` then the evaluation metric on the validation set, if + given, is printed at each boosting stage. Returns ------- From 113285e1dc3fdc0c709e72a2cb985b3025360897 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 20 May 2015 14:39:48 -0500 Subject: [PATCH 05/83] DOC: Point to parameter.md for eval_metric --- wrapper/xgboost.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index a4ad84bf5..adb21a00b 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -1108,7 +1108,7 @@ class XGBModel(XGBModelBase): A list of (X, y) tuple pairs to use as a validation set for early-stopping eval_metric : str, optional - Built-in evaluation metric to use. + Built-in evaluation metric to use. See doc/parameter.md. early_stopping_rounds : int Activates early stopping. Validation error needs to decrease at least every round(s) to continue training. @@ -1197,7 +1197,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase): A list of (X, y) pairs to use as a validation set for early-stopping eval_metric : str - Built-in evaluation metric to use. + Built-in evaluation metric to use. See doc/parameter.md. early_stopping_rounds : int, optional Activates early stopping. Validation error needs to decrease at least every round(s) to continue training. From b0f7ddaa2ee3411b33f95b42be46a3325b0ac23b Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 30 Jun 2015 11:42:14 -0500 Subject: [PATCH 06/83] REF: Combine eval_metric and feval to one parameter --- wrapper/xgboost.py | 48 ++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index adb21a00b..95e0bf6ff 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -1093,7 +1093,7 @@ class XGBModel(XGBModelBase): return xgb_params def fit(self, X, y, eval_set=None, eval_metric=None, - early_stopping_rounds=None, feval=None, verbose=True): + early_stopping_rounds=None, verbose=True): # pylint: disable=missing-docstring,invalid-name """ Fit the gradient boosting model @@ -1107,8 +1107,14 @@ class XGBModel(XGBModelBase): eval_set : list, optional A list of (X, y) tuple pairs to use as a validation set for early-stopping - eval_metric : str, optional - Built-in evaluation metric to use. See doc/parameter.md. + eval_metric : str, callable, optional + If a str, should be a built-in evaluation metric to use. See + doc/parameter.md. If callable, a custom evaluation metric. The call + signature is func(y_predicted, y_true) where y_true will be a + DMatrix object such that you may need to call the get_label + method. It must return a str, value pair where the str is a name + for the evaluation and value is the value of the evaluation + function. This objective is always minimized. early_stopping_rounds : int Activates early stopping. Validation error needs to decrease at least every round(s) to continue training. @@ -1116,11 +1122,6 @@ class XGBModel(XGBModelBase): will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have two additional fields: bst.best_score and bst.best_iteration. - feval : function, optional - Custom evaluation metric to use. The call signature is - feval(y_predicted, y_true) where y_true will be a DMatrix object - such that you may need to call the get_label method. This objective - if always assumed to be minimized, so use -feval when appropriate. verbose : bool If `verbose` and an evaluation set is used, writes the evaluation metric measured on the validation set to stderr. @@ -1137,13 +1138,17 @@ class XGBModel(XGBModelBase): params = self.get_xgb_params() + feval = eval_metric if callable(eval_metric) else None if eval_metric is not None: - params.update({'eval_metric': eval_metric}) + if callable(eval_metric): + eval_metric = None + else: + params.update({'eval_metric': eval_metric}) self._Booster = train(params, trainDmatrix, self.n_estimators, evals=evals, early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=None, + evals_result=eval_results, feval=feval, verbose_eval=verbose) if eval_results: eval_results = {k: np.array(v, dtype=float) @@ -1180,7 +1185,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase): base_score, seed, missing) def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, - early_stopping_rounds=None, feval=None, versbose=True): + early_stopping_rounds=None, verbose=True): # pylint: disable = attribute-defined-outside-init,arguments-differ """ Fit gradient boosting classifier @@ -1196,8 +1201,14 @@ class XGBClassifier(XGBModel, XGBClassifierBase): eval_set : list, optional A list of (X, y) pairs to use as a validation set for early-stopping - eval_metric : str - Built-in evaluation metric to use. See doc/parameter.md. + eval_metric : str, callable, optional + If a str, should be a built-in evaluation metric to use. See + doc/parameter.md. If callable, a custom evaluation metric. The call + signature is func(y_predicted, y_true) where y_true will be a + DMatrix object such that you may need to call the get_label + method. It must return a str, value pair where the str is a name + for the evaluation and value is the value of the evaluation + function. This objective is always minimized. early_stopping_rounds : int, optional Activates early stopping. Validation error needs to decrease at least every round(s) to continue training. @@ -1205,11 +1216,6 @@ class XGBClassifier(XGBModel, XGBClassifierBase): will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have two additional fields: bst.best_score and bst.best_iteration. - feval : function, optional - Custom evaluation metric to use. The call signature is - feval(y_predicted, y_true) where y_true will be a DMatrix object - such that you may need to call the get_label method. This objective - if always assumed to be minimized, so use -feval when appropriate. verbose : bool If `verbose` and an evaluation set is used, writes the evaluation metric measured on the validation set to stderr. @@ -1225,8 +1231,12 @@ class XGBClassifier(XGBModel, XGBClassifierBase): else: xgb_options = self.get_xgb_params() + feval = eval_metric if callable(eval_metric) else None if eval_metric is not None: - xgb_options.update({"eval_metric": eval_metric}) + if callable(eval_metric): + eval_metric = None + else: + xgb_options.update({"eval_metric": eval_metric}) if eval_set is not None: # TODO: use sample_weight if given? From 4a37b852a03b1320d1c41f948a4ec212a981ad1d Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 30 Jun 2015 11:42:28 -0500 Subject: [PATCH 07/83] DOC: Add early stopping example --- demo/guide-python/sklearn_examples.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index ce8c8d01e..56fed1dd2 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -8,7 +8,7 @@ import pickle import xgboost as xgb import numpy as np -from sklearn.cross_validation import KFold +from sklearn.cross_validation import KFold, train_test_split from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.grid_search import GridSearchCV from sklearn.datasets import load_iris, load_digits, load_boston @@ -65,3 +65,23 @@ print("Pickling sklearn API models") pickle.dump(clf, open("best_boston.pkl", "wb")) clf2 = pickle.load(open("best_boston.pkl", "rb")) print(np.allclose(clf.predict(X), clf2.predict(X))) + +# Early-stopping + +X = digits['data'] +y = digits['target'] +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) +clf = xgb.XGBClassifier() +clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", + eval_set=[(X_test, y_test)]) + +# Custom evaluation function +from sklearn.metrics import log_loss + + +def log_loss_eval(y_pred, y_true): + return "log-loss", log_loss(y_true.get_label(), y_pred) + + +clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=log_loss_eval, + eval_set=[(X_test, y_test)]) From b76db01c6605a19e852172e3f08d9a4613bf6361 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 8 Jul 2015 14:29:52 -0500 Subject: [PATCH 08/83] STY: Fix lint errors --- wrapper/xgboost.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 95e0bf6ff..27041376b 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -6,7 +6,7 @@ Version: 0.40 Authors: Tianqi Chen, Bing Xu Early stopping by Zygmunt Zając """ -# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name +# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name, fixme from __future__ import absolute_import import os @@ -784,7 +784,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, else: evals_name = [d[1] for d in evals] evals_result.clear() - evals_result.update({key:[] for key in evals_name}) + evals_result.update({key: [] for key in evals_name}) if not early_stopping_rounds: for i in range(num_boost_round): @@ -1094,7 +1094,7 @@ class XGBModel(XGBModelBase): def fit(self, X, y, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True): - # pylint: disable=missing-docstring,invalid-name + # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init """ Fit the gradient boosting model @@ -1131,8 +1131,8 @@ class XGBModel(XGBModelBase): eval_results = {} if eval_set is not None: evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) - evals = list(zip(evals, - ["validation_{}" for i in range(len(evals))])) + evals = list(zip(evals, ["validation_{}".format(i) for i in + range(len(evals))])) else: evals = () From d8fc16538ea83e624c19b4ad0e2839c6f9f3d581 Mon Sep 17 00:00:00 2001 From: orenov Date: Wed, 22 Jul 2015 12:03:01 +0300 Subject: [PATCH 09/83] issue #368, data.table problems --- R-package/R/xgb.model.dt.tree.R | 37 ++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 7eea3dfcd..d083566a5 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -133,34 +133,33 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } - yes <- allTrees[!is.na(Yes),Yes] - - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + yes <- allTrees[!is.na(Yes), Yes] + + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "Yes.Feature", - value = allTrees[ID == yes,Feature]) - - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + value = allTrees[ID %in% yes, Feature]) + + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "Yes.Cover", - value = allTrees[ID == yes,Cover]) - - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), - j = "Yes.Quality", - value = allTrees[ID == yes,Quality]) + value = allTrees[ID %in% yes, Cover]) - no <- allTrees[!is.na(No),No] + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), + j = "Yes.Quality", + value = allTrees[ID %in% yes, Quality]) + no <- allTrees[!is.na(No), No] - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "No.Feature", - value = allTrees[ID == no,Feature]) + value = allTrees[ID %in% no, Feature]) - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "No.Cover", - value = allTrees[ID == no,Cover]) + value = allTrees[ID %in% no, Cover]) - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "No.Quality", - value = allTrees[ID == no,Quality]) - + value = allTrees[ID %in% no, Quality]) + allTrees } From d120167725fd7d56ab84bf7feb479389b8866eab Mon Sep 17 00:00:00 2001 From: Will Stanton Date: Wed, 22 Jul 2015 09:19:22 -0600 Subject: [PATCH 10/83] Fixed a few typos in README --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4a5e7bf6e..21d15ce56 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ DMLC/XGBoost An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data -Checkout our [Comitters and Contributors](CONTRIBUTORS.md) who keep make xgboost better. +Checkout our [Committers and Contributors](CONTRIBUTORS.md) who help make xgboost better. -Documentations: [Documentation of dmlc/xgboost](doc/README.md) +Documentation: [Documentation of dmlc/xgboost](doc/README.md) -Issues Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion) +Issue Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion) Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) to ask questions and share your experience on xgboost. - Use issue tracker for bug reports, feature requests etc. @@ -30,13 +30,13 @@ What's New - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) * XGBoost helps three champion teams to win [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) - - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) + - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) * [External Memory Version](doc/external_memory.md) Contributing to XGBoost ========= -XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users. -* Checkout [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something. +XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users. +* Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something. * Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users. * Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged. @@ -66,5 +66,5 @@ Version XGBoost in Graphlab Create ========================== -* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html -* Nice blogpost by Jay Gu using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand +* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html +* Nice blogpost by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand From ba63b2886f32894c15caab87012f0862fbdd9242 Mon Sep 17 00:00:00 2001 From: Will Stanton Date: Wed, 22 Jul 2015 10:37:49 -0600 Subject: [PATCH 11/83] Check out vs. checkout Made it consistent across the README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 21d15ce56..97e348b43 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ DMLC/XGBoost An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data -Checkout our [Committers and Contributors](CONTRIBUTORS.md) who help make xgboost better. +Check out our [Committers and Contributors](CONTRIBUTORS.md) who help make xgboost better. Documentation: [Documentation of dmlc/xgboost](doc/README.md) @@ -27,7 +27,7 @@ XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) What's New ========== * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance) - - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links) + - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) * XGBoost helps three champion teams to win [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) From 9eca9bccf47d780add609fb8530b20d0f23a379c Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Wed, 22 Jul 2015 23:18:34 -0700 Subject: [PATCH 12/83] moving gitter chat up --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 97e348b43..15775d67b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ DMLC/XGBoost ================================== -[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) +[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data @@ -16,8 +16,6 @@ Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost - Use issue tracker for bug reports, feature requests etc. - Use the user group to post your experience, ask questions about general usages. -Gitter for developers [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) - Distributed Version: [Distributed XGBoost](multi-node) Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links) From 0ea5b14bd8aa1d6d89826f7e9ce56f5656ef7ec4 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Thu, 23 Jul 2015 01:12:33 -0700 Subject: [PATCH 13/83] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 97e348b43..ff302175a 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,10 @@ Version - Change log in [CHANGES.md](CHANGES.md) - This version is compatible with 0.3x versions +License +======= +© Contributors, 2015. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license. + XGBoost in Graphlab Create ========================== * XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html From 141f9ebf4b2f1c2ebbbf10ed2008f53af906b2e1 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 24 Jul 2015 08:51:05 -0700 Subject: [PATCH 14/83] Update CHANGES.md --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 90fd77ebb..0be001744 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -34,3 +34,10 @@ xgboost-0.4 - xgboost python model is now pickable * sklearn wrapper is supported in python module * Experimental External memory version + +on going version +===== +* Python module now throw exception instead of crash terminal when a parameter error happens. +* Java api is ready for use +* Added more test cases and continuous integration to make each build more robust +* Improvements in sklearn compatible module From 198c5bb55e11a48dd0be351eed8dc9785e973022 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 24 Jul 2015 11:58:02 -0700 Subject: [PATCH 15/83] fix namespace and desc --- R-package/.Rbuildignore | 1 + R-package/DESCRIPTION | 2 +- R-package/R/utils.R | 4 ++-- R-package/R/xgb.cv.R | 2 +- R-package/R/xgb.plot.importance.R | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore index 6b3c4084e..b37d627ba 100644 --- a/R-package/.Rbuildignore +++ b/R-package/.Rbuildignore @@ -3,3 +3,4 @@ \.dll$ ^.*\.Rproj$ ^\.Rproj\.user$ +README.md diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index c6975af5e..6f784fbb3 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -5,7 +5,7 @@ Version: 0.4-0 Date: 2015-05-11 Author: Tianqi Chen , Tong He , Michael Benesty Maintainer: Tong He -Description: Xgboost is short for eXtreme Gradient Boosting, which is an +Description: eXtreme Gradient Boosting, which is an efficient and scalable implementation of gradient boosting framework. This package is an R wrapper of xgboost. The package includes efficient linear model solver and tree learning algorithms. The package can automatically diff --git a/R-package/R/utils.R b/R-package/R/utils.R index f7f6b9192..e58601df8 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -288,7 +288,7 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) { } ret <- paste(ret, sprintf("%f", mean(stats)), sep="") if (showsd) { - ret <- paste(ret, sprintf("+%f", sd(stats)), sep="") + ret <- paste(ret, sprintf("+%f", stats::sd(stats)), sep="") } } return (ret) @@ -313,7 +313,7 @@ xgb.createFolds <- function(y, k = 10) if(cuts < 2) cuts <- 2 if(cuts > 5) cuts <- 5 y <- cut(y, - unique(quantile(y, probs = seq(0, 1, length = cuts))), + unique(stats::quantile(y, probs = seq(0, 1, length = cuts))), include.lowest = TRUE) } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 793d904cd..a5364db52 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -240,7 +240,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = else colnames <- colnamesMean type <- rep(x = "numeric", times = length(colnames)) - dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table + dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table split <- str_split(string = history, pattern = "\t") for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)} diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index eb0f8e346..b86d14323 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -33,7 +33,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 if (!"data.table" %in% class(importance_matrix)) { stop("importance_matrix: Should be a data.table.") } - if (!require(ggplot2, quietly = TRUE)) { + if (!requireNamespace(ggplot2, quietly = TRUE)) { stop("ggplot2 package is required for plotting the importance", call. = FALSE) } if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { @@ -46,7 +46,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) importance_matrix[,"Cluster":=clusters$cluster %>% as.character] - plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() ) + plot <- ggplot(importance_matrix, aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() ) return(plot) } From a1c7104d7f7a797794ed6ecc73f76c9582dd562b Mon Sep 17 00:00:00 2001 From: hetong007 Date: Fri, 24 Jul 2015 19:11:08 +0000 Subject: [PATCH 16/83] fix crash --- R-package/R/xgb.plot.importance.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index b86d14323..f126dfe46 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -33,7 +33,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 if (!"data.table" %in% class(importance_matrix)) { stop("importance_matrix: Should be a data.table.") } - if (!requireNamespace(ggplot2, quietly = TRUE)) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { stop("ggplot2 package is required for plotting the importance", call. = FALSE) } if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { @@ -46,7 +46,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) importance_matrix[,"Cluster":=clusters$cluster %>% as.character] - plot <- ggplot(importance_matrix, aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() ) + plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) return(plot) } From e353a2e51cd0269a31bbc1dac4001fa5193d312a Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Fri, 24 Jul 2015 17:00:02 -0700 Subject: [PATCH 17/83] restructuring the README with an index --- README.md | 87 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 4fabb7362..7a4cfa4c8 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,32 @@ -DMLC/XGBoost -================================== +XGBoost +======= [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. +An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. + It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data -Check out our [Committers and Contributors](CONTRIBUTORS.md) who help make xgboost better. - -Documentation: [Documentation of dmlc/xgboost](doc/README.md) - -Issue Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion) - -Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) to ask questions and share your experience on xgboost. - - Use issue tracker for bug reports, feature requests etc. - - Use the user group to post your experience, ask questions about general usages. - -Distributed Version: [Distributed XGBoost](multi-node) - -Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links) - XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects +Contents +-------- +* [What's New](#whats-new) +* [Version](#version) +* [Documentation](doc/README.md) +* [Build Instruction](doc/build.md) +* [Features](#features) +* [Distributed XGBoost](multi-node) +* [Usecases](doc/README.md#highlight-links) +* [Bug Reporting](#bug-reporting) +* [Contributing to XGBoost](#contributing-to-xgboost) +* [Committers and Contributors](CONTRIBUTORS.md) +* [License](#license) +* [XGBoost in Graphlab Create](#xgboost-in-graphlab-create) + What's New -========== +---------- + * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance) - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) @@ -31,42 +34,46 @@ What's New - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) * [External Memory Version](doc/external_memory.md) -Contributing to XGBoost -========= -XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users. -* Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something. -* Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users. -* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged. +Version +------- + +* Current version xgboost-0.4, a lot improvment has been made since 0.3 + - Change log in [CHANGES.md](CHANGES.md) + - This version is compatible with 0.3x versions Features -======== -* Easily accessible in python, R, Julia, CLI -* Fast speed and memory efficient - - Can be more than 10 times faster than GBM in sklearn and R +-------- + +* Easily accessible through python, R, Julia, CLI +* Fast and memory efficient + - Can be more than 10 times faster than GBM in sklearn and R. [benchm-ml numbers](https://github.com/szilard/benchm-ml) - Handles sparse matrices, support external memory * Accurate prediction, and used extensively by data scientists and kagglers - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) * Distributed and Portable - The distributed version runs on Hadoop (YARN), MPI, SGE etc. - Scales to billions of examples and beyond + +Bug Reporting +------------- -Build -======= -* Run ```bash build.sh``` (you can also type make) - - Normally it gives what you want - - See [Build Instruction](doc/build.md) for more information +* For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page. +* For generic questions or to share your experience using xgboost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) -Version -======= -* Current version xgboost-0.4, a lot improvment has been made since 0.3 - - Change log in [CHANGES.md](CHANGES.md) - - This version is compatible with 0.3x versions + +Contributing to XGBoost +----------------------- + +XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users. +* Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something. +* Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users. +* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged. License -======= +------- © Contributors, 2015. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license. XGBoost in Graphlab Create -========================== +-------------------------- * XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html * Nice blogpost by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand From cbdcbfc49c63c8c0201b429839e8b64c6a81ef52 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Sat, 25 Jul 2015 12:46:28 -0700 Subject: [PATCH 18/83] some more changes to remove redundant information --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7a4cfa4c8..18c5b77c1 100644 --- a/README.md +++ b/README.md @@ -28,17 +28,17 @@ What's New ---------- * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance) - - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) + Check out the [winning solution](doc/README.md#highlight-links) * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) * XGBoost helps three champion teams to win [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing) - - Check out the winning solution at [Highlight links](doc/README.md#highlight-links) + Check out the [winning solution](doc/README.md#highlight-links) * [External Memory Version](doc/external_memory.md) Version ------- -* Current version xgboost-0.4, a lot improvment has been made since 0.3 - - Change log in [CHANGES.md](CHANGES.md) +* Current version xgboost-0.4 + - [Change log](CHANGES.md) - This version is compatible with 0.3x versions Features @@ -48,8 +48,7 @@ Features * Fast and memory efficient - Can be more than 10 times faster than GBM in sklearn and R. [benchm-ml numbers](https://github.com/szilard/benchm-ml) - Handles sparse matrices, support external memory -* Accurate prediction, and used extensively by data scientists and kagglers - - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) +* Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) * Distributed and Portable - The distributed version runs on Hadoop (YARN), MPI, SGE etc. - Scales to billions of examples and beyond @@ -75,5 +74,5 @@ License XGBoost in Graphlab Create -------------------------- -* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html +* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html) * Nice blogpost by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand From af042f6a248afa01b882c6f41ad068d28036e84c Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 25 Jul 2015 21:14:50 -0700 Subject: [PATCH 19/83] make things cxx98 compatible --- src/utils/thread_buffer.h | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h index 2119f53ab..c4dc1185d 100644 --- a/src/utils/thread_buffer.h +++ b/src/utils/thread_buffer.h @@ -11,9 +11,14 @@ #include #include #include "./utils.h" +// threading util could not run on solaris +#ifndef XGBOOST_STRICT_CXX98_ #include "./thread.h" +#endif + namespace xgboost { namespace utils { +#if !defined(XGBOOST_STRICT_CXX98_) /*! * \brief buffered loading iterator that uses multithread * this template method will assume the following paramters @@ -201,6 +206,52 @@ class ThreadBuffer { loading_need.Post(); } }; +#else +// a dummy single threaded ThreadBuffer +// use this to resolve R's solaris compatibility for now +template +class ThreadBuffer { + public: + ThreadBuffer() : init_end_(false) {} + ~ThreadBuffer() { + if (init_end_) { + factory_.FreeSpace(data_); + factory_.Destroy(); + } + } + inline void SetParam(const char *name, const char *val) { + } + inline bool Init(void) { + if (!factory_.Init()) return false; + data_ = factory_.Create(); + return (init_end_ = true); + } + inline void BeforeFirst(void) { + factory_.BeforeFirst(); + } + inline bool Next(Elem &elem) { // NOLINT(*) + if (factory_.LoadNext(data_)) { + elem = data_; return true; + } else { + return false; + } + } + inline ElemFactory &get_factory() { + return factory_; + } + inline const ElemFactory &get_factory() const { + return factory_; + } + + private: + // initialized + bool init_end_; + // current data + Elem data_; + // factory object used to load configures + ElemFactory factory_; +}; +#endif // !defined(XGBOOST_STRICT_CXX98_) } // namespace utils } // namespace xgboost #endif // XGBOOST_UTILS_THREAD_BUFFER_H_ From f6c82d52ec498f8f73261d58b7811748f6480799 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 25 Jul 2015 21:17:28 -0700 Subject: [PATCH 20/83] make solaris happy --- wrapper/xgboost_wrapper.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index fb33d0392..b27132d50 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -98,7 +98,14 @@ class Booster: public learner::BoostLearner { private: bool init_model; }; +} // namespace wrapper +} // namespace xgboost +using namespace xgboost::wrapper; + +#ifndef XGBOOST_STRICT_CXX98_ +namespace xgboost { +namespace wrapper { // helper to support threadlocal struct ThreadLocalStore { std::vector data; @@ -126,8 +133,6 @@ static ThreadLocalStore thread_local_store; } // namespace wrapper } // namespace xgboost -using namespace xgboost::wrapper; - /*! \brief macro to guard beginning and end section of all functions */ #define API_BEGIN() try { /*! @@ -173,6 +178,17 @@ const char *XGBSetGetLastError_(const char *str_set) { } return last_error->c_str(); } +#else +// crippled implementation for solaris case +// exception handling is not needed for R, so it is OK. +#define API_BEGIN() +#define API_END_FINALIZE(Finalize) return 0 +#define API_END() return 0 + +const char *XGBSetGetLastError_(const char *str_set) { + return NULL; +} +#endif /*! \brief return str message of the last error */ const char *XGBGetLastError() { From 0dbac3d11ec59f9c8322866d645e3717f68566d0 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 25 Jul 2015 21:23:40 -0700 Subject: [PATCH 21/83] fix travis --- scripts/travis_after_failure.sh | 2 +- src/utils/thread_buffer.h | 4 ++-- wrapper/xgboost_wrapper.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/travis_after_failure.sh b/scripts/travis_after_failure.sh index 15b74d87f..921e14953 100755 --- a/scripts/travis_after_failure.sh +++ b/scripts/travis_after_failure.sh @@ -1,5 +1,5 @@ #!/bin/bash if [ ${TASK} == "R-package" ]; then - cat R-package/xgboost.Rcheck/*.log + cat xgboost/xgboost.Rcheck/*.log fi diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h index c4dc1185d..bc4fb9f5e 100644 --- a/src/utils/thread_buffer.h +++ b/src/utils/thread_buffer.h @@ -219,7 +219,7 @@ class ThreadBuffer { factory_.Destroy(); } } - inline void SetParam(const char *name, const char *val) { + inline void SetParam(const char *name, const char *val) { } inline bool Init(void) { if (!factory_.Init()) return false; @@ -242,7 +242,7 @@ class ThreadBuffer { inline const ElemFactory &get_factory() const { return factory_; } - + private: // initialized bool init_end_; diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index b27132d50..6956b567d 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -181,14 +181,14 @@ const char *XGBSetGetLastError_(const char *str_set) { #else // crippled implementation for solaris case // exception handling is not needed for R, so it is OK. -#define API_BEGIN() +#define API_BEGIN() #define API_END_FINALIZE(Finalize) return 0 #define API_END() return 0 const char *XGBSetGetLastError_(const char *str_set) { return NULL; } -#endif +#endif // XGBOOST_STRICT_CXX98_ /*! \brief return str message of the last error */ const char *XGBGetLastError() { From b1dec917c7fe17f9d95f26ce1b47735c225dafd5 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 25 Jul 2015 21:29:46 -0700 Subject: [PATCH 22/83] Update page_fmatrix-inl.hpp --- src/io/page_fmatrix-inl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 2aaec5b19..2fa5c83bd 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -319,7 +319,7 @@ class FMatrixPage : public IFMatrix { bytes_write += spage; double tnow = rabit::utils::GetTime(); double tdiff = tnow - tstart; - utils::Printf("Writting to %s in %g MB/s, %lu MB written current speed:%g MB/s\n", + utils::Printf("Writting to %s in %g MB/s, %lu MB written\n", col_data_name_.c_str(), (bytes_write >> 20UL) / tdiff, (bytes_write >> 20UL)); From 9a936721d84873c5d97fda04f9c96760e82500e5 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Sun, 26 Jul 2015 20:12:51 -0700 Subject: [PATCH 23/83] dropping raw graphlab url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 18c5b77c1..0a09c5168 100644 --- a/README.md +++ b/README.md @@ -75,4 +75,4 @@ License XGBoost in Graphlab Create -------------------------- * XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html) -* Nice blogpost by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand +* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: From f2eb55683cc20f8e7885add55cca50916bb7ad5f Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Sun, 26 Jul 2015 20:30:59 -0700 Subject: [PATCH 24/83] some more links and restructuring --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0a09c5168..51ed94633 100644 --- a/README.md +++ b/README.md @@ -44,14 +44,13 @@ Version Features -------- -* Easily accessible through python, R, Julia, CLI -* Fast and memory efficient - - Can be more than 10 times faster than GBM in sklearn and R. [benchm-ml numbers](https://github.com/szilard/benchm-ml) - - Handles sparse matrices, support external memory +* Easily accessible through CLI, [python](guide-python/basic_walkthrough.py), + [R](../R-package/demo/basic_walkthrough.R), + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) +* Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml) +* Memory efficient - Handles sparse matrices, supports external memory * Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) -* Distributed and Portable - - The distributed version runs on Hadoop (YARN), MPI, SGE etc. - - Scales to billions of examples and beyond +* Distributed version runs on Hadoop (YARN), MPI, SGE etc., scales to billions of examples. Bug Reporting ------------- From fc27e2f32d79632261d9a905e3a34f4df42061e7 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Sun, 26 Jul 2015 20:31:51 -0700 Subject: [PATCH 25/83] adding DMLC back to the title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 51ed94633..df53f5e46 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -XGBoost +DMLC/XGBoost ======= [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) From 74055cc15e3fde12847d483b7b36b2746f759d8f Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Sun, 26 Jul 2015 21:22:35 -0700 Subject: [PATCH 26/83] fixing broken basic_walkthrough links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index df53f5e46..0f6ffc7fa 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,8 @@ Version Features -------- -* Easily accessible through CLI, [python](guide-python/basic_walkthrough.py), - [R](../R-package/demo/basic_walkthrough.R), +* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py), + [R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R), [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) * Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml) * Memory efficient - Handles sparse matrices, supports external memory From 0c8c23194928d3afe3e5a2c1119fe01bb43ea0de Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Wed, 29 Jul 2015 14:28:34 -0700 Subject: [PATCH 27/83] Fixing duplicate params in demo Issue in "demo(package="xgboost", custom_objective)" > bst <- xgb.train(param, dtrain, num_round, watchlist, + objective=logregobj, eval_metric=evalerror) Error in xgb.train(param, dtrain, num_round, watchlist, objective = logregobj, : Duplicated term in parameters. Please check your list of params. --- R-package/demo/custom_objective.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R index 201f23d98..7234ead86 100644 --- a/R-package/demo/custom_objective.R +++ b/R-package/demo/custom_objective.R @@ -33,7 +33,7 @@ evalerror <- function(preds, dtrain) { return(list(metric = "error", value = err)) } -param <- list(max.depth=2,eta=1,nthread = 2, silent=1, +param <- list(max.depth=2, eta=1, nthread = 2, silent=1, objective=logregobj, eval_metric=evalerror) print ('start training with user customized objective') # training with customized objective, we can also do step by step training @@ -57,9 +57,9 @@ logregobjattr <- function(preds, dtrain) { hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } - +param <- list(max.depth=2, eta=1, nthread = 2, silent=1, + objective=logregobjattr, eval_metric=evalerror) print ('start training with user customized objective, with additional attributes in DMatrix') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, watchlist, - objective=logregobj, eval_metric=evalerror) +bst <- xgb.train(param, dtrain, num_round, watchlist) From cca955fc9465f627899023a492b74d1b62c22e92 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Wed, 29 Jul 2015 16:20:55 -0700 Subject: [PATCH 28/83] add setuptools info --- wrapper/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/wrapper/README.md b/wrapper/README.md index ab013faf6..c5368bd7d 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -5,6 +5,7 @@ This folder provides wrapper of xgboost to other languages Python ===== * To make the python module, type ```./build.sh``` in the root directory of project +* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) * Install with `python setup.py install` from this directory. * Refer also to the walk through example in [demo folder](../demo/guide-python) * **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo. From 5f9f42292c82afea411a3939e58544ef4cc723d2 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 17:49:55 -0700 Subject: [PATCH 29/83] fix sklearn best score --- wrapper/xgboost.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 77f5bedb8..32f9a52b4 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -866,6 +866,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst.best_iteration = best_score_i return bst + class CVPack(object): """"Auxiliary datastruct to hold one fold of CV.""" def __init__(self, dtrain, dtest, param): @@ -1154,9 +1155,11 @@ class XGBModel(XGBModelBase): eval_results = {k: np.array(v, dtype=float) for k, v in eval_results.items()} eval_results = {k: np.array(v) for k, v in eval_results.items()} - self.eval_results_ = eval_results - self.best_score_ = self._Booster.best_score - self.best_iteration_ = self._Booster.best_iteration + self.eval_results = eval_results + + if early_stopping_rounds is not None: + self.best_score = self._Booster.best_score + self.best_iteration = self._Booster.best_iteration return self def predict(self, data): @@ -1266,9 +1269,11 @@ class XGBClassifier(XGBModel, XGBClassifierBase): if eval_results: eval_results = {k: np.array(v, dtype=float) for k, v in eval_results.items()} - self.eval_results_ = eval_results - self.best_score_ = self._Booster.best_score - self.best_iteration_ = self._Booster.best_iteration + self.eval_results = eval_results + + if early_stopping_rounds is not None: + self.best_score = self._Booster.best_score + self.best_iteration = self._Booster.best_iteration return self From efde0eb1719f84c6ac74dadbff134379b8889d4c Mon Sep 17 00:00:00 2001 From: Tong He Date: Wed, 29 Jul 2015 18:16:59 -0700 Subject: [PATCH 30/83] enable travis on os x --- .travis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index ac4f58154..8d754c333 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,10 @@ sudo: true +# Enabling test on Linux and OS X +os: + - linux + - osx + # Use Build Matrix to do lint and build seperately env: matrix: From 75c8bdf962f38a8eeb521b1f966779a1883c4721 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 18:24:19 -0700 Subject: [PATCH 31/83] add osx matrix --- .travis.yml | 6 ++++++ scripts/travis_script.sh | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ac4f58154..494073850 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,10 @@ env: - TASK=build CXX=g++ - TASK=build-with-dmlc CXX=g++ +os: + - linux + - osx + # dependent apt packages addons: apt: @@ -25,6 +29,7 @@ addons: - python-nose before_install: + - scripts/travis_osx_install.sh - git clone https://github.com/dmlc/dmlc-core - export TRAVIS=dmlc-core/scripts/travis/ - export PYTHONPATH=${PYTHONPATH}:${PWD}/wrapper @@ -33,6 +38,7 @@ before_install: install: - pip install cpplint pylint --user `whoami` + script: scripts/travis_script.sh diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 5702d35cd..d382a1a2e 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -2,7 +2,9 @@ # main script of travis if [ ${TASK} == "lint" ]; then - make lint || exit -1 + if [ ${TRAVIS_OS_NAME} != "osx" ]; then + make lint || exit -1 + fi fi if [ ${TASK} == "build" ]; then From f44511e94df2b0bc6620c87b000facac8897c4f0 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 18:29:06 -0700 Subject: [PATCH 32/83] fix mac build --- scripts/travis_osx_install.sh | 11 +++++++++++ scripts/travis_script.sh | 4 ++++ 2 files changed, 15 insertions(+) create mode 100755 scripts/travis_osx_install.sh diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh new file mode 100755 index 000000000..d82dfe63d --- /dev/null +++ b/scripts/travis_osx_install.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ ${TRAVIS_OS_NAME} != "osx" ]; then + exit 0 +fi + +brew update +brew install unzip +brew install python-numpy +brew install python-scipy +brew install python-nose diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index d382a1a2e..3717c2ed2 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -7,6 +7,10 @@ if [ ${TASK} == "lint" ]; then fi fi +if [ ${TRAVIS_OS_NAME} != "osx" ]; then + export no_omp=1 +fi + if [ ${TASK} == "build" ]; then make all CXX=${CXX} || exit -1 fi From 2ab6907fe287df32bfd5602f15aef832e2d02986 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 18:45:42 -0700 Subject: [PATCH 33/83] add os lrt --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index a24bea327..aa9bf632f 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,12 @@ ifeq ($(OS), Windows_NT) export CC = gcc -m64 endif +UNAME= $(shell uname) + +ifeq ($(UNAME), Linux) + LDFLAGS += -lrt +endif + ifeq ($(no_omp),1) CFLAGS += -DDISABLE_OPENMP else From 24a188588a6d3dae7a8d7be803cc57336f14bebb Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 20:10:29 -0700 Subject: [PATCH 34/83] ok --- scripts/travis_osx_install.sh | 9 +++++---- scripts/travis_script.sh | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh index d82dfe63d..4b4c714b8 100755 --- a/scripts/travis_osx_install.sh +++ b/scripts/travis_osx_install.sh @@ -5,7 +5,8 @@ if [ ${TRAVIS_OS_NAME} != "osx" ]; then fi brew update -brew install unzip -brew install python-numpy -brew install python-scipy -brew install python-nose + +if [ ${TASK} == "python-package" ]; then + brew install python git + easy_install pip scipy numpy +fi diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 3717c2ed2..85bfab47f 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -18,7 +18,11 @@ fi if [ ${TASK} == "build-with-dmlc" ]; then cd dmlc-core cp make/config.mk . - echo "USE_S3=1" >> config.mk + if [ ${TRAVIS_OS_NAME} != "osx" ]; then + echo "USE_S3=1" >> config.mk + else + echo "USE_S3=0" >> config.mk + fi make all CXX=${CXX}|| exit -1 cd .. make dmlc=dmlc-core CXX=${CXX} || exit -1 From 6062f4dd587ebcc413e5d838702c74cdd39792a1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 20:18:54 -0700 Subject: [PATCH 35/83] update --- scripts/travis_osx_install.sh | 3 ++- scripts/travis_script.sh | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh index 4b4c714b8..95e5838a8 100755 --- a/scripts/travis_osx_install.sh +++ b/scripts/travis_osx_install.sh @@ -8,5 +8,6 @@ brew update if [ ${TASK} == "python-package" ]; then brew install python git - easy_install pip scipy numpy + easy_install pip + pip install numpy scipy fi diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 85bfab47f..0d4cf8049 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -7,7 +7,7 @@ if [ ${TASK} == "lint" ]; then fi fi -if [ ${TRAVIS_OS_NAME} != "osx" ]; then +if [ ${TRAVIS_OS_NAME} == "osx" ]; then export no_omp=1 fi @@ -37,7 +37,10 @@ if [ ${TASK} == "python-package" ]; then nosetests tests/python || exit -1 fi +# only test java under linux for now if [ ${TASK} == "java-package" ]; then - make java CXX=${CXX} || exit -1 - scripts/travis_java_script.sh || exit -1 + if [ ${TRAVIS_OS_NAME} != "osx" ]; then + make java CXX=${CXX} || exit -1 + scripts/travis_java_script.sh || exit -1 + fi fi From d9599f816fab5cd777a8f5ab8bfafc98a5736316 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:01:53 -0700 Subject: [PATCH 36/83] add appvegor --- appvegor.yml | 22 ++++++++++++++++++++++ scripts/travis_osx_install.sh | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 appvegor.yml diff --git a/appvegor.yml b/appvegor.yml new file mode 100644 index 000000000..7adff2a2d --- /dev/null +++ b/appvegor.yml @@ -0,0 +1,22 @@ +platform: + - x64 + - x86 + +configuration: + - Release + +clone_folder: c:\dmlc\xgboost + + +install: + - cmd: git clone https://github.com/ogrisel/python-appveyor-demo + - ps: if (-not(Test-Path($env:PYTHON))) { & python-appvegor-demo\appveyor\install.ps1 } + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - "python --version" + - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" + - "pip instal nose numpy scipy" + +build: + parallel: true + project: windows\xgboost.sln + diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh index 95e5838a8..8121afd6b 100755 --- a/scripts/travis_osx_install.sh +++ b/scripts/travis_osx_install.sh @@ -9,5 +9,5 @@ brew update if [ ${TASK} == "python-package" ]; then brew install python git easy_install pip - pip install numpy scipy + pip install numpy scipy nose fi From 15286523cf98b6d4fe28d9c4a5bbb8f3b6ae41aa Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:06:29 -0700 Subject: [PATCH 37/83] ok --- appvegor.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/appvegor.yml b/appvegor.yml index 7adff2a2d..bb9bfe8c9 100644 --- a/appvegor.yml +++ b/appvegor.yml @@ -5,9 +5,6 @@ platform: configuration: - Release -clone_folder: c:\dmlc\xgboost - - install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - ps: if (-not(Test-Path($env:PYTHON))) { & python-appvegor-demo\appveyor\install.ps1 } @@ -17,6 +14,6 @@ install: - "pip instal nose numpy scipy" build: - parallel: true - project: windows\xgboost.sln + - "msbuild windows\xgboost.sln" + From 8f6e5e197b6be3bdcc0d32520616a1e3b142206e Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:07:18 -0700 Subject: [PATCH 38/83] ok --- appvegor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appvegor.yml b/appvegor.yml index bb9bfe8c9..2d7f233fe 100644 --- a/appvegor.yml +++ b/appvegor.yml @@ -12,8 +12,8 @@ install: - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - "pip instal nose numpy scipy" - -build: - "msbuild windows\xgboost.sln" +build: false + From fa41fe3f13b03d041f6f01ade3d51ce5a5eaf8e7 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:09:42 -0700 Subject: [PATCH 39/83] rename --- appvegor.yml => appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename appvegor.yml => appveyor.yml (89%) diff --git a/appvegor.yml b/appveyor.yml similarity index 89% rename from appvegor.yml rename to appveyor.yml index 2d7f233fe..ead70d5ef 100644 --- a/appvegor.yml +++ b/appveyor.yml @@ -12,8 +12,8 @@ install: - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - "pip instal nose numpy scipy" - - "msbuild windows\xgboost.sln" -build: false +build: + project: windows\xgboost.sln From c870c08b7ebd46f189e6d94cc98b51817ba2703f Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:11:44 -0700 Subject: [PATCH 40/83] disable openmp in dmlc --- scripts/travis_script.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 0d4cf8049..402cb6992 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -9,6 +9,7 @@ fi if [ ${TRAVIS_OS_NAME} == "osx" ]; then export no_omp=1 + export NO_OPENMP=1 fi if [ ${TASK} == "build" ]; then From 2bf0eeb82dbe60512151feef6f33e24a6f7fc344 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:15:25 -0700 Subject: [PATCH 41/83] update appvegor --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index ead70d5ef..96018b1d0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -7,7 +7,7 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - - ps: if (-not(Test-Path($env:PYTHON))) { & python-appvegor-demo\appveyor\install.ps1 } + - ps: python-appvegor-demo\appveyor\install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" From 899bfbfbaed6c05225c9fae530426daead64e3d1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:19:49 -0700 Subject: [PATCH 42/83] rest --- appveyor.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 96018b1d0..fbad6f51a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -7,7 +7,8 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - - ps: python-appvegor-demo\appveyor\install.ps1 + - cmd: copy-item python-appvegor-demo\appvegor\install.ps1 install.ps1 + - ps: .\\install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" @@ -15,5 +16,3 @@ install: build: project: windows\xgboost.sln - - From 0d5741bc7469ebba3e480716619a6c1d2279d528 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:21:15 -0700 Subject: [PATCH 43/83] rest --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index fbad6f51a..a7f7e2de0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -7,7 +7,7 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - - cmd: copy-item python-appvegor-demo\appvegor\install.ps1 install.ps1 + - cmd: copy python-appvegor-demo\appvegor\install.ps1 install.ps1 - ps: .\\install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" From 033a0c139e80fbbbd6863169c0a83cf315b8f3ef Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:21:58 -0700 Subject: [PATCH 44/83] ok --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index a7f7e2de0..b6feac2a5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -8,7 +8,7 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - cmd: copy python-appvegor-demo\appvegor\install.ps1 install.ps1 - - ps: .\\install.ps1 + - ps: install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" From bb13c2cd1587d5fcf9b1f0e99b488f291b6394bd Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:25:52 -0700 Subject: [PATCH 45/83] ok --- appveyor.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index b6feac2a5..5c7dd813f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -7,8 +7,9 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - - cmd: copy python-appvegor-demo\appvegor\install.ps1 install.ps1 - - ps: install.ps1 + - cmd: mkdir appveyor + - cmd: copy python-appveyor-demo\appvegor\install.ps1 appveyor\install.ps1 + - ps: appvegor\install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" From 1a91b15a6ea9cbbaaf749f3f3b3dfbbff3e48a07 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:27:40 -0700 Subject: [PATCH 46/83] ok --- appveyor.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 5c7dd813f..230991777 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -7,9 +7,7 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - - cmd: mkdir appveyor - - cmd: copy python-appveyor-demo\appvegor\install.ps1 appveyor\install.ps1 - - ps: appvegor\install.ps1 + - ps: python-appveyor-demo\appveyor\install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" From c2c5ad2d47fccaf76e0ea8d6dd0d903ad7c9763d Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:35:15 -0700 Subject: [PATCH 47/83] finl --- appveyor.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 230991777..0211f15c4 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,3 +1,7 @@ +environment: + global: + CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd" + platform: - x64 - x86 @@ -7,11 +11,17 @@ configuration: install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo - - ps: python-appveyor-demo\appveyor\install.ps1 + - ECHO "Filesystem root:" + - ps: "ls \"C:/\"" + + - ECHO "Installed SDKs:" + - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" + + - ps: if (-not(Test-Path($env:PYTHON))) { & python-appveyor-demo\appveyor\install.ps1 } - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - - "pip instal nose numpy scipy" + - ""%CMD_IN_ENV% pip install numpy scipy nose" build: project: windows\xgboost.sln From 7e166066185773e66a9afc3145e4116937af4da1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:36:28 -0700 Subject: [PATCH 48/83] ok --- appveyor.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 0211f15c4..388acccf3 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -21,7 +21,10 @@ install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - - ""%CMD_IN_ENV% pip install numpy scipy nose" + - "%CMD_IN_ENV% pip install numpy scipy nose" build: project: windows\xgboost.sln + +test_script: + - "%CMD_IN_ENV% nosetests test\\python" \ No newline at end of file From 6f4148faab6f9034ab6bb5f622dd7513facfb789 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:37:16 -0700 Subject: [PATCH 49/83] ok --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 388acccf3..0d0a2b72f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,7 +17,7 @@ install: - ECHO "Installed SDKs:" - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" - - ps: if (-not(Test-Path($env:PYTHON))) { & python-appveyor-demo\appveyor\install.ps1 } + - ps: python-appveyor-demo\appveyor\install.ps1 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" From e30c724bd4b5f156bbe80903415937fd46e9a48d Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:39:34 -0700 Subject: [PATCH 50/83] ok --- appveyor.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 0d0a2b72f..401a150c3 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,9 +2,17 @@ environment: global: CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd" + matrix: + - PYTHON: "C:\\Python27-x64" + PYTHON_VERSION: "2.7.x" # currently 2.7.9 + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python33-x64" + PYTHON_VERSION: "3.3.x" # currently 3.3.5 + PYTHON_ARCH: "64" + platform: - x64 - - x86 configuration: - Release From 259dea0777f3d7e9cacdca76660fcc9cccf02f02 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 21:46:41 -0700 Subject: [PATCH 51/83] incomplete appveyor --- appveyor.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 401a150c3..67c310a00 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,10 +29,7 @@ install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - - "%CMD_IN_ENV% pip install numpy scipy nose" build: project: windows\xgboost.sln -test_script: - - "%CMD_IN_ENV% nosetests test\\python" \ No newline at end of file From 5dab410537e098f212863d4e8824dd7ec261dcde Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:00:38 -0700 Subject: [PATCH 52/83] ok --- appveyor.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 67c310a00..c9948ba5f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -31,5 +31,4 @@ install: - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" build: - project: windows\xgboost.sln - + - cmd: msbuild windows\xgboost.sln /openmp- From 67d332e0f55c9618e05fa5eb6cc2eb1977751200 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:01:42 -0700 Subject: [PATCH 53/83] ok --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index c9948ba5f..6a64fb15f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,6 +29,6 @@ install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - -build: - cmd: msbuild windows\xgboost.sln /openmp- + +build: off From 6f01fa50ce6f5e0ff177ea5b59e4e4ae7f6eac2a Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:14:38 -0700 Subject: [PATCH 54/83] try disable omp --- appveyor.yml | 1 + src/utils/omp.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 6a64fb15f..3f5011824 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,7 @@ environment: global: CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd" + DISABLE_OPENMP: 1 matrix: - PYTHON: "C:\\Python27-x64" diff --git a/src/utils/omp.h b/src/utils/omp.h index ddd3467d9..e3c61110c 100644 --- a/src/utils/omp.h +++ b/src/utils/omp.h @@ -7,10 +7,10 @@ #ifndef XGBOOST_UTILS_OMP_H_ #define XGBOOST_UTILS_OMP_H_ -#if defined(_OPENMP) +#if defined(_OPENMP) && !defined(DISABLE_OPENMP) #include #else -#ifndef DISABLE_OPENMP +#if !defined(DISABLE_OPENMP) // use pragma message instead of warning #pragma message("Warning: OpenMP is not available,"\ "xgboost will be compiled into single-thread code."\ From 0a9c8acd6db158a10616de64799af1343d80c0bc Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:17:25 -0700 Subject: [PATCH 55/83] final --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 3f5011824..30c009c2a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,6 +30,6 @@ install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - - cmd: msbuild windows\xgboost.sln /openmp- -build: off +build: + project: windows\xgboost.sln From 73ec467dd3911319edb36714589e8ac27a2417e2 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:22:43 -0700 Subject: [PATCH 56/83] final --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 30c009c2a..ab1040434 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,6 +30,6 @@ install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" + - cmd: msbuild windows\xgboost.sln /DDISABLE_OPENMP=1 /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" -build: - project: windows\xgboost.sln +build: off \ No newline at end of file From ebefb78fd418b4fc542c4652207d692728388b1f Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:26:21 -0700 Subject: [PATCH 57/83] use debug --- appveyor.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index ab1040434..dfc291922 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,7 +16,7 @@ platform: - x64 configuration: - - Release + - Debug install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo @@ -30,6 +30,6 @@ install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - - cmd: msbuild windows\xgboost.sln /DDISABLE_OPENMP=1 /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" -build: off \ No newline at end of file +build: + project: windows\xgboost.sln \ No newline at end of file From 4a6f4eaac95474da59a3b6107b327bec456c18d5 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 29 Jul 2015 22:31:35 -0700 Subject: [PATCH 58/83] giveup for now, appveyor do not support openmp for msvc yet allow openmp to switch on --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index dfc291922..1fb594cb2 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,7 +16,7 @@ platform: - x64 configuration: - - Debug + - Release install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo From 11f27beccd2a64d8800d00e44371e5fc2a324c51 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 29 Jul 2015 22:41:06 -0700 Subject: [PATCH 59/83] checkin debug --- appveyor.yml | 2 +- subtree/rabit/windows/rabit/rabit.vcxproj | 2 +- windows/xgboost.sln | 8 ++++---- windows/xgboost/xgboost.vcxproj | 1 + windows/xgboost_wrapper/xgboost_wrapper.vcxproj | 1 + 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 1fb594cb2..dfc291922 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,7 +16,7 @@ platform: - x64 configuration: - - Release + - Debug install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo diff --git a/subtree/rabit/windows/rabit/rabit.vcxproj b/subtree/rabit/windows/rabit/rabit.vcxproj index 5948e4c17..c9594b182 100644 --- a/subtree/rabit/windows/rabit/rabit.vcxproj +++ b/subtree/rabit/windows/rabit/rabit.vcxproj @@ -29,7 +29,7 @@ MultiByte - Application + StaticLibrary true MultiByte diff --git a/windows/xgboost.sln b/windows/xgboost.sln index 7bd8db5b2..b1371a266 100644 --- a/windows/xgboost.sln +++ b/windows/xgboost.sln @@ -22,15 +22,16 @@ Global GlobalSection(ProjectConfigurationPlatforms) = postSolution {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32 - {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Release|x64 - {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Release|x64 + {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Debug|x64 + {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Debug|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32 - {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|Win32 + {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|x64 + {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.Build.0 = Debug|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64 @@ -46,7 +47,6 @@ Global {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.ActiveCfg = Debug|Win32 {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.Build.0 = Debug|Win32 {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.ActiveCfg = Debug|x64 - {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.Build.0 = Debug|x64 {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.ActiveCfg = Release|Win32 {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.Build.0 = Release|Win32 {20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|x64.ActiveCfg = Release|x64 diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj index c14d84645..00846f36a 100644 --- a/windows/xgboost/xgboost.vcxproj +++ b/windows/xgboost/xgboost.vcxproj @@ -85,6 +85,7 @@ true + $(OutDir)\rabit.lib;%(AdditionalDependencies) diff --git a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj index 62f7d0fd3..cff3cde65 100644 --- a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj +++ b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj @@ -86,6 +86,7 @@ true + $(OutDir)\rabit.lib;%(AdditionalDependencies) From f9c02aa40f616e2035f2180282b098c09626adbc Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 29 Jul 2015 22:45:28 -0700 Subject: [PATCH 60/83] final attempt --- src/utils/omp.h | 2 +- windows/xgboost.sln | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/utils/omp.h b/src/utils/omp.h index e3c61110c..c7a04dc32 100644 --- a/src/utils/omp.h +++ b/src/utils/omp.h @@ -10,7 +10,7 @@ #if defined(_OPENMP) && !defined(DISABLE_OPENMP) #include #else -#if !defined(DISABLE_OPENMP) +#if !defined(DISABLE_OPENMP) && !defined(_MSC_VER) // use pragma message instead of warning #pragma message("Warning: OpenMP is not available,"\ "xgboost will be compiled into single-thread code."\ diff --git a/windows/xgboost.sln b/windows/xgboost.sln index b1371a266..a3c861f6c 100644 --- a/windows/xgboost.sln +++ b/windows/xgboost.sln @@ -23,7 +23,6 @@ Global {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Debug|x64 - {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Debug|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64 From 264c636adf4fce3a93f6f6c4a1389ddf5079de6d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 29 Jul 2015 22:50:23 -0700 Subject: [PATCH 61/83] add dep --- windows/xgboost.sln | 3 +++ 1 file changed, 3 insertions(+) diff --git a/windows/xgboost.sln b/windows/xgboost.sln index a3c861f6c..d94c14932 100644 --- a/windows/xgboost.sln +++ b/windows/xgboost.sln @@ -7,6 +7,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost. EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}" + ProjectSection(ProjectDependencies) = postProject + {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} + EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "..\subtree\rabit\windows\rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}" EndProject From 53107995bfed1f7ab3de8e6bb4fc2ea29fcb8889 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 29 Jul 2015 22:54:21 -0700 Subject: [PATCH 62/83] give up for now --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index dfc291922..c966886fb 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -31,5 +31,5 @@ install: - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" -build: - project: windows\xgboost.sln \ No newline at end of file +build: off + #project: windows\xgboost.sln \ No newline at end of file From 7560518eec848ed380a9da0a31f76fd33b5c659e Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 29 Jul 2015 23:23:40 -0700 Subject: [PATCH 63/83] sleep --- appveyor.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index c966886fb..d7d0c58b1 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,7 +2,8 @@ environment: global: CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd" DISABLE_OPENMP: 1 - + VisualStudioVersion: 12.0 + matrix: - PYTHON: "C:\\Python27-x64" PYTHON_VERSION: "2.7.x" # currently 2.7.9 @@ -16,7 +17,7 @@ platform: - x64 configuration: - - Debug + - Release install: - cmd: git clone https://github.com/ogrisel/python-appveyor-demo @@ -31,5 +32,5 @@ install: - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" -build: off - #project: windows\xgboost.sln \ No newline at end of file +build: + project: windows\xgboost.sln \ No newline at end of file From f6fed76e7ee22a6e1dbfe253c039ca33dabaea48 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 29 Jul 2015 23:24:54 -0700 Subject: [PATCH 64/83] not working --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index d7d0c58b1..c1367d52e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,5 +32,5 @@ install: - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" -build: - project: windows\xgboost.sln \ No newline at end of file +build: off + #project: windows\xgboost.sln \ No newline at end of file From c2fec29bfa91d0fad27fa8d0338eac283aac4996 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 30 Jul 2015 22:04:45 -0700 Subject: [PATCH 65/83] python package refactor into python-package --- Makefile | 2 +- demo/.gitignore | 3 +- demo/README.md | 18 +- demo/guide-python/sklearn_examples.py | 10 - doc/python.md | 2 +- python-package/.gitignore | 3 + python-package/README.md | 7 + python-package/setup.py | 21 + .../xgboost/core.py | 630 +----------------- windows/README.md | 2 +- wrapper/README.md | 27 +- wrapper/__init__.py | 0 wrapper/setup.py | 39 -- 13 files changed, 87 insertions(+), 677 deletions(-) create mode 100644 python-package/.gitignore create mode 100644 python-package/README.md create mode 100644 python-package/setup.py rename wrapper/xgboost.py => python-package/xgboost/core.py (50%) delete mode 100644 wrapper/__init__.py delete mode 100644 wrapper/setup.py diff --git a/Makefile b/Makefile index aa9bf632f..c9e35e80c 100644 --- a/Makefile +++ b/Makefile @@ -169,7 +169,7 @@ Rcheck: # lint requires dmlc to be in current folder lint: - dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package + dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package clean: $(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~ diff --git a/demo/.gitignore b/demo/.gitignore index e52797d15..ee79c704b 100644 --- a/demo/.gitignore +++ b/demo/.gitignore @@ -1 +1,2 @@ -*.libsvm \ No newline at end of file +*.libsvm +*.pkl diff --git a/demo/README.md b/demo/README.md index 49e9e52b8..fcfaa8434 100644 --- a/demo/README.md +++ b/demo/README.md @@ -1,14 +1,14 @@ XGBoost Examples ==== -This folder contains all the code examples using xgboost. +This folder contains all the code examples using xgboost. * Contribution of examples, benchmarks is more than welcome! * If you like to share how you use xgboost to solve your problem, send a pull request:) - + Features Walkthrough ==== -This is a list of short codes introducing different functionalities of xgboost and its wrapper. -* Basic walkthrough of wrappers +This is a list of short codes introducing different functionalities of xgboost packages. +* Basic walkthrough of packages [python](guide-python/basic_walkthrough.py) [R](../R-package/demo/basic_walkthrough.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) @@ -20,18 +20,18 @@ This is a list of short codes introducing different functionalities of xgboost a [python](guide-python/boost_from_prediction.py) [R](../R-package/demo/boost_from_prediction.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) -* Predicting using first n trees +* Predicting using first n trees [python](guide-python/predict_first_ntree.py) [R](../R-package/demo/boost_from_prediction.R) - [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) * Generalized Linear Model [python](guide-python/generalized_linear_model.py) [R](../R-package/demo/generalized_linear_model.R) - [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl) + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl) * Cross validation [python](guide-python/cross_validation.py) [R](../R-package/demo/cross_validation.R) - [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl) + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl) * Predicting leaf indices [python](guide-python/predict_leaf_indices.py) [R](../R-package/demo/predict_leaf_indices.R) @@ -48,5 +48,5 @@ However, the parameter settings can be applied to all versions Benchmarks ==== * [Starter script for Kaggle Higgs Boson](kaggle-higgs) -* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) +* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index 56fed1dd2..7ce95b491 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -75,13 +75,3 @@ clf = xgb.XGBClassifier() clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)]) -# Custom evaluation function -from sklearn.metrics import log_loss - - -def log_loss_eval(y_pred, y_true): - return "log-loss", log_loss(y_true.get_label(), y_pred) - - -clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=log_loss_eval, - eval_set=[(X_test, y_test)]) diff --git a/doc/python.md b/doc/python.md index dfe886fe9..93b5c43d4 100644 --- a/doc/python.md +++ b/doc/python.md @@ -14,7 +14,7 @@ A [walk through python example](https://github.com/tqchen/xgboost/blob/master/de = #### Install -To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run +To install XGBoost, you need to run `make` in the root directory of the project and then in the `python-package` directory run ```shell python setup.py install diff --git a/python-package/.gitignore b/python-package/.gitignore new file mode 100644 index 000000000..d765c67c7 --- /dev/null +++ b/python-package/.gitignore @@ -0,0 +1,3 @@ +build +dist +*.egg* \ No newline at end of file diff --git a/python-package/README.md b/python-package/README.md new file mode 100644 index 000000000..a4ac71d4d --- /dev/null +++ b/python-package/README.md @@ -0,0 +1,7 @@ +XGBoost Python Package +====================== +* To make the python module, type ```./build.sh``` in the root directory of project +* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) +* Install with `python setup.py install` from this directory. +* Refer also to the walk through example in [demo folder](../demo/guide-python) +* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo. diff --git a/python-package/setup.py b/python-package/setup.py new file mode 100644 index 000000000..42e39f3ba --- /dev/null +++ b/python-package/setup.py @@ -0,0 +1,21 @@ +# pylint: disable=invalid-name +"""Setup xgboost package.""" +from __future__ import absolute_import +import sys +from setuptools import setup +sys.path.insert(0, '.') +import xgboost + +LIB_PATH = xgboost.core.find_lib_path() + +setup(name='xgboost', + version=xgboost.__version__, + description=xgboost.__doc__, + install_requires=[ + 'numpy', + 'scipy', + ], + zip_safe=False, + packages=['xgboost'], + data_files=[('xgboost', [LIB_PATH[0]])], + url='https://github.com/dmlc/xgboost') diff --git a/wrapper/xgboost.py b/python-package/xgboost/core.py similarity index 50% rename from wrapper/xgboost.py rename to python-package/xgboost/core.py index 32f9a52b4..85017cb82 100644 --- a/wrapper/xgboost.py +++ b/python-package/xgboost/core.py @@ -1,17 +1,10 @@ # coding: utf-8 -""" -xgboost: eXtreme Gradient Boosting library - -Version: 0.40 -Authors: Tianqi Chen, Bing Xu -Early stopping by Zygmunt Zając -""" -# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name, fixme +# pylint: disable=too-many-arguments +"""Core XGBoost Library.""" from __future__ import absolute_import import os import sys -import re import ctypes import platform import collections @@ -19,13 +12,6 @@ import collections import numpy as np import scipy.sparse -try: - from sklearn.base import BaseEstimator - from sklearn.base import RegressorMixin, ClassifierMixin - from sklearn.preprocessing import LabelEncoder - SKLEARN_INSTALLED = True -except ImportError: - SKLEARN_INSTALLED = False class XGBoostLibraryNotFound(Exception): """Error throwed by when xgboost is not found""" @@ -35,7 +21,6 @@ class XGBoostError(Exception): """Error throwed by xgboost trainer.""" pass -__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] if sys.version_info[0] == 3: # pylint: disable=invalid-name @@ -44,30 +29,43 @@ else: # pylint: disable=invalid-name STRING_TYPES = basestring, -def load_xglib(): - """Load the xgboost library.""" + +def find_lib_path(): + """Load find the path to xgboost dynamic library files. + + Returns + ------- + lib_path: list(string) + List of all found library path to xgboost + """ curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) - dll_path = [curr_path] + dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')] if os.name == 'nt': if platform.architecture()[0] == '64bit': - dll_path.append(os.path.join(curr_path, '../windows/x64/Release/')) + dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/')) else: - dll_path.append(os.path.join(curr_path, '../windows/Release/')) + dll_path.append(os.path.join(curr_path, '../../windows/Release/')) if os.name == 'nt': dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] else: dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - if len(dll_path) == 0: + if len(lib_path) == 0: raise XGBoostLibraryNotFound( - 'cannot find find the files in the candicate path ' + str(dll_path)) + 'Cannot find XGBoost Libarary in the candicate path %s,' + + 'Did you run build.sh in root oath?' % str(dll_path)) + return lib_path + +def _load_lib(): + """Load xgboost Library.""" + lib_path = find_lib_path() lib = ctypes.cdll.LoadLibrary(lib_path[0]) lib.XGBGetLastError.restype = ctypes.c_char_p return lib # load the XGBoost library globally -_LIB = load_xglib() +_LIB = _load_lib() def _check_call(ret): """Check the return value of C API call @@ -117,7 +115,11 @@ def c_array(ctype, values): class DMatrix(object): - """Data Matrix used in XGBoost.""" + """Data Matrix used in XGBoost. + + DMatrix is a internal data structure that used by XGBoost + which is optimized for both memory efficiency and training speed. + """ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ Data matrix used in XGBoost. @@ -400,11 +402,14 @@ class DMatrix(object): class Booster(object): - """"A Booster of of XGBoost.""" + """"A Booster of of XGBoost. + + Booster is the model of xgboost, that contains low level routines for + training, prediction and evaluation. + """ def __init__(self, params=None, cache=(), model_file=None): # pylint: disable=invalid-name - """ - Learner class. + """Initialize the Booster. Parameters ---------- @@ -735,570 +740,3 @@ class Booster(object): else: fmap[fid] += 1 return fmap - - -def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, - early_stopping_rounds=None, evals_result=None, verbose_eval=True): - # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init - """Train a booster with given parameters. - - Parameters - ---------- - params : dict - Booster params. - dtrain : DMatrix - Data to be trained. - num_boost_round: int - Number of boosting iterations. - watchlist (evals): list of pairs (DMatrix, string) - List of items to be evaluated during training, this allows user to watch - performance on the validation set. - obj : function - Customized objective function. - feval : function - Customized evaluation function. - early_stopping_rounds: int - Activates early stopping. Validation error needs to decrease at least - every round(s) to continue training. - Requires at least one item in evals. - If there's more than one, will use the last. - Returns the model from the last iteration (not the best one). - If early stopping occurs, the model will have two additional fields: - bst.best_score and bst.best_iteration. - evals_result: dict - This dictionary stores the evaluation results of all the items in watchlist - verbose_eval : bool - If `verbose_eval` then the evaluation metric on the validation set, if - given, is printed at each boosting stage. - - Returns - ------- - booster : a trained booster model - """ - evals = list(evals) - bst = Booster(params, [dtrain] + [d[0] for d in evals]) - - if evals_result is not None: - if not isinstance(evals_result, dict): - raise TypeError('evals_result has to be a dictionary') - else: - evals_name = [d[1] for d in evals] - evals_result.clear() - evals_result.update({key: [] for key in evals_name}) - - if not early_stopping_rounds: - for i in range(num_boost_round): - bst.update(dtrain, i, obj) - if len(evals) != 0: - bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, STRING_TYPES): - msg = bst_eval_set - else: - msg = bst_eval_set.decode() - - if verbose_eval: - sys.stderr.write(msg + '\n') - if evals_result is not None: - res = re.findall(":-?([0-9.]+).", msg) - for key, val in zip(evals_name, res): - evals_result[key].append(val) - return bst - - else: - # early stopping - if len(evals) < 1: - raise ValueError('For early stopping you need at least one set in evals.') - - sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\ - evals[-1][1], early_stopping_rounds)) - - # is params a list of tuples? are we using multiple eval metrics? - if isinstance(params, list): - if len(params) != len(dict(params).items()): - raise ValueError('Check your params.'\ - 'Early stopping works with single eval metric only.') - params = dict(params) - - # either minimize loss or maximize AUC/MAP/NDCG - maximize_score = False - if 'eval_metric' in params: - maximize_metrics = ('auc', 'map', 'ndcg') - if any(params['eval_metric'].startswith(x) for x in maximize_metrics): - maximize_score = True - - if maximize_score: - best_score = 0.0 - else: - best_score = float('inf') - - best_msg = '' - best_score_i = 0 - - for i in range(num_boost_round): - bst.update(dtrain, i, obj) - bst_eval_set = bst.eval_set(evals, i, feval) - - if isinstance(bst_eval_set, STRING_TYPES): - msg = bst_eval_set - else: - msg = bst_eval_set.decode() - - if verbose_eval: - sys.stderr.write(msg + '\n') - - if evals_result is not None: - res = re.findall(":-([0-9.]+).", msg) - for key, val in zip(evals_name, res): - evals_result[key].append(val) - - score = float(msg.rsplit(':', 1)[1]) - if (maximize_score and score > best_score) or \ - (not maximize_score and score < best_score): - best_score = score - best_score_i = i - best_msg = msg - elif i - best_score_i >= early_stopping_rounds: - sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) - bst.best_score = best_score - bst.best_iteration = best_score_i - break - bst.best_score = best_score - bst.best_iteration = best_score_i - return bst - - -class CVPack(object): - """"Auxiliary datastruct to hold one fold of CV.""" - def __init__(self, dtrain, dtest, param): - """"Initialize the CVPack""" - self.dtrain = dtrain - self.dtest = dtest - self.watchlist = [(dtrain, 'train'), (dtest, 'test')] - self.bst = Booster(param, [dtrain, dtest]) - - def update(self, iteration, fobj): - """"Update the boosters for one iteration""" - self.bst.update(self.dtrain, iteration, fobj) - - def eval(self, iteration, feval): - """"Evaluate the CVPack for one iteration.""" - return self.bst.eval_set(self.watchlist, iteration, feval) - - -def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): - """ - Make an n-fold list of CVPack from random indices. - """ - evals = list(evals) - np.random.seed(seed) - randidx = np.random.permutation(dall.num_row()) - kstep = len(randidx) / nfold - idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)] - ret = [] - for k in range(nfold): - dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) - dtest = dall.slice(idset[k]) - # run preprocessing on the data set if needed - if fpreproc is not None: - dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) - else: - tparam = param - plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] - ret.append(CVPack(dtrain, dtest, plst)) - return ret - - -def aggcv(rlist, show_stdv=True): - # pylint: disable=invalid-name - """ - Aggregate cross-validation results. - """ - cvmap = {} - ret = rlist[0].split()[0] - for line in rlist: - arr = line.split() - assert ret == arr[0] - for it in arr[1:]: - if not isinstance(it, STRING_TYPES): - it = it.decode() - k, v = it.split(':') - if k not in cvmap: - cvmap[k] = [] - cvmap[k].append(float(v)) - for k, v in sorted(cvmap.items(), key=lambda x: x[0]): - v = np.array(v) - if not isinstance(ret, STRING_TYPES): - ret = ret.decode() - if show_stdv: - ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) - else: - ret += '\tcv-%s:%f' % (k, np.mean(v)) - return ret - - -def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), - obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): - # pylint: disable = invalid-name - """Cross-validation with given paramaters. - - Parameters - ---------- - params : dict - Booster params. - dtrain : DMatrix - Data to be trained. - num_boost_round : int - Number of boosting iterations. - nfold : int - Number of folds in CV. - metrics : list of strings - Evaluation metrics to be watched in CV. - obj : function - Custom objective function. - feval : function - Custom evaluation function. - fpreproc : function - Preprocessing function that takes (dtrain, dtest, param) and returns - transformed versions of those. - show_stdv : bool - Whether to display the standard deviation. - seed : int - Seed used to generate the folds (passed to numpy.random.seed). - - Returns - ------- - evaluation history : list(string) - """ - results = [] - cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) - for i in range(num_boost_round): - for fold in cvfolds: - fold.update(i, obj) - res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) - sys.stderr.write(res + '\n') - results.append(res) - return results - - -# used for compatiblity without sklearn -XGBModelBase = object -XGBClassifierBase = object -XGBRegressorBase = object -if SKLEARN_INSTALLED: - XGBModelBase = BaseEstimator - XGBRegressorBase = RegressorMixin - XGBClassifierBase = ClassifierMixin - -class XGBModel(XGBModelBase): - # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name - """Implementation of the Scikit-Learn API for XGBoost. - - Parameters - ---------- - max_depth : int - Maximum tree depth for base learners. - learning_rate : float - Boosting learning rate (xgb's "eta") - n_estimators : int - Number of boosted trees to fit. - silent : boolean - Whether to print messages while running boosting. - objective : string - Specify the learning task and the corresponding learning objective. - - nthread : int - Number of parallel threads used to run xgboost. - gamma : float - Minimum loss reduction required to make a further partition on a leaf node of the tree. - min_child_weight : int - Minimum sum of instance weight(hessian) needed in a child. - max_delta_step : int - Maximum delta step we allow each tree's weight estimation to be. - subsample : float - Subsample ratio of the training instance. - colsample_bytree : float - Subsample ratio of columns when constructing each tree. - - base_score: - The initial prediction score of all instances, global bias. - seed : int - Random number seed. - missing : float, optional - Value in the data which needs to be present as a missing value. If - None, defaults to np.nan. - """ - def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, - silent=True, objective="reg:linear", - nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, - subsample=1, colsample_bytree=1, - base_score=0.5, seed=0, missing=None): - if not SKLEARN_INSTALLED: - raise XGBoostError('sklearn needs to be installed in order to use this module') - self.max_depth = max_depth - self.learning_rate = learning_rate - self.n_estimators = n_estimators - self.silent = silent - self.objective = objective - - self.nthread = nthread - self.gamma = gamma - self.min_child_weight = min_child_weight - self.max_delta_step = max_delta_step - self.subsample = subsample - self.colsample_bytree = colsample_bytree - - self.base_score = base_score - self.seed = seed - self.missing = missing if missing is not None else np.nan - self._Booster = None - - def __setstate__(self, state): - # backward compatiblity code - # load booster from raw if it is raw - # the booster now support pickle - bst = state["_Booster"] - if bst is not None and not isinstance(bst, Booster): - state["_Booster"] = Booster(model_file=bst) - self.__dict__.update(state) - - def booster(self): - """Get the underlying xgboost Booster of this model. - - This will raise an exception when fit was not called - - Returns - ------- - booster : a xgboost booster of underlying model - """ - if self._Booster is None: - raise XGBoostError('need to call fit beforehand') - return self._Booster - - def get_params(self, deep=False): - """Get parameter.s""" - params = super(XGBModel, self).get_params(deep=deep) - if params['missing'] is np.nan: - params['missing'] = None # sklearn doesn't handle nan. see #4725 - if not params.get('eval_metric', True): - del params['eval_metric'] # don't give as None param to Booster - return params - - def get_xgb_params(self): - """Get xgboost type parameters.""" - xgb_params = self.get_params() - - xgb_params['silent'] = 1 if self.silent else 0 - - if self.nthread <= 0: - xgb_params.pop('nthread', None) - return xgb_params - - def fit(self, X, y, eval_set=None, eval_metric=None, - early_stopping_rounds=None, verbose=True): - # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init - """ - Fit the gradient boosting model - - Parameters - ---------- - X : array_like - Feature matrix - y : array_like - Labels - eval_set : list, optional - A list of (X, y) tuple pairs to use as a validation set for - early-stopping - eval_metric : str, callable, optional - If a str, should be a built-in evaluation metric to use. See - doc/parameter.md. If callable, a custom evaluation metric. The call - signature is func(y_predicted, y_true) where y_true will be a - DMatrix object such that you may need to call the get_label - method. It must return a str, value pair where the str is a name - for the evaluation and value is the value of the evaluation - function. This objective is always minimized. - early_stopping_rounds : int - Activates early stopping. Validation error needs to decrease at - least every round(s) to continue training. - Requires at least one item in evals. If there's more than one, - will use the last. Returns the model from the last iteration - (not the best one). If early stopping occurs, the model will - have two additional fields: bst.best_score and bst.best_iteration. - verbose : bool - If `verbose` and an evaluation set is used, writes the evaluation - metric measured on the validation set to stderr. - """ - trainDmatrix = DMatrix(X, label=y, missing=self.missing) - - eval_results = {} - if eval_set is not None: - evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) - evals = list(zip(evals, ["validation_{}".format(i) for i in - range(len(evals))])) - else: - evals = () - - params = self.get_xgb_params() - - feval = eval_metric if callable(eval_metric) else None - if eval_metric is not None: - if callable(eval_metric): - eval_metric = None - else: - params.update({'eval_metric': eval_metric}) - - self._Booster = train(params, trainDmatrix, - self.n_estimators, evals=evals, - early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=feval, - verbose_eval=verbose) - if eval_results: - eval_results = {k: np.array(v, dtype=float) - for k, v in eval_results.items()} - eval_results = {k: np.array(v) for k, v in eval_results.items()} - self.eval_results = eval_results - - if early_stopping_rounds is not None: - self.best_score = self._Booster.best_score - self.best_iteration = self._Booster.best_iteration - return self - - def predict(self, data): - # pylint: disable=missing-docstring,invalid-name - test_dmatrix = DMatrix(data, missing=self.missing) - return self.booster().predict(test_dmatrix) - - -class XGBClassifier(XGBModel, XGBClassifierBase): - # pylint: disable=missing-docstring,too-many-arguments,invalid-name - __doc__ = """ - Implementation of the scikit-learn API for XGBoost classification - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - - def __init__(self, max_depth=3, learning_rate=0.1, - n_estimators=100, silent=True, - objective="binary:logistic", - nthread=-1, gamma=0, min_child_weight=1, - max_delta_step=0, subsample=1, colsample_bytree=1, - base_score=0.5, seed=0, missing=None): - super(XGBClassifier, self).__init__(max_depth, learning_rate, - n_estimators, silent, objective, - nthread, gamma, min_child_weight, - max_delta_step, subsample, - colsample_bytree, - base_score, seed, missing) - - def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, - early_stopping_rounds=None, verbose=True): - # pylint: disable = attribute-defined-outside-init,arguments-differ - """ - Fit gradient boosting classifier - - Parameters - ---------- - X : array_like - Feature matrix - y : array_like - Labels - sample_weight : array_like - Weight for each instance - eval_set : list, optional - A list of (X, y) pairs to use as a validation set for - early-stopping - eval_metric : str, callable, optional - If a str, should be a built-in evaluation metric to use. See - doc/parameter.md. If callable, a custom evaluation metric. The call - signature is func(y_predicted, y_true) where y_true will be a - DMatrix object such that you may need to call the get_label - method. It must return a str, value pair where the str is a name - for the evaluation and value is the value of the evaluation - function. This objective is always minimized. - early_stopping_rounds : int, optional - Activates early stopping. Validation error needs to decrease at - least every round(s) to continue training. - Requires at least one item in evals. If there's more than one, - will use the last. Returns the model from the last iteration - (not the best one). If early stopping occurs, the model will - have two additional fields: bst.best_score and bst.best_iteration. - verbose : bool - If `verbose` and an evaluation set is used, writes the evaluation - metric measured on the validation set to stderr. - """ - eval_results = {} - self.classes_ = list(np.unique(y)) - self.n_classes_ = len(self.classes_) - if self.n_classes_ > 2: - # Switch to using a multiclass objective in the underlying XGB instance - self.objective = "multi:softprob" - xgb_options = self.get_xgb_params() - xgb_options['num_class'] = self.n_classes_ - else: - xgb_options = self.get_xgb_params() - - feval = eval_metric if callable(eval_metric) else None - if eval_metric is not None: - if callable(eval_metric): - eval_metric = None - else: - xgb_options.update({"eval_metric": eval_metric}) - - if eval_set is not None: - # TODO: use sample_weight if given? - evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) - nevals = len(evals) - eval_names = ["validation_{}".format(i) for i in range(nevals)] - evals = list(zip(evals, eval_names)) - else: - evals = () - - self._le = LabelEncoder().fit(y) - training_labels = self._le.transform(y) - - if sample_weight is not None: - train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, - missing=self.missing) - else: - train_dmatrix = DMatrix(X, label=training_labels, - missing=self.missing) - - self._Booster = train(xgb_options, train_dmatrix, self.n_estimators, - evals=evals, - early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=feval, - verbose_eval=verbose) - - if eval_results: - eval_results = {k: np.array(v, dtype=float) - for k, v in eval_results.items()} - self.eval_results = eval_results - - if early_stopping_rounds is not None: - self.best_score = self._Booster.best_score - self.best_iteration = self._Booster.best_iteration - - return self - - def predict(self, data): - test_dmatrix = DMatrix(data, missing=self.missing) - class_probs = self.booster().predict(test_dmatrix) - if len(class_probs.shape) > 1: - column_indexes = np.argmax(class_probs, axis=1) - else: - column_indexes = np.repeat(0, data.shape[0]) - column_indexes[class_probs > 0.5] = 1 - return self._le.inverse_transform(column_indexes) - - def predict_proba(self, data): - test_dmatrix = DMatrix(data, missing=self.missing) - class_probs = self.booster().predict(test_dmatrix) - if self.objective == "multi:softprob": - return class_probs - else: - classone_probs = class_probs - classzero_probs = 1.0 - classone_probs - return np.vstack((classzero_probs, classone_probs)).transpose() - -class XGBRegressor(XGBModel, XGBRegressorBase): - # pylint: disable=missing-docstring - __doc__ = """ - Implementation of the scikit-learn API for XGBoost regression - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) diff --git a/windows/README.md b/windows/README.md index cb1cc9dd9..564c97d25 100644 --- a/windows/README.md +++ b/windows/README.md @@ -11,7 +11,7 @@ This should give you xgboost.exe for CLI version and xgboost_wrapper.dll for pyt Use Python Module ===== -* After you build the dll, you can install the Python package from the [../wrapper](../wrapper) folder +* After you build the dll, you can install the Python package from the [../python-package](../python-package) folder ``` python setup.py install diff --git a/wrapper/README.md b/wrapper/README.md index c5368bd7d..77316e15c 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -1,20 +1,9 @@ -Wrapper of XGBoost -===== -This folder provides wrapper of xgboost to other languages +XGBoost Wrappers +================ +This folder provides wrapper to create xgboost packages to other languages. -Python -===== -* To make the python module, type ```./build.sh``` in the root directory of project -* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) -* Install with `python setup.py install` from this directory. -* Refer also to the walk through example in [demo folder](../demo/guide-python) -* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo. - - -R -===== -* See [R-package](../R-package) - -Julia -===== -* See [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl) +***Supported Language Packages*** +* [Python package](../python-package) +* [R-package](../R-package) +* [Java Package](../java) +* [Julia Package](https://github.com/antinucleon/XGBoost.jl) diff --git a/wrapper/__init__.py b/wrapper/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/wrapper/setup.py b/wrapper/setup.py deleted file mode 100644 index 5365d61b0..000000000 --- a/wrapper/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -# pylint: disable=invalid-name -"""Setup xgboost package.""" -import os -import platform -from setuptools import setup - - -class XGBoostLibraryNotFound(Exception): - """Exception to raise when xgboost library cannot be found.""" - pass - - -curr_dir = os.path.dirname(os.path.abspath(__file__)) -dll_path = [curr_dir] - -if os.name == 'nt': - if platform.architecture()[0] == '64bit': - dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/')) - else: - dll_path.append(os.path.join(curr_dir, '../windows/Release/')) - - -if os.name == 'nt': - dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] -else: - dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] - -lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - -if len(lib_path) == 0: - raise XGBoostLibraryNotFound("XGBoost library not found. Did you run " - "../make?") -setup(name="xgboost", - version="0.40", - description="Python wrappers for XGBoost: eXtreme Gradient Boosting", - zip_safe=False, - py_modules=['xgboost'], - data_files=[('.', [lib_path[0]])], - url="https://github.com/dmlc/xgboost") From 60217a2c02dc80376f1bc6084b10d003b2aeef1f Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 30 Jul 2015 22:08:48 -0700 Subject: [PATCH 66/83] checkin all python --- .gitignore | 7 +- python-package/xgboost/__init__.py | 12 + python-package/xgboost/sklearn.py | 341 +++++++++++++++++++++++++++++ python-package/xgboost/training.py | 252 +++++++++++++++++++++ 4 files changed, 608 insertions(+), 4 deletions(-) create mode 100644 python-package/xgboost/__init__.py create mode 100644 python-package/xgboost/sklearn.py create mode 100644 python-package/xgboost/training.py diff --git a/.gitignore b/.gitignore index 73ae6748e..048803abd 100644 --- a/.gitignore +++ b/.gitignore @@ -48,10 +48,9 @@ Debug *.cpage.col *.cpage *.Rproj -xgboost -xgboost.mpi -xgboost.mock -train* +./xgboost +./xgboost.mpi +./xgboost.mock rabit #.Rbuildignore R-package.Rproj diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py new file mode 100644 index 000000000..6f967b837 --- /dev/null +++ b/python-package/xgboost/__init__.py @@ -0,0 +1,12 @@ +# coding: utf-8 +"""XGBoost: eXtreme Gradient Boosting library. + +Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md +""" + +from __future__ import absolute_import +from .core import DMatrix, Booster +from .training import train, cv +from .sklearn import XGBModel, XGBClassifier, XGBRegressor + +__version__ = '0.4' diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py new file mode 100644 index 000000000..4a5771724 --- /dev/null +++ b/python-package/xgboost/sklearn.py @@ -0,0 +1,341 @@ +# coding: utf-8 +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +"""Scikit-Learn Wrapper interface for XGBoost.""" +from __future__ import absolute_import + +import numpy as np +from .core import Booster, DMatrix, XGBoostError +from .training import train + +try: + from sklearn.base import BaseEstimator + from sklearn.base import RegressorMixin, ClassifierMixin + from sklearn.preprocessing import LabelEncoder + SKLEARN_INSTALLED = True +except ImportError: + SKLEARN_INSTALLED = False + +# used for compatiblity without sklearn +XGBModelBase = object +XGBClassifierBase = object +XGBRegressorBase = object + +if SKLEARN_INSTALLED: + XGBModelBase = BaseEstimator + XGBRegressorBase = RegressorMixin + XGBClassifierBase = ClassifierMixin + +class XGBModel(XGBModelBase): + # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name + """Implementation of the Scikit-Learn API for XGBoost. + + Parameters + ---------- + max_depth : int + Maximum tree depth for base learners. + learning_rate : float + Boosting learning rate (xgb's "eta") + n_estimators : int + Number of boosted trees to fit. + silent : boolean + Whether to print messages while running boosting. + objective : string + Specify the learning task and the corresponding learning objective. + + nthread : int + Number of parallel threads used to run xgboost. + gamma : float + Minimum loss reduction required to make a further partition on a leaf node of the tree. + min_child_weight : int + Minimum sum of instance weight(hessian) needed in a child. + max_delta_step : int + Maximum delta step we allow each tree's weight estimation to be. + subsample : float + Subsample ratio of the training instance. + colsample_bytree : float + Subsample ratio of columns when constructing each tree. + + base_score: + The initial prediction score of all instances, global bias. + seed : int + Random number seed. + missing : float, optional + Value in the data which needs to be present as a missing value. If + None, defaults to np.nan. + """ + def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, + silent=True, objective="reg:linear", + nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, + subsample=1, colsample_bytree=1, + base_score=0.5, seed=0, missing=None): + if not SKLEARN_INSTALLED: + raise XGBoostError('sklearn needs to be installed in order to use this module') + self.max_depth = max_depth + self.learning_rate = learning_rate + self.n_estimators = n_estimators + self.silent = silent + self.objective = objective + + self.nthread = nthread + self.gamma = gamma + self.min_child_weight = min_child_weight + self.max_delta_step = max_delta_step + self.subsample = subsample + self.colsample_bytree = colsample_bytree + + self.base_score = base_score + self.seed = seed + self.missing = missing if missing is not None else np.nan + self._Booster = None + + def __setstate__(self, state): + # backward compatiblity code + # load booster from raw if it is raw + # the booster now support pickle + bst = state["_Booster"] + if bst is not None and not isinstance(bst, Booster): + state["_Booster"] = Booster(model_file=bst) + self.__dict__.update(state) + + def booster(self): + """Get the underlying xgboost Booster of this model. + + This will raise an exception when fit was not called + + Returns + ------- + booster : a xgboost booster of underlying model + """ + if self._Booster is None: + raise XGBoostError('need to call fit beforehand') + return self._Booster + + def get_params(self, deep=False): + """Get parameter.s""" + params = super(XGBModel, self).get_params(deep=deep) + if params['missing'] is np.nan: + params['missing'] = None # sklearn doesn't handle nan. see #4725 + if not params.get('eval_metric', True): + del params['eval_metric'] # don't give as None param to Booster + return params + + def get_xgb_params(self): + """Get xgboost type parameters.""" + xgb_params = self.get_params() + + xgb_params['silent'] = 1 if self.silent else 0 + + if self.nthread <= 0: + xgb_params.pop('nthread', None) + return xgb_params + + def fit(self, X, y, eval_set=None, eval_metric=None, + early_stopping_rounds=None, verbose=True): + # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init + """ + Fit the gradient boosting model + + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + eval_set : list, optional + A list of (X, y) tuple pairs to use as a validation set for + early-stopping + eval_metric : str, callable, optional + If a str, should be a built-in evaluation metric to use. See + doc/parameter.md. If callable, a custom evaluation metric. The call + signature is func(y_predicted, y_true) where y_true will be a + DMatrix object such that you may need to call the get_label + method. It must return a str, value pair where the str is a name + for the evaluation and value is the value of the evaluation + function. This objective is always minimized. + early_stopping_rounds : int + Activates early stopping. Validation error needs to decrease at + least every round(s) to continue training. + Requires at least one item in evals. If there's more than one, + will use the last. Returns the model from the last iteration + (not the best one). If early stopping occurs, the model will + have two additional fields: bst.best_score and bst.best_iteration. + verbose : bool + If `verbose` and an evaluation set is used, writes the evaluation + metric measured on the validation set to stderr. + """ + trainDmatrix = DMatrix(X, label=y, missing=self.missing) + + eval_results = {} + if eval_set is not None: + evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) + evals = list(zip(evals, ["validation_{}".format(i) for i in + range(len(evals))])) + else: + evals = () + + params = self.get_xgb_params() + + feval = eval_metric if callable(eval_metric) else None + if eval_metric is not None: + if callable(eval_metric): + eval_metric = None + else: + params.update({'eval_metric': eval_metric}) + + self._Booster = train(params, trainDmatrix, + self.n_estimators, evals=evals, + early_stopping_rounds=early_stopping_rounds, + evals_result=eval_results, feval=feval, + verbose_eval=verbose) + if eval_results: + eval_results = {k: np.array(v, dtype=float) + for k, v in eval_results.items()} + eval_results = {k: np.array(v) for k, v in eval_results.items()} + self.eval_results = eval_results + + if early_stopping_rounds is not None: + self.best_score = self._Booster.best_score + self.best_iteration = self._Booster.best_iteration + return self + + def predict(self, data): + # pylint: disable=missing-docstring,invalid-name + test_dmatrix = DMatrix(data, missing=self.missing) + return self.booster().predict(test_dmatrix) + + +class XGBClassifier(XGBModel, XGBClassifierBase): + # pylint: disable=missing-docstring,too-many-arguments,invalid-name + __doc__ = """ + Implementation of the scikit-learn API for XGBoost classification + """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) + + def __init__(self, max_depth=3, learning_rate=0.1, + n_estimators=100, silent=True, + objective="binary:logistic", + nthread=-1, gamma=0, min_child_weight=1, + max_delta_step=0, subsample=1, colsample_bytree=1, + base_score=0.5, seed=0, missing=None): + super(XGBClassifier, self).__init__(max_depth, learning_rate, + n_estimators, silent, objective, + nthread, gamma, min_child_weight, + max_delta_step, subsample, + colsample_bytree, + base_score, seed, missing) + + def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, + early_stopping_rounds=None, verbose=True): + # pylint: disable = attribute-defined-outside-init,arguments-differ + """ + Fit gradient boosting classifier + + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + sample_weight : array_like + Weight for each instance + eval_set : list, optional + A list of (X, y) pairs to use as a validation set for + early-stopping + eval_metric : str, callable, optional + If a str, should be a built-in evaluation metric to use. See + doc/parameter.md. If callable, a custom evaluation metric. The call + signature is func(y_predicted, y_true) where y_true will be a + DMatrix object such that you may need to call the get_label + method. It must return a str, value pair where the str is a name + for the evaluation and value is the value of the evaluation + function. This objective is always minimized. + early_stopping_rounds : int, optional + Activates early stopping. Validation error needs to decrease at + least every round(s) to continue training. + Requires at least one item in evals. If there's more than one, + will use the last. Returns the model from the last iteration + (not the best one). If early stopping occurs, the model will + have two additional fields: bst.best_score and bst.best_iteration. + verbose : bool + If `verbose` and an evaluation set is used, writes the evaluation + metric measured on the validation set to stderr. + """ + eval_results = {} + self.classes_ = list(np.unique(y)) + self.n_classes_ = len(self.classes_) + if self.n_classes_ > 2: + # Switch to using a multiclass objective in the underlying XGB instance + self.objective = "multi:softprob" + xgb_options = self.get_xgb_params() + xgb_options['num_class'] = self.n_classes_ + else: + xgb_options = self.get_xgb_params() + + feval = eval_metric if callable(eval_metric) else None + if eval_metric is not None: + if callable(eval_metric): + eval_metric = None + else: + xgb_options.update({"eval_metric": eval_metric}) + + if eval_set is not None: + # TODO: use sample_weight if given? + evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) + nevals = len(evals) + eval_names = ["validation_{}".format(i) for i in range(nevals)] + evals = list(zip(evals, eval_names)) + else: + evals = () + + self._le = LabelEncoder().fit(y) + training_labels = self._le.transform(y) + + if sample_weight is not None: + train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, + missing=self.missing) + else: + train_dmatrix = DMatrix(X, label=training_labels, + missing=self.missing) + + self._Booster = train(xgb_options, train_dmatrix, self.n_estimators, + evals=evals, + early_stopping_rounds=early_stopping_rounds, + evals_result=eval_results, feval=feval, + verbose_eval=verbose) + + if eval_results: + eval_results = {k: np.array(v, dtype=float) + for k, v in eval_results.items()} + self.eval_results = eval_results + + if early_stopping_rounds is not None: + self.best_score = self._Booster.best_score + self.best_iteration = self._Booster.best_iteration + + return self + + def predict(self, data): + test_dmatrix = DMatrix(data, missing=self.missing) + class_probs = self.booster().predict(test_dmatrix) + if len(class_probs.shape) > 1: + column_indexes = np.argmax(class_probs, axis=1) + else: + column_indexes = np.repeat(0, data.shape[0]) + column_indexes[class_probs > 0.5] = 1 + return self._le.inverse_transform(column_indexes) + + def predict_proba(self, data): + test_dmatrix = DMatrix(data, missing=self.missing) + class_probs = self.booster().predict(test_dmatrix) + if self.objective == "multi:softprob": + return class_probs + else: + classone_probs = class_probs + classzero_probs = 1.0 - classone_probs + return np.vstack((classzero_probs, classone_probs)).transpose() + +class XGBRegressor(XGBModel, XGBRegressorBase): + # pylint: disable=missing-docstring + __doc__ = """ + Implementation of the scikit-learn API for XGBoost regression + """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) + diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py new file mode 100644 index 000000000..1f2d722ac --- /dev/null +++ b/python-package/xgboost/training.py @@ -0,0 +1,252 @@ +# coding: utf-8 +# pylint: disable=too-many-locals, too-many-arguments, invalid-name +"""Training Library containing training routines.""" +from __future__ import absolute_import + +import sys +import re +import numpy as np +from .core import Booster, STRING_TYPES + +def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, + early_stopping_rounds=None, evals_result=None, verbose_eval=True): + # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init + """Train a booster with given parameters. + + Parameters + ---------- + params : dict + Booster params. + dtrain : DMatrix + Data to be trained. + num_boost_round: int + Number of boosting iterations. + watchlist (evals): list of pairs (DMatrix, string) + List of items to be evaluated during training, this allows user to watch + performance on the validation set. + obj : function + Customized objective function. + feval : function + Customized evaluation function. + early_stopping_rounds: int + Activates early stopping. Validation error needs to decrease at least + every round(s) to continue training. + Requires at least one item in evals. + If there's more than one, will use the last. + Returns the model from the last iteration (not the best one). + If early stopping occurs, the model will have two additional fields: + bst.best_score and bst.best_iteration. + evals_result: dict + This dictionary stores the evaluation results of all the items in watchlist + verbose_eval : bool + If `verbose_eval` then the evaluation metric on the validation set, if + given, is printed at each boosting stage. + + Returns + ------- + booster : a trained booster model + """ + evals = list(evals) + bst = Booster(params, [dtrain] + [d[0] for d in evals]) + + if evals_result is not None: + if not isinstance(evals_result, dict): + raise TypeError('evals_result has to be a dictionary') + else: + evals_name = [d[1] for d in evals] + evals_result.clear() + evals_result.update({key: [] for key in evals_name}) + + if not early_stopping_rounds: + for i in range(num_boost_round): + bst.update(dtrain, i, obj) + if len(evals) != 0: + bst_eval_set = bst.eval_set(evals, i, feval) + if isinstance(bst_eval_set, STRING_TYPES): + msg = bst_eval_set + else: + msg = bst_eval_set.decode() + + if verbose_eval: + sys.stderr.write(msg + '\n') + if evals_result is not None: + res = re.findall(":-?([0-9.]+).", msg) + for key, val in zip(evals_name, res): + evals_result[key].append(val) + return bst + + else: + # early stopping + if len(evals) < 1: + raise ValueError('For early stopping you need at least one set in evals.') + + sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\ + evals[-1][1], early_stopping_rounds)) + + # is params a list of tuples? are we using multiple eval metrics? + if isinstance(params, list): + if len(params) != len(dict(params).items()): + raise ValueError('Check your params.'\ + 'Early stopping works with single eval metric only.') + params = dict(params) + + # either minimize loss or maximize AUC/MAP/NDCG + maximize_score = False + if 'eval_metric' in params: + maximize_metrics = ('auc', 'map', 'ndcg') + if any(params['eval_metric'].startswith(x) for x in maximize_metrics): + maximize_score = True + + if maximize_score: + best_score = 0.0 + else: + best_score = float('inf') + + best_msg = '' + best_score_i = 0 + + for i in range(num_boost_round): + bst.update(dtrain, i, obj) + bst_eval_set = bst.eval_set(evals, i, feval) + + if isinstance(bst_eval_set, STRING_TYPES): + msg = bst_eval_set + else: + msg = bst_eval_set.decode() + + if verbose_eval: + sys.stderr.write(msg + '\n') + + if evals_result is not None: + res = re.findall(":-([0-9.]+).", msg) + for key, val in zip(evals_name, res): + evals_result[key].append(val) + + score = float(msg.rsplit(':', 1)[1]) + if (maximize_score and score > best_score) or \ + (not maximize_score and score < best_score): + best_score = score + best_score_i = i + best_msg = msg + elif i - best_score_i >= early_stopping_rounds: + sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) + bst.best_score = best_score + bst.best_iteration = best_score_i + break + bst.best_score = best_score + bst.best_iteration = best_score_i + return bst + + +class CVPack(object): + """"Auxiliary datastruct to hold one fold of CV.""" + def __init__(self, dtrain, dtest, param): + """"Initialize the CVPack""" + self.dtrain = dtrain + self.dtest = dtest + self.watchlist = [(dtrain, 'train'), (dtest, 'test')] + self.bst = Booster(param, [dtrain, dtest]) + + def update(self, iteration, fobj): + """"Update the boosters for one iteration""" + self.bst.update(self.dtrain, iteration, fobj) + + def eval(self, iteration, feval): + """"Evaluate the CVPack for one iteration.""" + return self.bst.eval_set(self.watchlist, iteration, feval) + + +def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): + """ + Make an n-fold list of CVPack from random indices. + """ + evals = list(evals) + np.random.seed(seed) + randidx = np.random.permutation(dall.num_row()) + kstep = len(randidx) / nfold + idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)] + ret = [] + for k in range(nfold): + dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) + dtest = dall.slice(idset[k]) + # run preprocessing on the data set if needed + if fpreproc is not None: + dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) + else: + tparam = param + plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] + ret.append(CVPack(dtrain, dtest, plst)) + return ret + + +def aggcv(rlist, show_stdv=True): + # pylint: disable=invalid-name + """ + Aggregate cross-validation results. + """ + cvmap = {} + ret = rlist[0].split()[0] + for line in rlist: + arr = line.split() + assert ret == arr[0] + for it in arr[1:]: + if not isinstance(it, STRING_TYPES): + it = it.decode() + k, v = it.split(':') + if k not in cvmap: + cvmap[k] = [] + cvmap[k].append(float(v)) + for k, v in sorted(cvmap.items(), key=lambda x: x[0]): + v = np.array(v) + if not isinstance(ret, STRING_TYPES): + ret = ret.decode() + if show_stdv: + ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) + else: + ret += '\tcv-%s:%f' % (k, np.mean(v)) + return ret + + +def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), + obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): + # pylint: disable = invalid-name + """Cross-validation with given paramaters. + + Parameters + ---------- + params : dict + Booster params. + dtrain : DMatrix + Data to be trained. + num_boost_round : int + Number of boosting iterations. + nfold : int + Number of folds in CV. + metrics : list of strings + Evaluation metrics to be watched in CV. + obj : function + Custom objective function. + feval : function + Custom evaluation function. + fpreproc : function + Preprocessing function that takes (dtrain, dtest, param) and returns + transformed versions of those. + show_stdv : bool + Whether to display the standard deviation. + seed : int + Seed used to generate the folds (passed to numpy.random.seed). + + Returns + ------- + evaluation history : list(string) + """ + results = [] + cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) + for i in range(num_boost_round): + for fold in cvfolds: + fold.update(i, obj) + res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) + sys.stderr.write(res + '\n') + results.append(res) + return results + From 362fe4e4fa2981191e6797be5dcfd6a22be956cd Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 30 Jul 2015 22:11:27 -0700 Subject: [PATCH 67/83] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6f2cba90f..10f217038 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,7 @@ before_install: - scripts/travis_osx_install.sh - git clone https://github.com/dmlc/dmlc-core - export TRAVIS=dmlc-core/scripts/travis/ - - export PYTHONPATH=${PYTHONPATH}:${PWD}/wrapper + - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package - source ${TRAVIS}/travis_setup_env.sh install: From 2a01c5c86527070f8f235823166ef5faf2ce0cc3 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 30 Jul 2015 22:26:10 -0700 Subject: [PATCH 68/83] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5ed3d266c..71f48a166 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,13 +19,19 @@ List of Contributors * [Full List of Contributors](https://github.com/dmlc/xgboost/graphs/contributors) - To contributors: please add your name to the list when you submit a patch to the project:) * [Kailong Chen](https://github.com/kalenhaha) + - Kailong is an early contributor of xgboost, he is creator of ranking objectives in xgboost. * [Skipper Seabold](https://github.com/jseabold) + - Skipper is the major contributor to the scikit-learn module of xgboost. * [Zygmunt Zając](https://github.com/zygmuntz) + - Zygmunt is the master behind the early stopping feature frequently used by kagglers. * [Ajinkya Kale](https://github.com/ajkl) * [Boliang Chen](https://github.com/cblsjtu) * [Vadim Khotilovich](https://github.com/khotilov) * [Yangqing Men](https://github.com/yanqingmen) + - Yangqing is the creator of xgboost java package. * [Engpeng Yao](https://github.com/yepyao) * [Giulio](https://github.com/giuliohome) + - Giulio is the creator of windows project of xgboost * [Jamie Hall](https://github.com/nerdcha) + - Jamie is the initial creator of xgboost sklearn modue. * [Yen-Ying Lee](https://github.com/white1033) From 3a091fa30265e51f4e1b6e72960b78bbe7d5e597 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Fri, 31 Jul 2015 21:33:54 +0000 Subject: [PATCH 69/83] modify desc --- R-package/DESCRIPTION | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 6f784fbb3..4560971e2 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,16 +1,16 @@ Package: xgboost Type: Package -Title: eXtreme Gradient Boosting -Version: 0.4-0 +Title: Extreme Gradient Boosting +Version: 0.4-1 Date: 2015-05-11 Author: Tianqi Chen , Tong He , Michael Benesty Maintainer: Tong He -Description: eXtreme Gradient Boosting, which is an - efficient and scalable implementation of gradient boosting framework. - This package is an R wrapper of xgboost. The package includes efficient +Description: Extreme Gradient Boosting, which is an + efficient implementation of gradient boosting framework. + This package is its R interface. The package includes efficient linear model solver and tree learning algorithms. The package can automatically - do parallel computation with OpenMP, and it can be more than 10 times faster - than existing gradient boosting packages such as gbm. It supports various + do parallel computation on a single machine which could be more than 10 times faster + than existing gradient boosting packages. It supports various objective functions, including regression, classification and ranking. The package is made to be extensible, so that users are also allowed to define their own objectives easily. From 8083c30e7b7dadf4e19387d18573abe3b70ffe12 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 1 Aug 2015 09:18:34 -0700 Subject: [PATCH 70/83] quick fix of solaris problem in cranc check --- CHANGES.md | 18 ++++++++++-------- R-package/README.md | 4 ++-- R-package/src/xgboost_R.cpp | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0be001744..d9c8786c0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,18 +1,18 @@ Change Log -===== +========== xgboost-0.1 -===== +=========== * Initial release xgboost-0.2x -===== +============ * Python module * Weighted samples instances * Initial version of pairwise rank xgboost-0.3 -===== +=========== * Faster tree construction module - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio``` * Support for boosting from initial predictions @@ -22,7 +22,7 @@ xgboost-0.3 * Add R module xgboost-0.4 -===== +=========== * Distributed version of xgboost that runs on YARN, scales to billions of examples * Direct save/load data and model from/to S3 and HDFS * Feature importance visualization in R module, by Michael Benesty @@ -35,9 +35,11 @@ xgboost-0.4 * sklearn wrapper is supported in python module * Experimental External memory version -on going version -===== +on going at master +================== +* Fix List + - Fixed possible problem of poisson regression for R. * Python module now throw exception instead of crash terminal when a parameter error happens. * Java api is ready for use * Added more test cases and continuous integration to make each build more robust -* Improvements in sklearn compatible module +* Improvements in sklearn compatible module \ No newline at end of file diff --git a/R-package/README.md b/R-package/README.md index 81dabb31c..96113c391 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -24,10 +24,10 @@ If you face an issue installing the package using ```devtools::install_github`` ``` devtools::install_github('dmlc/xgboost',subdir='R-package') Downloading github repo dmlc/xgboost@master -Error in function (type, msg, asError = TRUE) : +Error in function (type, msg, asError = TRUE) : Peer certificate cannot be authenticated with given CA certificates ``` -To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) - +To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) - ``` 1. Clone the current repository and set your workspace to xgboost/R-package/ 2. Run R CMD INSTALL --build . in terminal to get the tarball. diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index a8084b206..37a30c797 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -32,7 +32,7 @@ extern "C" { bool CheckNAN(double v) { return ISNAN(v); } -bool LogGamma(double v) { +double LogGamma(double v) { return lgammafn(v); } } // namespace utils From c43fee541da8a15c3871c1963a2927ab4dca3e07 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 1 Aug 2015 11:27:13 -0700 Subject: [PATCH 71/83] enable basic sphinx doc --- README.md | 13 +- doc/.gitignore | 7 + doc/Makefile | 192 ++++++++++++++++++++++ doc/conf.py | 158 ++++++++++++++++++ doc/external_memory.md | 10 +- doc/{README.md => index.md} | 26 ++- doc/input_format.md | 9 +- doc/parameter.md | 2 +- doc/python/python_api.rst | 36 ++++ doc/{python.md => python/python_intro.md} | 3 +- doc/sphinx_util.py | 50 ++++++ python-package/xgboost/__init__.py | 4 + python-package/xgboost/core.py | 63 ++++--- python-package/xgboost/sklearn.py | 12 +- 14 files changed, 529 insertions(+), 56 deletions(-) create mode 100644 doc/.gitignore create mode 100644 doc/Makefile create mode 100644 doc/conf.py rename doc/{README.md => index.md} (87%) create mode 100644 doc/python/python_api.rst rename doc/{python.md => python/python_intro.md} (98%) create mode 100644 doc/sphinx_util.py diff --git a/README.md b/README.md index 0f6ffc7fa..be93e99fd 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ DMLC/XGBoost [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. +An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data @@ -13,7 +13,7 @@ Contents -------- * [What's New](#whats-new) * [Version](#version) -* [Documentation](doc/README.md) +* [Documentation](doc/index.md) * [Build Instruction](doc/build.md) * [Features](#features) * [Distributed XGBoost](multi-node) @@ -43,15 +43,14 @@ Version Features -------- - -* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py), - [R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R), +* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py), + [R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R), [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) * Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml) * Memory efficient - Handles sparse matrices, supports external memory * Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) * Distributed version runs on Hadoop (YARN), MPI, SGE etc., scales to billions of examples. - + Bug Reporting ------------- @@ -74,4 +73,4 @@ License XGBoost in Graphlab Create -------------------------- * XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html) -* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: +* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 000000000..382c3419f --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1,7 @@ +html +latex +*.sh +_* +doxygen +parser.py +*.pyc diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 000000000..40bba2a28 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/rabit" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 000000000..b08f495f5 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# +# documentation build configuration file, created by +# sphinx-quickstart on Thu Jul 23 19:40:08 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. +import sys +import os, subprocess +import shlex +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +libpath = os.path.join(curr_path, '../python-package/') +sys.path.insert(0, libpath) +sys.path.insert(0, curr_path) + +from sphinx_util import MarkdownParser + +# -- General configuration ------------------------------------------------ + +# General information about the project. +project = u'xgboost' +author = u'%s developers' % project +copyright = u'2015, %s' % author +github_doc_root = 'https://github.com/dmlc/xgboost/tree/master/doc/' + +# add markdown parser +MarkdownParser.github_doc_root = github_doc_root +source_parsers = { + '.md': MarkdownParser, +} +os.environ['XGBOOST_BUILD_DOC'] = '1' +# Version information. +import xgboost +version = xgboost.__version__ +release = xgboost.__version__ + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.mathjax', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = ['.rst', '.md'] + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Output file base name for HTML help builder. +htmlhelp_basename = project + 'doc' + +# -- Options for LaTeX output --------------------------------------------- +latex_elements = { +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, '%s.tex' % project, project, + author, 'manual'), +] + +# hook for doxygen +def run_doxygen(folder): + """Run the doxygen make command in the designated folder.""" + try: + retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True) + if retcode < 0: + sys.stderr.write("doxygen terminated by signal %s" % (-retcode)) + except OSError as e: + sys.stderr.write("doxygen execution failed: %s" % e) + +def generate_doxygen_xml(app): + """Run the doxygen make commands if we're on the ReadTheDocs server""" + read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' + if read_the_docs_build: + run_doxygen('..') + +def setup(app): + # Add hook for building doxygen xml when needed + # no c++ API for now + # app.connect("builder-inited", generate_doxygen_xml) + pass diff --git a/doc/external_memory.md b/doc/external_memory.md index f8eec83fc..e50c02e57 100644 --- a/doc/external_memory.md +++ b/doc/external_memory.md @@ -1,5 +1,5 @@ Using XGBoost External Memory Version(beta) -==== +=========================================== There is no big difference between using external memory version and in-memory version. The only difference is the filename format. @@ -19,13 +19,13 @@ You can find that there is additional ```#dtrain.cache``` following the libsvm f For CLI version, simply use ```"../data/agaricus.txt.train#dtrain.cache"``` in filename. Performance Note -==== +---------------- * the parameter ```nthread``` should be set to number of ***real*** cores - Most modern CPU offer hyperthreading, which means you can have a 4 core cpu with 8 threads - Set nthread to be 4 for maximum performance in such case Distributed Version -==== +------------------- The external memory mode naturally works on distributed version, you can simply set path like ``` data = "hdfs:///path-to-data/#dtrain.cache" @@ -34,8 +34,8 @@ xgboost will cache the data to the local position. When you run on YARN, the cur so that you can directly use ```dtrain.cache``` to cache to current folder. -Usage Note: -==== +Usage Note +---------- * This is a experimental version - If you like to try and test it, report results to https://github.com/dmlc/xgboost/issues/244 * Currently only importing from libsvm format is supported diff --git a/doc/README.md b/doc/index.md similarity index 87% rename from doc/README.md rename to doc/index.md index e8df7d57d..5d8d5b26f 100644 --- a/doc/README.md +++ b/doc/index.md @@ -1,6 +1,9 @@ -List of Documentations -==== -* [Using XGBoost in Python](python.md) +XGBoost Documentation +===================== + + + +* [Using XGBoost in Python](python/python_intro.md) * [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) * [Learning to use xgboost by example](../demo) * [External Memory Version](external_memory.md) @@ -11,24 +14,29 @@ List of Documentations - [Notes on Parameter Tunning](param_tuning.md) * Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) -How to get started -==== + +How to Get Started +------------------ * Try to read the [binary classification example](../demo/binary_classification) for getting started example * Find the guide specific language guide above for the language you like to use * [Learning to use xgboost by example](../demo) contains lots of useful examples -Highlight Links -==== +Example Highlight Links +----------------------- This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request. * [Kaggle CrowdFlower winner's solution by Chenglong Chen](https://github.com/ChenglongChen/Kaggle_CrowdFlower) * [Kaggle Malware Prediction winner's solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware) * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) * [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit) * Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y) -* [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) +* [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) + +API Reference +------------- + * [Python API Reference](python/python_api.rst) Contribution -==== +------------ Contribution of documents and use-cases are welcomed! * This package use Google C++ style * Check tool of codestyle diff --git a/doc/input_format.md b/doc/input_format.md index 557b87512..3986d07fb 100644 --- a/doc/input_format.md +++ b/doc/input_format.md @@ -1,12 +1,13 @@ -Input Format -==== +Text Input Format of DMatrix +============================ + ## Basic Input Format As we have mentioned, XGBoost takes LibSVM format. For training or predicting, XGBoost takes an instance file with the format as below: train.txt ``` 1 101:1.2 102:0.03 -0 1:2.1 10001:300 10002:400 +0 1:2.1 10001:300 10002:400 0 0:1.3 1:0.3 1 0:0.01 1:0.3 0 0:0.2 1:0.3 @@ -37,7 +38,7 @@ train.txt.weight 0.5 ``` It means that XGBoost will emphasize more on the first and fourth instance, that is to say positive instances while training. -The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost. +The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost. ## Initial Margin file XGBoost supports providing each instance an initial margin prediction. For example, if we have a initial prediction using logistic regression for "train.txt" file, we can create the following file: diff --git a/doc/parameter.md b/doc/parameter.md index 13eefa0fe..53cdd806f 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -1,5 +1,5 @@ XGBoost Parameters -==== +================== Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters: - General parameters relates to which booster we are using to do boosting, commonly tree or linear model - Booster parameters depends on which booster you have chosen diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst new file mode 100644 index 000000000..e665efe84 --- /dev/null +++ b/doc/python/python_api.rst @@ -0,0 +1,36 @@ +Python API Reference +==================== +This page gives the Python API reference of xgboost. + +Core Data Structure +------------------- +.. automodule:: xgboost.core + +.. autoclass:: xgboost.DMatrix + :members: + :show-inheritance: + +.. autoclass:: xgboost.Booster + :members: + :show-inheritance: + + +Learning API +------------ +.. automodule:: xgboost.training + +.. autofunction:: xgboost.train + +.. autofunction:: xgboost.cv + + +Scikit-Learn API +---------------- +.. automodule:: xgboost.sklearn +.. autoclass:: xgboost.XGBRegressor + :members: + :show-inheritance: +.. autoclass:: xgboost.XGBClassifier + :members: + :show-inheritance: + diff --git a/doc/python.md b/doc/python/python_intro.md similarity index 98% rename from doc/python.md rename to doc/python/python_intro.md index 93b5c43d4..2acb73b3c 100644 --- a/doc/python.md +++ b/doc/python/python_intro.md @@ -1,5 +1,5 @@ XGBoost Python Module -==== +===================== This page will introduce XGBoost Python module, including: * [Building and Import](#building-and-import) @@ -8,6 +8,7 @@ This page will introduce XGBoost Python module, including: * [Train Model](#training-model) * [Early Stopping](#early-stopping) * [Prediction](#prediction) +* [API Reference](python_api.md) A [walk through python example](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) for UCI Mushroom dataset is provided. diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py new file mode 100644 index 000000000..33c98d381 --- /dev/null +++ b/doc/sphinx_util.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +"""Helper hacking utilty function for customization.""" +import sys +import os +import subprocess + +# TODO: make less hacky way than this one +if os.environ.get('READTHEDOCS', None) == 'True': + subprocess.call('cd ..; rm -rf recommonmark;' + + 'git clone https://github.com/tqchen/recommonmark;' + + 'cp recommonmark/recommonmark/parser.py doc/parser', shell=True) + +sys.path.insert(0, os.path.abspath('..')) +import parser + +class MarkdownParser(parser.CommonMarkParser): + github_doc_root = None + doc_suffix = set(['md', 'rst']) + + @staticmethod + def remap_url(url): + if MarkdownParser.github_doc_root is None or url is None: + return url + if url.startswith('#'): + return url + arr = url.split('#', 1) + ssuffix = arr[0].rsplit('.', 1) + + if len(ssuffix) == 2 and (ssuffix[-1] in MarkdownParser.doc_suffix + and arr[0].find('://') == -1): + arr[0] = ssuffix[0] + '.html' + return '#'.join(arr) + else: + if arr[0].find('://') == -1: + return MarkdownParser.github_doc_root + url + else: + return url + + def reference(self, block): + block.destination = remap_url(block.destination) + return super(MarkdownParser, self).reference(block) + +# inplace modify the function in recommonmark module to allow link remap +old_ref = parser.reference + +def reference(block): + block.destination = MarkdownParser.remap_url(block.destination) + return old_ref(block) + +parser.reference = reference diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py index 6f967b837..b284c27e0 100644 --- a/python-package/xgboost/__init__.py +++ b/python-package/xgboost/__init__.py @@ -10,3 +10,7 @@ from .training import train, cv from .sklearn import XGBModel, XGBClassifier, XGBRegressor __version__ = '0.4' + +__all__ = ['DMatrix', 'Booster', + 'train', 'cv', + 'XGBModel', 'XGBClassifier', 'XGBRegressor'] diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 85017cb82..0849d276c 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -50,20 +50,24 @@ def find_lib_path(): else: dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - if len(lib_path) == 0: + if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False): raise XGBoostLibraryNotFound( - 'Cannot find XGBoost Libarary in the candicate path %s,' + - 'Did you run build.sh in root oath?' % str(dll_path)) + 'Cannot find XGBoost Libarary in the candicate path, ' + + 'did you run build.sh in root path?\n' + 'List of candidates:\n' + ('\n'.join(dll_path))) return lib_path + def _load_lib(): """Load xgboost Library.""" lib_path = find_lib_path() + if len(lib_path) == 0: + return None lib = ctypes.cdll.LoadLibrary(lib_path[0]) lib.XGBGetLastError.restype = ctypes.c_char_p - return lib + # load the XGBoost library globally _LIB = _load_lib() @@ -119,6 +123,7 @@ class DMatrix(object): DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed. + You can construct DMatrix from numpy.arrays """ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ @@ -127,15 +132,16 @@ class DMatrix(object): Parameters ---------- data : string/numpy array/scipy.sparse - Data source, string type is the path of svmlight format txt file, - xgb buffer or path to cache_file - label : list or numpy 1-D array (optional) + Data source of DMatrix. + When data is string type, it represents the path libsvm format txt file, + or binary file that xgboost can read from. + label : list or numpy 1-D array, optional Label of the training data. - missing : float + missing : float, optional Value in the data which needs to be present as a missing value. - weight : list or numpy 1-D array (optional) + weight : list or numpy 1-D array , optional Weight for each instance. - silent: boolean + silent : boolean, optional Whether print messages during construction """ # force into void_p, mac need to pass things in as void_p @@ -469,13 +475,22 @@ class Booster(object): """Copy the booster object. Returns - -------- - a copied booster model + ------- + booster: `Booster` + a copied booster model """ return self.__copy__() def set_param(self, params, value=None): - """Set parameters into the DMatrix.""" + """Set parameters into the Booster. + + Parameters + ---------- + params: dict/list/str + list of key,value paris, dict of key to value or simply str key + value: optional + value of the specified parameter, when params is str key + """ if isinstance(params, collections.Mapping): params = params.items() elif isinstance(params, STRING_TYPES) and value is not None: @@ -485,7 +500,7 @@ class Booster(object): def update(self, dtrain, iteration, fobj=None): """ - Update (one iteration). + Update for one iteration, with objective function calculated internally. Parameters ---------- @@ -507,7 +522,7 @@ class Booster(object): def boost(self, dtrain, grad, hess): """ - Update. + Boost the booster for one iteration, with customized gradient statistics. Parameters ---------- @@ -542,7 +557,8 @@ class Booster(object): Returns ------- - evaluation result + result: str + Evaluation result string. """ if feval is None: for d in evals: @@ -567,18 +583,21 @@ class Booster(object): def eval(self, data, name='eval', iteration=0): """Evaluate the model on mat. - Parameters - --------- + ---------- data : DMatrix The dmatrix storing the input. - name : str (default = 'eval') - The name of the dataset + name : str, optional + The name of the dataset. + iteration : int, optional + The current iteration number. - iteration : int (default = 0) - The current iteration number + Returns + ------- + result: str + Evaluation result string. """ return self.eval_set([(data, name)], iteration) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 4a5771724..6f176972a 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -206,9 +206,9 @@ class XGBModel(XGBModelBase): class XGBClassifier(XGBModel, XGBClassifierBase): # pylint: disable=missing-docstring,too-many-arguments,invalid-name - __doc__ = """ - Implementation of the scikit-learn API for XGBoost classification - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) + __doc__ = """Implementation of the scikit-learn API for XGBoost classification. + + """ + '\n'.join(XGBModel.__doc__.split('\n')[2:]) def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, @@ -335,7 +335,5 @@ class XGBClassifier(XGBModel, XGBClassifierBase): class XGBRegressor(XGBModel, XGBRegressorBase): # pylint: disable=missing-docstring - __doc__ = """ - Implementation of the scikit-learn API for XGBoost regression - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - + __doc__ = """Implementation of the scikit-learn API for XGBoost regression. + """ + '\n'.join(XGBModel.__doc__.split('\n')[2:]) From e8de5da3a56661b19e237bf0e60302b69731c35d Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 1 Aug 2015 13:47:41 -0700 Subject: [PATCH 72/83] Document refactor change badge --- CONTRIBUTORS.md | 15 +++- R-package/DESCRIPTION | 4 +- R-package/README.md | 8 +- R-package/vignettes/xgboostPresentation.Rmd | 46 +++++------ README.md | 9 ++- demo/README.md | 10 +-- demo/guide-python/README.md | 4 +- .../kaggle-otto/understandingXGBoostModel.Rmd | 14 ++-- doc/README | 5 ++ doc/build.md | 26 +++---- doc/conf.py | 13 +++- doc/dev-guide/contribute.md | 13 ++++ doc/faq.md | 61 +++++++++++++++ doc/index.md | 78 +++++++++++++------ doc/param_tuning.md | 12 +-- doc/parameter.md | 25 +++--- doc/python-requirements.txt | 2 + doc/python/python_api.rst | 5 +- doc/python/python_intro.md | 73 +++++++++-------- doc/sphinx_util.py | 47 ++--------- 20 files changed, 286 insertions(+), 184 deletions(-) create mode 100644 doc/README create mode 100644 doc/dev-guide/contribute.md create mode 100644 doc/faq.md create mode 100644 doc/python-requirements.txt diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 71f48a166..36ccc9d5d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,9 +1,9 @@ Contributors of DMLC/XGBoost -======= +============================ XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users. Comitters -======= +--------- Committers are people who have made substantial contribution to the project and granted write access to the project. * [Tianqi Chen](https://github.com/tqchen), University of Washington - Tianqi is a PhD working on large-scale machine learning, he is the creator of the project. @@ -14,8 +14,17 @@ Committers are people who have made substantial contribution to the project and * [Michael Benesty](https://github.com/pommedeterresautee) - Micheal is a lawyer, data scientist in France, he is the creator of xgboost interactive analysis module in R. +Become a Comitter +----------------- +XGBoost is a opensource project and we are actively looking for new comitters who are willing to help maintaining and lead the project. +Committers comes from contributors who: +* Made substantial contribution to the project. +* Willing to spent time on maintaining and lead the project. + +New committers will be proposed by current comitter memembers, with support from more than two of current comitters. + List of Contributors -======= +-------------------- * [Full List of Contributors](https://github.com/dmlc/xgboost/graphs/contributors) - To contributors: please add your name to the list when you submit a patch to the project:) * [Kailong Chen](https://github.com/kalenhaha) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 4560971e2..19410d65a 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,8 +1,8 @@ Package: xgboost Type: Package Title: Extreme Gradient Boosting -Version: 0.4-1 -Date: 2015-05-11 +Version: 0.4-2 +Date: 2015-08-01 Author: Tianqi Chen , Tong He , Michael Benesty Maintainer: Tong He Description: Extreme Gradient Boosting, which is an diff --git a/R-package/README.md b/R-package/README.md index 96113c391..294691416 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -4,7 +4,13 @@ R package for xgboost Installation ------------ -For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first. +We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now. For stable/pre-compiled(for Windows and OS X) version, please install from CRAN: + +```r +install.packages('xgboost') +``` + +For up-to-date version, please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first. ```r devtools::install_github('dmlc/xgboost',subdir='R-package') diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 39ab819f7..89d27fb45 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -1,6 +1,6 @@ --- title: "Xgboost presentation" -output: +output: rmarkdown::html_vignette: css: vignette.css number_sections: yes @@ -16,7 +16,7 @@ vignette: > Introduction ============ -**Xgboost** is short for e**X**treme **G**radient **Boost**ing package. +**Xgboost** is short for e**X**treme **G**radient **Boost**ing package. The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions. @@ -25,9 +25,9 @@ It is an efficient and scalable implementation of gradient boosting framework by - *linear* model ; - *tree learning* algorithm. -It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily. +It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily. -It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. +It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. It has several features: @@ -64,7 +64,7 @@ Formerly available versions can be obtained from the CRAN [archive](http://cran. Learning ======== -For the purpose of this tutorial we will load **Xgboost** package. +For the purpose of this tutorial we will load **XGBoost** package. ```{r libLoading, results='hold', message=F, warning=F} require(xgboost) @@ -73,7 +73,7 @@ require(xgboost) Dataset presentation -------------------- -In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-). +In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-). Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013. @@ -85,7 +85,7 @@ We will load the `agaricus` datasets embedded with the package and will link the The datasets are already split in: * `train`: will be used to build the model ; -* `test`: will be used to assess the quality of our model. +* `test`: will be used to assess the quality of our model. Why *split* the dataset in two parts? @@ -115,7 +115,7 @@ dim(train$data) dim(test$data) ``` -This dataset is very small to not make the **R** package too heavy, however **Xgboost** is built to manage huge dataset very efficiently. +This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge dataset very efficiently. As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`): @@ -124,7 +124,7 @@ class(train$data)[1] class(train$label) ``` -Basic Training using Xgboost +Basic Training using XGBoost ---------------------------- This step is the most critical part of the process for the quality of our model. @@ -160,7 +160,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth #### xgb.DMatrix -**Xgboost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later. +**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later. ```{r trainingDmatrix, message=F, warning=F} dtrain <- xgb.DMatrix(data = train$data, label = train$label) @@ -169,7 +169,7 @@ bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround #### Verbose option -**Xgboost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality. +**XGBoost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality. One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics). @@ -188,7 +188,7 @@ bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, o bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2) ``` -Basic prediction using Xgboost +Basic prediction using XGBoost ============================== Perform the prediction @@ -211,7 +211,7 @@ These numbers doesn't look like *binary classification* `{0,1}`. We need to perf Transform the regression in a binary classification --------------------------------------------------- -The only thing that **Xgboost** does is a *regression*. **Xgboost** is using `label` vector to build its *regression* model. +The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model. How can we use a *regression* model to perform a binary classification? @@ -240,7 +240,7 @@ Steps explanation: 2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ; 3. `mean(vectorOfErrors)` computes the *average error* itself. -The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**. +The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**. *Multiclass* classification works in a similar way. @@ -269,7 +269,7 @@ Both `xgboost` (simple) and `xgb.train` (advanced) functions train models. One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible. -One way to measure progress in learning of a model is to provide to **Xgboost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning. +One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning. > in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors. @@ -281,7 +281,7 @@ watchlist <- list(train=dtrain, test=dtest) bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic") ``` -**Xgboost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. +**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset. @@ -298,13 +298,13 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli Linear boosting --------------- -Until know, all the learnings we have performed were based on boosting trees. **Xgboost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter). +Until know, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter). ```{r linearBoosting, message=F, warning=F} bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic") ``` -In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. +In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use. @@ -340,7 +340,7 @@ print(paste("test-error=", err)) View feature importance/influence from the learnt model ------------------------------------------------------- -Feature importance is similar to R gbm package's relative influence (rel.inf). +Feature importance is similar to R gbm package's relative influence (rel.inf). ``` importance_matrix <- xgb.importance(model = bst) @@ -370,7 +370,7 @@ Save and load models May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required. -Hopefully for you, **Xgboost** implements such functions. +Hopefully for you, **XGBoost** implements such functions. ```{r saveModel, message=F, warning=F} # save model to binary local file @@ -397,7 +397,7 @@ file.remove("./xgboost.model") > result is `0`? We are good! -In some very specific cases, like when you want to pilot **Xgboost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it. +In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it. ```{r saveLoadRBinVectorModel, message=F, warning=F} # save model to R's raw vector @@ -412,9 +412,9 @@ pred3 <- predict(bst3, test$data) # pred2 should be identical to pred print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred)))) -``` +``` -> Again `0`? It seems that `Xgboost` works pretty well! +> Again `0`? It seems that `XGBoost` works pretty well! References ========== diff --git a/README.md b/README.md index be93e99fd..ac29ef7eb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -DMLC/XGBoost -======= - -[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + eXtreme Gradient Boosting +=========== +[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) +[![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) +[![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. diff --git a/demo/README.md b/demo/README.md index fcfaa8434..d6f061484 100644 --- a/demo/README.md +++ b/demo/README.md @@ -1,12 +1,12 @@ -XGBoost Examples -==== +XGBoost Code Examples +===================== This folder contains all the code examples using xgboost. * Contribution of examples, benchmarks is more than welcome! * If you like to share how you use xgboost to solve your problem, send a pull request:) Features Walkthrough -==== +-------------------- This is a list of short codes introducing different functionalities of xgboost packages. * Basic walkthrough of packages [python](guide-python/basic_walkthrough.py) @@ -37,7 +37,7 @@ This is a list of short codes introducing different functionalities of xgboost p [R](../R-package/demo/predict_leaf_indices.R) Basic Examples by Tasks -==== +----------------------- Most of examples in this section are based on CLI or python version. However, the parameter settings can be applied to all versions * [Binary classification](binary_classification) @@ -46,7 +46,7 @@ However, the parameter settings can be applied to all versions * [Learning to Rank](rank) Benchmarks -==== +---------- * [Starter script for Kaggle Higgs Boson](kaggle-higgs) * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md index 32d0290ab..ff1f98ad0 100644 --- a/demo/guide-python/README.md +++ b/demo/guide-python/README.md @@ -1,6 +1,6 @@ XGBoost Python Feature Walkthrough -==== -* [Basic walkthrough of wrappers](basic_walkthrough.py) +================================== +* [Basic walkthrough of wrappers](basic_walkthrough.py) * [Cutomize loss function, and evaluation metric](custom_objective.py) * [Boosting from existing prediction](boost_from_prediction.py) * [Predicting using first n trees](predict_first_ntree.py) diff --git a/demo/kaggle-otto/understandingXGBoostModel.Rmd b/demo/kaggle-otto/understandingXGBoostModel.Rmd index 6bd64401d..e04277d4e 100644 --- a/demo/kaggle-otto/understandingXGBoostModel.Rmd +++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd @@ -1,7 +1,7 @@ --- title: "Understanding XGBoost Model on Otto Dataset" author: "Michaël Benesty" -output: +output: rmarkdown::html_vignette: css: ../../R-package/vignettes/vignette.css number_sections: yes @@ -54,7 +54,7 @@ test[1:6,1:5, with =F] Each *column* represents a feature measured by an `integer`. Each *row* is an **Otto** product. -Obviously the first column (`ID`) doesn't contain any useful information. +Obviously the first column (`ID`) doesn't contain any useful information. To let the algorithm focus on real stuff, we will delete it. @@ -124,7 +124,7 @@ param <- list("objective" = "multi:softprob", cv.nround <- 5 cv.nfold <- 3 -bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, +bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, nfold = cv.nfold, nrounds = cv.nround) ``` > As we can see the error rate is low on the test dataset (for a 5mn trained model). @@ -144,7 +144,7 @@ Feature importance So far, we have built a model made of **`r nround`** trees. -To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products). +To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products). Each division operation is called a *split*. @@ -158,7 +158,7 @@ In the same way, in Boosting we try to optimize the missclassification at each r The improvement brought by each *split* can be measured, it is the *gain*. -Each *split* is done on one feature only at one value. +Each *split* is done on one feature only at one value. Let's see what the model looks like. @@ -168,7 +168,7 @@ model[1:10] ``` > For convenience, we are displaying the first 10 lines of the model only. -Clearly, it is not easy to understand what it means. +Clearly, it is not easy to understand what it means. Basically each line represents a *branch*, there is the *tree* ID, the feature ID, the point where it *splits*, and information regarding the next *branches* (left, right, when the row for this feature is N/A). @@ -217,7 +217,7 @@ xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2) We are just displaying the first two trees here. -On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated. +On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated. Besides, **XGBoost** generate `k` trees at each round for a `k`-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes. Going deeper diff --git a/doc/README b/doc/README new file mode 100644 index 000000000..a14ad800b --- /dev/null +++ b/doc/README @@ -0,0 +1,5 @@ +The document of xgboost is generated with recommonmark and sphinx. + +You can build it locally by typing "make html" in this folder. +- You will need to rerun the recommonmark script for readthedocs in sphinx_util. +- This was a hack to get the customized parser into readthedocs, hopefully to be removed in future. diff --git a/doc/build.md b/doc/build.md index 7b8ee96aa..b97237bcb 100644 --- a/doc/build.md +++ b/doc/build.md @@ -1,5 +1,5 @@ Build XGBoost -==== +============= * Run ```bash build.sh``` (you can also type make) * If you have C++11 compiler, it is recommended to type ```make cxx11=1``` - C++11 is not used by default @@ -12,19 +12,19 @@ Build XGBoost * OS X with multi-threading support: see [next section](#openmp-for-os-x) Build XGBoost in OS X with OpenMP -==== +--------------------------------- Here is the complete solution to use OpenMp-enabled compilers to install XGBoost. 1. Obtain gcc with openmp support by `brew install gcc --without-multilib` **or** clang with openmp by `brew install clang-omp`. The clang one is recommended because the first method requires us compiling gcc inside the machine (more than an hour in mine)! (BTW, `brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.) -2. **if you are planing to use clang-omp** - in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to +2. **if you are planing to use clang-omp** - in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to ```C++ - #include /* instead of #include */` + #include /* instead of #include */` ``` - to make it work, otherwise you might get this error - + to make it work, otherwise you might get this error + `src/tree/../utils/omp.h:9:10: error: 'omp.h' file not found...` @@ -43,11 +43,11 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost export CXX = clang-omp++ ``` - Remember to change `header` (mentioned in step 2) if using clang-omp. - + Remember to change `header` (mentioned in step 2) if using clang-omp. + Then `cd xgboost` then `bash build.sh` to compile XGBoost. And go to `wrapper` sub-folder to install python version. -4. Set the `Makevars` file in highest piority for R. +4. Set the `Makevars` file in highest piority for R. The point is, there are three `Makevars` : `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by running `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!). @@ -75,21 +75,21 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost Again, remember to change `header` if using clang-omp. - Then inside R, run + Then inside R, run ```R install.packages('xgboost/R-package/', repos=NULL, type='source') ``` - + Or - + ```R devtools::install_local('xgboost/', subdir = 'R-package') # you may use devtools ``` Build with HDFS and S3 Support -===== +------------------------------ * To build xgboost use with HDFS/S3 support and distributed learnig. It is recommended to build with dmlc, with the following steps - ```git clone https://github.com/dmlc/dmlc-core``` - Follow instruction in dmlc-core/make/config.mk to compile libdmlc.a diff --git a/doc/conf.py b/doc/conf.py index b08f495f5..05e1e91ba 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -22,7 +22,13 @@ libpath = os.path.join(curr_path, '../python-package/') sys.path.insert(0, libpath) sys.path.insert(0, curr_path) -from sphinx_util import MarkdownParser +from sphinx_util import MarkdownParser, AutoStructify + +# -- mock out modules +import mock +MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib'] +for mod_name in MOCK_MODULES: + sys.modules[mod_name] = mock.Mock() # -- General configuration ------------------------------------------------ @@ -155,4 +161,7 @@ def setup(app): # Add hook for building doxygen xml when needed # no c++ API for now # app.connect("builder-inited", generate_doxygen_xml) - pass + app.add_config_value('recommonmark_config', { + 'url_resolver': lambda url: github_doc_root + url, + }, True) + app.add_transform(AutoStructify) diff --git a/doc/dev-guide/contribute.md b/doc/dev-guide/contribute.md new file mode 100644 index 000000000..5d8f7c26c --- /dev/null +++ b/doc/dev-guide/contribute.md @@ -0,0 +1,13 @@ +Developer Guide +=============== +This page contains guide for developers of xgboost. XGBoost has been developed and used by a group of active community. +Everyone is more than welcomed to is a great way to make the project better. +The project is maintained by a committee of [committers](../../CONTRIBUTORS.md#comitters) who will review and merge pull requests from contributors. + +Contributing Code +================= +* The C++ code follows Google C++ style +* We follow numpy style to document our python module +* Tools to precheck codestyle + - clone https://github.com/dmlc/dmlc-core into root directory + - type ```make lint``` and fix possible errors. diff --git a/doc/faq.md b/doc/faq.md new file mode 100644 index 000000000..5c985182a --- /dev/null +++ b/doc/faq.md @@ -0,0 +1,61 @@ +Frequent Asked Questions +======================== +This document contains the frequent asked question to xgboost. + +How to tune parameters +---------------------- +See [Parameter Tunning Guide](param_tuning.md) + + +I have a big dataset +-------------------- +XGBoost is designed to be memory efficient. Usually it could handle problems as long as the data fit into your memory +(This usually means millions of instances). +If you are running out of memory, checkout [external memory version](external_memory.md) or +[distributed version](https://github.com/dmlc/wormhole/tree/master/learn/xgboost) of xgboost. + + +Running xgboost on Platform X (Hadoop/Yarn, Mesos) +-------------------------------------------------- +The distributed version of XGBoost is designed to be portable to various environment. +Distributed XGBoost can be ported to any platform that supports [rabit](https://github.com/dmlc/rabit). +You can directly run xgboost on Yarn. In theory Mesos and other resource allocation engine can be easily supported as well. + + +Why not implement distributed xgboost on top of X (Spark, Hadoop) +----------------------------------------------------------------- +The first fact we need to know is going distributed does not necessarily solve all the problems. +Instead, it creates more problems such as more communication over head and fault tolerance. +The ultimate question will still come back into how to push the limit of each computation node +and use less resources to complete the task (thus with less communication and chance of failure). + +To achieve these, we decide to reuse the optimizations in the single node xgboost and build distributed version on top of it. +The demand of communication in machine learning is rather simple, in a sense that we can depend on a limited set of API (in our case rabit). +Such design allows us to reuse most of the code, and being portable to major platforms such as Hadoop/Yarn, MPI, SGE. +Most importantly, pushs the limit of the computation resources we can use. + + +How can I port the model to my own system +----------------------------------------- +The model and data format of XGBoost is exchangable. +Which means the model trained by one langauge can be loaded in another. +This means you can train the model using R, while running prediction using +Java or C++, which are more common in production system. +You can also train the model using distributed version, +and load them in from python to do some interactive analysis. + + +Do you support LambdaMART +------------------------- +Yes, xgboost implements LambdaMART. Checkout the objective section in [parameters](parameter.md) + + +How to deal with Missing Value +------------------------------ +xgboost support missing value by default + + +Slightly different result between runs +-------------------------------------- +This could happen, due to non-determinism in floating point summation order and multi-threading. +Though the general accurac will usually remain the same. \ No newline at end of file diff --git a/doc/index.md b/doc/index.md index 5d8d5b26f..7c41d15e2 100644 --- a/doc/index.md +++ b/doc/index.md @@ -1,28 +1,45 @@ XGBoost Documentation ===================== +This is document of xgboost library. +XGBoost is short for eXtreme gradient boosting. This is a library that is designed, and optimized for boosted (tree) algorithms. +The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate*** +for large scale tree boosting. - -* [Using XGBoost in Python](python/python_intro.md) -* [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) -* [Learning to use xgboost by example](../demo) -* [External Memory Version](external_memory.md) -* [Text input format](input_format.md) -* [Build Instruction](build.md) -* [Notes on the Code](../src) -* List of all parameters and their usage: [Parameters](parameter.md) - - [Notes on Parameter Tunning](param_tuning.md) -* Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) - +This document is hosted at http://xgboost.readthedocs.org/. You can also browse most of the documents in github directly. How to Get Started ------------------ -* Try to read the [binary classification example](../demo/binary_classification) for getting started example -* Find the guide specific language guide above for the language you like to use -* [Learning to use xgboost by example](../demo) contains lots of useful examples +The best way to get started to learn xgboost is by the examples. There are three types of examples you can find in xgboost. +* [Tutorials](#tutorials) are self-conatained tutorials on a complete data science tasks. +* [XGBoost Code Examples](../demo/) are collections of code and benchmarks of xgboost. + - There is a walkthrough section in this to walk you through specific API features. +* [Highlight Solutions](#highlight-solutions) are presentations using xgboost to solve real world problems. + - These examples are usually more advanced. You can usually find state-of-art solutions to many problems and challenges in here. -Example Highlight Links ------------------------ +After you gets familiar with the interface, checkout the following additional resources +* [Frequently Asked Questions](faq.md) +* [Learning what is in Behind: Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) +* [User Guide](#user-guide) contains comprehensive list of documents of xgboost. +* [Developer Guide](dev-guide/contribute.md) + +Tutorials +--------- +Tutorials are self contained materials that teaches you how to achieve a complete data science task with xgboost, these +are great resources to learn xgboost by real examples. If you think you have something that belongs to here, send a pull request. +* [Binary classification using XGBoost Command Line](../demo/binary_classification/) (CLI) + - This tutorial introduces the basic usage of CLI version of xgboost +* [Introduction of XGBoost in Python](python/python_intro.md) (python) + - This tutorial introduces the python package of xgboost +* [Introduction to XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) (R package) + - This is a general presentation about xgboost in R. +* [Discover your data with XGBoost in R](../R-package/vignettes/discoverYourData.Rmd) (R package) + - This tutorial explaining feature analysis in xgboost. +* [Understanding XGBoost Model on Otto Dataset](../demo/kaggle-otto/understandingXGBoostModel.Rmd) (R package) + - This tutorial teaches you how to use xgboost to compete kaggle otto challenge. + +Highlight Solutions +------------------- This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request. * [Kaggle CrowdFlower winner's solution by Chenglong Chen](https://github.com/ChenglongChen/Kaggle_CrowdFlower) * [Kaggle Malware Prediction winner's solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware) @@ -31,14 +48,25 @@ This section is about blogposts, presentation and videos discussing how to use x * Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y) * [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) +User Guide +---------- +* [Frequently Asked Questions](faq.md) +* [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) +* [Using XGBoost in Python](python/python_intro.md) +* [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) +* [Learning to use XGBoost by Example](../demo) +* [External Memory Version](external_memory.md) +* [Text input format](input_format.md) +* [Build Instruction](build.md) +* [Parameters](parameter.md) +* [Notes on Parameter Tunning](param_tuning.md) + + +Developer Guide +--------------- +* [Developer Guide](dev-guide/contribute.md) + API Reference ------------- - * [Python API Reference](python/python_api.rst) +* [Python API Reference](python/python_api.rst) -Contribution ------------- -Contribution of documents and use-cases are welcomed! -* This package use Google C++ style -* Check tool of codestyle - - clone https://github.com/dmlc/dmlc-core into root directory - - type ```make lint``` and fix possible errors. diff --git a/doc/param_tuning.md b/doc/param_tuning.md index 78263a6a8..c5848f602 100644 --- a/doc/param_tuning.md +++ b/doc/param_tuning.md @@ -1,5 +1,5 @@ Notes on Parameter Tuning -==== +========================= Parameter tuning is a dark art in machine learning, the optimal parameters of a model can depend on many scenarios. So it is impossible to create a comprehensive guide for doing so. @@ -8,7 +8,7 @@ This document tries to provide some guideline for parameters in xgboost. Understanding Bias-Variance Tradeoff -==== +------------------------------------ If you take a machine learning or statistics course, this is likely to be one of the most important concepts. When we allow the model to get more complicated (e.g. more depth), the model @@ -22,7 +22,7 @@ will make the model more conservative or not. This can be used to help you turn the knob between complicated model and simple model. Control Overfitting -==== +------------------- When you observe high training accuracy, but low tests accuracy, it is likely that you encounter overfitting problem. There are in general two ways that you can control overfitting in xgboost @@ -31,9 +31,9 @@ There are in general two ways that you can control overfitting in xgboost * The second way is to add randomness to make training robust to noise - This include ```subsample```, ```colsample_bytree``` - You can also reduce stepsize ```eta```, but needs to remember to increase ```num_round``` when you do so. - -Handle Imbalanced Dataset -=== + +Handle Imbalanced Dataset +------------------------- For common cases such as ads clickthrough log, the dataset is extremely imbalanced. This can affect the training of xgboost model, and there are two ways to improve it. * If you care only about the ranking order (AUC) of your prediction diff --git a/doc/parameter.md b/doc/parameter.md index 53cdd806f..4e0f365bf 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -3,13 +3,15 @@ XGBoost Parameters Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters: - General parameters relates to which booster we are using to do boosting, commonly tree or linear model - Booster parameters depends on which booster you have chosen -- Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks. -- In addition to these parameters, there can be console parameters that relates to behavior of console version of xgboost(e.g. when to save model) +- Learning Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks. +- Command line parameters that relates to behavior of CLI version of xgboost. -### Parameters in R Package +Parameters in R Package +----------------------- In R-package, you can use .(dot) to replace under score in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R. -### General Parameters +General Parameters +------------------ * booster [default=gbtree] - which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function. * silent [default=0] @@ -21,10 +23,8 @@ In R-package, you can use .(dot) to replace under score in the parameters, for e * num_feature [set automatically by xgboost, no need to be set by user] - feature dimension used in boosting, set to maximum dimension of the feature -### Booster Parameters -From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parameters. Parameter with or without bst: prefix will be equivalent(i.e. both bst:eta and eta will be valid parameter setting) . - -#### Parameter for Tree Booster +Parameters for Tree Booster +--------------------------- * eta [default=0.3] - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. - range: [0,1] @@ -47,7 +47,8 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame - subsample ratio of columns when constructing each tree. - range: (0,1] -#### Parameter for Linear Booster +Parameters for Linear Booster +----------------------------- * lambda [default=0] - L2 regularization term on weights * alpha [default=0] @@ -55,7 +56,8 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame * lambda_bias - L2 regularization term on bias, default 0(no L1 reg on bias because it is not important) -### Task Parameters +Learning Task Parameters +------------------------ * objective [ default=reg:linear ] - specify the learning task and the corresponding learning objective, and the objective options are below: - "reg:linear" --linear regression @@ -87,7 +89,8 @@ training repeatively * seed [ default=0 ] - random number seed. -### Console Parameters +Command Line Parameters +----------------------- The following parameters are only used in the console version of xgboost * use_buffer [ default=1 ] - whether create binary buffer for text input, this normally will speedup loading when do diff --git a/doc/python-requirements.txt b/doc/python-requirements.txt new file mode 100644 index 000000000..1a041d154 --- /dev/null +++ b/doc/python-requirements.txt @@ -0,0 +1,2 @@ +commonmark + diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst index e665efe84..85249cbc4 100644 --- a/doc/python/python_api.rst +++ b/doc/python/python_api.rst @@ -1,6 +1,8 @@ Python API Reference ==================== -This page gives the Python API reference of xgboost. +This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package. + +The document in this page is automatically generated by sphinx. The content do not render at github, you can view it at http://xgboost.readthedocs.org/en/latest/python/python_api.html Core Data Structure ------------------- @@ -33,4 +35,3 @@ Scikit-Learn API .. autoclass:: xgboost.XGBClassifier :members: :show-inheritance: - diff --git a/doc/python/python_intro.md b/doc/python/python_intro.md index 2acb73b3c..2b670a053 100644 --- a/doc/python/python_intro.md +++ b/doc/python/python_intro.md @@ -1,32 +1,27 @@ -XGBoost Python Module -===================== +Python Package Introduction +=========================== +This document gives a basic walkthrough of xgboost python package. -This page will introduce XGBoost Python module, including: -* [Building and Import](#building-and-import) -* [Data Interface](#data-interface) -* [Setting Parameters](#setting-parameters) -* [Train Model](#training-model) -* [Early Stopping](#early-stopping) -* [Prediction](#prediction) -* [API Reference](python_api.md) +***List of other Helpful Links*** +* [Python walkthrough code collections](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) +* [Python API Reference](python_api.rst) -A [walk through python example](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) for UCI Mushroom dataset is provided. - -= -#### Install - -To install XGBoost, you need to run `make` in the root directory of the project and then in the `python-package` directory run +Install XGBoost +--------------- +To install XGBoost, do the following steps. +* You need to run `make` in the root directory of the project +* In the `python-package` directory run ```shell python setup.py install ``` -Then import the module in Python as usual + ```python import xgboost as xgb ``` -= -#### Data Interface +Data Interface +-------------- XGBoost python module is able to loading from libsvm txt format file, Numpy 2D array and xgboost binary buffer file. The data will be store in ```DMatrix``` object. * To load libsvm text format file and XGBoost binary file into ```DMatrix```, the usage is like @@ -42,8 +37,8 @@ dtrain = xgb.DMatrix( data, label=label) ``` * Build ```DMatrix``` from ```scipy.sparse``` ```python -csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) -dtrain = xgb.DMatrix( csr ) +csr = scipy.sparse.csr_matrix((dat, (row, col))) +dtrain = xgb.DMatrix(csr) ``` * Saving ```DMatrix``` into XGBoost binary file will make loading faster in next time. The usage is like: ```python @@ -52,18 +47,17 @@ dtrain.save_binary("train.buffer") ``` * To handle missing value in ```DMatrix```, you can initialize the ```DMatrix``` like: ```python -dtrain = xgb.DMatrix( data, label=label, missing = -999.0) +dtrain = xgb.DMatrix(data, label=label, missing = -999.0) ``` * Weight can be set when needed, like ```python -w = np.random.rand(5,1) -dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=w) +w = np.random.rand(5, 1) +dtrain = xgb.DMatrix(data, label=label, missing = -999.0, weight=w) ``` - -= -#### Setting Parameters -XGBoost use list of pair to save [parameters](parameter.md). Eg +Setting Parameters +------------------ +XGBoost use list of pair to save [parameters](../parameter.md). Eg * Booster parameters ```python param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' } @@ -77,8 +71,9 @@ plst += [('eval_metric', 'ams@0')] evallist = [(dtest,'eval'), (dtrain,'train')] ``` -= -#### Training Model +Training +-------- + With parameter list and data, you are able to train a model. * Training ```python @@ -104,10 +99,11 @@ After you save your model, you can load model file at anytime by using bst = xgb.Booster({'nthread':4}) #init model bst.load_model("model.bin") # load data ``` -= -#### Early stopping -If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. Early stopping requires at least one set in `evals`. If there's more than one, it will use the last. +Early Stopping +-------------- +If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. +Early stopping requires at least one set in `evals`. If there's more than one, it will use the last. `train(..., evals=evals, early_stopping_rounds=10)` @@ -117,13 +113,14 @@ If early stopping occurs, the model will have two additional fields: `bst.best_s This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). -= -#### Prediction +Prediction +---------- After you training/loading a model and preparing the data, you can start to do prediction. ```python -data = np.random.rand(7,10) # 7 entities, each contains 10 features -dtest = xgb.DMatrix( data, missing = -999.0 ) -ypred = bst.predict( xgmat ) +# 7 entities, each contains 10 features +data = np.random.rand(7, 10) +dtest = xgb.DMatrix(data) +ypred = bst.predict(xgmat) ``` If early stopping is enabled during training, you can predict with the best iteration. diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py index 33c98d381..0b5178630 100644 --- a/doc/sphinx_util.py +++ b/doc/sphinx_util.py @@ -1,50 +1,17 @@ # -*- coding: utf-8 -*- -"""Helper hacking utilty function for customization.""" +"""Helper utilty function for customization.""" import sys import os +import docutils import subprocess -# TODO: make less hacky way than this one if os.environ.get('READTHEDOCS', None) == 'True': - subprocess.call('cd ..; rm -rf recommonmark;' + + subprocess.call('cd ..; rm -rf recommonmark recom;' + 'git clone https://github.com/tqchen/recommonmark;' + - 'cp recommonmark/recommonmark/parser.py doc/parser', shell=True) + 'mv recommonmark/recommonmark recom', shell=True) sys.path.insert(0, os.path.abspath('..')) -import parser +from recom import parser, transform -class MarkdownParser(parser.CommonMarkParser): - github_doc_root = None - doc_suffix = set(['md', 'rst']) - - @staticmethod - def remap_url(url): - if MarkdownParser.github_doc_root is None or url is None: - return url - if url.startswith('#'): - return url - arr = url.split('#', 1) - ssuffix = arr[0].rsplit('.', 1) - - if len(ssuffix) == 2 and (ssuffix[-1] in MarkdownParser.doc_suffix - and arr[0].find('://') == -1): - arr[0] = ssuffix[0] + '.html' - return '#'.join(arr) - else: - if arr[0].find('://') == -1: - return MarkdownParser.github_doc_root + url - else: - return url - - def reference(self, block): - block.destination = remap_url(block.destination) - return super(MarkdownParser, self).reference(block) - -# inplace modify the function in recommonmark module to allow link remap -old_ref = parser.reference - -def reference(block): - block.destination = MarkdownParser.remap_url(block.destination) - return old_ref(block) - -parser.reference = reference +MarkdownParser = parser.CommonMarkParser +AutoStructify = transform.AutoStructify From 014fa02c6a78d76783435dca29c6c9aa49bd3b23 Mon Sep 17 00:00:00 2001 From: Tong He Date: Sun, 2 Aug 2015 19:03:44 -0700 Subject: [PATCH 73/83] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ac29ef7eb..56275a92d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ =========== [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) +[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. From f7bb8fc10fc4c381ad2f02d8f86e695d6e8e6e7a Mon Sep 17 00:00:00 2001 From: Tong He Date: Sun, 2 Aug 2015 19:04:32 -0700 Subject: [PATCH 74/83] Update README.md --- R-package/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R-package/README.md b/R-package/README.md index 294691416..c92bc9b96 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -1,6 +1,9 @@ R package for xgboost ===================== +[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost) +[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html) + Installation ------------ From bf94add99267b2fbac3ab471c19a19c23a65059f Mon Sep 17 00:00:00 2001 From: Tong He Date: Sun, 2 Aug 2015 19:09:33 -0700 Subject: [PATCH 75/83] Update faq.md --- doc/faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/faq.md b/doc/faq.md index 5c985182a..63f949fad 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -58,4 +58,4 @@ xgboost support missing value by default Slightly different result between runs -------------------------------------- This could happen, due to non-determinism in floating point summation order and multi-threading. -Though the general accurac will usually remain the same. \ No newline at end of file +Though the general accuracy will usually remain the same. From 64dd1973b9ef8c608852b41a8383891ab6aacf6e Mon Sep 17 00:00:00 2001 From: muli Date: Mon, 3 Aug 2015 12:59:28 -0400 Subject: [PATCH 76/83] align logo with title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 56275a92d..fb133ed49 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - eXtreme Gradient Boosting + - eXtreme Gradient Boosting =========== [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) From 81b1befd106475e824f195347c4795ee9df931f6 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Mon, 3 Aug 2015 15:46:22 -0700 Subject: [PATCH 77/83] Adding dmlc stamp --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb133ed49..35a977e25 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ - - eXtreme Gradient Boosting + +eXtreme Gradient Boosting =========== [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) @@ -9,7 +10,7 @@ An optimized general purpose gradient boosting library. The library is paralleli It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data -XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects + XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects Contents -------- From 7fe8b9583301542c6b0d4a29dd270f69040453ad Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 3 Aug 2015 19:36:29 -0700 Subject: [PATCH 78/83] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 35a977e25..ef9eb9a53 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ - -eXtreme Gradient Boosting + eXtreme Gradient Boosting =========== [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) @@ -10,7 +9,7 @@ An optimized general purpose gradient boosting library. The library is paralleli It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data - XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects +XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects Contents -------- From 889887c2f18117b8541e586d85048f4c17e6107f Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 3 Aug 2015 19:37:33 -0700 Subject: [PATCH 79/83] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ef9eb9a53..299524758 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - eXtreme Gradient Boosting + eXtreme Gradient Boosting =========== [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) From 3d38ebbef57743083d7cf95f6de30884dae1f45e Mon Sep 17 00:00:00 2001 From: EricChanBD Date: Wed, 5 Aug 2015 06:19:54 +0800 Subject: [PATCH 80/83] fix SetCombine and SetPrune bug --- src/utils/quantile.h | 89 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 13 deletions(-) diff --git a/src/utils/quantile.h b/src/utils/quantile.h index ffd9142da..677c4e12f 100644 --- a/src/utils/quantile.h +++ b/src/utils/quantile.h @@ -173,14 +173,6 @@ struct WQSummary { } } } - /*! \brief used for debug purpose, print the summary */ - inline void Print(void) const { - for (size_t i = 0; i < size; ++i) { - std::cout << "x=" << data[i].value << "\t" - << "[" << data[i].rmin << "," << data[i].rmax << "]" - << " wmin=" << data[i].wmin << std::endl; - } - } /*! * \brief set current summary to be pruned summary of src * assume data field is already allocated to be at least maxsize @@ -226,6 +218,8 @@ struct WQSummary { */ inline void SetCombine(const WQSummary &sa, const WQSummary &sb) { + utils::Check(sa.Check("BeforeCombine A"), "Check Left error"); + utils::Check(sb.Check("BeforeCombine B"), "Check right error"); if (sa.size == 0) { this->CopyFrom(sb); return; } @@ -276,9 +270,74 @@ struct WQSummary { } while (b != b_end); } this->size = dst - data; + const RType tol = 10; + RType err_mingap, err_maxgap, err_wgap; + this->FixError(&err_mingap, &err_maxgap, &err_wgap); + if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) { + utils::Printf("INFO: mingap=%g, maxgap=%g, wgap=%g\n", + err_mingap, err_maxgap, err_wgap); + } + + if (!this->Check("AfterCombine")) { + utils::Printf("-----Left-----\n"); + sa.Print(); + utils::Printf("-----Right-----\n"); + sb.Print(); + utils::Error("Error after combine\n"); + } utils::Assert(size <= sa.size + sb.size, "bug in combine"); } + // helper function to print the current content of sketch + inline void Print() const { + for (size_t i = 0; i < this->size; ++i) { + utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g\n", + i, data[i].rmin, data[i].rmax, + data[i].wmin, data[i].value); + } + } + // try to fix rounding error + // and re-establish invariance + inline void FixError(RType *err_mingap, + RType *err_maxgap, + RType *err_wgap) const { + *err_mingap = 0; + *err_maxgap = 0; + *err_wgap = 0; + RType prev_rmin = 0, prev_rmax = 0; + for (size_t i = 0; i < this->size; ++i) { + if (data[i].rmin < prev_rmin) { + data[i].rmin = prev_rmin; + *err_mingap = std::max(*err_mingap, prev_rmin - data[i].rmin); + } else { + prev_rmin = data[i].rmin; + } + if (data[i].rmax < prev_rmax) { + data[i].rmax = prev_rmax; + *err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].rmax); + } + RType rmin_next = data[i].rmin_next(); + if (data[i].rmax < rmin_next) { + data[i].rmax = rmin_next; + *err_wgap = std::max(*err_wgap, data[i].rmax - rmin_next); + } + prev_rmax = data[i].rmax; + } + } + // check consistency of the summary + inline bool Check(const char *msg) const { + const float tol = 10.0f; + for (size_t i = 0; i < this->size; ++i) { + if (data[i].rmin + data[i].wmin > data[i].rmax + tol || + data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) { + utils::Printf("----%s: Check not Pass------\n", msg); + this->Print(); + return false; + } + } + return true; + } }; + /*! \brief try to do efficient prunning */ template struct WXQSummary : public WQSummary { @@ -293,6 +352,7 @@ struct WXQSummary : public WQSummary { } // set prune inline void SetPrune(const WQSummary &src, size_t maxsize) { + utils::Check(src.Check("BeforePrune"), "Check src error"); if (src.size <= maxsize) { this->CopyFrom(src); return; } @@ -334,11 +394,7 @@ struct WXQSummary : public WQSummary { utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n", src.size, maxsize, static_cast(range), static_cast(chunk)); - for (size_t i = 0; i < src.size; ++i) { - utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i, - src.data[i].rmin, src.data[i].rmax, src.data[i].wmin, - src.data[i].value, CheckLarge(src.data[i], chunk)); - } + src.Print(); utils::Assert(nbig < n - 1, "quantile: too many large chunk"); } this->data[0] = src.data[0]; @@ -357,6 +413,12 @@ struct WXQSummary : public WQSummary { if (dx2 >= maxdx2) break; while (i < end && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i; + if (i == end) { + utils::Printf("INFO: i==end reached, dx2=%g, i=%lu, end=%lu, mrange=%g, k=%lu, n=%lu, maxsize=%lu\n", + dx2, i, end, mrange, k, n, maxsize); + src.Print(); + break; + } if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) { if (i != lastidx) { this->data[this->size++] = src.data[i]; lastidx = i; @@ -377,6 +439,7 @@ struct WXQSummary : public WQSummary { begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev(); } } + utils::Check(this->Check("AfterPrune"), "Check result error"); } }; /*! From 0f6ad749f5b1211e41db5d89ef053e2725931147 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 4 Aug 2015 19:40:30 -0700 Subject: [PATCH 81/83] remove debug messages fix lint --- src/utils/quantile.h | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/utils/quantile.h b/src/utils/quantile.h index 677c4e12f..adcd0222d 100644 --- a/src/utils/quantile.h +++ b/src/utils/quantile.h @@ -218,8 +218,6 @@ struct WQSummary { */ inline void SetCombine(const WQSummary &sa, const WQSummary &sb) { - utils::Check(sa.Check("BeforeCombine A"), "Check Left error"); - utils::Check(sb.Check("BeforeCombine B"), "Check right error"); if (sa.size == 0) { this->CopyFrom(sb); return; } @@ -270,7 +268,7 @@ struct WQSummary { } while (b != b_end); } this->size = dst - data; - const RType tol = 10; + const RType tol = 10; RType err_mingap, err_maxgap, err_wgap; this->FixError(&err_mingap, &err_maxgap, &err_wgap); if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) { @@ -278,13 +276,6 @@ struct WQSummary { err_mingap, err_maxgap, err_wgap); } - if (!this->Check("AfterCombine")) { - utils::Printf("-----Left-----\n"); - sa.Print(); - utils::Printf("-----Right-----\n"); - sb.Print(); - utils::Error("Error after combine\n"); - } utils::Assert(size <= sa.size + sb.size, "bug in combine"); } // helper function to print the current content of sketch @@ -335,7 +326,7 @@ struct WQSummary { } } return true; - } + } }; /*! \brief try to do efficient prunning */ @@ -352,7 +343,6 @@ struct WXQSummary : public WQSummary { } // set prune inline void SetPrune(const WQSummary &src, size_t maxsize) { - utils::Check(src.Check("BeforePrune"), "Check src error"); if (src.size <= maxsize) { this->CopyFrom(src); return; } @@ -413,12 +403,7 @@ struct WXQSummary : public WQSummary { if (dx2 >= maxdx2) break; while (i < end && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i; - if (i == end) { - utils::Printf("INFO: i==end reached, dx2=%g, i=%lu, end=%lu, mrange=%g, k=%lu, n=%lu, maxsize=%lu\n", - dx2, i, end, mrange, k, n, maxsize); - src.Print(); - break; - } + if (i == end) break; if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) { if (i != lastidx) { this->data[this->size++] = src.data[i]; lastidx = i; @@ -439,7 +424,6 @@ struct WXQSummary : public WQSummary { begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev(); } } - utils::Check(this->Check("AfterPrune"), "Check result error"); } }; /*! From b30aa96a8852f59d69e85b49b3b77859aa51159b Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 4 Aug 2015 20:14:58 -0700 Subject: [PATCH 82/83] Update xgboost_R.cpp --- R-package/src/xgboost_R.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index 37a30c797..0f40ad848 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -39,7 +39,7 @@ double LogGamma(double v) { namespace random { void Seed(unsigned seed) { - warning("parameter seed is ignored, please set random seed using set.seed"); + //warning("parameter seed is ignored, please set random seed using set.seed"); } double Uniform(void) { return unif_rand(); From 752cf4c95d682a9a42508a3f7f82235a0bcc6290 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 4 Aug 2015 22:56:16 -0700 Subject: [PATCH 83/83] Update xgboost_R.cpp --- R-package/src/xgboost_R.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index 0f40ad848..1d426c496 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -39,7 +39,7 @@ double LogGamma(double v) { namespace random { void Seed(unsigned seed) { - //warning("parameter seed is ignored, please set random seed using set.seed"); + // warning("parameter seed is ignored, please set random seed using set.seed"); } double Uniform(void) { return unif_rand();