diff --git a/Makefile b/Makefile index aa9bf632f..c9e35e80c 100644 --- a/Makefile +++ b/Makefile @@ -169,7 +169,7 @@ Rcheck: # lint requires dmlc to be in current folder lint: - dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package + dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package clean: $(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~ diff --git a/demo/.gitignore b/demo/.gitignore index e52797d15..ee79c704b 100644 --- a/demo/.gitignore +++ b/demo/.gitignore @@ -1 +1,2 @@ -*.libsvm \ No newline at end of file +*.libsvm +*.pkl diff --git a/demo/README.md b/demo/README.md index 49e9e52b8..fcfaa8434 100644 --- a/demo/README.md +++ b/demo/README.md @@ -1,14 +1,14 @@ XGBoost Examples ==== -This folder contains all the code examples using xgboost. +This folder contains all the code examples using xgboost. * Contribution of examples, benchmarks is more than welcome! * If you like to share how you use xgboost to solve your problem, send a pull request:) - + Features Walkthrough ==== -This is a list of short codes introducing different functionalities of xgboost and its wrapper. -* Basic walkthrough of wrappers +This is a list of short codes introducing different functionalities of xgboost packages. +* Basic walkthrough of packages [python](guide-python/basic_walkthrough.py) [R](../R-package/demo/basic_walkthrough.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) @@ -20,18 +20,18 @@ This is a list of short codes introducing different functionalities of xgboost a [python](guide-python/boost_from_prediction.py) [R](../R-package/demo/boost_from_prediction.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) -* Predicting using first n trees +* Predicting using first n trees [python](guide-python/predict_first_ntree.py) [R](../R-package/demo/boost_from_prediction.R) - [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) * Generalized Linear Model [python](guide-python/generalized_linear_model.py) [R](../R-package/demo/generalized_linear_model.R) - [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl) + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl) * Cross validation [python](guide-python/cross_validation.py) [R](../R-package/demo/cross_validation.R) - [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl) + [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl) * Predicting leaf indices [python](guide-python/predict_leaf_indices.py) [R](../R-package/demo/predict_leaf_indices.R) @@ -48,5 +48,5 @@ However, the parameter settings can be applied to all versions Benchmarks ==== * [Starter script for Kaggle Higgs Boson](kaggle-higgs) -* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) +* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index 56fed1dd2..7ce95b491 100755 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -75,13 +75,3 @@ clf = xgb.XGBClassifier() clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)]) -# Custom evaluation function -from sklearn.metrics import log_loss - - -def log_loss_eval(y_pred, y_true): - return "log-loss", log_loss(y_true.get_label(), y_pred) - - -clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=log_loss_eval, - eval_set=[(X_test, y_test)]) diff --git a/doc/python.md b/doc/python.md index dfe886fe9..93b5c43d4 100644 --- a/doc/python.md +++ b/doc/python.md @@ -14,7 +14,7 @@ A [walk through python example](https://github.com/tqchen/xgboost/blob/master/de = #### Install -To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run +To install XGBoost, you need to run `make` in the root directory of the project and then in the `python-package` directory run ```shell python setup.py install diff --git a/python-package/.gitignore b/python-package/.gitignore new file mode 100644 index 000000000..d765c67c7 --- /dev/null +++ b/python-package/.gitignore @@ -0,0 +1,3 @@ +build +dist +*.egg* \ No newline at end of file diff --git a/python-package/README.md b/python-package/README.md new file mode 100644 index 000000000..a4ac71d4d --- /dev/null +++ b/python-package/README.md @@ -0,0 +1,7 @@ +XGBoost Python Package +====================== +* To make the python module, type ```./build.sh``` in the root directory of project +* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) +* Install with `python setup.py install` from this directory. +* Refer also to the walk through example in [demo folder](../demo/guide-python) +* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo. diff --git a/python-package/setup.py b/python-package/setup.py new file mode 100644 index 000000000..42e39f3ba --- /dev/null +++ b/python-package/setup.py @@ -0,0 +1,21 @@ +# pylint: disable=invalid-name +"""Setup xgboost package.""" +from __future__ import absolute_import +import sys +from setuptools import setup +sys.path.insert(0, '.') +import xgboost + +LIB_PATH = xgboost.core.find_lib_path() + +setup(name='xgboost', + version=xgboost.__version__, + description=xgboost.__doc__, + install_requires=[ + 'numpy', + 'scipy', + ], + zip_safe=False, + packages=['xgboost'], + data_files=[('xgboost', [LIB_PATH[0]])], + url='https://github.com/dmlc/xgboost') diff --git a/wrapper/xgboost.py b/python-package/xgboost/core.py similarity index 50% rename from wrapper/xgboost.py rename to python-package/xgboost/core.py index 32f9a52b4..85017cb82 100644 --- a/wrapper/xgboost.py +++ b/python-package/xgboost/core.py @@ -1,17 +1,10 @@ # coding: utf-8 -""" -xgboost: eXtreme Gradient Boosting library - -Version: 0.40 -Authors: Tianqi Chen, Bing Xu -Early stopping by Zygmunt ZajÄ…c -""" -# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name, fixme +# pylint: disable=too-many-arguments +"""Core XGBoost Library.""" from __future__ import absolute_import import os import sys -import re import ctypes import platform import collections @@ -19,13 +12,6 @@ import collections import numpy as np import scipy.sparse -try: - from sklearn.base import BaseEstimator - from sklearn.base import RegressorMixin, ClassifierMixin - from sklearn.preprocessing import LabelEncoder - SKLEARN_INSTALLED = True -except ImportError: - SKLEARN_INSTALLED = False class XGBoostLibraryNotFound(Exception): """Error throwed by when xgboost is not found""" @@ -35,7 +21,6 @@ class XGBoostError(Exception): """Error throwed by xgboost trainer.""" pass -__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] if sys.version_info[0] == 3: # pylint: disable=invalid-name @@ -44,30 +29,43 @@ else: # pylint: disable=invalid-name STRING_TYPES = basestring, -def load_xglib(): - """Load the xgboost library.""" + +def find_lib_path(): + """Load find the path to xgboost dynamic library files. + + Returns + ------- + lib_path: list(string) + List of all found library path to xgboost + """ curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) - dll_path = [curr_path] + dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')] if os.name == 'nt': if platform.architecture()[0] == '64bit': - dll_path.append(os.path.join(curr_path, '../windows/x64/Release/')) + dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/')) else: - dll_path.append(os.path.join(curr_path, '../windows/Release/')) + dll_path.append(os.path.join(curr_path, '../../windows/Release/')) if os.name == 'nt': dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] else: dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - if len(dll_path) == 0: + if len(lib_path) == 0: raise XGBoostLibraryNotFound( - 'cannot find find the files in the candicate path ' + str(dll_path)) + 'Cannot find XGBoost Libarary in the candicate path %s,' + + 'Did you run build.sh in root oath?' % str(dll_path)) + return lib_path + +def _load_lib(): + """Load xgboost Library.""" + lib_path = find_lib_path() lib = ctypes.cdll.LoadLibrary(lib_path[0]) lib.XGBGetLastError.restype = ctypes.c_char_p return lib # load the XGBoost library globally -_LIB = load_xglib() +_LIB = _load_lib() def _check_call(ret): """Check the return value of C API call @@ -117,7 +115,11 @@ def c_array(ctype, values): class DMatrix(object): - """Data Matrix used in XGBoost.""" + """Data Matrix used in XGBoost. + + DMatrix is a internal data structure that used by XGBoost + which is optimized for both memory efficiency and training speed. + """ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ Data matrix used in XGBoost. @@ -400,11 +402,14 @@ class DMatrix(object): class Booster(object): - """"A Booster of of XGBoost.""" + """"A Booster of of XGBoost. + + Booster is the model of xgboost, that contains low level routines for + training, prediction and evaluation. + """ def __init__(self, params=None, cache=(), model_file=None): # pylint: disable=invalid-name - """ - Learner class. + """Initialize the Booster. Parameters ---------- @@ -735,570 +740,3 @@ class Booster(object): else: fmap[fid] += 1 return fmap - - -def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, - early_stopping_rounds=None, evals_result=None, verbose_eval=True): - # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init - """Train a booster with given parameters. - - Parameters - ---------- - params : dict - Booster params. - dtrain : DMatrix - Data to be trained. - num_boost_round: int - Number of boosting iterations. - watchlist (evals): list of pairs (DMatrix, string) - List of items to be evaluated during training, this allows user to watch - performance on the validation set. - obj : function - Customized objective function. - feval : function - Customized evaluation function. - early_stopping_rounds: int - Activates early stopping. Validation error needs to decrease at least - every round(s) to continue training. - Requires at least one item in evals. - If there's more than one, will use the last. - Returns the model from the last iteration (not the best one). - If early stopping occurs, the model will have two additional fields: - bst.best_score and bst.best_iteration. - evals_result: dict - This dictionary stores the evaluation results of all the items in watchlist - verbose_eval : bool - If `verbose_eval` then the evaluation metric on the validation set, if - given, is printed at each boosting stage. - - Returns - ------- - booster : a trained booster model - """ - evals = list(evals) - bst = Booster(params, [dtrain] + [d[0] for d in evals]) - - if evals_result is not None: - if not isinstance(evals_result, dict): - raise TypeError('evals_result has to be a dictionary') - else: - evals_name = [d[1] for d in evals] - evals_result.clear() - evals_result.update({key: [] for key in evals_name}) - - if not early_stopping_rounds: - for i in range(num_boost_round): - bst.update(dtrain, i, obj) - if len(evals) != 0: - bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, STRING_TYPES): - msg = bst_eval_set - else: - msg = bst_eval_set.decode() - - if verbose_eval: - sys.stderr.write(msg + '\n') - if evals_result is not None: - res = re.findall(":-?([0-9.]+).", msg) - for key, val in zip(evals_name, res): - evals_result[key].append(val) - return bst - - else: - # early stopping - if len(evals) < 1: - raise ValueError('For early stopping you need at least one set in evals.') - - sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\ - evals[-1][1], early_stopping_rounds)) - - # is params a list of tuples? are we using multiple eval metrics? - if isinstance(params, list): - if len(params) != len(dict(params).items()): - raise ValueError('Check your params.'\ - 'Early stopping works with single eval metric only.') - params = dict(params) - - # either minimize loss or maximize AUC/MAP/NDCG - maximize_score = False - if 'eval_metric' in params: - maximize_metrics = ('auc', 'map', 'ndcg') - if any(params['eval_metric'].startswith(x) for x in maximize_metrics): - maximize_score = True - - if maximize_score: - best_score = 0.0 - else: - best_score = float('inf') - - best_msg = '' - best_score_i = 0 - - for i in range(num_boost_round): - bst.update(dtrain, i, obj) - bst_eval_set = bst.eval_set(evals, i, feval) - - if isinstance(bst_eval_set, STRING_TYPES): - msg = bst_eval_set - else: - msg = bst_eval_set.decode() - - if verbose_eval: - sys.stderr.write(msg + '\n') - - if evals_result is not None: - res = re.findall(":-([0-9.]+).", msg) - for key, val in zip(evals_name, res): - evals_result[key].append(val) - - score = float(msg.rsplit(':', 1)[1]) - if (maximize_score and score > best_score) or \ - (not maximize_score and score < best_score): - best_score = score - best_score_i = i - best_msg = msg - elif i - best_score_i >= early_stopping_rounds: - sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) - bst.best_score = best_score - bst.best_iteration = best_score_i - break - bst.best_score = best_score - bst.best_iteration = best_score_i - return bst - - -class CVPack(object): - """"Auxiliary datastruct to hold one fold of CV.""" - def __init__(self, dtrain, dtest, param): - """"Initialize the CVPack""" - self.dtrain = dtrain - self.dtest = dtest - self.watchlist = [(dtrain, 'train'), (dtest, 'test')] - self.bst = Booster(param, [dtrain, dtest]) - - def update(self, iteration, fobj): - """"Update the boosters for one iteration""" - self.bst.update(self.dtrain, iteration, fobj) - - def eval(self, iteration, feval): - """"Evaluate the CVPack for one iteration.""" - return self.bst.eval_set(self.watchlist, iteration, feval) - - -def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): - """ - Make an n-fold list of CVPack from random indices. - """ - evals = list(evals) - np.random.seed(seed) - randidx = np.random.permutation(dall.num_row()) - kstep = len(randidx) / nfold - idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)] - ret = [] - for k in range(nfold): - dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) - dtest = dall.slice(idset[k]) - # run preprocessing on the data set if needed - if fpreproc is not None: - dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) - else: - tparam = param - plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] - ret.append(CVPack(dtrain, dtest, plst)) - return ret - - -def aggcv(rlist, show_stdv=True): - # pylint: disable=invalid-name - """ - Aggregate cross-validation results. - """ - cvmap = {} - ret = rlist[0].split()[0] - for line in rlist: - arr = line.split() - assert ret == arr[0] - for it in arr[1:]: - if not isinstance(it, STRING_TYPES): - it = it.decode() - k, v = it.split(':') - if k not in cvmap: - cvmap[k] = [] - cvmap[k].append(float(v)) - for k, v in sorted(cvmap.items(), key=lambda x: x[0]): - v = np.array(v) - if not isinstance(ret, STRING_TYPES): - ret = ret.decode() - if show_stdv: - ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) - else: - ret += '\tcv-%s:%f' % (k, np.mean(v)) - return ret - - -def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), - obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): - # pylint: disable = invalid-name - """Cross-validation with given paramaters. - - Parameters - ---------- - params : dict - Booster params. - dtrain : DMatrix - Data to be trained. - num_boost_round : int - Number of boosting iterations. - nfold : int - Number of folds in CV. - metrics : list of strings - Evaluation metrics to be watched in CV. - obj : function - Custom objective function. - feval : function - Custom evaluation function. - fpreproc : function - Preprocessing function that takes (dtrain, dtest, param) and returns - transformed versions of those. - show_stdv : bool - Whether to display the standard deviation. - seed : int - Seed used to generate the folds (passed to numpy.random.seed). - - Returns - ------- - evaluation history : list(string) - """ - results = [] - cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) - for i in range(num_boost_round): - for fold in cvfolds: - fold.update(i, obj) - res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) - sys.stderr.write(res + '\n') - results.append(res) - return results - - -# used for compatiblity without sklearn -XGBModelBase = object -XGBClassifierBase = object -XGBRegressorBase = object -if SKLEARN_INSTALLED: - XGBModelBase = BaseEstimator - XGBRegressorBase = RegressorMixin - XGBClassifierBase = ClassifierMixin - -class XGBModel(XGBModelBase): - # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name - """Implementation of the Scikit-Learn API for XGBoost. - - Parameters - ---------- - max_depth : int - Maximum tree depth for base learners. - learning_rate : float - Boosting learning rate (xgb's "eta") - n_estimators : int - Number of boosted trees to fit. - silent : boolean - Whether to print messages while running boosting. - objective : string - Specify the learning task and the corresponding learning objective. - - nthread : int - Number of parallel threads used to run xgboost. - gamma : float - Minimum loss reduction required to make a further partition on a leaf node of the tree. - min_child_weight : int - Minimum sum of instance weight(hessian) needed in a child. - max_delta_step : int - Maximum delta step we allow each tree's weight estimation to be. - subsample : float - Subsample ratio of the training instance. - colsample_bytree : float - Subsample ratio of columns when constructing each tree. - - base_score: - The initial prediction score of all instances, global bias. - seed : int - Random number seed. - missing : float, optional - Value in the data which needs to be present as a missing value. If - None, defaults to np.nan. - """ - def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, - silent=True, objective="reg:linear", - nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, - subsample=1, colsample_bytree=1, - base_score=0.5, seed=0, missing=None): - if not SKLEARN_INSTALLED: - raise XGBoostError('sklearn needs to be installed in order to use this module') - self.max_depth = max_depth - self.learning_rate = learning_rate - self.n_estimators = n_estimators - self.silent = silent - self.objective = objective - - self.nthread = nthread - self.gamma = gamma - self.min_child_weight = min_child_weight - self.max_delta_step = max_delta_step - self.subsample = subsample - self.colsample_bytree = colsample_bytree - - self.base_score = base_score - self.seed = seed - self.missing = missing if missing is not None else np.nan - self._Booster = None - - def __setstate__(self, state): - # backward compatiblity code - # load booster from raw if it is raw - # the booster now support pickle - bst = state["_Booster"] - if bst is not None and not isinstance(bst, Booster): - state["_Booster"] = Booster(model_file=bst) - self.__dict__.update(state) - - def booster(self): - """Get the underlying xgboost Booster of this model. - - This will raise an exception when fit was not called - - Returns - ------- - booster : a xgboost booster of underlying model - """ - if self._Booster is None: - raise XGBoostError('need to call fit beforehand') - return self._Booster - - def get_params(self, deep=False): - """Get parameter.s""" - params = super(XGBModel, self).get_params(deep=deep) - if params['missing'] is np.nan: - params['missing'] = None # sklearn doesn't handle nan. see #4725 - if not params.get('eval_metric', True): - del params['eval_metric'] # don't give as None param to Booster - return params - - def get_xgb_params(self): - """Get xgboost type parameters.""" - xgb_params = self.get_params() - - xgb_params['silent'] = 1 if self.silent else 0 - - if self.nthread <= 0: - xgb_params.pop('nthread', None) - return xgb_params - - def fit(self, X, y, eval_set=None, eval_metric=None, - early_stopping_rounds=None, verbose=True): - # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init - """ - Fit the gradient boosting model - - Parameters - ---------- - X : array_like - Feature matrix - y : array_like - Labels - eval_set : list, optional - A list of (X, y) tuple pairs to use as a validation set for - early-stopping - eval_metric : str, callable, optional - If a str, should be a built-in evaluation metric to use. See - doc/parameter.md. If callable, a custom evaluation metric. The call - signature is func(y_predicted, y_true) where y_true will be a - DMatrix object such that you may need to call the get_label - method. It must return a str, value pair where the str is a name - for the evaluation and value is the value of the evaluation - function. This objective is always minimized. - early_stopping_rounds : int - Activates early stopping. Validation error needs to decrease at - least every round(s) to continue training. - Requires at least one item in evals. If there's more than one, - will use the last. Returns the model from the last iteration - (not the best one). If early stopping occurs, the model will - have two additional fields: bst.best_score and bst.best_iteration. - verbose : bool - If `verbose` and an evaluation set is used, writes the evaluation - metric measured on the validation set to stderr. - """ - trainDmatrix = DMatrix(X, label=y, missing=self.missing) - - eval_results = {} - if eval_set is not None: - evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) - evals = list(zip(evals, ["validation_{}".format(i) for i in - range(len(evals))])) - else: - evals = () - - params = self.get_xgb_params() - - feval = eval_metric if callable(eval_metric) else None - if eval_metric is not None: - if callable(eval_metric): - eval_metric = None - else: - params.update({'eval_metric': eval_metric}) - - self._Booster = train(params, trainDmatrix, - self.n_estimators, evals=evals, - early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=feval, - verbose_eval=verbose) - if eval_results: - eval_results = {k: np.array(v, dtype=float) - for k, v in eval_results.items()} - eval_results = {k: np.array(v) for k, v in eval_results.items()} - self.eval_results = eval_results - - if early_stopping_rounds is not None: - self.best_score = self._Booster.best_score - self.best_iteration = self._Booster.best_iteration - return self - - def predict(self, data): - # pylint: disable=missing-docstring,invalid-name - test_dmatrix = DMatrix(data, missing=self.missing) - return self.booster().predict(test_dmatrix) - - -class XGBClassifier(XGBModel, XGBClassifierBase): - # pylint: disable=missing-docstring,too-many-arguments,invalid-name - __doc__ = """ - Implementation of the scikit-learn API for XGBoost classification - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - - def __init__(self, max_depth=3, learning_rate=0.1, - n_estimators=100, silent=True, - objective="binary:logistic", - nthread=-1, gamma=0, min_child_weight=1, - max_delta_step=0, subsample=1, colsample_bytree=1, - base_score=0.5, seed=0, missing=None): - super(XGBClassifier, self).__init__(max_depth, learning_rate, - n_estimators, silent, objective, - nthread, gamma, min_child_weight, - max_delta_step, subsample, - colsample_bytree, - base_score, seed, missing) - - def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, - early_stopping_rounds=None, verbose=True): - # pylint: disable = attribute-defined-outside-init,arguments-differ - """ - Fit gradient boosting classifier - - Parameters - ---------- - X : array_like - Feature matrix - y : array_like - Labels - sample_weight : array_like - Weight for each instance - eval_set : list, optional - A list of (X, y) pairs to use as a validation set for - early-stopping - eval_metric : str, callable, optional - If a str, should be a built-in evaluation metric to use. See - doc/parameter.md. If callable, a custom evaluation metric. The call - signature is func(y_predicted, y_true) where y_true will be a - DMatrix object such that you may need to call the get_label - method. It must return a str, value pair where the str is a name - for the evaluation and value is the value of the evaluation - function. This objective is always minimized. - early_stopping_rounds : int, optional - Activates early stopping. Validation error needs to decrease at - least every round(s) to continue training. - Requires at least one item in evals. If there's more than one, - will use the last. Returns the model from the last iteration - (not the best one). If early stopping occurs, the model will - have two additional fields: bst.best_score and bst.best_iteration. - verbose : bool - If `verbose` and an evaluation set is used, writes the evaluation - metric measured on the validation set to stderr. - """ - eval_results = {} - self.classes_ = list(np.unique(y)) - self.n_classes_ = len(self.classes_) - if self.n_classes_ > 2: - # Switch to using a multiclass objective in the underlying XGB instance - self.objective = "multi:softprob" - xgb_options = self.get_xgb_params() - xgb_options['num_class'] = self.n_classes_ - else: - xgb_options = self.get_xgb_params() - - feval = eval_metric if callable(eval_metric) else None - if eval_metric is not None: - if callable(eval_metric): - eval_metric = None - else: - xgb_options.update({"eval_metric": eval_metric}) - - if eval_set is not None: - # TODO: use sample_weight if given? - evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) - nevals = len(evals) - eval_names = ["validation_{}".format(i) for i in range(nevals)] - evals = list(zip(evals, eval_names)) - else: - evals = () - - self._le = LabelEncoder().fit(y) - training_labels = self._le.transform(y) - - if sample_weight is not None: - train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, - missing=self.missing) - else: - train_dmatrix = DMatrix(X, label=training_labels, - missing=self.missing) - - self._Booster = train(xgb_options, train_dmatrix, self.n_estimators, - evals=evals, - early_stopping_rounds=early_stopping_rounds, - evals_result=eval_results, feval=feval, - verbose_eval=verbose) - - if eval_results: - eval_results = {k: np.array(v, dtype=float) - for k, v in eval_results.items()} - self.eval_results = eval_results - - if early_stopping_rounds is not None: - self.best_score = self._Booster.best_score - self.best_iteration = self._Booster.best_iteration - - return self - - def predict(self, data): - test_dmatrix = DMatrix(data, missing=self.missing) - class_probs = self.booster().predict(test_dmatrix) - if len(class_probs.shape) > 1: - column_indexes = np.argmax(class_probs, axis=1) - else: - column_indexes = np.repeat(0, data.shape[0]) - column_indexes[class_probs > 0.5] = 1 - return self._le.inverse_transform(column_indexes) - - def predict_proba(self, data): - test_dmatrix = DMatrix(data, missing=self.missing) - class_probs = self.booster().predict(test_dmatrix) - if self.objective == "multi:softprob": - return class_probs - else: - classone_probs = class_probs - classzero_probs = 1.0 - classone_probs - return np.vstack((classzero_probs, classone_probs)).transpose() - -class XGBRegressor(XGBModel, XGBRegressorBase): - # pylint: disable=missing-docstring - __doc__ = """ - Implementation of the scikit-learn API for XGBoost regression - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) diff --git a/windows/README.md b/windows/README.md index cb1cc9dd9..564c97d25 100644 --- a/windows/README.md +++ b/windows/README.md @@ -11,7 +11,7 @@ This should give you xgboost.exe for CLI version and xgboost_wrapper.dll for pyt Use Python Module ===== -* After you build the dll, you can install the Python package from the [../wrapper](../wrapper) folder +* After you build the dll, you can install the Python package from the [../python-package](../python-package) folder ``` python setup.py install diff --git a/wrapper/README.md b/wrapper/README.md index c5368bd7d..77316e15c 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -1,20 +1,9 @@ -Wrapper of XGBoost -===== -This folder provides wrapper of xgboost to other languages +XGBoost Wrappers +================ +This folder provides wrapper to create xgboost packages to other languages. -Python -===== -* To make the python module, type ```./build.sh``` in the root directory of project -* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) -* Install with `python setup.py install` from this directory. -* Refer also to the walk through example in [demo folder](../demo/guide-python) -* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo. - - -R -===== -* See [R-package](../R-package) - -Julia -===== -* See [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl) +***Supported Language Packages*** +* [Python package](../python-package) +* [R-package](../R-package) +* [Java Package](../java) +* [Julia Package](https://github.com/antinucleon/XGBoost.jl) diff --git a/wrapper/__init__.py b/wrapper/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/wrapper/setup.py b/wrapper/setup.py deleted file mode 100644 index 5365d61b0..000000000 --- a/wrapper/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -# pylint: disable=invalid-name -"""Setup xgboost package.""" -import os -import platform -from setuptools import setup - - -class XGBoostLibraryNotFound(Exception): - """Exception to raise when xgboost library cannot be found.""" - pass - - -curr_dir = os.path.dirname(os.path.abspath(__file__)) -dll_path = [curr_dir] - -if os.name == 'nt': - if platform.architecture()[0] == '64bit': - dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/')) - else: - dll_path.append(os.path.join(curr_dir, '../windows/Release/')) - - -if os.name == 'nt': - dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] -else: - dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] - -lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - -if len(lib_path) == 0: - raise XGBoostLibraryNotFound("XGBoost library not found. Did you run " - "../make?") -setup(name="xgboost", - version="0.40", - description="Python wrappers for XGBoost: eXtreme Gradient Boosting", - zip_safe=False, - py_modules=['xgboost'], - data_files=[('.', [lib_path[0]])], - url="https://github.com/dmlc/xgboost")