python package refactor into python-package
This commit is contained in:
parent
f6fed76e7e
commit
c2fec29bfa
2
Makefile
2
Makefile
@ -169,7 +169,7 @@ Rcheck:
|
|||||||
|
|
||||||
# lint requires dmlc to be in current folder
|
# lint requires dmlc to be in current folder
|
||||||
lint:
|
lint:
|
||||||
dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package
|
dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||||
|
|||||||
1
demo/.gitignore
vendored
1
demo/.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
*.libsvm
|
*.libsvm
|
||||||
|
*.pkl
|
||||||
|
|||||||
@ -7,8 +7,8 @@ This folder contains all the code examples using xgboost.
|
|||||||
|
|
||||||
Features Walkthrough
|
Features Walkthrough
|
||||||
====
|
====
|
||||||
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
|
This is a list of short codes introducing different functionalities of xgboost packages.
|
||||||
* Basic walkthrough of wrappers
|
* Basic walkthrough of packages
|
||||||
[python](guide-python/basic_walkthrough.py)
|
[python](guide-python/basic_walkthrough.py)
|
||||||
[R](../R-package/demo/basic_walkthrough.R)
|
[R](../R-package/demo/basic_walkthrough.R)
|
||||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
|
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
|
||||||
|
|||||||
@ -75,13 +75,3 @@ clf = xgb.XGBClassifier()
|
|||||||
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
||||||
eval_set=[(X_test, y_test)])
|
eval_set=[(X_test, y_test)])
|
||||||
|
|
||||||
# Custom evaluation function
|
|
||||||
from sklearn.metrics import log_loss
|
|
||||||
|
|
||||||
|
|
||||||
def log_loss_eval(y_pred, y_true):
|
|
||||||
return "log-loss", log_loss(y_true.get_label(), y_pred)
|
|
||||||
|
|
||||||
|
|
||||||
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=log_loss_eval,
|
|
||||||
eval_set=[(X_test, y_test)])
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ A [walk through python example](https://github.com/tqchen/xgboost/blob/master/de
|
|||||||
=
|
=
|
||||||
#### Install
|
#### Install
|
||||||
|
|
||||||
To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run
|
To install XGBoost, you need to run `make` in the root directory of the project and then in the `python-package` directory run
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
python setup.py install
|
python setup.py install
|
||||||
|
|||||||
3
python-package/.gitignore
vendored
Normal file
3
python-package/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
build
|
||||||
|
dist
|
||||||
|
*.egg*
|
||||||
7
python-package/README.md
Normal file
7
python-package/README.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
XGBoost Python Package
|
||||||
|
======================
|
||||||
|
* To make the python module, type ```./build.sh``` in the root directory of project
|
||||||
|
* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools)
|
||||||
|
* Install with `python setup.py install` from this directory.
|
||||||
|
* Refer also to the walk through example in [demo folder](../demo/guide-python)
|
||||||
|
* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
|
||||||
21
python-package/setup.py
Normal file
21
python-package/setup.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# pylint: disable=invalid-name
|
||||||
|
"""Setup xgboost package."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import sys
|
||||||
|
from setuptools import setup
|
||||||
|
sys.path.insert(0, '.')
|
||||||
|
import xgboost
|
||||||
|
|
||||||
|
LIB_PATH = xgboost.core.find_lib_path()
|
||||||
|
|
||||||
|
setup(name='xgboost',
|
||||||
|
version=xgboost.__version__,
|
||||||
|
description=xgboost.__doc__,
|
||||||
|
install_requires=[
|
||||||
|
'numpy',
|
||||||
|
'scipy',
|
||||||
|
],
|
||||||
|
zip_safe=False,
|
||||||
|
packages=['xgboost'],
|
||||||
|
data_files=[('xgboost', [LIB_PATH[0]])],
|
||||||
|
url='https://github.com/dmlc/xgboost')
|
||||||
@ -1,17 +1,10 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""
|
# pylint: disable=too-many-arguments
|
||||||
xgboost: eXtreme Gradient Boosting library
|
"""Core XGBoost Library."""
|
||||||
|
|
||||||
Version: 0.40
|
|
||||||
Authors: Tianqi Chen, Bing Xu
|
|
||||||
Early stopping by Zygmunt Zając
|
|
||||||
"""
|
|
||||||
# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name, fixme
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import platform
|
import platform
|
||||||
import collections
|
import collections
|
||||||
@ -19,13 +12,6 @@ import collections
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.sparse
|
import scipy.sparse
|
||||||
|
|
||||||
try:
|
|
||||||
from sklearn.base import BaseEstimator
|
|
||||||
from sklearn.base import RegressorMixin, ClassifierMixin
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
|
||||||
SKLEARN_INSTALLED = True
|
|
||||||
except ImportError:
|
|
||||||
SKLEARN_INSTALLED = False
|
|
||||||
|
|
||||||
class XGBoostLibraryNotFound(Exception):
|
class XGBoostLibraryNotFound(Exception):
|
||||||
"""Error throwed by when xgboost is not found"""
|
"""Error throwed by when xgboost is not found"""
|
||||||
@ -35,7 +21,6 @@ class XGBoostError(Exception):
|
|||||||
"""Error throwed by xgboost trainer."""
|
"""Error throwed by xgboost trainer."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
|
|
||||||
|
|
||||||
if sys.version_info[0] == 3:
|
if sys.version_info[0] == 3:
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
@ -44,30 +29,43 @@ else:
|
|||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
STRING_TYPES = basestring,
|
STRING_TYPES = basestring,
|
||||||
|
|
||||||
def load_xglib():
|
|
||||||
"""Load the xgboost library."""
|
def find_lib_path():
|
||||||
|
"""Load find the path to xgboost dynamic library files.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
lib_path: list(string)
|
||||||
|
List of all found library path to xgboost
|
||||||
|
"""
|
||||||
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
|
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
|
||||||
dll_path = [curr_path]
|
dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')]
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
if platform.architecture()[0] == '64bit':
|
if platform.architecture()[0] == '64bit':
|
||||||
dll_path.append(os.path.join(curr_path, '../windows/x64/Release/'))
|
dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/'))
|
||||||
else:
|
else:
|
||||||
dll_path.append(os.path.join(curr_path, '../windows/Release/'))
|
dll_path.append(os.path.join(curr_path, '../../windows/Release/'))
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
|
dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
|
||||||
else:
|
else:
|
||||||
dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
|
dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
|
||||||
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
||||||
if len(dll_path) == 0:
|
if len(lib_path) == 0:
|
||||||
raise XGBoostLibraryNotFound(
|
raise XGBoostLibraryNotFound(
|
||||||
'cannot find find the files in the candicate path ' + str(dll_path))
|
'Cannot find XGBoost Libarary in the candicate path %s,' +
|
||||||
|
'Did you run build.sh in root oath?' % str(dll_path))
|
||||||
|
return lib_path
|
||||||
|
|
||||||
|
def _load_lib():
|
||||||
|
"""Load xgboost Library."""
|
||||||
|
lib_path = find_lib_path()
|
||||||
lib = ctypes.cdll.LoadLibrary(lib_path[0])
|
lib = ctypes.cdll.LoadLibrary(lib_path[0])
|
||||||
lib.XGBGetLastError.restype = ctypes.c_char_p
|
lib.XGBGetLastError.restype = ctypes.c_char_p
|
||||||
|
|
||||||
return lib
|
return lib
|
||||||
|
|
||||||
# load the XGBoost library globally
|
# load the XGBoost library globally
|
||||||
_LIB = load_xglib()
|
_LIB = _load_lib()
|
||||||
|
|
||||||
def _check_call(ret):
|
def _check_call(ret):
|
||||||
"""Check the return value of C API call
|
"""Check the return value of C API call
|
||||||
@ -117,7 +115,11 @@ def c_array(ctype, values):
|
|||||||
|
|
||||||
|
|
||||||
class DMatrix(object):
|
class DMatrix(object):
|
||||||
"""Data Matrix used in XGBoost."""
|
"""Data Matrix used in XGBoost.
|
||||||
|
|
||||||
|
DMatrix is a internal data structure that used by XGBoost
|
||||||
|
which is optimized for both memory efficiency and training speed.
|
||||||
|
"""
|
||||||
def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
|
def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
|
||||||
"""
|
"""
|
||||||
Data matrix used in XGBoost.
|
Data matrix used in XGBoost.
|
||||||
@ -400,11 +402,14 @@ class DMatrix(object):
|
|||||||
|
|
||||||
|
|
||||||
class Booster(object):
|
class Booster(object):
|
||||||
""""A Booster of of XGBoost."""
|
""""A Booster of of XGBoost.
|
||||||
|
|
||||||
|
Booster is the model of xgboost, that contains low level routines for
|
||||||
|
training, prediction and evaluation.
|
||||||
|
"""
|
||||||
def __init__(self, params=None, cache=(), model_file=None):
|
def __init__(self, params=None, cache=(), model_file=None):
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
"""
|
"""Initialize the Booster.
|
||||||
Learner class.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -735,570 +740,3 @@ class Booster(object):
|
|||||||
else:
|
else:
|
||||||
fmap[fid] += 1
|
fmap[fid] += 1
|
||||||
return fmap
|
return fmap
|
||||||
|
|
||||||
|
|
||||||
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|
||||||
early_stopping_rounds=None, evals_result=None, verbose_eval=True):
|
|
||||||
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
|
|
||||||
"""Train a booster with given parameters.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
params : dict
|
|
||||||
Booster params.
|
|
||||||
dtrain : DMatrix
|
|
||||||
Data to be trained.
|
|
||||||
num_boost_round: int
|
|
||||||
Number of boosting iterations.
|
|
||||||
watchlist (evals): list of pairs (DMatrix, string)
|
|
||||||
List of items to be evaluated during training, this allows user to watch
|
|
||||||
performance on the validation set.
|
|
||||||
obj : function
|
|
||||||
Customized objective function.
|
|
||||||
feval : function
|
|
||||||
Customized evaluation function.
|
|
||||||
early_stopping_rounds: int
|
|
||||||
Activates early stopping. Validation error needs to decrease at least
|
|
||||||
every <early_stopping_rounds> round(s) to continue training.
|
|
||||||
Requires at least one item in evals.
|
|
||||||
If there's more than one, will use the last.
|
|
||||||
Returns the model from the last iteration (not the best one).
|
|
||||||
If early stopping occurs, the model will have two additional fields:
|
|
||||||
bst.best_score and bst.best_iteration.
|
|
||||||
evals_result: dict
|
|
||||||
This dictionary stores the evaluation results of all the items in watchlist
|
|
||||||
verbose_eval : bool
|
|
||||||
If `verbose_eval` then the evaluation metric on the validation set, if
|
|
||||||
given, is printed at each boosting stage.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
booster : a trained booster model
|
|
||||||
"""
|
|
||||||
evals = list(evals)
|
|
||||||
bst = Booster(params, [dtrain] + [d[0] for d in evals])
|
|
||||||
|
|
||||||
if evals_result is not None:
|
|
||||||
if not isinstance(evals_result, dict):
|
|
||||||
raise TypeError('evals_result has to be a dictionary')
|
|
||||||
else:
|
|
||||||
evals_name = [d[1] for d in evals]
|
|
||||||
evals_result.clear()
|
|
||||||
evals_result.update({key: [] for key in evals_name})
|
|
||||||
|
|
||||||
if not early_stopping_rounds:
|
|
||||||
for i in range(num_boost_round):
|
|
||||||
bst.update(dtrain, i, obj)
|
|
||||||
if len(evals) != 0:
|
|
||||||
bst_eval_set = bst.eval_set(evals, i, feval)
|
|
||||||
if isinstance(bst_eval_set, STRING_TYPES):
|
|
||||||
msg = bst_eval_set
|
|
||||||
else:
|
|
||||||
msg = bst_eval_set.decode()
|
|
||||||
|
|
||||||
if verbose_eval:
|
|
||||||
sys.stderr.write(msg + '\n')
|
|
||||||
if evals_result is not None:
|
|
||||||
res = re.findall(":-?([0-9.]+).", msg)
|
|
||||||
for key, val in zip(evals_name, res):
|
|
||||||
evals_result[key].append(val)
|
|
||||||
return bst
|
|
||||||
|
|
||||||
else:
|
|
||||||
# early stopping
|
|
||||||
if len(evals) < 1:
|
|
||||||
raise ValueError('For early stopping you need at least one set in evals.')
|
|
||||||
|
|
||||||
sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
|
|
||||||
evals[-1][1], early_stopping_rounds))
|
|
||||||
|
|
||||||
# is params a list of tuples? are we using multiple eval metrics?
|
|
||||||
if isinstance(params, list):
|
|
||||||
if len(params) != len(dict(params).items()):
|
|
||||||
raise ValueError('Check your params.'\
|
|
||||||
'Early stopping works with single eval metric only.')
|
|
||||||
params = dict(params)
|
|
||||||
|
|
||||||
# either minimize loss or maximize AUC/MAP/NDCG
|
|
||||||
maximize_score = False
|
|
||||||
if 'eval_metric' in params:
|
|
||||||
maximize_metrics = ('auc', 'map', 'ndcg')
|
|
||||||
if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
|
|
||||||
maximize_score = True
|
|
||||||
|
|
||||||
if maximize_score:
|
|
||||||
best_score = 0.0
|
|
||||||
else:
|
|
||||||
best_score = float('inf')
|
|
||||||
|
|
||||||
best_msg = ''
|
|
||||||
best_score_i = 0
|
|
||||||
|
|
||||||
for i in range(num_boost_round):
|
|
||||||
bst.update(dtrain, i, obj)
|
|
||||||
bst_eval_set = bst.eval_set(evals, i, feval)
|
|
||||||
|
|
||||||
if isinstance(bst_eval_set, STRING_TYPES):
|
|
||||||
msg = bst_eval_set
|
|
||||||
else:
|
|
||||||
msg = bst_eval_set.decode()
|
|
||||||
|
|
||||||
if verbose_eval:
|
|
||||||
sys.stderr.write(msg + '\n')
|
|
||||||
|
|
||||||
if evals_result is not None:
|
|
||||||
res = re.findall(":-([0-9.]+).", msg)
|
|
||||||
for key, val in zip(evals_name, res):
|
|
||||||
evals_result[key].append(val)
|
|
||||||
|
|
||||||
score = float(msg.rsplit(':', 1)[1])
|
|
||||||
if (maximize_score and score > best_score) or \
|
|
||||||
(not maximize_score and score < best_score):
|
|
||||||
best_score = score
|
|
||||||
best_score_i = i
|
|
||||||
best_msg = msg
|
|
||||||
elif i - best_score_i >= early_stopping_rounds:
|
|
||||||
sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
|
|
||||||
bst.best_score = best_score
|
|
||||||
bst.best_iteration = best_score_i
|
|
||||||
break
|
|
||||||
bst.best_score = best_score
|
|
||||||
bst.best_iteration = best_score_i
|
|
||||||
return bst
|
|
||||||
|
|
||||||
|
|
||||||
class CVPack(object):
|
|
||||||
""""Auxiliary datastruct to hold one fold of CV."""
|
|
||||||
def __init__(self, dtrain, dtest, param):
|
|
||||||
""""Initialize the CVPack"""
|
|
||||||
self.dtrain = dtrain
|
|
||||||
self.dtest = dtest
|
|
||||||
self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
|
|
||||||
self.bst = Booster(param, [dtrain, dtest])
|
|
||||||
|
|
||||||
def update(self, iteration, fobj):
|
|
||||||
""""Update the boosters for one iteration"""
|
|
||||||
self.bst.update(self.dtrain, iteration, fobj)
|
|
||||||
|
|
||||||
def eval(self, iteration, feval):
|
|
||||||
""""Evaluate the CVPack for one iteration."""
|
|
||||||
return self.bst.eval_set(self.watchlist, iteration, feval)
|
|
||||||
|
|
||||||
|
|
||||||
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
|
|
||||||
"""
|
|
||||||
Make an n-fold list of CVPack from random indices.
|
|
||||||
"""
|
|
||||||
evals = list(evals)
|
|
||||||
np.random.seed(seed)
|
|
||||||
randidx = np.random.permutation(dall.num_row())
|
|
||||||
kstep = len(randidx) / nfold
|
|
||||||
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
|
|
||||||
ret = []
|
|
||||||
for k in range(nfold):
|
|
||||||
dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
|
|
||||||
dtest = dall.slice(idset[k])
|
|
||||||
# run preprocessing on the data set if needed
|
|
||||||
if fpreproc is not None:
|
|
||||||
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
|
|
||||||
else:
|
|
||||||
tparam = param
|
|
||||||
plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
|
|
||||||
ret.append(CVPack(dtrain, dtest, plst))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def aggcv(rlist, show_stdv=True):
|
|
||||||
# pylint: disable=invalid-name
|
|
||||||
"""
|
|
||||||
Aggregate cross-validation results.
|
|
||||||
"""
|
|
||||||
cvmap = {}
|
|
||||||
ret = rlist[0].split()[0]
|
|
||||||
for line in rlist:
|
|
||||||
arr = line.split()
|
|
||||||
assert ret == arr[0]
|
|
||||||
for it in arr[1:]:
|
|
||||||
if not isinstance(it, STRING_TYPES):
|
|
||||||
it = it.decode()
|
|
||||||
k, v = it.split(':')
|
|
||||||
if k not in cvmap:
|
|
||||||
cvmap[k] = []
|
|
||||||
cvmap[k].append(float(v))
|
|
||||||
for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
|
|
||||||
v = np.array(v)
|
|
||||||
if not isinstance(ret, STRING_TYPES):
|
|
||||||
ret = ret.decode()
|
|
||||||
if show_stdv:
|
|
||||||
ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
|
|
||||||
else:
|
|
||||||
ret += '\tcv-%s:%f' % (k, np.mean(v))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
|
||||||
obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0):
|
|
||||||
# pylint: disable = invalid-name
|
|
||||||
"""Cross-validation with given paramaters.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
params : dict
|
|
||||||
Booster params.
|
|
||||||
dtrain : DMatrix
|
|
||||||
Data to be trained.
|
|
||||||
num_boost_round : int
|
|
||||||
Number of boosting iterations.
|
|
||||||
nfold : int
|
|
||||||
Number of folds in CV.
|
|
||||||
metrics : list of strings
|
|
||||||
Evaluation metrics to be watched in CV.
|
|
||||||
obj : function
|
|
||||||
Custom objective function.
|
|
||||||
feval : function
|
|
||||||
Custom evaluation function.
|
|
||||||
fpreproc : function
|
|
||||||
Preprocessing function that takes (dtrain, dtest, param) and returns
|
|
||||||
transformed versions of those.
|
|
||||||
show_stdv : bool
|
|
||||||
Whether to display the standard deviation.
|
|
||||||
seed : int
|
|
||||||
Seed used to generate the folds (passed to numpy.random.seed).
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
evaluation history : list(string)
|
|
||||||
"""
|
|
||||||
results = []
|
|
||||||
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
|
|
||||||
for i in range(num_boost_round):
|
|
||||||
for fold in cvfolds:
|
|
||||||
fold.update(i, obj)
|
|
||||||
res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
|
|
||||||
sys.stderr.write(res + '\n')
|
|
||||||
results.append(res)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
# used for compatiblity without sklearn
|
|
||||||
XGBModelBase = object
|
|
||||||
XGBClassifierBase = object
|
|
||||||
XGBRegressorBase = object
|
|
||||||
if SKLEARN_INSTALLED:
|
|
||||||
XGBModelBase = BaseEstimator
|
|
||||||
XGBRegressorBase = RegressorMixin
|
|
||||||
XGBClassifierBase = ClassifierMixin
|
|
||||||
|
|
||||||
class XGBModel(XGBModelBase):
|
|
||||||
# pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
|
|
||||||
"""Implementation of the Scikit-Learn API for XGBoost.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
max_depth : int
|
|
||||||
Maximum tree depth for base learners.
|
|
||||||
learning_rate : float
|
|
||||||
Boosting learning rate (xgb's "eta")
|
|
||||||
n_estimators : int
|
|
||||||
Number of boosted trees to fit.
|
|
||||||
silent : boolean
|
|
||||||
Whether to print messages while running boosting.
|
|
||||||
objective : string
|
|
||||||
Specify the learning task and the corresponding learning objective.
|
|
||||||
|
|
||||||
nthread : int
|
|
||||||
Number of parallel threads used to run xgboost.
|
|
||||||
gamma : float
|
|
||||||
Minimum loss reduction required to make a further partition on a leaf node of the tree.
|
|
||||||
min_child_weight : int
|
|
||||||
Minimum sum of instance weight(hessian) needed in a child.
|
|
||||||
max_delta_step : int
|
|
||||||
Maximum delta step we allow each tree's weight estimation to be.
|
|
||||||
subsample : float
|
|
||||||
Subsample ratio of the training instance.
|
|
||||||
colsample_bytree : float
|
|
||||||
Subsample ratio of columns when constructing each tree.
|
|
||||||
|
|
||||||
base_score:
|
|
||||||
The initial prediction score of all instances, global bias.
|
|
||||||
seed : int
|
|
||||||
Random number seed.
|
|
||||||
missing : float, optional
|
|
||||||
Value in the data which needs to be present as a missing value. If
|
|
||||||
None, defaults to np.nan.
|
|
||||||
"""
|
|
||||||
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
|
|
||||||
silent=True, objective="reg:linear",
|
|
||||||
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
|
|
||||||
subsample=1, colsample_bytree=1,
|
|
||||||
base_score=0.5, seed=0, missing=None):
|
|
||||||
if not SKLEARN_INSTALLED:
|
|
||||||
raise XGBoostError('sklearn needs to be installed in order to use this module')
|
|
||||||
self.max_depth = max_depth
|
|
||||||
self.learning_rate = learning_rate
|
|
||||||
self.n_estimators = n_estimators
|
|
||||||
self.silent = silent
|
|
||||||
self.objective = objective
|
|
||||||
|
|
||||||
self.nthread = nthread
|
|
||||||
self.gamma = gamma
|
|
||||||
self.min_child_weight = min_child_weight
|
|
||||||
self.max_delta_step = max_delta_step
|
|
||||||
self.subsample = subsample
|
|
||||||
self.colsample_bytree = colsample_bytree
|
|
||||||
|
|
||||||
self.base_score = base_score
|
|
||||||
self.seed = seed
|
|
||||||
self.missing = missing if missing is not None else np.nan
|
|
||||||
self._Booster = None
|
|
||||||
|
|
||||||
def __setstate__(self, state):
|
|
||||||
# backward compatiblity code
|
|
||||||
# load booster from raw if it is raw
|
|
||||||
# the booster now support pickle
|
|
||||||
bst = state["_Booster"]
|
|
||||||
if bst is not None and not isinstance(bst, Booster):
|
|
||||||
state["_Booster"] = Booster(model_file=bst)
|
|
||||||
self.__dict__.update(state)
|
|
||||||
|
|
||||||
def booster(self):
|
|
||||||
"""Get the underlying xgboost Booster of this model.
|
|
||||||
|
|
||||||
This will raise an exception when fit was not called
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
booster : a xgboost booster of underlying model
|
|
||||||
"""
|
|
||||||
if self._Booster is None:
|
|
||||||
raise XGBoostError('need to call fit beforehand')
|
|
||||||
return self._Booster
|
|
||||||
|
|
||||||
def get_params(self, deep=False):
|
|
||||||
"""Get parameter.s"""
|
|
||||||
params = super(XGBModel, self).get_params(deep=deep)
|
|
||||||
if params['missing'] is np.nan:
|
|
||||||
params['missing'] = None # sklearn doesn't handle nan. see #4725
|
|
||||||
if not params.get('eval_metric', True):
|
|
||||||
del params['eval_metric'] # don't give as None param to Booster
|
|
||||||
return params
|
|
||||||
|
|
||||||
def get_xgb_params(self):
|
|
||||||
"""Get xgboost type parameters."""
|
|
||||||
xgb_params = self.get_params()
|
|
||||||
|
|
||||||
xgb_params['silent'] = 1 if self.silent else 0
|
|
||||||
|
|
||||||
if self.nthread <= 0:
|
|
||||||
xgb_params.pop('nthread', None)
|
|
||||||
return xgb_params
|
|
||||||
|
|
||||||
def fit(self, X, y, eval_set=None, eval_metric=None,
|
|
||||||
early_stopping_rounds=None, verbose=True):
|
|
||||||
# pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
|
|
||||||
"""
|
|
||||||
Fit the gradient boosting model
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array_like
|
|
||||||
Feature matrix
|
|
||||||
y : array_like
|
|
||||||
Labels
|
|
||||||
eval_set : list, optional
|
|
||||||
A list of (X, y) tuple pairs to use as a validation set for
|
|
||||||
early-stopping
|
|
||||||
eval_metric : str, callable, optional
|
|
||||||
If a str, should be a built-in evaluation metric to use. See
|
|
||||||
doc/parameter.md. If callable, a custom evaluation metric. The call
|
|
||||||
signature is func(y_predicted, y_true) where y_true will be a
|
|
||||||
DMatrix object such that you may need to call the get_label
|
|
||||||
method. It must return a str, value pair where the str is a name
|
|
||||||
for the evaluation and value is the value of the evaluation
|
|
||||||
function. This objective is always minimized.
|
|
||||||
early_stopping_rounds : int
|
|
||||||
Activates early stopping. Validation error needs to decrease at
|
|
||||||
least every <early_stopping_rounds> round(s) to continue training.
|
|
||||||
Requires at least one item in evals. If there's more than one,
|
|
||||||
will use the last. Returns the model from the last iteration
|
|
||||||
(not the best one). If early stopping occurs, the model will
|
|
||||||
have two additional fields: bst.best_score and bst.best_iteration.
|
|
||||||
verbose : bool
|
|
||||||
If `verbose` and an evaluation set is used, writes the evaluation
|
|
||||||
metric measured on the validation set to stderr.
|
|
||||||
"""
|
|
||||||
trainDmatrix = DMatrix(X, label=y, missing=self.missing)
|
|
||||||
|
|
||||||
eval_results = {}
|
|
||||||
if eval_set is not None:
|
|
||||||
evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
|
|
||||||
evals = list(zip(evals, ["validation_{}".format(i) for i in
|
|
||||||
range(len(evals))]))
|
|
||||||
else:
|
|
||||||
evals = ()
|
|
||||||
|
|
||||||
params = self.get_xgb_params()
|
|
||||||
|
|
||||||
feval = eval_metric if callable(eval_metric) else None
|
|
||||||
if eval_metric is not None:
|
|
||||||
if callable(eval_metric):
|
|
||||||
eval_metric = None
|
|
||||||
else:
|
|
||||||
params.update({'eval_metric': eval_metric})
|
|
||||||
|
|
||||||
self._Booster = train(params, trainDmatrix,
|
|
||||||
self.n_estimators, evals=evals,
|
|
||||||
early_stopping_rounds=early_stopping_rounds,
|
|
||||||
evals_result=eval_results, feval=feval,
|
|
||||||
verbose_eval=verbose)
|
|
||||||
if eval_results:
|
|
||||||
eval_results = {k: np.array(v, dtype=float)
|
|
||||||
for k, v in eval_results.items()}
|
|
||||||
eval_results = {k: np.array(v) for k, v in eval_results.items()}
|
|
||||||
self.eval_results = eval_results
|
|
||||||
|
|
||||||
if early_stopping_rounds is not None:
|
|
||||||
self.best_score = self._Booster.best_score
|
|
||||||
self.best_iteration = self._Booster.best_iteration
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, data):
|
|
||||||
# pylint: disable=missing-docstring,invalid-name
|
|
||||||
test_dmatrix = DMatrix(data, missing=self.missing)
|
|
||||||
return self.booster().predict(test_dmatrix)
|
|
||||||
|
|
||||||
|
|
||||||
class XGBClassifier(XGBModel, XGBClassifierBase):
|
|
||||||
# pylint: disable=missing-docstring,too-many-arguments,invalid-name
|
|
||||||
__doc__ = """
|
|
||||||
Implementation of the scikit-learn API for XGBoost classification
|
|
||||||
""" + "\n".join(XGBModel.__doc__.split('\n')[2:])
|
|
||||||
|
|
||||||
def __init__(self, max_depth=3, learning_rate=0.1,
|
|
||||||
n_estimators=100, silent=True,
|
|
||||||
objective="binary:logistic",
|
|
||||||
nthread=-1, gamma=0, min_child_weight=1,
|
|
||||||
max_delta_step=0, subsample=1, colsample_bytree=1,
|
|
||||||
base_score=0.5, seed=0, missing=None):
|
|
||||||
super(XGBClassifier, self).__init__(max_depth, learning_rate,
|
|
||||||
n_estimators, silent, objective,
|
|
||||||
nthread, gamma, min_child_weight,
|
|
||||||
max_delta_step, subsample,
|
|
||||||
colsample_bytree,
|
|
||||||
base_score, seed, missing)
|
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
|
|
||||||
early_stopping_rounds=None, verbose=True):
|
|
||||||
# pylint: disable = attribute-defined-outside-init,arguments-differ
|
|
||||||
"""
|
|
||||||
Fit gradient boosting classifier
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array_like
|
|
||||||
Feature matrix
|
|
||||||
y : array_like
|
|
||||||
Labels
|
|
||||||
sample_weight : array_like
|
|
||||||
Weight for each instance
|
|
||||||
eval_set : list, optional
|
|
||||||
A list of (X, y) pairs to use as a validation set for
|
|
||||||
early-stopping
|
|
||||||
eval_metric : str, callable, optional
|
|
||||||
If a str, should be a built-in evaluation metric to use. See
|
|
||||||
doc/parameter.md. If callable, a custom evaluation metric. The call
|
|
||||||
signature is func(y_predicted, y_true) where y_true will be a
|
|
||||||
DMatrix object such that you may need to call the get_label
|
|
||||||
method. It must return a str, value pair where the str is a name
|
|
||||||
for the evaluation and value is the value of the evaluation
|
|
||||||
function. This objective is always minimized.
|
|
||||||
early_stopping_rounds : int, optional
|
|
||||||
Activates early stopping. Validation error needs to decrease at
|
|
||||||
least every <early_stopping_rounds> round(s) to continue training.
|
|
||||||
Requires at least one item in evals. If there's more than one,
|
|
||||||
will use the last. Returns the model from the last iteration
|
|
||||||
(not the best one). If early stopping occurs, the model will
|
|
||||||
have two additional fields: bst.best_score and bst.best_iteration.
|
|
||||||
verbose : bool
|
|
||||||
If `verbose` and an evaluation set is used, writes the evaluation
|
|
||||||
metric measured on the validation set to stderr.
|
|
||||||
"""
|
|
||||||
eval_results = {}
|
|
||||||
self.classes_ = list(np.unique(y))
|
|
||||||
self.n_classes_ = len(self.classes_)
|
|
||||||
if self.n_classes_ > 2:
|
|
||||||
# Switch to using a multiclass objective in the underlying XGB instance
|
|
||||||
self.objective = "multi:softprob"
|
|
||||||
xgb_options = self.get_xgb_params()
|
|
||||||
xgb_options['num_class'] = self.n_classes_
|
|
||||||
else:
|
|
||||||
xgb_options = self.get_xgb_params()
|
|
||||||
|
|
||||||
feval = eval_metric if callable(eval_metric) else None
|
|
||||||
if eval_metric is not None:
|
|
||||||
if callable(eval_metric):
|
|
||||||
eval_metric = None
|
|
||||||
else:
|
|
||||||
xgb_options.update({"eval_metric": eval_metric})
|
|
||||||
|
|
||||||
if eval_set is not None:
|
|
||||||
# TODO: use sample_weight if given?
|
|
||||||
evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
|
|
||||||
nevals = len(evals)
|
|
||||||
eval_names = ["validation_{}".format(i) for i in range(nevals)]
|
|
||||||
evals = list(zip(evals, eval_names))
|
|
||||||
else:
|
|
||||||
evals = ()
|
|
||||||
|
|
||||||
self._le = LabelEncoder().fit(y)
|
|
||||||
training_labels = self._le.transform(y)
|
|
||||||
|
|
||||||
if sample_weight is not None:
|
|
||||||
train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
|
|
||||||
missing=self.missing)
|
|
||||||
else:
|
|
||||||
train_dmatrix = DMatrix(X, label=training_labels,
|
|
||||||
missing=self.missing)
|
|
||||||
|
|
||||||
self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
|
|
||||||
evals=evals,
|
|
||||||
early_stopping_rounds=early_stopping_rounds,
|
|
||||||
evals_result=eval_results, feval=feval,
|
|
||||||
verbose_eval=verbose)
|
|
||||||
|
|
||||||
if eval_results:
|
|
||||||
eval_results = {k: np.array(v, dtype=float)
|
|
||||||
for k, v in eval_results.items()}
|
|
||||||
self.eval_results = eval_results
|
|
||||||
|
|
||||||
if early_stopping_rounds is not None:
|
|
||||||
self.best_score = self._Booster.best_score
|
|
||||||
self.best_iteration = self._Booster.best_iteration
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, data):
|
|
||||||
test_dmatrix = DMatrix(data, missing=self.missing)
|
|
||||||
class_probs = self.booster().predict(test_dmatrix)
|
|
||||||
if len(class_probs.shape) > 1:
|
|
||||||
column_indexes = np.argmax(class_probs, axis=1)
|
|
||||||
else:
|
|
||||||
column_indexes = np.repeat(0, data.shape[0])
|
|
||||||
column_indexes[class_probs > 0.5] = 1
|
|
||||||
return self._le.inverse_transform(column_indexes)
|
|
||||||
|
|
||||||
def predict_proba(self, data):
|
|
||||||
test_dmatrix = DMatrix(data, missing=self.missing)
|
|
||||||
class_probs = self.booster().predict(test_dmatrix)
|
|
||||||
if self.objective == "multi:softprob":
|
|
||||||
return class_probs
|
|
||||||
else:
|
|
||||||
classone_probs = class_probs
|
|
||||||
classzero_probs = 1.0 - classone_probs
|
|
||||||
return np.vstack((classzero_probs, classone_probs)).transpose()
|
|
||||||
|
|
||||||
class XGBRegressor(XGBModel, XGBRegressorBase):
|
|
||||||
# pylint: disable=missing-docstring
|
|
||||||
__doc__ = """
|
|
||||||
Implementation of the scikit-learn API for XGBoost regression
|
|
||||||
""" + "\n".join(XGBModel.__doc__.split('\n')[2:])
|
|
||||||
@ -11,7 +11,7 @@ This should give you xgboost.exe for CLI version and xgboost_wrapper.dll for pyt
|
|||||||
|
|
||||||
Use Python Module
|
Use Python Module
|
||||||
=====
|
=====
|
||||||
* After you build the dll, you can install the Python package from the [../wrapper](../wrapper) folder
|
* After you build the dll, you can install the Python package from the [../python-package](../python-package) folder
|
||||||
|
|
||||||
```
|
```
|
||||||
python setup.py install
|
python setup.py install
|
||||||
|
|||||||
@ -1,20 +1,9 @@
|
|||||||
Wrapper of XGBoost
|
XGBoost Wrappers
|
||||||
=====
|
================
|
||||||
This folder provides wrapper of xgboost to other languages
|
This folder provides wrapper to create xgboost packages to other languages.
|
||||||
|
|
||||||
Python
|
***Supported Language Packages***
|
||||||
=====
|
* [Python package](../python-package)
|
||||||
* To make the python module, type ```./build.sh``` in the root directory of project
|
* [R-package](../R-package)
|
||||||
* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools)
|
* [Java Package](../java)
|
||||||
* Install with `python setup.py install` from this directory.
|
* [Julia Package](https://github.com/antinucleon/XGBoost.jl)
|
||||||
* Refer also to the walk through example in [demo folder](../demo/guide-python)
|
|
||||||
* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
|
|
||||||
|
|
||||||
|
|
||||||
R
|
|
||||||
=====
|
|
||||||
* See [R-package](../R-package)
|
|
||||||
|
|
||||||
Julia
|
|
||||||
=====
|
|
||||||
* See [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl)
|
|
||||||
|
|||||||
@ -1,39 +0,0 @@
|
|||||||
# pylint: disable=invalid-name
|
|
||||||
"""Setup xgboost package."""
|
|
||||||
import os
|
|
||||||
import platform
|
|
||||||
from setuptools import setup
|
|
||||||
|
|
||||||
|
|
||||||
class XGBoostLibraryNotFound(Exception):
|
|
||||||
"""Exception to raise when xgboost library cannot be found."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
dll_path = [curr_dir]
|
|
||||||
|
|
||||||
if os.name == 'nt':
|
|
||||||
if platform.architecture()[0] == '64bit':
|
|
||||||
dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/'))
|
|
||||||
else:
|
|
||||||
dll_path.append(os.path.join(curr_dir, '../windows/Release/'))
|
|
||||||
|
|
||||||
|
|
||||||
if os.name == 'nt':
|
|
||||||
dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
|
|
||||||
else:
|
|
||||||
dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
|
|
||||||
|
|
||||||
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
|
||||||
|
|
||||||
if len(lib_path) == 0:
|
|
||||||
raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
|
|
||||||
"../make?")
|
|
||||||
setup(name="xgboost",
|
|
||||||
version="0.40",
|
|
||||||
description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
|
|
||||||
zip_safe=False,
|
|
||||||
py_modules=['xgboost'],
|
|
||||||
data_files=[('.', [lib_path[0]])],
|
|
||||||
url="https://github.com/dmlc/xgboost")
|
|
||||||
Loading…
x
Reference in New Issue
Block a user