make python lint

This commit is contained in:
tqchen 2015-07-03 20:36:41 -07:00
parent 57ec922214
commit 59b91cf205
2 changed files with 263 additions and 169 deletions

View File

@ -1,9 +1,12 @@
# pylint: disable=invalid-name
"""Setup xgboost package."""
import os
import platform
from setuptools import setup
class XGBoostLibraryNotFound(Exception):
"""Exception to raise when xgboost library cannot be found."""
pass
@ -15,7 +18,7 @@ if os.name == 'nt':
dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/'))
else:
dll_path.append(os.path.join(curr_dir, '../windows/Release/'))
if os.name == 'nt':
dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]

View File

@ -6,7 +6,7 @@ Version: 0.40
Authors: Tianqi Chen, Bing Xu
Early stopping by Zygmunt Zając
"""
# pylint: disable=too-many-arguments, too-many-locals, too-many-lines
from __future__ import absolute_import
import os
@ -28,20 +28,25 @@ except ImportError:
SKLEARN_INSTALLED = False
class XGBoostLibraryNotFound(Exception):
"""Error throwed by when xgboost is not found"""
pass
class XGBoostError(Exception):
"""Error throwed by xgboost trainer."""
pass
__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
if sys.version_info[0] == 3:
string_types = str,
# pylint: disable=invalid-name
STRING_TYPES = str,
else:
string_types = basestring,
# pylint: disable=invalid-name
STRING_TYPES = basestring,
def load_xglib():
"""Load the xgboost library."""
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
dll_path = [curr_path]
if os.name == 'nt':
@ -55,7 +60,8 @@ def load_xglib():
dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
if len(dll_path) == 0:
raise XGBoostLibraryNotFound('cannot find find the files in the candicate path ' + str(dll_path))
raise XGBoostLibraryNotFound(
'cannot find find the files in the candicate path ' + str(dll_path))
lib = ctypes.cdll.LoadLibrary(lib_path[0])
# DMatrix functions
@ -79,12 +85,11 @@ def load_xglib():
return lib
# load the XGBoost library globally
xglib = load_xglib()
_LIB = load_xglib()
def ctypes2numpy(cptr, length, dtype):
"""
Convert a ctypes pointer array to a numpy array.
"""Convert a ctypes pointer array to a numpy array.
"""
if not isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
raise RuntimeError('expected float pointer')
@ -95,6 +100,7 @@ def ctypes2numpy(cptr, length, dtype):
def ctypes2buffer(cptr, length):
"""Convert ctypes pointer to buffer type."""
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
raise RuntimeError('expected char pointer')
res = bytearray(length)
@ -105,14 +111,17 @@ def ctypes2buffer(cptr, length):
def c_str(string):
"""Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values):
"""Convert a python string to c array."""
return (ctype * len(values))(*values)
class DMatrix(object):
"""Data Matrix used in XGBoost."""
def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
"""
Data matrix used in XGBoost.
@ -135,8 +144,8 @@ class DMatrix(object):
if data is None:
self.handle = None
return
if isinstance(data, string_types):
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent)))
if isinstance(data, STRING_TYPES):
self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromFile(c_str(data), int(silent)))
elif isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data)
elif isinstance(data, scipy.sparse.csc_matrix):
@ -160,7 +169,7 @@ class DMatrix(object):
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromCSR(
c_array(ctypes.c_ulong, csr.indptr),
c_array(ctypes.c_uint, csr.indices),
c_array(ctypes.c_float, csr.data),
@ -172,7 +181,7 @@ class DMatrix(object):
"""
if len(csc.indices) != len(csc.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC(
self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromCSC(
c_array(ctypes.c_ulong, csc.indptr),
c_array(ctypes.c_uint, csc.indices),
c_array(ctypes.c_float, csc.data),
@ -183,34 +192,77 @@ class DMatrix(object):
Initialize data from a 2-D numpy matrix.
"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromMat(
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
mat.shape[0], mat.shape[1], ctypes.c_float(missing)))
def __del__(self):
xglib.XGDMatrixFree(self.handle)
_LIB.XGDMatrixFree(self.handle)
def get_float_info(self, field):
"""Get float property from the DMatrix.
Parameters
----------
field: str
The field name of the information
Returns
-------
info : array
a numpy array of float information of the data
"""
length = ctypes.c_ulong()
ret = xglib.XGDMatrixGetFloatInfo(self.handle, c_str(field), ctypes.byref(length))
ret = _LIB.XGDMatrixGetFloatInfo(self.handle, c_str(field), ctypes.byref(length))
return ctypes2numpy(ret, length.value, np.float32)
def get_uint_info(self, field):
"""Get unsigned integer property from the DMatrix.
Parameters
----------
field: str
The field name of the information
Returns
-------
info : array
a numpy array of float information of the data
"""
length = ctypes.c_ulong()
ret = xglib.XGDMatrixGetUIntInfo(self.handle, c_str(field), ctypes.byref(length))
ret = _LIB.XGDMatrixGetUIntInfo(self.handle, c_str(field), ctypes.byref(length))
return ctypes2numpy(ret, length.value, np.uint32)
def set_float_info(self, field, data):
xglib.XGDMatrixSetFloatInfo(self.handle, c_str(field),
c_array(ctypes.c_float, data), len(data))
"""Set float type property into the DMatrix.
Parameters
----------
field: str
The field name of the information
data: numpy array
The array ofdata to be set
"""
_LIB.XGDMatrixSetFloatInfo(self.handle, c_str(field),
c_array(ctypes.c_float, data), len(data))
def set_uint_info(self, field, data):
xglib.XGDMatrixSetUIntInfo(self.handle, c_str(field),
c_array(ctypes.c_uint, data), len(data))
"""Set uint type property into the DMatrix.
Parameters
----------
field: str
The field name of the information
data: numpy array
The array ofdata to be set
"""
_LIB.XGDMatrixSetUIntInfo(self.handle, c_str(field),
c_array(ctypes.c_uint, data), len(data))
def save_binary(self, fname, silent=True):
"""
Save DMatrix to an XGBoost buffer.
"""Save DMatrix to an XGBoost buffer.
Parameters
----------
@ -219,74 +271,74 @@ class DMatrix(object):
silent : bool (optional; default: True)
If set, the output is suppressed.
"""
xglib.XGDMatrixSaveBinary(self.handle, c_str(fname), int(silent))
_LIB.XGDMatrixSaveBinary(self.handle, c_str(fname), int(silent))
def set_label(self, label):
"""set label of dmatrix
Args:
label: list
label for DMatrix
Returns:
None
"""Set label of dmatrix
Parameters
----------
label: array like
The label information to be set into DMatrix
"""
self.set_float_info('label', label)
def set_weight(self, weight):
"""
Set weight of each instance.
""" Set weight of each instance.
Parameters
----------
weight : float
Weight for positive instance.
weight : array like
Weight for each data point
"""
self.set_float_info('weight', weight)
def set_base_margin(self, margin):
"""
set base margin of booster to start from
this can be used to specify a prediction value of
""" Set base margin of booster to start from.
This can be used to specify a prediction value of
existing model to be base_margin
However, remember margin is needed, instead of transformed prediction
e.g. for logistic regression: need to put in value before logistic transformation
see also example/demo.py
Parameters
----------
margin: array like
Prediction margin of each datapoint
"""
self.set_float_info('base_margin', margin)
def set_group(self, group):
"""
Set group size of DMatrix (used for ranking).
"""Set group size of DMatrix (used for ranking).
Parameters
----------
group : int
Group size.
group : array like
Group size of each group
"""
xglib.XGDMatrixSetGroup(self.handle, c_array(ctypes.c_uint, group), len(group))
_LIB.XGDMatrixSetGroup(self.handle, c_array(ctypes.c_uint, group), len(group))
def get_label(self):
"""
Get the label of the DMatrix.
"""Get the label of the DMatrix.
Returns
-------
label : list
label : array
"""
return self.get_float_info('label')
def get_weight(self):
"""
Get the weight of the DMatrix.
"""Get the weight of the DMatrix.
Returns
-------
weight : float
weight : array
"""
return self.get_float_info('weight')
def get_base_margin(self):
"""
Get the base margin of the DMatrix.
"""Get the base margin of the DMatrix.
Returns
-------
@ -295,18 +347,16 @@ class DMatrix(object):
return self.get_float_info('base_margin')
def num_row(self):
"""
Get the number of rows in the DMatrix.
"""Get the number of rows in the DMatrix.
Returns
-------
number of rows : int
"""
return xglib.XGDMatrixNumRow(self.handle)
return _LIB.XGDMatrixNumRow(self.handle)
def slice(self, rindex):
"""
Slice the DMatrix and return a new DMatrix that only contains `rindex`.
"""Slice the DMatrix and return a new DMatrix that only contains `rindex`.
Parameters
----------
@ -319,13 +369,15 @@ class DMatrix(object):
A new DMatrix containing only selected indices.
"""
res = DMatrix(None)
res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
res.handle = ctypes.c_void_p(_LIB.XGDMatrixSliceDMatrix(
self.handle, c_array(ctypes.c_int, rindex), len(rindex)))
return res
class Booster(object):
""""A Booster of of XGBoost."""
def __init__(self, params=None, cache=(), model_file=None):
# pylint: disable=invalid-name
"""
Learner class.
@ -342,14 +394,14 @@ class Booster(object):
if not isinstance(d, DMatrix):
raise TypeError('invalid cache item: {}'.format(type(d).__name__))
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
self.handle = ctypes.c_void_p(_LIB.XGBoosterCreate(dmats, len(cache)))
self.set_param({'seed': 0})
self.set_param(params or {})
if model_file is not None:
self.load_model(model_file)
def __del__(self):
xglib.XGBoosterFree(self.handle)
_LIB.XGBoosterFree(self.handle)
def __getstate__(self):
# can't pickle ctypes pointers
@ -367,10 +419,10 @@ class Booster(object):
if handle is not None:
buf = handle
dmats = c_array(ctypes.c_void_p, [])
handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0))
handle = ctypes.c_void_p(_LIB.XGBoosterCreate(dmats, 0))
length = ctypes.c_ulong(len(buf))
ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length)
_LIB.XGBoosterLoadModelFromBuffer(handle, ptr, length)
state['handle'] = handle
self.__dict__.update(state)
self.set_param({'seed': 0})
@ -379,11 +431,10 @@ class Booster(object):
return self.__deepcopy__()
def __deepcopy__(self):
return Booster(model_file = self.save_raw())
return Booster(model_file=self.save_raw())
def copy(self):
"""
Copy the booster object
"""Copy the booster object.
Returns
--------
@ -391,15 +442,16 @@ class Booster(object):
"""
return self.__copy__()
def set_param(self, params, pv=None):
def set_param(self, params, value=None):
"""Set parameters into the DMatrix."""
if isinstance(params, collections.Mapping):
params = params.items()
elif isinstance(params, string_types) and pv is not None:
params = [(params, pv)]
for k, v in params:
xglib.XGBoosterSetParam(self.handle, c_str(k), c_str(str(v)))
elif isinstance(params, STRING_TYPES) and value is not None:
params = [(params, value)]
for key, val in params:
_LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))
def update(self, dtrain, it, fobj=None):
def update(self, dtrain, iteration, fobj=None):
"""
Update (one iteration).
@ -407,7 +459,7 @@ class Booster(object):
----------
dtrain : DMatrix
Training data.
it : int
iteration : int
Current iteration number.
fobj : function
Customized objective function.
@ -415,7 +467,7 @@ class Booster(object):
if not isinstance(dtrain, DMatrix):
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
if fobj is None:
xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)
else:
pred = self.predict(dtrain)
grad, hess = fobj(pred, dtrain)
@ -438,20 +490,20 @@ class Booster(object):
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
if not isinstance(dtrain, DMatrix):
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
c_array(ctypes.c_float, grad),
c_array(ctypes.c_float, hess),
len(grad))
_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
c_array(ctypes.c_float, grad),
c_array(ctypes.c_float, hess),
len(grad))
def eval_set(self, evals, it=0, feval=None):
"""
Evaluate by a metric.
def eval_set(self, evals, iteration=0, feval=None):
# pylint: disable=invalid-name
"""Evaluate a set of data.
Parameters
----------
evals : list of tuples (DMatrix, string)
List of items to be evaluated.
it : int
iteration : int
Current iteration.
feval : function
Custom evaluation function.
@ -464,20 +516,35 @@ class Booster(object):
for d in evals:
if not isinstance(d[0], DMatrix):
raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__))
if not isinstance(d[1], string_types):
if not isinstance(d[1], STRING_TYPES):
raise TypeError('expected string, got {}'.format(type(d[1]).__name__))
dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))
return _LIB.XGBoosterEvalOneIter(self.handle, iteration, dmats, evnames, len(evals))
else:
res = '[%d]' % it
for dm, evname in evals:
name, val = feval(self.predict(dm), dm)
res = '[%d]' % iteration
for dmat, evname in evals:
name, val = feval(self.predict(dmat), dmat)
res += '\t%s-%s:%f' % (evname, name, val)
return res
def eval(self, mat, name='eval', it=0):
return self.eval_set([(mat, name)], it)
def eval(self, data, name='eval', iteration=0):
"""Evaluate the model on mat.
Parameters
---------
data : DMatrix
The dmatrix storing the input.
name : str (default = 'eval')
The name of the dataset
iteration : int (default = 0)
The current iteration number
"""
return self.eval_set([(data, name)], iteration)
def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
"""
@ -492,10 +559,13 @@ class Booster(object):
----------
data : DMatrix
The dmatrix storing the input.
output_margin : bool
Whether to output the raw untransformed margin value.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
pred_leaf : bool
When this option is on, the output will be a matrix of (nsample, ntrees)
with each record indicating the predicted leaf index of each sample in each tree.
@ -512,8 +582,8 @@ class Booster(object):
if pred_leaf:
option_mask |= 0x02
length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict(self.handle, data.handle,
option_mask, ntree_limit, ctypes.byref(length))
preds = _LIB.XGBoosterPredict(self.handle, data.handle,
option_mask, ntree_limit, ctypes.byref(length))
preds = ctypes2numpy(preds, length.value, np.float32)
if pred_leaf:
preds = preds.astype(np.int32)
@ -531,8 +601,8 @@ class Booster(object):
fname : string
Output file name
"""
if isinstance(fname, string_types): # assume file name
xglib.XGBoosterSaveModel(self.handle, c_str(fname))
if isinstance(fname, STRING_TYPES): # assume file name
_LIB.XGBoosterSaveModel(self.handle, c_str(fname))
else:
raise TypeError("fname must be a string")
@ -545,8 +615,8 @@ class Booster(object):
a in memory buffer represetation of the model
"""
length = ctypes.c_ulong()
cptr = xglib.XGBoosterGetModelRaw(self.handle,
ctypes.byref(length))
cptr = _LIB.XGBoosterGetModelRaw(self.handle,
ctypes.byref(length))
return ctypes2buffer(cptr, length.value)
def load_model(self, fname):
@ -559,59 +629,63 @@ class Booster(object):
Input file name or memory buffer(see also save_raw)
"""
if isinstance(fname, str): # assume file name
xglib.XGBoosterLoadModel(self.handle, c_str(fname))
_LIB.XGBoosterLoadModel(self.handle, c_str(fname))
else:
buf = fname
length = ctypes.c_ulong(len(buf))
ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
def dump_model(self, fo, fmap='', with_stats=False):
def dump_model(self, fout, fmap='', with_stats=False):
"""
Dump model into a text file.
Parameters
----------
fo : string
foout : string
Output file name.
fmap : string, optional
Name of the file containing feature map names.
with_stats : bool (optional)
Controls whether the split statistics are output.
"""
if isinstance(fo, string_types):
fo = open(fo, 'w')
if isinstance(fout, STRING_TYPES):
fout = open(fout, 'w')
need_close = True
else:
need_close = False
ret = self.get_dump(fmap, with_stats)
for i in range(len(ret)):
fo.write('booster[{}]:\n'.format(i))
fo.write(ret[i])
fout.write('booster[{}]:\n'.format(i))
fout.write(ret[i])
if need_close:
fo.close()
fout.close()
def get_dump(self, fmap='', with_stats=False):
"""
Returns the dump the model as a list of strings.
"""
length = ctypes.c_ulong()
sarr = xglib.XGBoosterDumpModel(self.handle, c_str(fmap),
int(with_stats), ctypes.byref(length))
sarr = _LIB.XGBoosterDumpModel(self.handle, c_str(fmap),
int(with_stats), ctypes.byref(length))
res = []
for i in range(length.value):
res.append(str(sarr[i].decode('ascii')))
return res
def get_fscore(self, fmap=''):
"""
Get feature importance of each feature.
"""Get feature importance of each feature.
Parameters
----------
fmap: str (optional)
The name of feature map file
"""
trees = self.get_dump(fmap)
fmap = {}
for tree in trees:
for l in tree.split('\n'):
arr = l.split('[')
for line in tree.split('\n'):
arr = line.split('[')
if len(arr) == 1:
continue
fid = arr[1].split(']')[0]
@ -624,9 +698,9 @@ class Booster(object):
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
early_stopping_rounds=None,evals_result=None):
"""
Train a booster with given parameters.
early_stopping_rounds=None, evals_result=None):
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
"""Train a booster with given parameters.
Parameters
----------
@ -663,7 +737,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
bst = Booster(params, [dtrain] + [d[0] for d in evals])
if evals_result is not None:
if type(evals_result) is not dict:
if isinstance(evals_result, dict):
raise TypeError('evals_result has to be a dictionary')
else:
evals_name = [d[1] for d in evals]
@ -675,37 +749,38 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
bst.update(dtrain, i, obj)
if len(evals) != 0:
bst_eval_set = bst.eval_set(evals, i, feval)
if isinstance(bst_eval_set, string_types):
if isinstance(bst_eval_set, STRING_TYPES):
msg = bst_eval_set
else:
msg = bst_eval_set.decode()
sys.stderr.write(msg + '\n')
if evals_result is not None:
res = re.findall(":([0-9.]+).",msg)
for key,val in zip(evals_name,res):
res = re.findall(":([0-9.]+).", msg)
for key, val in zip(evals_name, res):
evals_result[key].append(val)
return bst
else:
# early stopping
if len(evals) < 1:
raise ValueError('For early stopping you need at least one set in evals.')
sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], early_stopping_rounds))
sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
evals[-1][1], early_stopping_rounds))
# is params a list of tuples? are we using multiple eval metrics?
if type(params) == list:
if isinstance(params, list):
if len(params) != len(dict(params).items()):
raise ValueError('Check your params. Early stopping works with single eval metric only.')
raise ValueError('Check your params.'\
'Early stopping works with single eval metric only.')
params = dict(params)
# either minimize loss or maximize AUC/MAP/NDCG
maximize_score = False
if 'eval_metric' in params:
maximize_metrics = ('auc', 'map', 'ndcg')
if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)):
if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
maximize_score = True
if maximize_score:
@ -720,7 +795,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
bst.update(dtrain, i, obj)
bst_eval_set = bst.eval_set(evals, i, feval)
if isinstance(bst_eval_set, string_types):
if isinstance(bst_eval_set, STRING_TYPES):
msg = bst_eval_set
else:
msg = bst_eval_set.decode()
@ -728,8 +803,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
sys.stderr.write(msg + '\n')
if evals_result is not None:
res = re.findall(":([0-9.]+).",msg)
for key,val in zip(evals_name,res):
res = re.findall(":([0-9.]+).", msg)
for key, val in zip(evals_name, res):
evals_result[key].append(val)
score = float(msg.rsplit(':', 1)[1])
@ -748,17 +823,21 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
return bst
class CVPack(object):
""""Auxiliary datastruct to hold one fold of CV."""
def __init__(self, dtrain, dtest, param):
""""Initialize the CVPack"""
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
self.bst = Booster(param, [dtrain, dtest])
def update(self, r, fobj):
self.bst.update(self.dtrain, r, fobj)
def update(self, iteration, fobj):
""""Update the boosters for one iteration"""
self.bst.update(self.dtrain, iteration, fobj)
def eval(self, r, feval):
return self.bst.eval_set(self.watchlist, r, feval)
def eval(self, iteration, feval):
""""Evaluate the CVPack for one iteration."""
return self.bst.eval_set(self.watchlist, iteration, feval)
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
@ -785,6 +864,7 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
def aggcv(rlist, show_stdv=True):
# pylint: disable=invalid-name
"""
Aggregate cross-validation results.
"""
@ -794,7 +874,7 @@ def aggcv(rlist, show_stdv=True):
arr = line.split()
assert ret == arr[0]
for it in arr[1:]:
if not isinstance(it, string_types):
if not isinstance(it, STRING_TYPES):
it = it.decode()
k, v = it.split(':')
if k not in cvmap:
@ -802,7 +882,7 @@ def aggcv(rlist, show_stdv=True):
cvmap[k].append(float(v))
for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
v = np.array(v)
if not isinstance(ret, string_types):
if not isinstance(ret, STRING_TYPES):
ret = ret.decode()
if show_stdv:
ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
@ -813,8 +893,8 @@ def aggcv(rlist, show_stdv=True):
def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0):
"""
Cross-validation with given paramaters.
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
Parameters
----------
@ -847,8 +927,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
results = []
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
for i in range(num_boost_round):
for f in cvfolds:
f.update(i, obj)
for fold in cvfolds:
fold.update(i, obj)
res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
sys.stderr.write(res + '\n')
results.append(res)
@ -857,16 +937,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
# used for compatiblity without sklearn
XGBModelBase = object
XGBClassifier = object
XGBRegressor = object
XGBClassifierBase = object
XGBRegressorBase = object
if SKLEARN_INSTALLED:
XGBModelBase = BaseEstimator
XGBRegressor = RegressorMixin
XGBClassifier = ClassifierMixin
XGBRegressorBase = RegressorMixin
XGBClassifierBase = ClassifierMixin
class XGBModel(XGBModelBase):
"""
Implementation of the Scikit-Learn API for XGBoost.
# pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
"""Implementation of the Scikit-Learn API for XGBoost.
Parameters
----------
@ -902,8 +982,10 @@ class XGBModel(XGBModelBase):
Value in the data which needs to be present as a missing value. If
None, defaults to np.nan.
"""
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
silent=True, objective="reg:linear",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
subsample=1, colsample_bytree=1,
base_score=0.5, seed=0, missing=None):
if not SKLEARN_INSTALLED:
raise XGBoostError('sklearn needs to be installed in order to use this module')
@ -923,7 +1005,6 @@ class XGBModel(XGBModelBase):
self.base_score = base_score
self.seed = seed
self.missing = missing if missing is not None else np.nan
self._Booster = None
def __setstate__(self, state):
@ -936,9 +1017,9 @@ class XGBModel(XGBModelBase):
self.__dict__.update(state)
def booster(self):
"""
get the underlying xgboost Booster of this model
will raise an exception when fit was not called
"""Get the underlying xgboost Booster of this model.
This will raise an exception when fit was not called
Returns
-------
@ -949,12 +1030,14 @@ class XGBModel(XGBModelBase):
return self._Booster
def get_params(self, deep=False):
"""Get parameter.s"""
params = super(XGBModel, self).get_params(deep=deep)
if params['missing'] is np.nan:
params['missing'] = None # sklearn doesn't handle nan. see #4725
return params
def get_xgb_params(self):
"""Get xgboost type parameters."""
xgb_params = self.get_params()
xgb_params['silent'] = 1 if self.silent else 0
@ -963,30 +1046,39 @@ class XGBModel(XGBModelBase):
xgb_params.pop('nthread', None)
return xgb_params
def fit(self, X, y):
trainDmatrix = DMatrix(X, label=y, missing=self.missing)
self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators)
def fit(self, data, y):
# pylint: disable=missing-docstring,invalid-name
train_dmatrix = DMatrix(data, label=y, missing=self.missing)
self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators)
return self
def predict(self, X):
testDmatrix = DMatrix(X, missing=self.missing)
return self.booster().predict(testDmatrix)
def predict(self, data):
# pylint: disable=missing-docstring,invalid-name
test_dmatrix = DMatrix(data, missing=self.missing)
return self.booster().predict(test_dmatrix)
class XGBClassifier(XGBModel, XGBClassifier):
class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name
__doc__ = """
Implementation of the scikit-learn API for XGBoost classification
""" + "\n".join(XGBModel.__doc__.split('\n')[2:])
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
def __init__(self, max_depth=3, learning_rate=0.1,
n_estimators=100, silent=True,
objective="binary:logistic",
nthread=-1, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=1, colsample_bytree=1,
base_score=0.5, seed=0, missing=None):
super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective,
nthread, gamma, min_child_weight, max_delta_step, subsample,
super(XGBClassifier, self).__init__(max_depth, learning_rate,
n_estimators, silent, objective,
nthread, gamma, min_child_weight,
max_delta_step, subsample,
colsample_bytree,
base_score, seed, missing)
def fit(self, X, y, sample_weight=None):
# pylint: disable = attribute-defined-outside-init,arguments-differ
self.classes_ = list(np.unique(y))
self.n_classes_ = len(self.classes_)
if self.n_classes_ > 2:
@ -1001,29 +1093,29 @@ class XGBClassifier(XGBModel, XGBClassifier):
training_labels = self._le.transform(y)
if sample_weight is not None:
trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
missing=self.missing)
train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
missing=self.missing)
else:
trainDmatrix = DMatrix(X, label=training_labels,
missing=self.missing)
train_dmatrix = DMatrix(X, label=training_labels,
missing=self.missing)
self._Booster = train(xgb_options, trainDmatrix, self.n_estimators)
self._Booster = train(xgb_options, train_dmatrix, self.n_estimators)
return self
def predict(self, X):
testDmatrix = DMatrix(X, missing=self.missing)
class_probs = self.booster().predict(testDmatrix)
def predict(self, data):
test_dmatrix = DMatrix(data, missing=self.missing)
class_probs = self.booster().predict(test_dmatrix)
if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1)
else:
column_indexes = np.repeat(0, X.shape[0])
column_indexes = np.repeat(0, data.shape[0])
column_indexes[class_probs > 0.5] = 1
return self._le.inverse_transform(column_indexes)
def predict_proba(self, X):
testDmatrix = DMatrix(X, missing=self.missing)
class_probs = self.booster().predict(testDmatrix)
def predict_proba(self, data):
test_dmatrix = DMatrix(data, missing=self.missing)
class_probs = self.booster().predict(test_dmatrix)
if self.objective == "multi:softprob":
return class_probs
else:
@ -1031,9 +1123,8 @@ class XGBClassifier(XGBModel, XGBClassifier):
classzero_probs = 1.0 - classone_probs
return np.vstack((classzero_probs, classone_probs)).transpose()
class XGBRegressor(XGBModel, XGBRegressor):
class XGBRegressor(XGBModel, XGBRegressorBase):
# pylint: disable=missing-docstring
__doc__ = """
Implementation of the scikit-learn API for XGBoost regression
""" + "\n".join(XGBModel.__doc__.split('\n')[2:])
pass