From 59b91cf205f294ead9242b1895cae4ebb4b466c7 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 3 Jul 2015 20:36:41 -0700 Subject: [PATCH] make python lint --- wrapper/setup.py | 5 +- wrapper/xgboost.py | 427 +++++++++++++++++++++++++++------------------ 2 files changed, 263 insertions(+), 169 deletions(-) diff --git a/wrapper/setup.py b/wrapper/setup.py index 52bf1cf82..5365d61b0 100644 --- a/wrapper/setup.py +++ b/wrapper/setup.py @@ -1,9 +1,12 @@ +# pylint: disable=invalid-name +"""Setup xgboost package.""" import os import platform from setuptools import setup class XGBoostLibraryNotFound(Exception): + """Exception to raise when xgboost library cannot be found.""" pass @@ -15,7 +18,7 @@ if os.name == 'nt': dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/')) else: dll_path.append(os.path.join(curr_dir, '../windows/Release/')) - + if os.name == 'nt': dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 0280d87b3..c21545b4c 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -6,7 +6,7 @@ Version: 0.40 Authors: Tianqi Chen, Bing Xu Early stopping by Zygmunt ZajÄ…c """ - +# pylint: disable=too-many-arguments, too-many-locals, too-many-lines from __future__ import absolute_import import os @@ -28,20 +28,25 @@ except ImportError: SKLEARN_INSTALLED = False class XGBoostLibraryNotFound(Exception): + """Error throwed by when xgboost is not found""" pass class XGBoostError(Exception): + """Error throwed by xgboost trainer.""" pass __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] if sys.version_info[0] == 3: - string_types = str, + # pylint: disable=invalid-name + STRING_TYPES = str, else: - string_types = basestring, + # pylint: disable=invalid-name + STRING_TYPES = basestring, def load_xglib(): + """Load the xgboost library.""" curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) dll_path = [curr_path] if os.name == 'nt': @@ -55,7 +60,8 @@ def load_xglib(): dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] if len(dll_path) == 0: - raise XGBoostLibraryNotFound('cannot find find the files in the candicate path ' + str(dll_path)) + raise XGBoostLibraryNotFound( + 'cannot find find the files in the candicate path ' + str(dll_path)) lib = ctypes.cdll.LoadLibrary(lib_path[0]) # DMatrix functions @@ -79,12 +85,11 @@ def load_xglib(): return lib # load the XGBoost library globally -xglib = load_xglib() +_LIB = load_xglib() def ctypes2numpy(cptr, length, dtype): - """ - Convert a ctypes pointer array to a numpy array. + """Convert a ctypes pointer array to a numpy array. """ if not isinstance(cptr, ctypes.POINTER(ctypes.c_float)): raise RuntimeError('expected float pointer') @@ -95,6 +100,7 @@ def ctypes2numpy(cptr, length, dtype): def ctypes2buffer(cptr, length): + """Convert ctypes pointer to buffer type.""" if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): raise RuntimeError('expected char pointer') res = bytearray(length) @@ -105,14 +111,17 @@ def ctypes2buffer(cptr, length): def c_str(string): + """Convert a python string to cstring.""" return ctypes.c_char_p(string.encode('utf-8')) def c_array(ctype, values): + """Convert a python string to c array.""" return (ctype * len(values))(*values) class DMatrix(object): + """Data Matrix used in XGBoost.""" def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ Data matrix used in XGBoost. @@ -135,8 +144,8 @@ class DMatrix(object): if data is None: self.handle = None return - if isinstance(data, string_types): - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent))) + if isinstance(data, STRING_TYPES): + self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromFile(c_str(data), int(silent))) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) elif isinstance(data, scipy.sparse.csc_matrix): @@ -160,7 +169,7 @@ class DMatrix(object): """ if len(csr.indices) != len(csr.data): raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR( + self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromCSR( c_array(ctypes.c_ulong, csr.indptr), c_array(ctypes.c_uint, csr.indices), c_array(ctypes.c_float, csr.data), @@ -172,7 +181,7 @@ class DMatrix(object): """ if len(csc.indices) != len(csc.data): raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data))) - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC( + self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromCSC( c_array(ctypes.c_ulong, csc.indptr), c_array(ctypes.c_uint, csc.indices), c_array(ctypes.c_float, csc.data), @@ -183,34 +192,77 @@ class DMatrix(object): Initialize data from a 2-D numpy matrix. """ data = np.array(mat.reshape(mat.size), dtype=np.float32) - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat( + self.handle = ctypes.c_void_p(_LIB.XGDMatrixCreateFromMat( data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), mat.shape[0], mat.shape[1], ctypes.c_float(missing))) def __del__(self): - xglib.XGDMatrixFree(self.handle) + _LIB.XGDMatrixFree(self.handle) def get_float_info(self, field): + """Get float property from the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + Returns + ------- + info : array + a numpy array of float information of the data + """ length = ctypes.c_ulong() - ret = xglib.XGDMatrixGetFloatInfo(self.handle, c_str(field), ctypes.byref(length)) + ret = _LIB.XGDMatrixGetFloatInfo(self.handle, c_str(field), ctypes.byref(length)) return ctypes2numpy(ret, length.value, np.float32) def get_uint_info(self, field): + """Get unsigned integer property from the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + Returns + ------- + info : array + a numpy array of float information of the data + """ length = ctypes.c_ulong() - ret = xglib.XGDMatrixGetUIntInfo(self.handle, c_str(field), ctypes.byref(length)) + ret = _LIB.XGDMatrixGetUIntInfo(self.handle, c_str(field), ctypes.byref(length)) return ctypes2numpy(ret, length.value, np.uint32) def set_float_info(self, field, data): - xglib.XGDMatrixSetFloatInfo(self.handle, c_str(field), - c_array(ctypes.c_float, data), len(data)) + """Set float type property into the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + data: numpy array + The array ofdata to be set + """ + _LIB.XGDMatrixSetFloatInfo(self.handle, c_str(field), + c_array(ctypes.c_float, data), len(data)) def set_uint_info(self, field, data): - xglib.XGDMatrixSetUIntInfo(self.handle, c_str(field), - c_array(ctypes.c_uint, data), len(data)) + """Set uint type property into the DMatrix. + + Parameters + ---------- + field: str + The field name of the information + + data: numpy array + The array ofdata to be set + """ + _LIB.XGDMatrixSetUIntInfo(self.handle, c_str(field), + c_array(ctypes.c_uint, data), len(data)) def save_binary(self, fname, silent=True): - """ - Save DMatrix to an XGBoost buffer. + """Save DMatrix to an XGBoost buffer. Parameters ---------- @@ -219,74 +271,74 @@ class DMatrix(object): silent : bool (optional; default: True) If set, the output is suppressed. """ - xglib.XGDMatrixSaveBinary(self.handle, c_str(fname), int(silent)) + _LIB.XGDMatrixSaveBinary(self.handle, c_str(fname), int(silent)) def set_label(self, label): - """set label of dmatrix - Args: - label: list - label for DMatrix - Returns: - None + """Set label of dmatrix + + Parameters + ---------- + label: array like + The label information to be set into DMatrix """ self.set_float_info('label', label) def set_weight(self, weight): - """ - Set weight of each instance. + """ Set weight of each instance. Parameters ---------- - weight : float - Weight for positive instance. + weight : array like + Weight for each data point """ self.set_float_info('weight', weight) def set_base_margin(self, margin): - """ - set base margin of booster to start from - this can be used to specify a prediction value of + """ Set base margin of booster to start from. + + This can be used to specify a prediction value of existing model to be base_margin However, remember margin is needed, instead of transformed prediction e.g. for logistic regression: need to put in value before logistic transformation see also example/demo.py + + Parameters + ---------- + margin: array like + Prediction margin of each datapoint """ self.set_float_info('base_margin', margin) def set_group(self, group): - """ - Set group size of DMatrix (used for ranking). + """Set group size of DMatrix (used for ranking). Parameters ---------- - group : int - Group size. + group : array like + Group size of each group """ - xglib.XGDMatrixSetGroup(self.handle, c_array(ctypes.c_uint, group), len(group)) + _LIB.XGDMatrixSetGroup(self.handle, c_array(ctypes.c_uint, group), len(group)) def get_label(self): - """ - Get the label of the DMatrix. + """Get the label of the DMatrix. Returns ------- - label : list + label : array """ return self.get_float_info('label') def get_weight(self): - """ - Get the weight of the DMatrix. + """Get the weight of the DMatrix. Returns ------- - weight : float + weight : array """ return self.get_float_info('weight') def get_base_margin(self): - """ - Get the base margin of the DMatrix. + """Get the base margin of the DMatrix. Returns ------- @@ -295,18 +347,16 @@ class DMatrix(object): return self.get_float_info('base_margin') def num_row(self): - """ - Get the number of rows in the DMatrix. + """Get the number of rows in the DMatrix. Returns ------- number of rows : int """ - return xglib.XGDMatrixNumRow(self.handle) + return _LIB.XGDMatrixNumRow(self.handle) def slice(self, rindex): - """ - Slice the DMatrix and return a new DMatrix that only contains `rindex`. + """Slice the DMatrix and return a new DMatrix that only contains `rindex`. Parameters ---------- @@ -319,13 +369,15 @@ class DMatrix(object): A new DMatrix containing only selected indices. """ res = DMatrix(None) - res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix( + res.handle = ctypes.c_void_p(_LIB.XGDMatrixSliceDMatrix( self.handle, c_array(ctypes.c_int, rindex), len(rindex))) return res class Booster(object): + """"A Booster of of XGBoost.""" def __init__(self, params=None, cache=(), model_file=None): + # pylint: disable=invalid-name """ Learner class. @@ -342,14 +394,14 @@ class Booster(object): if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) - self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache))) + self.handle = ctypes.c_void_p(_LIB.XGBoosterCreate(dmats, len(cache))) self.set_param({'seed': 0}) self.set_param(params or {}) if model_file is not None: self.load_model(model_file) def __del__(self): - xglib.XGBoosterFree(self.handle) + _LIB.XGBoosterFree(self.handle) def __getstate__(self): # can't pickle ctypes pointers @@ -367,10 +419,10 @@ class Booster(object): if handle is not None: buf = handle dmats = c_array(ctypes.c_void_p, []) - handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0)) + handle = ctypes.c_void_p(_LIB.XGBoosterCreate(dmats, 0)) length = ctypes.c_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) - xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length) + _LIB.XGBoosterLoadModelFromBuffer(handle, ptr, length) state['handle'] = handle self.__dict__.update(state) self.set_param({'seed': 0}) @@ -379,11 +431,10 @@ class Booster(object): return self.__deepcopy__() def __deepcopy__(self): - return Booster(model_file = self.save_raw()) + return Booster(model_file=self.save_raw()) def copy(self): - """ - Copy the booster object + """Copy the booster object. Returns -------- @@ -391,15 +442,16 @@ class Booster(object): """ return self.__copy__() - def set_param(self, params, pv=None): + def set_param(self, params, value=None): + """Set parameters into the DMatrix.""" if isinstance(params, collections.Mapping): params = params.items() - elif isinstance(params, string_types) and pv is not None: - params = [(params, pv)] - for k, v in params: - xglib.XGBoosterSetParam(self.handle, c_str(k), c_str(str(v))) + elif isinstance(params, STRING_TYPES) and value is not None: + params = [(params, value)] + for key, val in params: + _LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val))) - def update(self, dtrain, it, fobj=None): + def update(self, dtrain, iteration, fobj=None): """ Update (one iteration). @@ -407,7 +459,7 @@ class Booster(object): ---------- dtrain : DMatrix Training data. - it : int + iteration : int Current iteration number. fobj : function Customized objective function. @@ -415,7 +467,7 @@ class Booster(object): if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) if fobj is None: - xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) + _LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle) else: pred = self.predict(dtrain) grad, hess = fobj(pred, dtrain) @@ -438,20 +490,20 @@ class Booster(object): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) - xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle, - c_array(ctypes.c_float, grad), - c_array(ctypes.c_float, hess), - len(grad)) + _LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, + c_array(ctypes.c_float, grad), + c_array(ctypes.c_float, hess), + len(grad)) - def eval_set(self, evals, it=0, feval=None): - """ - Evaluate by a metric. + def eval_set(self, evals, iteration=0, feval=None): + # pylint: disable=invalid-name + """Evaluate a set of data. Parameters ---------- evals : list of tuples (DMatrix, string) List of items to be evaluated. - it : int + iteration : int Current iteration. feval : function Custom evaluation function. @@ -464,20 +516,35 @@ class Booster(object): for d in evals: if not isinstance(d[0], DMatrix): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) - if not isinstance(d[1], string_types): + if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) - return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) + return _LIB.XGBoosterEvalOneIter(self.handle, iteration, dmats, evnames, len(evals)) else: - res = '[%d]' % it - for dm, evname in evals: - name, val = feval(self.predict(dm), dm) + res = '[%d]' % iteration + for dmat, evname in evals: + name, val = feval(self.predict(dmat), dmat) res += '\t%s-%s:%f' % (evname, name, val) return res - def eval(self, mat, name='eval', it=0): - return self.eval_set([(mat, name)], it) + def eval(self, data, name='eval', iteration=0): + """Evaluate the model on mat. + + + Parameters + --------- + data : DMatrix + The dmatrix storing the input. + + name : str (default = 'eval') + The name of the dataset + + + iteration : int (default = 0) + The current iteration number + """ + return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): """ @@ -492,10 +559,13 @@ class Booster(object): ---------- data : DMatrix The dmatrix storing the input. + output_margin : bool Whether to output the raw untransformed margin value. + ntree_limit : int Limit number of trees in the prediction; defaults to 0 (use all trees). + pred_leaf : bool When this option is on, the output will be a matrix of (nsample, ntrees) with each record indicating the predicted leaf index of each sample in each tree. @@ -512,8 +582,8 @@ class Booster(object): if pred_leaf: option_mask |= 0x02 length = ctypes.c_ulong() - preds = xglib.XGBoosterPredict(self.handle, data.handle, - option_mask, ntree_limit, ctypes.byref(length)) + preds = _LIB.XGBoosterPredict(self.handle, data.handle, + option_mask, ntree_limit, ctypes.byref(length)) preds = ctypes2numpy(preds, length.value, np.float32) if pred_leaf: preds = preds.astype(np.int32) @@ -531,8 +601,8 @@ class Booster(object): fname : string Output file name """ - if isinstance(fname, string_types): # assume file name - xglib.XGBoosterSaveModel(self.handle, c_str(fname)) + if isinstance(fname, STRING_TYPES): # assume file name + _LIB.XGBoosterSaveModel(self.handle, c_str(fname)) else: raise TypeError("fname must be a string") @@ -545,8 +615,8 @@ class Booster(object): a in memory buffer represetation of the model """ length = ctypes.c_ulong() - cptr = xglib.XGBoosterGetModelRaw(self.handle, - ctypes.byref(length)) + cptr = _LIB.XGBoosterGetModelRaw(self.handle, + ctypes.byref(length)) return ctypes2buffer(cptr, length.value) def load_model(self, fname): @@ -559,59 +629,63 @@ class Booster(object): Input file name or memory buffer(see also save_raw) """ if isinstance(fname, str): # assume file name - xglib.XGBoosterLoadModel(self.handle, c_str(fname)) + _LIB.XGBoosterLoadModel(self.handle, c_str(fname)) else: buf = fname length = ctypes.c_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) - xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length) + _LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, length) - def dump_model(self, fo, fmap='', with_stats=False): + def dump_model(self, fout, fmap='', with_stats=False): """ Dump model into a text file. Parameters ---------- - fo : string + foout : string Output file name. fmap : string, optional Name of the file containing feature map names. with_stats : bool (optional) Controls whether the split statistics are output. """ - if isinstance(fo, string_types): - fo = open(fo, 'w') + if isinstance(fout, STRING_TYPES): + fout = open(fout, 'w') need_close = True else: need_close = False ret = self.get_dump(fmap, with_stats) for i in range(len(ret)): - fo.write('booster[{}]:\n'.format(i)) - fo.write(ret[i]) + fout.write('booster[{}]:\n'.format(i)) + fout.write(ret[i]) if need_close: - fo.close() + fout.close() def get_dump(self, fmap='', with_stats=False): """ Returns the dump the model as a list of strings. """ length = ctypes.c_ulong() - sarr = xglib.XGBoosterDumpModel(self.handle, c_str(fmap), - int(with_stats), ctypes.byref(length)) + sarr = _LIB.XGBoosterDumpModel(self.handle, c_str(fmap), + int(with_stats), ctypes.byref(length)) res = [] for i in range(length.value): res.append(str(sarr[i].decode('ascii'))) return res def get_fscore(self, fmap=''): - """ - Get feature importance of each feature. + """Get feature importance of each feature. + + Parameters + ---------- + fmap: str (optional) + The name of feature map file """ trees = self.get_dump(fmap) fmap = {} for tree in trees: - for l in tree.split('\n'): - arr = l.split('[') + for line in tree.split('\n'): + arr = line.split('[') if len(arr) == 1: continue fid = arr[1].split(']')[0] @@ -624,9 +698,9 @@ class Booster(object): def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, - early_stopping_rounds=None,evals_result=None): - """ - Train a booster with given parameters. + early_stopping_rounds=None, evals_result=None): + # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init + """Train a booster with given parameters. Parameters ---------- @@ -663,7 +737,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst = Booster(params, [dtrain] + [d[0] for d in evals]) if evals_result is not None: - if type(evals_result) is not dict: + if isinstance(evals_result, dict): raise TypeError('evals_result has to be a dictionary') else: evals_name = [d[1] for d in evals] @@ -675,37 +749,38 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst.update(dtrain, i, obj) if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, string_types): + if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":([0-9.]+).",msg) - for key,val in zip(evals_name,res): + res = re.findall(":([0-9.]+).", msg) + for key, val in zip(evals_name, res): evals_result[key].append(val) return bst else: # early stopping - if len(evals) < 1: raise ValueError('For early stopping you need at least one set in evals.') - sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], early_stopping_rounds)) + sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\ + evals[-1][1], early_stopping_rounds)) # is params a list of tuples? are we using multiple eval metrics? - if type(params) == list: + if isinstance(params, list): if len(params) != len(dict(params).items()): - raise ValueError('Check your params. Early stopping works with single eval metric only.') + raise ValueError('Check your params.'\ + 'Early stopping works with single eval metric only.') params = dict(params) # either minimize loss or maximize AUC/MAP/NDCG maximize_score = False if 'eval_metric' in params: maximize_metrics = ('auc', 'map', 'ndcg') - if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)): + if any(params['eval_metric'].startswith(x) for x in maximize_metrics): maximize_score = True if maximize_score: @@ -720,7 +795,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, bst.update(dtrain, i, obj) bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, string_types): + if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() @@ -728,8 +803,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":([0-9.]+).",msg) - for key,val in zip(evals_name,res): + res = re.findall(":([0-9.]+).", msg) + for key, val in zip(evals_name, res): evals_result[key].append(val) score = float(msg.rsplit(':', 1)[1]) @@ -748,17 +823,21 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, return bst class CVPack(object): + """"Auxiliary datastruct to hold one fold of CV.""" def __init__(self, dtrain, dtest, param): + """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) - def update(self, r, fobj): - self.bst.update(self.dtrain, r, fobj) + def update(self, iteration, fobj): + """"Update the boosters for one iteration""" + self.bst.update(self.dtrain, iteration, fobj) - def eval(self, r, feval): - return self.bst.eval_set(self.watchlist, r, feval) + def eval(self, iteration, feval): + """"Evaluate the CVPack for one iteration.""" + return self.bst.eval_set(self.watchlist, iteration, feval) def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): @@ -785,6 +864,7 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): def aggcv(rlist, show_stdv=True): + # pylint: disable=invalid-name """ Aggregate cross-validation results. """ @@ -794,7 +874,7 @@ def aggcv(rlist, show_stdv=True): arr = line.split() assert ret == arr[0] for it in arr[1:]: - if not isinstance(it, string_types): + if not isinstance(it, STRING_TYPES): it = it.decode() k, v = it.split(':') if k not in cvmap: @@ -802,7 +882,7 @@ def aggcv(rlist, show_stdv=True): cvmap[k].append(float(v)) for k, v in sorted(cvmap.items(), key=lambda x: x[0]): v = np.array(v) - if not isinstance(ret, string_types): + if not isinstance(ret, STRING_TYPES): ret = ret.decode() if show_stdv: ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) @@ -813,8 +893,8 @@ def aggcv(rlist, show_stdv=True): def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): - """ - Cross-validation with given paramaters. + # pylint: disable = invalid-name + """Cross-validation with given paramaters. Parameters ---------- @@ -847,8 +927,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), results = [] cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) for i in range(num_boost_round): - for f in cvfolds: - f.update(i, obj) + for fold in cvfolds: + fold.update(i, obj) res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) sys.stderr.write(res + '\n') results.append(res) @@ -857,16 +937,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), # used for compatiblity without sklearn XGBModelBase = object -XGBClassifier = object -XGBRegressor = object +XGBClassifierBase = object +XGBRegressorBase = object if SKLEARN_INSTALLED: XGBModelBase = BaseEstimator - XGBRegressor = RegressorMixin - XGBClassifier = ClassifierMixin + XGBRegressorBase = RegressorMixin + XGBClassifierBase = ClassifierMixin class XGBModel(XGBModelBase): - """ - Implementation of the Scikit-Learn API for XGBoost. + # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name + """Implementation of the Scikit-Learn API for XGBoost. Parameters ---------- @@ -902,8 +982,10 @@ class XGBModel(XGBModelBase): Value in the data which needs to be present as a missing value. If None, defaults to np.nan. """ - def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear", - nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, + def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, + silent=True, objective="reg:linear", + nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, + subsample=1, colsample_bytree=1, base_score=0.5, seed=0, missing=None): if not SKLEARN_INSTALLED: raise XGBoostError('sklearn needs to be installed in order to use this module') @@ -923,7 +1005,6 @@ class XGBModel(XGBModelBase): self.base_score = base_score self.seed = seed self.missing = missing if missing is not None else np.nan - self._Booster = None def __setstate__(self, state): @@ -936,9 +1017,9 @@ class XGBModel(XGBModelBase): self.__dict__.update(state) def booster(self): - """ - get the underlying xgboost Booster of this model - will raise an exception when fit was not called + """Get the underlying xgboost Booster of this model. + + This will raise an exception when fit was not called Returns ------- @@ -949,12 +1030,14 @@ class XGBModel(XGBModelBase): return self._Booster def get_params(self, deep=False): + """Get parameter.s""" params = super(XGBModel, self).get_params(deep=deep) if params['missing'] is np.nan: params['missing'] = None # sklearn doesn't handle nan. see #4725 return params def get_xgb_params(self): + """Get xgboost type parameters.""" xgb_params = self.get_params() xgb_params['silent'] = 1 if self.silent else 0 @@ -963,30 +1046,39 @@ class XGBModel(XGBModelBase): xgb_params.pop('nthread', None) return xgb_params - def fit(self, X, y): - trainDmatrix = DMatrix(X, label=y, missing=self.missing) - self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators) + def fit(self, data, y): + # pylint: disable=missing-docstring,invalid-name + train_dmatrix = DMatrix(data, label=y, missing=self.missing) + self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators) return self - def predict(self, X): - testDmatrix = DMatrix(X, missing=self.missing) - return self.booster().predict(testDmatrix) + def predict(self, data): + # pylint: disable=missing-docstring,invalid-name + test_dmatrix = DMatrix(data, missing=self.missing) + return self.booster().predict(test_dmatrix) -class XGBClassifier(XGBModel, XGBClassifier): +class XGBClassifier(XGBModel, XGBClassifierBase): + # pylint: disable=missing-docstring,too-many-arguments,invalid-name __doc__ = """ Implementation of the scikit-learn API for XGBoost classification """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic", - nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, + def __init__(self, max_depth=3, learning_rate=0.1, + n_estimators=100, silent=True, + objective="binary:logistic", + nthread=-1, gamma=0, min_child_weight=1, + max_delta_step=0, subsample=1, colsample_bytree=1, base_score=0.5, seed=0, missing=None): - super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective, - nthread, gamma, min_child_weight, max_delta_step, subsample, + super(XGBClassifier, self).__init__(max_depth, learning_rate, + n_estimators, silent, objective, + nthread, gamma, min_child_weight, + max_delta_step, subsample, colsample_bytree, base_score, seed, missing) def fit(self, X, y, sample_weight=None): + # pylint: disable = attribute-defined-outside-init,arguments-differ self.classes_ = list(np.unique(y)) self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: @@ -1001,29 +1093,29 @@ class XGBClassifier(XGBModel, XGBClassifier): training_labels = self._le.transform(y) if sample_weight is not None: - trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight, - missing=self.missing) + train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, + missing=self.missing) else: - trainDmatrix = DMatrix(X, label=training_labels, - missing=self.missing) + train_dmatrix = DMatrix(X, label=training_labels, + missing=self.missing) - self._Booster = train(xgb_options, trainDmatrix, self.n_estimators) + self._Booster = train(xgb_options, train_dmatrix, self.n_estimators) return self - def predict(self, X): - testDmatrix = DMatrix(X, missing=self.missing) - class_probs = self.booster().predict(testDmatrix) + def predict(self, data): + test_dmatrix = DMatrix(data, missing=self.missing) + class_probs = self.booster().predict(test_dmatrix) if len(class_probs.shape) > 1: column_indexes = np.argmax(class_probs, axis=1) else: - column_indexes = np.repeat(0, X.shape[0]) + column_indexes = np.repeat(0, data.shape[0]) column_indexes[class_probs > 0.5] = 1 return self._le.inverse_transform(column_indexes) - def predict_proba(self, X): - testDmatrix = DMatrix(X, missing=self.missing) - class_probs = self.booster().predict(testDmatrix) + def predict_proba(self, data): + test_dmatrix = DMatrix(data, missing=self.missing) + class_probs = self.booster().predict(test_dmatrix) if self.objective == "multi:softprob": return class_probs else: @@ -1031,9 +1123,8 @@ class XGBClassifier(XGBModel, XGBClassifier): classzero_probs = 1.0 - classone_probs return np.vstack((classzero_probs, classone_probs)).transpose() -class XGBRegressor(XGBModel, XGBRegressor): +class XGBRegressor(XGBModel, XGBRegressorBase): + # pylint: disable=missing-docstring __doc__ = """ Implementation of the scikit-learn API for XGBoost regression """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - - pass