""" xgboost: eXtreme Gradient Boosting library Authors: Tianqi Chen, Bing Xu """ from __future__ import absolute_import import os import sys import ctypes import collections import numpy as np import scipy.sparse try: from sklearn.base import BaseEstimator from sklearn.preprocessing import LabelEncoder SKLEARN_INSTALLED = True except ImportError: SKLEARN_INSTALLED = False __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train'] if sys.version_info[0] == 3: string_types = str, else: string_types = basestring, def load_xglib(): dll_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) if os.name == 'nt': dll_path = os.path.join(dll_path, '../windows/x64/Release/xgboost_wrapper.dll') else: dll_path = os.path.join(dll_path, 'libxgboostwrapper.so') # load the xgboost wrapper library lib = ctypes.cdll.LoadLibrary(dll_path) # DMatrix functions lib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p lib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p lib.XGDMatrixCreateFromCSC.restype = ctypes.c_void_p lib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p lib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p lib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float) lib.XGDMatrixGetUIntInfo.restype = ctypes.POINTER(ctypes.c_uint) lib.XGDMatrixNumRow.restype = ctypes.c_ulong # Booster functions lib.XGBoosterCreate.restype = ctypes.c_void_p lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float) lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p) return lib # load the XGBoost library globally xglib = load_xglib() def ctypes2numpy(cptr, length, dtype): """ Convert a ctypes pointer array to a numpy array. """ if not isinstance(cptr, ctypes.POINTER(ctypes.c_float)): raise RuntimeError('expected float pointer') res = np.zeros(length, dtype=dtype) if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): raise RuntimeError('memmove failed') return res def c_str(string): return ctypes.c_char_p(string.encode('utf-8')) def c_array(ctype, values): return (ctype * len(values))(*values) class DMatrix(object): def __init__(self, data, label=None, missing=0.0, weight=None): """ Data matrix used in XGBoost. Parameters ---------- data : string/numpy array/scipy.sparse Data source, string type is the path of svmlight format txt file or xgb buffer. label : list or numpy 1-D array (optional) Label of the training data. missing : float Value in the data which needs to be present as a missing value. weight : list or numpy 1-D array (optional) Weight for each instance. """ # force into void_p, mac need to pass things in as void_p if data is None: self.handle = None return if isinstance(data, string_types): self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0)) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) elif isinstance(data, scipy.sparse.csc_matrix): self._init_from_csc(data) elif isinstance(data, np.ndarray) and len(data.shape) == 2: self._init_from_npy2d(data, missing) else: try: csr = scipy.sparse.csr_matrix(data) self._init_from_csr(csr) except: raise TypeError('can not intialize DMatrix from {}'.format(type(data).__name__)) if label is not None: self.set_label(label) if weight is not None: self.set_weight(weight) def _init_from_csr(self, csr): """ Initialize data from a CSR matrix. """ if len(csr.indices) != len(csr.data): raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR( c_array(ctypes.c_ulong, csr.indptr), c_array(ctypes.c_uint, csr.indices), c_array(ctypes.c_float, csr.data), len(csr.indptr), len(csr.data))) def _init_from_csc(self, csc): """ Initialize data from a CSC matrix. """ if len(csc.indices) != len(csc.data): raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data))) self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC( c_array(ctypes.c_ulong, csc.indptr), c_array(ctypes.c_uint, csc.indices), c_array(ctypes.c_float, csc.data), len(csc.indptr), len(csc.data))) def _init_from_npy2d(self, mat, missing): """ Initialize data from a 2-D numpy matrix. """ data = np.array(mat.reshape(mat.size), dtype=np.float32) self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat( data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), mat.shape[0], mat.shape[1], ctypes.c_float(missing))) def __del__(self): xglib.XGDMatrixFree(self.handle) def get_float_info(self, field): length = ctypes.c_ulong() ret = xglib.XGDMatrixGetFloatInfo(self.handle, c_str(field), ctypes.byref(length)) return ctypes2numpy(ret, length.value, np.float32) def get_uint_info(self, field): length = ctypes.c_ulong() ret = xglib.XGDMatrixGetUIntInfo(self.handle, c_str(field), ctypes.byref(length)) return ctypes2numpy(ret, length.value, np.uint32) def set_float_info(self, field, data): xglib.XGDMatrixSetFloatInfo(self.handle, c_str(field), c_array(ctypes.c_float, data), len(data)) def set_uint_info(self, field, data): xglib.XGDMatrixSetUIntInfo(self.handle, c_str(field), c_array(ctypes.c_uint, data), len(data)) def save_binary(self, fname, silent=True): """ Save DMatrix to an XGBoost buffer. Parameters ---------- fname : string Name of the output buffer file. silent : bool (optional; default: True) If set, the output is suppressed. """ xglib.XGDMatrixSaveBinary(self.handle, c_str(fname), int(silent)) def set_label(self, label): """set label of dmatrix Args: label: list label for DMatrix Returns: None """ self.set_float_info('label', label) def set_weight(self, weight): """ Set weight of each instance. Parameters ---------- weight : float Weight for positive instance. """ self.set_float_info('weight', weight) def set_base_margin(self, margin): """ set base margin of booster to start from this can be used to specify a prediction value of existing model to be base_margin However, remember margin is needed, instead of transformed prediction e.g. for logistic regression: need to put in value before logistic transformation see also example/demo.py """ self.set_float_info('base_margin', margin) def set_group(self, group): """ Set group size of DMatrix (used for ranking). Parameters ---------- group : int Group size. """ xglib.XGDMatrixSetGroup(self.handle, c_array(ctypes.c_uint, group), len(group)) def get_label(self): """ Get the label of the DMatrix. Returns ------- label : list """ return self.get_float_info('label') def get_weight(self): """ Get the weight of the DMatrix. Returns ------- weight : float """ return self.get_float_info('weight') def get_base_margin(self): """ Get the base margin of the DMatrix. Returns ------- base_margin : float """ return self.get_float_info('base_margin') def num_row(self): """ Get the number of rows in the DMatrix. Returns ------- number of rows : int """ return xglib.XGDMatrixNumRow(self.handle) def slice(self, rindex): """ Slice the DMatrix and return a new DMatrix that only contains `rindex`. Parameters ---------- rindex : list List of indices to be selected. Returns ------- res : DMatrix A new DMatrix containing only selected indices. """ res = DMatrix(None) res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix( self.handle, c_array(ctypes.c_int, rindex), len(rindex))) return res class Booster(object): def __init__(self, params=None, cache=(), model_file=None): """ Learner class. Parameters ---------- params : dict Parameters for boosters. cache : list List of cache items. model_file : string Path to the model file. """ for d in cache: if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache))) self.set_param({'seed': 0}) self.set_param(params or {}) if model_file is not None: self.load_model(model_file) def __del__(self): xglib.XGBoosterFree(self.handle) def set_param(self, params, pv=None): if isinstance(params, collections.Mapping): params = params.items() elif isinstance(params, string_types) and pv is not None: params = [(params, pv)] for k, v in params: xglib.XGBoosterSetParam(self.handle, c_str(k), c_str(str(v))) def update(self, dtrain, it, fobj=None): """ Update (one iteration). Parameters ---------- dtrain : DMatrix Training data. it : int Current iteration number. fobj : function Customized objective function. """ if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) if fobj is None: xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) else: pred = self.predict(dtrain) grad, hess = fobj(pred, dtrain) self.boost(dtrain, grad, hess) def boost(self, dtrain, grad, hess): """ Update. Parameters ---------- dtrain : DMatrix The training DMatrix. grad : list The first order of gradient. hess : list The second order of gradient. """ if len(grad) != len(hess): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle, c_array(ctypes.c_float, grad), c_array(ctypes.c_float, hess), len(grad)) def eval_set(self, evals, it=0, feval=None): """ Evaluate by a metric. Parameters ---------- evals : list of tuples (DMatrix, string) List of items to be evaluated. it : int Current iteration. feval : function Custom evaluation function. Returns ------- evaluation result """ if feval is None: for d in evals: if not isinstance(d[0], DMatrix): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], string_types): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) else: res = '[%d]' % it for dm, evname in evals: name, val = feval(self.predict(dm), dm) res += '\t%s-%s:%f' % (evname, name, val) return res def eval(self, mat, name='eval', it=0): return self.eval_set([(mat, name)], it) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): """ Predict with data. Parameters ---------- data : DMatrix The dmatrix storing the input. output_margin : bool Whether to output the raw untransformed margin value. ntree_limit : int Limit number of trees in the prediction; defaults to 0 (use all trees). pred_leaf : bool When this option is on, the output will be a matrix of (nsample, ntrees) with each record indicating the predicted leaf index of each sample in each tree. Note that the leaf index of a tree is unique per tree, so you may find leaf 1 in both tree 1 and tree 0. Returns ------- prediction : numpy array """ option_mask = 0x00 if output_margin: option_mask |= 0x01 if pred_leaf: option_mask |= 0x02 length = ctypes.c_ulong() preds = xglib.XGBoosterPredict(self.handle, data.handle, option_mask, ntree_limit, ctypes.byref(length)) preds = ctypes2numpy(preds, length.value, np.float32) if pred_leaf: preds = preds.astype(np.int32) nrow = data.num_row() if preds.size != nrow and preds.size % nrow == 0: preds = preds.reshape(nrow, preds.size / nrow) return preds def save_model(self, fname): """ Save the model to a file. Parameters ---------- fname : string Output file name. """ xglib.XGBoosterSaveModel(self.handle, c_str(fname)) def load_model(self, fname): """ Load the model from a file. Parameters ---------- fname : string Input file name. """ xglib.XGBoosterLoadModel(self.handle, c_str(fname)) def dump_model(self, fo, fmap='', with_stats=False): """ Dump model into a text file. Parameters ---------- fo : string Output file name. fmap : string, optional Name of the file containing feature map names. with_stats : bool (optional) Controls whether the split statistics are output. """ if isinstance(fo, string_types): fo = open(fo, 'w') need_close = True else: need_close = False ret = self.get_dump(fmap, with_stats) for i in range(len(ret)): fo.write('booster[{}]:\n'.format(i)) fo.write(ret[i]) if need_close: fo.close() def get_dump(self, fmap='', with_stats=False): """ Returns the dump the model as a list of strings. """ length = ctypes.c_ulong() sarr = xglib.XGBoosterDumpModel(self.handle, c_str(fmap), int(with_stats), ctypes.byref(length)) res = [] for i in range(length.value): res.append(str(sarr[i])) return res def get_fscore(self, fmap=''): """ Get feature importance of each feature. """ trees = self.get_dump(fmap) fmap = {} for tree in trees: sys.stdout.write(str(tree) + '\n') for l in tree.split('\n'): arr = l.split('[') if len(arr) == 1: continue fid = arr[1].split(']')[0] fid = fid.split('<')[0] if fid not in fmap: fmap[fid] = 1 else: fmap[fid] += 1 return fmap def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None): """ Train a booster with given parameters. Parameters ---------- params : dict Booster params. dtrain : DMatrix Data to be trained. num_boost_round: int Number of boosting iterations. watchlist : list of pairs (DMatrix, string) List of items to be evaluated during training, this allows user to watch performance on the validation set. obj : function Customized objective function. feval : function Customized evaluation function. Returns ------- booster : a trained booster model """ evals = list(evals) bst = Booster(params, [dtrain] + [d[0] for d in evals]) for i in range(num_boost_round): bst.update(dtrain, i, obj) if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, string_types): sys.stderr.write(bst_eval_set + '\n') else: sys.stderr.write(bst_eval_set.decode() + '\n') return bst class CVPack(object): def __init__(self, dtrain, dtest, param): self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) def update(self, r, fobj): self.bst.update(self.dtrain, r, fobj) def eval(self, r, feval): return self.bst.eval_set(self.watchlist, r, feval) def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): """ Make an n-fold list of CVPack from random indices. """ evals = list(evals) np.random.seed(seed) randidx = np.random.permutation(dall.num_row()) kstep = len(randidx) / nfold idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)] ret = [] for k in range(nfold): dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) dtest = dall.slice(idset[k]) # run preprocessing on the data set if needed if fpreproc is not None: dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) else: tparam = param plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret def aggcv(rlist, show_stdv=True): """ Aggregate cross-validation results. """ cvmap = {} ret = rlist[0].split()[0] for line in rlist: arr = line.split() assert ret == arr[0] for it in arr[1:]: if not isinstance(it, string_types): it = it.decode() k, v = it.split(':') if k not in cvmap: cvmap[k] = [] cvmap[k].append(float(v)) for k, v in sorted(cvmap.items(), key=lambda x: x[0]): v = np.array(v) if not isinstance(ret, string_types): ret = ret.decode() if show_stdv: ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) else: ret += '\tcv-%s:%f' % (k, np.mean(v)) return ret def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): """ Cross-validation with given paramaters. Parameters ---------- params : dict Booster params. dtrain : DMatrix Data to be trained. num_boost_round : int Number of boosting iterations. nfold : int Number of folds in CV. metrics : list of strings Evaluation metrics to be watched in CV. obj : function Custom objective function. feval : function Custom evaluation function. fpreproc : function Preprocessing function that takes (dtrain, dtest, param) and returns transformed versions of those. show_stdv : bool Whether to display the standard deviation. seed : int Seed used to generate the folds (passed to numpy.random.seed). Returns ------- evaluation history : list(string) """ results = [] cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) for i in range(num_boost_round): for f in cvfolds: f.update(i, obj) res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) sys.stderr.write(res + '\n') results.append(res) return results XGBModelBase = object if SKLEARN_INSTALLED: XGBModelBase = BaseEstimator class XGBModel(BaseEstimator): """ Implementation of the Scikit-Learn API for XGBoost. Parameters ---------- max_depth : int Maximum tree depth for base learners. learning_rate : float Boosting learning rate (xgb's "eta") n_estimators : int Number of boosted trees to fit. silent : boolean Whether to print messages while running boosting. """ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear"): if not SKLEARN_INSTALLED: raise Exception('sklearn needs to be installed in order to use this module') self.max_depth = max_depth self.eta = learning_rate self.silent = 1 if silent else 0 self.n_rounds = n_estimators self.objective = objective self._Booster = Booster() def get_params(self, deep=True): return {'max_depth': self.max_depth, 'learning_rate': self.eta, 'n_estimators': self.n_rounds, 'silent': True if self.silent == 1 else False, 'objective': self.objective } def get_xgb_params(self): return {'eta': self.eta, 'max_depth': self.max_depth, 'silent': self.silent, 'objective': self.objective} def fit(self, X, y): trainDmatrix = DMatrix(X, label=y) self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_rounds) return self class XGBClassifier(XGBModel): def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True): super().__init__(max_depth, learning_rate, n_estimators, silent, objective="binary:logistic") def fit(self, X, y, sample_weight=None): y_values = list(np.unique(y)) if len(y_values) == 2: # Map the two classes in the y vector into {0,1}, and record the mapping so that # the predict() method can return results in the original range if not (-1 in y_values and 1 in y_values) or (0 in y_values and 1 in y_values) or (True in y_values and False in y_values): raise ValueError("For a binary classifier, y must be in (0,1), or (-1,1), or (True,False).") if -1 in y_values: self._yspace = "svm_like" training_labels = y.copy() training_labels[training_labels == -1] = 0 elif False in y_values: self._yspace = "boolean" training_labels = np.array(y, dtype=int) else: self._yspace = "zero_one" training_labels = y xgb_options = self.get_xgb_params() else: # Switch to using a multiclass objective in the underlying XGB instance self._yspace = "multiclass" self.objective = "multi:softprob" self._le = LabelEncoder().fit(y) training_labels = self._le.transform(y) xgb_options = self.get_xgb_params() xgb_options['num_class'] = len(y_values) if sample_weight is not None: trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight) else: trainDmatrix = DMatrix(X, label=training_labels) self._Booster = train(xgb_options, trainDmatrix, self.n_rounds) return self def predict(self, X): testDmatrix = DMatrix(X) class_probs = self._Booster.predict(testDmatrix) if self._yspace == "multiclass": column_indexes = np.argmax(class_probs, axis=1) fitted_values = self._le.inverse_transform(column_indexes) else: if self._yspace == "svm_like": base_value = -1 one_value = 1 elif self._yspace == "boolean": base_value = False one_value = True else: base_value = 0 one_value = 1 fitted_values = np.repeat(base_value, X.shape[0]) fitted_values[class_probs > 0.5] = one_value return fitted_values def predict_proba(self, X): testDmatrix = DMatrix(X) class_probs = self._Booster.predict(testDmatrix) if self._yspace == "multiclass": return class_probs else: classone_probs = class_probs classzero_probs = 1.0 - classone_probs return np.vstack((classzero_probs,classone_probs)).transpose()