xgboost/wrapper/xgboost.py

"""
xgboost: eXtreme Gradient Boosting library
Author: Tianqi Chen, Bing Xu

"""
import ctypes
import os
# optinally have scipy sparse, though not necessary
import numpy as np
import sys
import numpy.ctypeslib
import scipy.sparse as scp

# set this line correctly
if os.name == 'nt':
    XGBOOST_PATH = os.path.dirname(__file__)+'/../windows/x64/Release/xgboost_wrapper.dll'
else:
    XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostwrapper.so'

# load in xgboost library
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
# DMatrix functions
xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromCSC.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p
xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p
xglib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float)
xglib.XGDMatrixGetUIntInfo.restype = ctypes.POINTER(ctypes.c_uint)
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
# booster functions
xglib.XGBoosterCreate.restype = ctypes.c_void_p
xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)

def ctypes2numpy(cptr, length, dtype):
    """convert a ctypes pointer array to numpy array """
    assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
    res = numpy.zeros(length, dtype=dtype)
    assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
    return res

class DMatrix:
    """data matrix used in xgboost"""
    # constructor
    def __init__(self, data, label=None, missing=0.0, weight = None):
        """ constructor of DMatrix

            Args:
                data: string/numpy array/scipy.sparse
                      data source, string type is the path of svmlight format txt file or xgb buffer
                label: list or numpy 1d array, optional
                       label of training data
                missing: float
                         value in data which need to be present as missing value
                weight: list or numpy 1d array, optional
                        weight for each instances
        """
        # force into void_p, mac need to pass things in as void_p
        if data is None:
            self.handle = None
            return
        if isinstance(data, str):
            self.handle = ctypes.c_void_p(
                xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 0))
        elif isinstance(data, scp.csr_matrix):
            self.__init_from_csr(data)
        elif isinstance(data, scp.csc_matrix):
            self.__init_from_csc(data)
        elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
            self.__init_from_npy2d(data, missing)
        else:
            try:
                csr = scp.csr_matrix(data)
                self.__init_from_csr(csr)
            except:
                raise Exception("can not intialize DMatrix from"+str(type(data)))
        if label != None:
            self.set_label(label)
        if weight !=None:
            self.set_weight(weight)

    def __init_from_csr(self, csr):
        """convert data from csr matrix"""
        assert len(csr.indices) == len(csr.data)
        self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
            (ctypes.c_ulong  * len(csr.indptr))(*csr.indptr),
            (ctypes.c_uint  * len(csr.indices))(*csr.indices),
            (ctypes.c_float * len(csr.data))(*csr.data),
            len(csr.indptr), len(csr.data)))

    def __init_from_csc(self, csc):
        """convert data from csr matrix"""
        assert len(csc.indices) == len(csc.data)
        self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC(
            (ctypes.c_ulong  * len(csc.indptr))(*csc.indptr),
            (ctypes.c_uint * len(csc.indices))(*csc.indices),
            (ctypes.c_float * len(csc.data))(*csc.data),
            len(csc.indptr), len(csc.data)))

    def __init_from_npy2d(self,mat,missing):
        """convert data from numpy matrix"""
        data = numpy.array(mat.reshape(mat.size), dtype='float32')
        self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            mat.shape[0], mat.shape[1], ctypes.c_float(missing)))

    def __del__(self):
        """destructor"""
        xglib.XGDMatrixFree(self.handle)
    def get_float_info(self, field):
        length = ctypes.c_ulong()
        ret = xglib.XGDMatrixGetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
                                          ctypes.byref(length))
        return ctypes2numpy(ret, length.value, 'float32')
    def get_uint_info(self, field):
        length = ctypes.c_ulong()
        ret = xglib.XGDMatrixGetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
                                         ctypes.byref(length))
        return ctypes2numpy(ret, length.value, 'uint32')
    def set_float_info(self, field, data):
        xglib.XGDMatrixSetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
                                    (ctypes.c_float*len(data))(*data), len(data))
    def set_uint_info(self, field, data):
        xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
                                   (ctypes.c_uint*len(data))(*data), len(data))

    def save_binary(self, fname, silent=True):
        """save DMatrix to XGBoost buffer
            Args:
                fname: string
                       name of buffer file
                slient: bool, option
                       whether print info
           Returns:
                None
        """
        xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))

    def set_label(self, label):
        """set label of dmatrix
            Args:
                label: list
                       label for DMatrix
            Returns:
                None
        """
        self.set_float_info('label', label)

    def set_weight(self, weight):
        """set weight of each instances
            Args:
                weight: float
                        weight for positive instance
            Returns:
                None
        """
        self.set_float_info('weight', weight)

    def set_base_margin(self, margin):
        """
        set base margin of booster to start from
        this can be used to specify a prediction value of
        existing model to be base_margin
        However, remember margin is needed, instead of transformed prediction
        e.g. for logistic regression: need to put in value before logistic transformation
        see also example/demo.py
        """
        self.set_float_info('base_margin', margin)

    def set_group(self, group):
        """set group size of dmatrix, used for rank
            Args:
                group:

            Returns:
                None
        """
        xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))

    def get_label(self):
        """get label from dmatrix
            Args:
                None
            Returns:
                list, label of data
        """
        return self.get_float_info('label')

    def get_weight(self):
        """get weight from dmatrix
            Args:
                None
            Returns:
                float, weight
        """
        return self.get_float_info('weight')
    def get_base_margin(self):
        """get base_margin from dmatrix
            Args:
                None
            Returns:
                float, base margin
        """
        return self.get_float_info('base_margin')
    def num_row(self):
        """get number of rows
            Args:
                None
            Returns:
                int, num rows
        """
        return xglib.XGDMatrixNumRow(self.handle)
    def slice(self, rindex):
        """slice the DMatrix to return a new DMatrix that only contains rindex
            Args:
                rindex: list
                        list of index to be chosen
            Returns:
                res: DMatrix
                     new DMatrix with chosen index
        """
        res = DMatrix(None)
        res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
            self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
        return res

class Booster:
    """learner class """
    def __init__(self, params={}, cache=[], model_file = None):
        """ constructor
            Args:
                params: dict
                        params for boosters
                cache: list
                        list of cache item
                model_file: string
                        path of model file
            Returns:
                None
        """
        for d in cache:
            assert isinstance(d, DMatrix)
        dmats = (ctypes.c_void_p  * len(cache))(*[ d.handle for d in cache])
        self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
        self.set_param({'seed':0})
        self.set_param(params)
        if model_file != None:
            self.load_model(model_file)
    def __del__(self):
        xglib.XGBoosterFree(self.handle)
    def set_param(self, params, pv=None):
        if isinstance(params, dict):
            for k, v in params.items():
                xglib.XGBoosterSetParam(
                    self.handle, ctypes.c_char_p(k.encode('utf-8')),
                    ctypes.c_char_p(str(v).encode('utf-8')))
        elif isinstance(params,str) and pv != None:
            xglib.XGBoosterSetParam(
                self.handle, ctypes.c_char_p(params.encode('utf-8')),
                ctypes.c_char_p(str(pv).encode('utf-8')))
        else:
            for k, v in params:
                xglib.XGBoosterSetParam(
                    self.handle, ctypes.c_char_p(k.encode('utf-8')),
                    ctypes.c_char_p(str(v).encode('utf-8')))

    def update(self, dtrain, it, fobj=None):
        """
        update
            Args:
                dtrain: DMatrix
                        the training DMatrix
                it: int
                    current iteration number
                fobj: function
                    cutomzied objective function
            Returns:
                None
        """
        assert isinstance(dtrain, DMatrix)
        if fobj is None:
            xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
        else:
            pred = self.predict( dtrain )
            grad, hess = fobj( pred, dtrain )
            self.boost( dtrain, grad, hess )

    def boost(self, dtrain, grad, hess):
        """ update
            Args:
                dtrain: DMatrix
                        the training DMatrix
                grad: list
                        the first order of gradient
                hess: list
                        the second order of gradient
        """
        assert len(grad) == len(hess)
        assert isinstance(dtrain, DMatrix)
        xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
                                    (ctypes.c_float*len(grad))(*grad),
                                    (ctypes.c_float*len(hess))(*hess),
                                    len(grad))

    def eval_set(self, evals, it = 0, feval = None):
        """evaluates by metric
            Args:
                evals: list of tuple (DMatrix, string)
                       lists of items to be evaluated
                it: int
                    current iteration
                feval: function
                       custom evaluation function
            Returns:
                evals result
        """
        if feval is None:
            for d in evals:
                assert isinstance(d[0], DMatrix)
                assert isinstance(d[1], str)
            dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
            evnames = (ctypes.c_char_p * len(evals))(
                * [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
            return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))
        else:
            res = '[%d]' % it
            for dm, evname in evals:
                name, val = feval(self.predict(dm), dm)
                res += '\t%s-%s:%f' % (evname, name, val)
            return res
    def eval(self, mat, name = 'eval', it = 0):
        return self.eval_set( [(mat,name)], it)
    def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
        """
        predict with data
            Args:
                data: DMatrix
                    the dmatrix storing the input
                output_margin: bool
                    whether output raw margin value that is untransformed
                ntree_limit: int
                    limit number of trees in prediction, default to 0, 0 means using all the trees
                pred_leaf: bool
                    when this option is on, the output will be a matrix of (nsample, ntrees)
                    with each record indicate the predicted leaf index of each sample in each tree
                    Note that the leaf index of tree is unique per tree, so you may find leaf 1 in both tree 1 and tree 0
            Returns:
                numpy array of prediction
        """
        option_mask = 0
        if output_margin:
            option_mask += 1
        if pred_leaf:
            option_mask += 2
        length = ctypes.c_ulong()
        preds = xglib.XGBoosterPredict(self.handle, data.handle,
                                       option_mask, ntree_limit, ctypes.byref(length))
        preds = ctypes2numpy(preds, length.value, 'float32')
        if pred_leaf:
            preds = preds.astype('int32')
        nrow = data.num_row()
        if preds.size != nrow and preds.size % nrow == 0:
            preds = preds.reshape(nrow, preds.size / nrow)
        return preds
    def save_model(self, fname):
        """ save model to file
            Args:
                fname: string
                       file name of saving model
            Returns:
                None
        """
        xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
    def load_model(self, fname):
        """load model from file
            Args:
                fname: string
                       file name of saving model
            Returns:
                None
        """
        xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
    def dump_model(self, fo, fmap='', with_stats = False):
        """dump model into text file
            Args:
                fo: string
                    file name to be dumped
                fmap: string, optional
                      file name of feature map names
                with_stats: bool, optional
                      whether output statistics of the split
            Returns:
                None
        """
        if isinstance(fo,str):
            fo = open(fo,'w')
            need_close = True
        else:
            need_close = False
        ret = self.get_dump(fmap, with_stats)
        for i in range(len(ret)):
            fo.write('booster[%d]:\n' %i)
            fo.write( ret[i] )
        if need_close:
            fo.close()
    def get_dump(self, fmap='', with_stats=False):
        """get dump of model as list of strings """
        length = ctypes.c_ulong()
        sarr = xglib.XGBoosterDumpModel(self.handle,
                                        ctypes.c_char_p(fmap.encode('utf-8')),
                                        int(with_stats), ctypes.byref(length))
        res = []
        for i in range(length.value):
            res.append( str(sarr[i]) )
        return res
    def get_fscore(self, fmap=''):
        """ get feature importance of each feature """
        trees = self.get_dump(fmap)
        fmap = {}
        for tree in trees:
            print (tree)
            for l in tree.split('\n'):
                arr = l.split('[')
                if len(arr) == 1:
                    continue
                fid = arr[1].split(']')[0]
                fid = fid.split('<')[0]
                if fid not in fmap:
                    fmap[fid] = 1
                else:
                    fmap[fid]+= 1
        return fmap

def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
    """ train a booster with given paramaters
        Args:
            params: dict
                    params of booster
            dtrain: DMatrix
                    data to be trained
            num_boost_round: int
                             num of round to be boosted
            watchlist: list of pairs (DMatrix, string)
                       list of items to be evaluated during training, this allows user to watch performance on validation set
            obj:  function
                   cutomized objective function
            feval: function
                   cutomized evaluation function
        Returns: Booster model trained
    """
    bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
    for i in range(num_boost_round):
        bst.update( dtrain, i, obj )
        if len(evals) != 0:
            bst_eval_set=bst.eval_set(evals, i, feval)
            if isinstance(bst_eval_set,str):
                sys.stderr.write(bst_eval_set+'\n')
            else:
                sys.stderr.write(bst_eval_set.decode()+'\n')
    return bst

class CVPack:
    def __init__(self, dtrain, dtest, param):
        self.dtrain = dtrain
        self.dtest = dtest
        self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
        self.bst = Booster(param, [dtrain,dtest])
    def update(self, r, fobj):
        self.bst.update(self.dtrain, r, fobj)
    def eval(self, r, feval):
        return self.bst.eval_set(self.watchlist, r, feval)

def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None):
    """
    mk nfold list of cvpack from randidx
    """
    np.random.seed(seed)
    randidx = np.random.permutation(dall.num_row())
    kstep = len(randidx) / nfold
    idset = [randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ] for i in range(nfold)]
    ret = []
    for k in range(nfold):
        dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
        dtest = dall.slice(idset[k])
        # run preprocessing on the data set if needed
        if fpreproc is not None:
            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
        else:
            tparam = param
        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
        ret.append(CVPack(dtrain, dtest, plst))
    return ret

def aggcv(rlist, show_stdv=True):
    """
    aggregate cross validation results
    """
    cvmap = {}
    ret = rlist[0].split()[0]
    for line in rlist:
        arr = line.split()
        assert ret == arr[0]
        for it in arr[1:]:
            if not isinstance(it,str):
                it=it.decode()
            k, v  = it.split(':')
            if k not in cvmap:
                cvmap[k] = []
            cvmap[k].append(float(v))
    for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
        v = np.array(v)
        if not isinstance(ret,str):
            ret = ret.decode()
        if show_stdv:
            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
        else:
            ret += '\tcv-%s:%f' % (k, np.mean(v))
    return ret

def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
        obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0):
    """ cross validation  with given paramaters
        Args:
            params: dict
                    params of booster
            dtrain: DMatrix
                    data to be trained
            num_boost_round: int
                             num of round to be boosted
            nfold: int
                   number of folds to do cv
            metrics: list of strings
                     evaluation metrics to be watched in cv
            obj: function
                 custom objective function
            feval: function
                   custom evaluation function
            fpreproc: function
                      preprocessing function that takes dtrain, dtest,
                      param and return transformed version of dtrain, dtest, param
            show_stdv: bool
                       whether display standard deviation
            seed: int
                  seed used to generate the folds, this is passed to numpy.random.seed

        Returns: list(string) of evaluation history
    """
    results = []
    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
    for i in range(num_boost_round):
        for f in cvfolds:
            f.update(i, obj)
        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
        sys.stderr.write(res+'\n')
        results.append(res)
    return results