adjust weight

2014-09-02 15:22:08 -06:00 · 2014-09-02 15:22:08 -06:00 · 5177fa02e4
commit 5177fa02e4
parent c75275a861
1 changed files with 284 additions and 31 deletions
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -3,10 +3,11 @@
 import ctypes
 import os
 # optinally have scipy sparse, though not necessary
-import numpy
+import numpy as np
 import sys
 import numpy.ctypeslib
 import scipy.sparse as scp
+import random

 # set this line correctly
 if os.name == 'nt':
@ -32,18 +33,30 @@ xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)


 def ctypes2numpy(cptr, length, dtype):
-    # convert a ctypes pointer array to numpy
+    """convert a ctypes pointer array to numpy array """
    assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
    res = numpy.zeros(length, dtype=dtype)
    assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
    return res

-# data matrix used in xgboost
 class DMatrix:
+    """data matrix used in xgboost"""
    # constructor
    def __init__(self, data, label=None, missing=0.0, weight = None):
+        """ constructor of DMatrix
+
+            Args:
+                data: string/numpy array/scipy.sparse
+                      data source, string type is the path of svmlight format txt file or xgb buffer
+                label: list or numpy 1d array, optional
+                       label of training data
+                missing: float
+                         value in data which need to be present as missing value
+                weight: list or numpy 1d array, optional
+                        weight for each instances
+        """
        # force into void_p, mac need to pass things in as void_p
-        if data == None:
+        if data is None:
            self.handle = None
            return
        if isinstance(data, str):
@ -63,22 +76,25 @@ class DMatrix:
            self.set_label(label)
        if weight !=None:
            self.set_weight(weight)
-    # convert data from csr matrix
+
    def __init_from_csr(self, csr):
+        """convert data from csr matrix"""
        assert len(csr.indices) == len(csr.data)
        self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
            (ctypes.c_ulong  * len(csr.indptr))(*csr.indptr),
            (ctypes.c_uint  * len(csr.indices))(*csr.indices),
            (ctypes.c_float * len(csr.data))(*csr.data),
            len(csr.indptr), len(csr.data)))
-    # convert data from numpy matrix
+
    def __init_from_npy2d(self,mat,missing):
+        """convert data from numpy matrix"""
        data = numpy.array(mat.reshape(mat.size), dtype='float32')
        self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            mat.shape[0], mat.shape[1], ctypes.c_float(missing)))
-    # destructor
+
    def __del__(self):
+        """destructor"""
        xglib.XGDMatrixFree(self.handle)
    def get_float_info(self, field):
        length = ctypes.c_ulong()
@ -96,16 +112,39 @@ class DMatrix:
    def set_uint_info(self, field, data):
        xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
                                   (ctypes.c_uint*len(data))(*data), len(data))
-    # load data from file
+
    def save_binary(self, fname, silent=True):
+        """save DMatrix to XGBoost buffer
+            Args:
+                fname: string
+                       name of buffer file
+                slient: bool, option
+                       whether print info
+           Returns:
+                None
+        """
        xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
-    # set label of dmatrix
+
    def set_label(self, label):
+        """set label of dmatrix
+            Args:
+                label: list
+                       label for DMatrix
+            Returns:
+                None
+        """
        self.set_float_info('label', label)
-    # set weight of each instances
+
    def set_weight(self, weight):
+        """set weight of each instances
+            Args:
+                weight: float
+                        weight for positive instance
+            Returns:
+                None
+        """
        self.set_float_info('weight', weight)
-    # set initialized margin prediction
+
    def set_base_margin(self, margin):
        """
        set base margin of booster to start from
@ -116,31 +155,149 @@ class DMatrix:
        see also example/demo.py
        """
        self.set_float_info('base_margin', margin)
-    # set group size of dmatrix, used for rank
+
    def set_group(self, group):
+        """set group size of dmatrix, used for rank
+            Args:
+                group:
+
+            Returns:
+                None
+        """
        xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
-    # get label from dmatrix
+
    def get_label(self):
+        """get label from dmatrix
+            Args:
+                None
+            Returns:
+                list, label of data
+        """
        return self.get_float_info('label')
-    # get weight from dmatrix
+
    def get_weight(self):
+        """get weight from dmatrix
+            Args:
+                None
+            Returns:
+                float, weight
+        """
        return self.get_float_info('weight')
-    # get base_margin from dmatrix
    def get_base_margin(self):
+        """get base_margin from dmatrix
+            Args:
+                None
+            Returns:
+                float, base margin
+        """
        return self.get_float_info('base_margin')
    def num_row(self):
+        """get number of rows
+            Args:
+                None
+            Returns:
+                int, num rows
+        """
        return xglib.XGDMatrixNumRow(self.handle)
-    # slice the DMatrix to return a new DMatrix that only contains rindex
    def slice(self, rindex):
+        """slice the DMatrix to return a new DMatrix that only contains rindex
+            Args:
+                rindex: list
+                        list of index to be chosen
+            Returns:
+                res: DMatrix
+                     new DMatrix with chosen index
+        """
        res = DMatrix(None)
        res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
            self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
        return res

+class CVPack:
+    def __init__(self, dtrain, dtest, param):
+        self.dtrain = dtrain
+        self.dtest = dtest
+        self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
+        self.bst = Booster(param, [dtrain,dtest])
+    def update(self,r):
+        self.bst.update(self.dtrain, r)
+    def eval(self,r):
+        return self.bst.eval_set(self.watchlist, r)
+
+def mknfold(dall, nfold, param, seed, weightscale=None, evals=[], set_pos_weight=None):
+    """
+    mk nfold list of cvpack from randidx
+    """
+    randidx = range(dall.num_row())
+    random.seed(seed)
+    random.shuffle(randidx)
+
+    idxset = []
+    kstep = len(randidx) / nfold
+    for i in range(nfold):
+        idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ])
+
+    ret = []
+    for k in range(nfold):
+        trainlst = []
+        for j in range(nfold):
+            if j == k:
+                testlst = idxset[j]
+            else:
+                trainlst += idxset[j]
+        dtrain = dall.slice(trainlst)
+        dtest = dall.slice(testlst)
+        # rescale weight of dtrain and dtest
+        if weightscale != None:
+            dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() )
+            dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() )
+        if set_pos_weight != None:
+            label = dtrain.get_label()
+            weight = dtrain.get_weight()
+            sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0  )
+            sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0  )
+            param['scale_pos_weight'] = sum_wneg/sum_wpos
+        plst = param.items() + [('eval_metric', itm) for itm in evals]
+        ret.append(CVPack(dtrain, dtest, plst))
+    return ret
+
+def aggcv(rlist):
+    """
+    aggregate cross validation results
+    """
+    cvmap = {}
+    arr = rlist[0].split()
+    ret = arr[0]
+    for it in arr[1:]:
+        k, v  = it.split(':')
+        cvmap[k] = [float(v)]
+    for line in rlist[1:]:
+        arr = line.split()
+        assert ret == arr[0]
+        for it in arr[1:]:
+            k, v  = it.split(':')
+            cvmap[k].append(float(v))
+
+    for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
+        v = np.array(v)
+        ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
+    return ret
+
+
 class Booster:
    """learner class """
    def __init__(self, params={}, cache=[], model_file = None):
-        """ constructor, param: """
+        """ constructor
+            Args:
+                params: dict
+                        params for boosters
+                cache: list
+                        list of cache item
+                model_file: string
+                        path of model file
+            Returns:
+                None
+        """
        for d in cache:
            assert isinstance(d, DMatrix)
        dmats = (ctypes.c_void_p  * len(cache))(*[ d.handle for d in cache])
@ -166,16 +323,30 @@ class Booster:
                xglib.XGBoosterSetParam(
                    self.handle, ctypes.c_char_p(k.encode('utf-8')),
                    ctypes.c_char_p(str(v).encode('utf-8')))
+
    def update(self, dtrain, it):
        """
        update
-          dtrain: the training DMatrix
-          it: current iteration number
+            Args:
+                dtrain: DMatrix
+                        the training DMatrix
+                it: int
+                    current iteration number
+            Returns:
+                None
        """
        assert isinstance(dtrain, DMatrix)
        xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
    def boost(self, dtrain, grad, hess):
-        """ update """
+        """ update
+            Args:
+                dtrain: DMatrix
+                        the training DMatrix
+                grad: list
+                        the first order of gradient
+                hess: list
+                        the second order of gradient
+        """
        assert len(grad) == len(hess)
        assert isinstance(dtrain, DMatrix)
        xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
@ -183,6 +354,14 @@ class Booster:
                                    (ctypes.c_float*len(hess))(*hess),
                                    len(grad))
    def eval_set(self, evals, it = 0):
+        """evaluates by metric
+            Args:
+                evals: list of tuple (DMatrix, string)
+                       lists of items to be evaluated
+                it: int
+            Returns:
+                evals result
+        """
        for d in evals:
            assert isinstance(d[0], DMatrix)
            assert isinstance(d[1], str)
@ -192,25 +371,49 @@ class Booster:
        return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))
    def eval(self, mat, name = 'eval', it = 0):
        return self.eval_set( [(mat,name)], it)
-    def predict(self, data, output_margin=False, ntree_limit=0):
+    def predict(self, data, output_margin=False):
        """
        predict with data
-            data: the dmatrix storing the input
-            output_margin: whether output raw margin value that is untransformed
-            ntree_limit: limit number of trees in prediction, default to 0, 0 means using all the trees
+            Args:
+                data: DMatrix
+                      the dmatrix storing the input
+                output_margin: bool
+                               whether output raw margin value that is untransformed
+            Returns:
+                numpy array of prediction
        """
        length = ctypes.c_ulong()
        preds = xglib.XGBoosterPredict(self.handle, data.handle,
-                                       int(output_margin), ntree_limit, ctypes.byref(length))
+                                       int(output_margin), ctypes.byref(length))
        return ctypes2numpy(preds, length.value, 'float32')
    def save_model(self, fname):
-        """ save model to file """
+        """ save model to file
+            Args:
+                fname: string
+                       file name of saving model
+            Returns:
+                None
+        """
        xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
    def load_model(self, fname):
-        """load model from file"""
+        """load model from file
+            Args:
+                fname: string
+                       file name of saving model
+            Returns:
+                None
+        """
        xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
    def dump_model(self, fo, fmap=''):
-        """dump model into text file"""
+        """dump model into text file
+            Args:
+                fo: string
+                    file name to be dumped
+                fmap: string, optional
+                      file name of feature map names
+            Returns:
+                None
+        """
        if isinstance(fo,str):
            fo = open(fo,'w')
            need_close = True
@ -249,7 +452,17 @@ class Booster:
        return fmap

 def evaluate(bst, evals, it, feval = None):
-    """evaluation on eval set"""
+    """evaluation on eval set
+        Args:
+            bst: XGBoost object
+                 object of XGBoost model
+            evals: list of tuple (DMatrix, string)
+                obj need to be evaluated
+            it: int
+            feval: optional
+        Returns:
+            eval result
+    """
    if feval != None:
        res = '[%d]' % it
        for dm, evname in evals:
@ -260,10 +473,24 @@ def evaluate(bst, evals, it, feval = None):

    return res

+
+
 def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
-    """ train a booster with given paramaters """
+    """ train a booster with given paramaters
+        Args:
+            params: dict
+                    params of booster
+            dtrain: DMatrix
+                    data to be trained
+            num_boost_round: int
+                             num of round to be boosted
+            evals: list
+                   list of items to be evaluated
+            obj:
+            feval:
+    """
    bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
-    if obj == None:
+    if obj is None:
        for i in range(num_boost_round):
            bst.update( dtrain, i )
            if len(evals) != 0:
@ -277,3 +504,29 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None
            if len(evals) != 0:
                sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
    return bst
+
+def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], \
+        weightscale=None, obj=None, feval=None, set_pos_weight=None):
+    """ cross validation  with given paramaters
+        Args:
+            params: dict
+                    params of booster
+            dtrain: DMatrix
+                    data to be trained
+            num_boost_round: int
+                             num of round to be boosted
+            nfold: int
+                   folds to do cv
+            evals: list
+                   list of items to be evaluated
+            obj:
+            feval:
+            set_pos_weight: bool, optional
+                            Adjust pos weight by number
+    """
+    cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals)
+    for i in range(num_boost_round):
+        for f in cvfolds:
+            f.update(i)
+        res = aggcv([f.eval(i) for f in cvfolds])
+        sys.stderr.write(res+'\n')