From 5177fa02e4ee08ac71a7fd84ace812819d94554f Mon Sep 17 00:00:00 2001 From: antinucleon Date: Tue, 2 Sep 2014 15:22:08 -0600 Subject: [PATCH 01/22] adjust weight --- wrapper/xgboost.py | 315 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 284 insertions(+), 31 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index a6999a39f..34c4bfde7 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -3,10 +3,11 @@ import ctypes import os # optinally have scipy sparse, though not necessary -import numpy +import numpy as np import sys import numpy.ctypeslib import scipy.sparse as scp +import random # set this line correctly if os.name == 'nt': @@ -32,18 +33,30 @@ xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p) def ctypes2numpy(cptr, length, dtype): - # convert a ctypes pointer array to numpy + """convert a ctypes pointer array to numpy array """ assert isinstance(cptr, ctypes.POINTER(ctypes.c_float)) res = numpy.zeros(length, dtype=dtype) assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]) return res -# data matrix used in xgboost class DMatrix: + """data matrix used in xgboost""" # constructor def __init__(self, data, label=None, missing=0.0, weight = None): + """ constructor of DMatrix + + Args: + data: string/numpy array/scipy.sparse + data source, string type is the path of svmlight format txt file or xgb buffer + label: list or numpy 1d array, optional + label of training data + missing: float + value in data which need to be present as missing value + weight: list or numpy 1d array, optional + weight for each instances + """ # force into void_p, mac need to pass things in as void_p - if data == None: + if data is None: self.handle = None return if isinstance(data, str): @@ -63,22 +76,25 @@ class DMatrix: self.set_label(label) if weight !=None: self.set_weight(weight) - # convert data from csr matrix + def __init_from_csr(self, csr): + """convert data from csr matrix""" assert len(csr.indices) == len(csr.data) self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR( (ctypes.c_ulong * len(csr.indptr))(*csr.indptr), (ctypes.c_uint * len(csr.indices))(*csr.indices), (ctypes.c_float * len(csr.data))(*csr.data), len(csr.indptr), len(csr.data))) - # convert data from numpy matrix + def __init_from_npy2d(self,mat,missing): + """convert data from numpy matrix""" data = numpy.array(mat.reshape(mat.size), dtype='float32') self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat( data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), mat.shape[0], mat.shape[1], ctypes.c_float(missing))) - # destructor + def __del__(self): + """destructor""" xglib.XGDMatrixFree(self.handle) def get_float_info(self, field): length = ctypes.c_ulong() @@ -96,16 +112,39 @@ class DMatrix: def set_uint_info(self, field, data): xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')), (ctypes.c_uint*len(data))(*data), len(data)) - # load data from file + def save_binary(self, fname, silent=True): + """save DMatrix to XGBoost buffer + Args: + fname: string + name of buffer file + slient: bool, option + whether print info + Returns: + None + """ xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent)) - # set label of dmatrix + def set_label(self, label): + """set label of dmatrix + Args: + label: list + label for DMatrix + Returns: + None + """ self.set_float_info('label', label) - # set weight of each instances + def set_weight(self, weight): + """set weight of each instances + Args: + weight: float + weight for positive instance + Returns: + None + """ self.set_float_info('weight', weight) - # set initialized margin prediction + def set_base_margin(self, margin): """ set base margin of booster to start from @@ -116,31 +155,149 @@ class DMatrix: see also example/demo.py """ self.set_float_info('base_margin', margin) - # set group size of dmatrix, used for rank + def set_group(self, group): + """set group size of dmatrix, used for rank + Args: + group: + + Returns: + None + """ xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group)) - # get label from dmatrix + def get_label(self): + """get label from dmatrix + Args: + None + Returns: + list, label of data + """ return self.get_float_info('label') - # get weight from dmatrix + def get_weight(self): + """get weight from dmatrix + Args: + None + Returns: + float, weight + """ return self.get_float_info('weight') - # get base_margin from dmatrix def get_base_margin(self): + """get base_margin from dmatrix + Args: + None + Returns: + float, base margin + """ return self.get_float_info('base_margin') def num_row(self): + """get number of rows + Args: + None + Returns: + int, num rows + """ return xglib.XGDMatrixNumRow(self.handle) - # slice the DMatrix to return a new DMatrix that only contains rindex def slice(self, rindex): + """slice the DMatrix to return a new DMatrix that only contains rindex + Args: + rindex: list + list of index to be chosen + Returns: + res: DMatrix + new DMatrix with chosen index + """ res = DMatrix(None) res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix( self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex))) return res +class CVPack: + def __init__(self, dtrain, dtest, param): + self.dtrain = dtrain + self.dtest = dtest + self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ] + self.bst = Booster(param, [dtrain,dtest]) + def update(self,r): + self.bst.update(self.dtrain, r) + def eval(self,r): + return self.bst.eval_set(self.watchlist, r) + +def mknfold(dall, nfold, param, seed, weightscale=None, evals=[], set_pos_weight=None): + """ + mk nfold list of cvpack from randidx + """ + randidx = range(dall.num_row()) + random.seed(seed) + random.shuffle(randidx) + + idxset = [] + kstep = len(randidx) / nfold + for i in range(nfold): + idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ]) + + ret = [] + for k in range(nfold): + trainlst = [] + for j in range(nfold): + if j == k: + testlst = idxset[j] + else: + trainlst += idxset[j] + dtrain = dall.slice(trainlst) + dtest = dall.slice(testlst) + # rescale weight of dtrain and dtest + if weightscale != None: + dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() ) + dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() ) + if set_pos_weight != None: + label = dtrain.get_label() + weight = dtrain.get_weight() + sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 ) + sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 ) + param['scale_pos_weight'] = sum_wneg/sum_wpos + plst = param.items() + [('eval_metric', itm) for itm in evals] + ret.append(CVPack(dtrain, dtest, plst)) + return ret + +def aggcv(rlist): + """ + aggregate cross validation results + """ + cvmap = {} + arr = rlist[0].split() + ret = arr[0] + for it in arr[1:]: + k, v = it.split(':') + cvmap[k] = [float(v)] + for line in rlist[1:]: + arr = line.split() + assert ret == arr[0] + for it in arr[1:]: + k, v = it.split(':') + cvmap[k].append(float(v)) + + for k, v in sorted(cvmap.items(), key = lambda x:x[0]): + v = np.array(v) + ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v)) + return ret + + class Booster: """learner class """ def __init__(self, params={}, cache=[], model_file = None): - """ constructor, param: """ + """ constructor + Args: + params: dict + params for boosters + cache: list + list of cache item + model_file: string + path of model file + Returns: + None + """ for d in cache: assert isinstance(d, DMatrix) dmats = (ctypes.c_void_p * len(cache))(*[ d.handle for d in cache]) @@ -166,16 +323,30 @@ class Booster: xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k.encode('utf-8')), ctypes.c_char_p(str(v).encode('utf-8'))) + def update(self, dtrain, it): """ update - dtrain: the training DMatrix - it: current iteration number + Args: + dtrain: DMatrix + the training DMatrix + it: int + current iteration number + Returns: + None """ assert isinstance(dtrain, DMatrix) xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) def boost(self, dtrain, grad, hess): - """ update """ + """ update + Args: + dtrain: DMatrix + the training DMatrix + grad: list + the first order of gradient + hess: list + the second order of gradient + """ assert len(grad) == len(hess) assert isinstance(dtrain, DMatrix) xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle, @@ -183,6 +354,14 @@ class Booster: (ctypes.c_float*len(hess))(*hess), len(grad)) def eval_set(self, evals, it = 0): + """evaluates by metric + Args: + evals: list of tuple (DMatrix, string) + lists of items to be evaluated + it: int + Returns: + evals result + """ for d in evals: assert isinstance(d[0], DMatrix) assert isinstance(d[1], str) @@ -192,25 +371,49 @@ class Booster: return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) def eval(self, mat, name = 'eval', it = 0): return self.eval_set( [(mat,name)], it) - def predict(self, data, output_margin=False, ntree_limit=0): + def predict(self, data, output_margin=False): """ predict with data - data: the dmatrix storing the input - output_margin: whether output raw margin value that is untransformed - ntree_limit: limit number of trees in prediction, default to 0, 0 means using all the trees + Args: + data: DMatrix + the dmatrix storing the input + output_margin: bool + whether output raw margin value that is untransformed + Returns: + numpy array of prediction """ length = ctypes.c_ulong() preds = xglib.XGBoosterPredict(self.handle, data.handle, - int(output_margin), ntree_limit, ctypes.byref(length)) + int(output_margin), ctypes.byref(length)) return ctypes2numpy(preds, length.value, 'float32') def save_model(self, fname): - """ save model to file """ + """ save model to file + Args: + fname: string + file name of saving model + Returns: + None + """ xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8'))) def load_model(self, fname): - """load model from file""" + """load model from file + Args: + fname: string + file name of saving model + Returns: + None + """ xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) ) def dump_model(self, fo, fmap=''): - """dump model into text file""" + """dump model into text file + Args: + fo: string + file name to be dumped + fmap: string, optional + file name of feature map names + Returns: + None + """ if isinstance(fo,str): fo = open(fo,'w') need_close = True @@ -249,7 +452,17 @@ class Booster: return fmap def evaluate(bst, evals, it, feval = None): - """evaluation on eval set""" + """evaluation on eval set + Args: + bst: XGBoost object + object of XGBoost model + evals: list of tuple (DMatrix, string) + obj need to be evaluated + it: int + feval: optional + Returns: + eval result + """ if feval != None: res = '[%d]' % it for dm, evname in evals: @@ -260,10 +473,24 @@ def evaluate(bst, evals, it, feval = None): return res + + def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None): - """ train a booster with given paramaters """ + """ train a booster with given paramaters + Args: + params: dict + params of booster + dtrain: DMatrix + data to be trained + num_boost_round: int + num of round to be boosted + evals: list + list of items to be evaluated + obj: + feval: + """ bst = Booster(params, [dtrain]+[ d[0] for d in evals ] ) - if obj == None: + if obj is None: for i in range(num_boost_round): bst.update( dtrain, i ) if len(evals) != 0: @@ -277,3 +504,29 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None if len(evals) != 0: sys.stderr.write(evaluate(bst, evals, i, feval)+'\n') return bst + +def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], \ + weightscale=None, obj=None, feval=None, set_pos_weight=None): + """ cross validation with given paramaters + Args: + params: dict + params of booster + dtrain: DMatrix + data to be trained + num_boost_round: int + num of round to be boosted + nfold: int + folds to do cv + evals: list + list of items to be evaluated + obj: + feval: + set_pos_weight: bool, optional + Adjust pos weight by number + """ + cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals) + for i in range(num_boost_round): + for f in cvfolds: + f.update(i) + res = aggcv([f.eval(i) for f in cvfolds]) + sys.stderr.write(res+'\n') From e4817bb4c3f0b8d395e5343382e1cba5fe2ec577 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 15:05:49 -0700 Subject: [PATCH 02/22] fix ntreelimit --- wrapper/xgboost.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 34c4bfde7..a0a88af47 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -371,7 +371,7 @@ class Booster: return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) def eval(self, mat, name = 'eval', it = 0): return self.eval_set( [(mat,name)], it) - def predict(self, data, output_margin=False): + def predict(self, data, output_margin=False, ntree_limit=0): """ predict with data Args: @@ -379,12 +379,14 @@ class Booster: the dmatrix storing the input output_margin: bool whether output raw margin value that is untransformed + + ntree_limit: limit number of trees in prediction, default to 0, 0 means using all the trees Returns: numpy array of prediction """ length = ctypes.c_ulong() preds = xglib.XGBoosterPredict(self.handle, data.handle, - int(output_margin), ctypes.byref(length)) + int(output_margin), ntree_limit, ctypes.byref(length)) return ctypes2numpy(preds, length.value, 'float32') def save_model(self, fname): """ save model to file From 65340ffda6c3712e532776697d2187f64d7fd3fa Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 17:51:05 -0700 Subject: [PATCH 03/22] quick lint --- src/utils/utils.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/utils/utils.h b/src/utils/utils.h index 5c3342d8e..c319c5ab7 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -86,7 +86,7 @@ void HandlePrint(const char *msg); #endif #endif #ifdef XGBOOST_STRICT_CXX98_ -// these function pointers are to be assigned +// these function pointers are to be assigned extern "C" void (*Printf)(const char *fmt, ...); extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...); extern "C" void (*Assert)(int exp, const char *fmt, ...); @@ -94,7 +94,7 @@ extern "C" void (*Check)(int exp, const char *fmt, ...); extern "C" void (*Error)(const char *fmt, ...); #else /*! \brief printf, print message to the console */ -inline void Printf(const char *fmt, ...) { +inline void Printf(const char *fmt, ...) { std::string msg(kPrintBuffer, '\0'); va_list args; va_start(args, fmt); @@ -103,7 +103,7 @@ inline void Printf(const char *fmt, ...) { HandlePrint(msg.c_str()); } /*! \brief portable version of snprintf */ -inline int SPrintf(char *buf, size_t size, const char *fmt, ...) { +inline int SPrintf(char *buf, size_t size, const char *fmt, ...) { va_list args; va_start(args, fmt); int ret = vsnprintf(buf, size, fmt, args); @@ -154,7 +154,7 @@ inline FILE *FopenCheck(const char *fname, const char *flag) { Check(fp != NULL, "can not open file \"%s\"\n", fname); return fp; } -} // namespace utils +} // namespace utils // easy utils that can be directly acessed in xgboost /*! \brief get the beginning address of a vector */ template From 1dbcebb6fedc0410ff2bc0338c8614bde0538862 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 22:12:28 -0700 Subject: [PATCH 04/22] fix cxx98 --- src/io/simple_dmatrix-inl.hpp | 2 +- src/tree/updater_colmaker-inl.hpp | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 0883955fe..9a88a6bfa 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -216,7 +216,7 @@ class DMatrixSimple : public DataMatrix { return; } char bname[1024]; - snprintf(bname, sizeof(bname), "%s.buffer", fname); + utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname); if (!this->LoadBinary(bname, silent)) { this->LoadText(fname, silent); if (savebuffer) this->SaveBinary(bname, silent); diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index a8cf6ea7f..2d7c5311e 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -81,18 +81,18 @@ class ColMaker: public IUpdater { const BoosterInfo &info, RegTree *p_tree) { this->InitData(gpair, *p_fmat, info.root_index, *p_tree); - this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree); + this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree); for (int depth = 0; depth < param.max_depth; ++depth) { - this->FindSplit(depth, this->qexpand, gpair, p_fmat, info, p_tree); - this->ResetPosition(this->qexpand, p_fmat, *p_tree); - this->UpdateQueueExpand(*p_tree, &this->qexpand); - this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree); + this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree); + this->ResetPosition(qexpand_, p_fmat, *p_tree); + this->UpdateQueueExpand(*p_tree, &qexpand_); + this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree); // if nothing left to be expand, break - if (qexpand.size() == 0) break; + if (qexpand_.size() == 0) break; } // set all the rest expanding nodes to leaf - for (size_t i = 0; i < qexpand.size(); ++i) { - const int nid = qexpand[i]; + for (size_t i = 0; i < qexpand_.size(); ++i) { + const int nid = qexpand_[i]; (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate); } // remember auxiliary statistics in the tree node @@ -165,9 +165,9 @@ class ColMaker: public IUpdater { snode.reserve(256); } {// expand query - qexpand.reserve(256); qexpand.clear(); + qexpand_.reserve(256); qexpand_.clear(); for (int i = 0; i < tree.param.num_roots; ++i) { - qexpand.push_back(i); + qexpand_.push_back(i); } } } @@ -228,6 +228,7 @@ class ColMaker: public IUpdater { const std::vector &gpair, const BoosterInfo &info, std::vector &temp) { + const std::vector &qexpand = qexpand_; // clear all the temp statistics for (size_t j = 0; j < qexpand.size(); ++j) { temp[qexpand[j]].stats.Clear(); @@ -248,7 +249,7 @@ class ColMaker: public IUpdater { e.last_fvalue = fvalue; } else { // try to find a split - if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { + if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { c.SetSubstract(snode[nid].stats, e.stats); if (c.sum_hess >= param.min_child_weight) { bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); @@ -391,7 +392,7 @@ class ColMaker: public IUpdater { /*! \brief TreeNode Data: statistics for each constructed node */ std::vector snode; /*! \brief queue of nodes to be expanded */ - std::vector qexpand; + std::vector qexpand_; }; }; From 10648a1ca7eee583459f3baf3d5f105959626735 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 22:43:19 -0700 Subject: [PATCH 05/22] remove using std from cpp --- src/gbm/gblinear-inl.hpp | 5 ++++- src/gbm/gbm.cpp | 2 +- src/gbm/gbtree-inl.hpp | 9 ++++++--- src/io/io.cpp | 1 - src/io/simple_dmatrix-inl.hpp | 8 +++++--- src/learner/dmatrix.h | 4 ++++ src/learner/evaluation-inl.hpp | 7 +++++-- src/learner/evaluation.h | 2 ++ src/learner/learner-inl.hpp | 4 +++- src/learner/objective-inl.hpp | 3 +++ src/learner/objective.h | 1 + src/tree/model.h | 3 ++- src/tree/param.h | 1 + src/tree/updater.cpp | 2 +- src/tree/updater_prune-inl.hpp | 1 + src/utils/fmap.h | 9 +++++---- src/utils/io.h | 12 ++++++------ src/utils/random.h | 2 +- src/utils/utils.h | 4 ++-- 19 files changed, 53 insertions(+), 27 deletions(-) diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp index a9d4c8d62..624f15c28 100644 --- a/src/gbm/gblinear-inl.hpp +++ b/src/gbm/gblinear-inl.hpp @@ -24,6 +24,7 @@ class GBLinear : public IGradBooster { } // set model parameters virtual void SetParam(const char *name, const char *val) { + using namespace std; if (!strncmp(name, "bst:", 4)) { param.SetParam(name + 4, val); } @@ -166,6 +167,7 @@ class GBLinear : public IGradBooster { learning_rate = 1.0f; } inline void SetParam(const char *name, const char *val) { + using namespace std; // sync-names if (!strcmp("eta", name)) learning_rate = static_cast(atof(val)); if (!strcmp("lambda", name)) reg_lambda = static_cast(atof(val)); @@ -207,9 +209,10 @@ class GBLinear : public IGradBooster { Param(void) { num_feature = 0; num_output_group = 1; - memset(reserved, 0, sizeof(reserved)); + std::memset(reserved, 0, sizeof(reserved)); } inline void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val); if (!strcmp(name, "num_output_group")) num_output_group = atoi(val); } diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp index 4713838e9..e280fdd4a 100644 --- a/src/gbm/gbm.cpp +++ b/src/gbm/gbm.cpp @@ -1,7 +1,6 @@ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #include -using namespace std; #include "./gbm.h" #include "./gbtree-inl.hpp" #include "./gblinear-inl.hpp" @@ -9,6 +8,7 @@ using namespace std; namespace xgboost { namespace gbm { IGradBooster* CreateGradBooster(const char *name) { + using namespace std; if (!strcmp("gbtree", name)) return new GBTree(); if (!strcmp("gblinear", name)) return new GBLinear(); utils::Error("unknown booster type: %s", name); diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index 8fea28727..ed52afa7d 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -23,6 +23,7 @@ class GBTree : public IGradBooster { this->Clear(); } virtual void SetParam(const char *name, const char *val) { + using namespace std; if (!strncmp(name, "bst:", 4)) { cfg.push_back(std::make_pair(std::string(name+4), std::string(val))); // set into updaters, if already intialized @@ -171,14 +172,14 @@ class GBTree : public IGradBooster { updaters.clear(); std::string tval = tparam.updater_seq; char *pstr; - pstr = strtok(&tval[0], ","); + pstr = std::strtok(&tval[0], ","); while (pstr != NULL) { updaters.push_back(tree::CreateUpdater(pstr)); for (size_t j = 0; j < cfg.size(); ++j) { // set parameters updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str()); } - pstr = strtok(NULL, ","); + pstr = std::strtok(NULL, ","); } tparam.updater_initialized = 1; } @@ -279,6 +280,7 @@ class GBTree : public IGradBooster { updater_initialized = 0; } inline void SetParam(const char *name, const char *val){ + using namespace std; if (!strcmp(name, "updater") && strcmp(updater_seq.c_str(), val) != 0) { updater_seq = val; @@ -319,7 +321,7 @@ class GBTree : public IGradBooster { num_pbuffer = 0; num_output_group = 1; size_leaf_vector = 0; - memset(reserved, 0, sizeof(reserved)); + std::memset(reserved, 0, sizeof(reserved)); } /*! * \brief set parameters from outside @@ -327,6 +329,7 @@ class GBTree : public IGradBooster { * \param val value of the parameter */ inline void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val); if (!strcmp("num_output_group", name)) num_output_group = atol(val); if (!strcmp("bst:num_roots", name)) num_roots = atoi(val); diff --git a/src/io/io.cpp b/src/io/io.cpp index dead398f7..d251d7a96 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -1,7 +1,6 @@ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #include -using namespace std; #include "./io.h" #include "../utils/io.h" #include "../utils/utils.h" diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 9a88a6bfa..374d621e9 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -55,8 +55,8 @@ class DMatrixSimple : public DataMatrix { RowBatch::Inst inst = batch[i]; row_data_.resize(row_data_.size() + inst.length); if (inst.length != 0) { - memcpy(&row_data_[row_ptr_.back()], inst.data, - sizeof(RowBatch::Entry) * inst.length); + std::memcpy(&row_data_[row_ptr_.back()], inst.data, + sizeof(RowBatch::Entry) * inst.length); } row_ptr_.push_back(row_ptr_.back() + inst.length); } @@ -82,6 +82,7 @@ class DMatrixSimple : public DataMatrix { * \param silent whether print information or not */ inline void LoadText(const char* fname, bool silent = false) { + using namespace std; this->Clear(); FILE* file = utils::FopenCheck(fname, "r"); float label; bool init = true; @@ -135,7 +136,7 @@ class DMatrixSimple : public DataMatrix { * \return whether loading is success */ inline bool LoadBinary(const char* fname, bool silent = false) { - FILE *fp = fopen64(fname, "rb"); + std::FILE *fp = fopen64(fname, "rb"); if (fp == NULL) return false; utils::FileStream fs(fp); this->LoadBinary(fs, silent, fname); @@ -208,6 +209,7 @@ class DMatrixSimple : public DataMatrix { * \param savebuffer whether do save binary buffer if it is text */ inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) { + using namespace std; size_t len = strlen(fname); if (len > 8 && !strcmp(fname + len - 7, ".buffer")) { if (!this->LoadBinary(fname, silent)) { diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index bef84900a..b58f7b2bb 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -90,6 +90,7 @@ struct MetaInfo { } // try to load group information from file, if exists inline bool TryLoadGroup(const char* fname, bool silent = false) { + using namespace std; FILE *fi = fopen64(fname, "r"); if (fi == NULL) return false; group_ptr.push_back(0); @@ -105,6 +106,7 @@ struct MetaInfo { return true; } inline std::vector& GetFloatInfo(const char *field) { + using namespace std; if (!strcmp(field, "label")) return labels; if (!strcmp(field, "weight")) return weights; if (!strcmp(field, "base_margin")) return base_margin; @@ -115,6 +117,7 @@ struct MetaInfo { return ((MetaInfo*)this)->GetFloatInfo(field); } inline std::vector &GetUIntInfo(const char *field) { + using namespace std; if (!strcmp(field, "root_index")) return info.root_index; if (!strcmp(field, "fold_index")) return info.fold_index; utils::Error("unknown field %s", field); @@ -125,6 +128,7 @@ struct MetaInfo { } // try to load weight information from file, if exists inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) { + using namespace std; std::vector &data = this->GetFloatInfo(field); FILE *fi = fopen64(fname, "r"); if (fi == NULL) return false; diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 52877e17b..fb0b8953d 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -147,10 +147,11 @@ struct EvalAMS : public IEvaluator { explicit EvalAMS(const char *name) { name_ = name; // note: ams@0 will automatically select which ratio to go - utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format"); + utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format"); } virtual float Eval(const std::vector &preds, const MetaInfo &info) const { + using namespace std; const bst_omp_uint ndata = static_cast(info.labels.size()); utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams"); @@ -202,6 +203,7 @@ struct EvalAMS : public IEvaluator { struct EvalPrecisionRatio : public IEvaluator{ public: explicit EvalPrecisionRatio(const char *name) : name_(name) { + using namespace std; if (sscanf(name, "apratio@%f", &ratio_) == 1) { use_ap = 1; } else { @@ -342,6 +344,7 @@ struct EvalRankList : public IEvaluator { protected: explicit EvalRankList(const char *name) { + using namespace std; name_ = name; minus_ = false; if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) { @@ -388,7 +391,7 @@ struct EvalNDCG : public EvalRankList{ for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) { const unsigned rel = rec[i].second; if (rel != 0) { - sumdcg += ((1 << rel) - 1) / log(i + 2.0); + sumdcg += ((1 << rel) - 1) / std::log(i + 2.0); } } return static_cast(sumdcg); diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h index ec37e1f4a..f34d832c8 100644 --- a/src/learner/evaluation.h +++ b/src/learner/evaluation.h @@ -36,6 +36,7 @@ struct IEvaluator{ namespace xgboost { namespace learner { inline IEvaluator* CreateEvaluator(const char *name) { + using namespace std; if (!strcmp(name, "rmse")) return new EvalRMSE(); if (!strcmp(name, "error")) return new EvalError(); if (!strcmp(name, "merror")) return new EvalMatchError(); @@ -56,6 +57,7 @@ inline IEvaluator* CreateEvaluator(const char *name) { class EvalSet{ public: inline void AddEval(const char *name) { + using namespace std; for (size_t i = 0; i < evals_.size(); ++i) { if (!strcmp(name, evals_[i]->Name())) return; } diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 5d7c9d06a..05519de8b 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -79,6 +79,7 @@ class BoostLearner { * \param val value of the parameter */ inline void SetParam(const char *name, const char *val) { + using namespace std; // in this version, bst: prefix is no longer required if (strncmp(name, "bst:", 4) != 0) { std::string n = "bst:"; n += name; @@ -290,7 +291,7 @@ class BoostLearner { base_score = 0.5f; num_feature = 0; num_class = 0; - memset(reserved, 0, sizeof(reserved)); + std::memset(reserved, 0, sizeof(reserved)); } /*! * \brief set parameters from outside @@ -298,6 +299,7 @@ class BoostLearner { * \param val value of the parameter */ inline void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp("base_score", name)) base_score = static_cast(atof(val)); if (!strcmp("num_class", name)) num_class = atoi(val); if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index 576549eac..96aacf12d 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -101,6 +101,7 @@ class RegLossObj : public IObjFunction{ } virtual ~RegLossObj(void) {} virtual void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp("scale_pos_weight", name)) { scale_pos_weight = static_cast(atof(val)); } @@ -156,6 +157,7 @@ class SoftmaxMultiClassObj : public IObjFunction { } virtual ~SoftmaxMultiClassObj(void) {} virtual void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp( "num_class", name )) nclass = atoi(val); } virtual void GetGradient(const std::vector &preds, @@ -247,6 +249,7 @@ class LambdaRankObj : public IObjFunction { } virtual ~LambdaRankObj(void) {} virtual void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val); if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast(atof(val)); if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val); diff --git a/src/learner/objective.h b/src/learner/objective.h index d741ba61f..6b11b7d18 100644 --- a/src/learner/objective.h +++ b/src/learner/objective.h @@ -67,6 +67,7 @@ namespace xgboost { namespace learner { /*! \brief factory funciton to create objective function by name */ inline IObjFunction* CreateObjFunction(const char *name) { + using namespace std; if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare); if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik); if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify); diff --git a/src/tree/model.h b/src/tree/model.h index 6d885faa7..8049a1608 100644 --- a/src/tree/model.h +++ b/src/tree/model.h @@ -53,7 +53,7 @@ class TreeModel { Param(void) { max_depth = 0; size_leaf_vector = 0; - memset(reserved, 0, sizeof(reserved)); + std::memset(reserved, 0, sizeof(reserved)); } /*! * \brief set parameters from outside @@ -61,6 +61,7 @@ class TreeModel { * \param val value of the parameter */ inline void SetParam(const char *name, const char *val) { + using namespace std; if (!strcmp("num_roots", name)) num_roots = atoi(val); if (!strcmp("num_feature", name)) num_feature = atoi(val); if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val); diff --git a/src/tree/param.h b/src/tree/param.h index 79bc162c3..04ea5277f 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -62,6 +62,7 @@ struct TrainParam{ * \param val value of the parameter */ inline void SetParam(const char *name, const char *val) { + using namespace std; // sync-names if (!strcmp(name, "gamma")) min_split_loss = static_cast(atof(val)); if (!strcmp(name, "eta")) learning_rate = static_cast(atof(val)); diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp index 09b63eb49..2cb6552fe 100644 --- a/src/tree/updater.cpp +++ b/src/tree/updater.cpp @@ -1,7 +1,6 @@ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #include -using namespace std; #include "./updater.h" #include "./updater_prune-inl.hpp" #include "./updater_refresh-inl.hpp" @@ -10,6 +9,7 @@ using namespace std; namespace xgboost { namespace tree { IUpdater* CreateUpdater(const char *name) { + using namespace std; if (!strcmp(name, "prune")) return new TreePruner(); if (!strcmp(name, "refresh")) return new TreeRefresher(); if (!strcmp(name, "grow_colmaker")) return new ColMaker(); diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp index 98fdf5ee4..726999f55 100644 --- a/src/tree/updater_prune-inl.hpp +++ b/src/tree/updater_prune-inl.hpp @@ -17,6 +17,7 @@ class TreePruner: public IUpdater { virtual ~TreePruner(void) {} // set training parameter virtual void SetParam(const char *name, const char *val) { + using namespace std; param.SetParam(name, val); if (!strcmp(name, "silent")) silent = atoi(val); } diff --git a/src/utils/fmap.h b/src/utils/fmap.h index f9437cc6c..607f37013 100644 --- a/src/utils/fmap.h +++ b/src/utils/fmap.h @@ -24,15 +24,15 @@ class FeatMap { // function definitions /*! \brief load feature map from text format */ inline void LoadText(const char *fname) { - FILE *fi = utils::FopenCheck(fname, "r"); + std::FILE *fi = utils::FopenCheck(fname, "r"); this->LoadText(fi); - fclose(fi); + std::fclose(fi); } /*! \brief load feature map from text format */ - inline void LoadText(FILE *fi) { + inline void LoadText(std::FILE *fi) { int fid; char fname[1256], ftype[1256]; - while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) { + while (std::fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) { this->PushBack(fid, fname, ftype); } } @@ -62,6 +62,7 @@ class FeatMap { private: inline static Type GetType(const char *tname) { + using namespace std; if (!strcmp("i", tname)) return kIndicator; if (!strcmp("q", tname)) return kQuantitive; if (!strcmp("int", tname)) return kInteger; diff --git a/src/utils/io.h b/src/utils/io.h index a15e2f0ce..026e3fec7 100644 --- a/src/utils/io.h +++ b/src/utils/io.h @@ -91,21 +91,21 @@ class IStream { /*! \brief implementation of file i/o stream */ class FileStream : public IStream { private: - FILE *fp; + std::FILE *fp; public: - explicit FileStream(FILE *fp) : fp(fp) { + explicit FileStream(std::FILE *fp) : fp(fp) { } virtual size_t Read(void *ptr, size_t size) { - return fread(ptr, size, 1, fp); + return std::fread(ptr, size, 1, fp); } virtual void Write(const void *ptr, size_t size) { - fwrite(ptr, size, 1, fp); + std::fwrite(ptr, size, 1, fp); } inline void Seek(size_t pos) { - fseek(fp, 0, SEEK_SET); + std::fseek(fp, 0, SEEK_SET); } inline void Close(void) { - fclose(fp); + std::fclose(fp); } }; diff --git a/src/utils/random.h b/src/utils/random.h index 57e1f243d..1e3e617f9 100644 --- a/src/utils/random.h +++ b/src/utils/random.h @@ -53,7 +53,7 @@ inline double NextDouble(void) { } /*! \brief return a random number in n */ inline uint32_t NextUInt32(uint32_t n) { - return (uint32_t)floor(NextDouble() * n); + return (uint32_t)std::floor(NextDouble() * n); } /*! \brief return x~N(mu,sigma^2) */ inline double SampleNormal(double mu, double sigma) { diff --git a/src/utils/utils.h b/src/utils/utils.h index c319c5ab7..afe17f64c 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -149,8 +149,8 @@ inline void Error(const char *fmt, ...) { #endif /*! \brief replace fopen, report error when the file open fails */ -inline FILE *FopenCheck(const char *fname, const char *flag) { - FILE *fp = fopen64(fname, flag); +inline std::FILE *FopenCheck(const char *fname, const char *flag) { + std::FILE *fp = fopen64(fname, flag); Check(fp != NULL, "can not open file \"%s\"\n", fname); return fp; } From ac8958b2844c3fbbf1570ae1b18633360ca84082 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 23:07:50 -0700 Subject: [PATCH 06/22] move custom obj build in into booster --- wrapper/xgboost.py | 235 ++++++++++++++++++++------------------------- 1 file changed, 106 insertions(+), 129 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index a0a88af47..2ae12c341 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -213,77 +213,6 @@ class DMatrix: self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex))) return res -class CVPack: - def __init__(self, dtrain, dtest, param): - self.dtrain = dtrain - self.dtest = dtest - self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ] - self.bst = Booster(param, [dtrain,dtest]) - def update(self,r): - self.bst.update(self.dtrain, r) - def eval(self,r): - return self.bst.eval_set(self.watchlist, r) - -def mknfold(dall, nfold, param, seed, weightscale=None, evals=[], set_pos_weight=None): - """ - mk nfold list of cvpack from randidx - """ - randidx = range(dall.num_row()) - random.seed(seed) - random.shuffle(randidx) - - idxset = [] - kstep = len(randidx) / nfold - for i in range(nfold): - idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ]) - - ret = [] - for k in range(nfold): - trainlst = [] - for j in range(nfold): - if j == k: - testlst = idxset[j] - else: - trainlst += idxset[j] - dtrain = dall.slice(trainlst) - dtest = dall.slice(testlst) - # rescale weight of dtrain and dtest - if weightscale != None: - dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() ) - dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() ) - if set_pos_weight != None: - label = dtrain.get_label() - weight = dtrain.get_weight() - sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 ) - sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 ) - param['scale_pos_weight'] = sum_wneg/sum_wpos - plst = param.items() + [('eval_metric', itm) for itm in evals] - ret.append(CVPack(dtrain, dtest, plst)) - return ret - -def aggcv(rlist): - """ - aggregate cross validation results - """ - cvmap = {} - arr = rlist[0].split() - ret = arr[0] - for it in arr[1:]: - k, v = it.split(':') - cvmap[k] = [float(v)] - for line in rlist[1:]: - arr = line.split() - assert ret == arr[0] - for it in arr[1:]: - k, v = it.split(':') - cvmap[k].append(float(v)) - - for k, v in sorted(cvmap.items(), key = lambda x:x[0]): - v = np.array(v) - ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v)) - return ret - - class Booster: """learner class """ def __init__(self, params={}, cache=[], model_file = None): @@ -324,7 +253,7 @@ class Booster: self.handle, ctypes.c_char_p(k.encode('utf-8')), ctypes.c_char_p(str(v).encode('utf-8'))) - def update(self, dtrain, it): + def update(self, dtrain, it, fobj=None): """ update Args: @@ -332,11 +261,19 @@ class Booster: the training DMatrix it: int current iteration number + fobj: function + cutomzied objective function Returns: None """ assert isinstance(dtrain, DMatrix) - xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) + if fobj is None: + xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) + else: + pred = self.predict( dtrain ) + grad, hess = fobj( pred, dtrain ) + self.boost( dtrain, grad, hess ) + def boost(self, dtrain, grad, hess): """ update Args: @@ -353,22 +290,31 @@ class Booster: (ctypes.c_float*len(grad))(*grad), (ctypes.c_float*len(hess))(*hess), len(grad)) - def eval_set(self, evals, it = 0): + def eval_set(self, evals, it = 0, feval = None): """evaluates by metric Args: evals: list of tuple (DMatrix, string) lists of items to be evaluated it: int + feval: function + custom evaluation function Returns: evals result """ - for d in evals: - assert isinstance(d[0], DMatrix) - assert isinstance(d[1], str) - dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals]) - evnames = (ctypes.c_char_p * len(evals))( - * [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals]) - return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) + if feval is None: + for d in evals: + assert isinstance(d[0], DMatrix) + assert isinstance(d[1], str) + dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals]) + evnames = (ctypes.c_char_p * len(evals))( + * [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals]) + return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) + else: + res = '[%d]' % it + for dm, evname in evals: + name, val = feval(self.predict(dm), dm) + res += '\t%s-%s:%f' % (evname, name, val) + return res def eval(self, mat, name = 'eval', it = 0): return self.eval_set( [(mat,name)], it) def predict(self, data, output_margin=False, ntree_limit=0): @@ -453,31 +399,7 @@ class Booster: fmap[fid]+= 1 return fmap -def evaluate(bst, evals, it, feval = None): - """evaluation on eval set - Args: - bst: XGBoost object - object of XGBoost model - evals: list of tuple (DMatrix, string) - obj need to be evaluated - it: int - feval: optional - Returns: - eval result - """ - if feval != None: - res = '[%d]' % it - for dm, evname in evals: - name, val = feval(bst.predict(dm), dm) - res += '\t%s-%s:%f' % (evname, name, val) - else: - res = bst.eval_set(evals, it) - - return res - - - -def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None): +def train(params, dtrain, num_boost_round = 10, evals = [], fobj=None, feval=None): """ train a booster with given paramaters Args: params: dict @@ -488,27 +410,84 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None num of round to be boosted evals: list list of items to be evaluated - obj: - feval: + fobj: function + cutomized objective function + feval: function + cutomized evaluation function """ bst = Booster(params, [dtrain]+[ d[0] for d in evals ] ) - if obj is None: - for i in range(num_boost_round): - bst.update( dtrain, i ) - if len(evals) != 0: - sys.stderr.write(evaluate(bst, evals, i, feval).decode()+'\n') - else: - # try customized objective function - for i in range(num_boost_round): - pred = bst.predict( dtrain ) - grad, hess = obj( pred, dtrain ) - bst.boost( dtrain, grad, hess ) - if len(evals) != 0: - sys.stderr.write(evaluate(bst, evals, i, feval)+'\n') + for i in range(num_boost_round): + bst.update( dtrain, i, fobj ) + if len(evals) != 0: + sys.stderr.write(bst.eval_set(evals, i, feval).decode()+'\n') return bst -def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], \ - weightscale=None, obj=None, feval=None, set_pos_weight=None): +class CVPack: + def __init__(self, dtrain, dtest, param): + self.dtrain = dtrain + self.dtest = dtest + self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ] + self.bst = Booster(param, [dtrain,dtest]) + def update(self, r, fobj): + self.bst.update(self.dtrain, r, fobj) + def eval(self, r, fval): + return self.bst.eval_set(self.watchlist, r, feval) + +def mknfold(dall, nfold, param, seed, weightscale=None, evals=[]): + """ + mk nfold list of cvpack from randidx + """ + randidx = range(dall.num_row()) + random.seed(seed) + random.shuffle(randidx) + + idxset = [] + kstep = len(randidx) / nfold + for i in range(nfold): + idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ]) + + ret = [] + for k in range(nfold): + trainlst = [] + for j in range(nfold): + if j == k: + testlst = idxset[j] + else: + trainlst += idxset[j] + dtrain = dall.slice(trainlst) + dtest = dall.slice(testlst) + # rescale weight of dtrain and dtest + if weightscale != None: + dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() ) + dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() ) + plst = param.items() + [('eval_metric', itm) for itm in evals] + ret.append(CVPack(dtrain, dtest, plst)) + return ret + +def aggcv(rlist): + """ + aggregate cross validation results + """ + cvmap = {} + arr = rlist[0].split() + ret = arr[0] + for it in arr[1:]: + k, v = it.split(':') + cvmap[k] = [float(v)] + for line in rlist[1:]: + arr = line.split() + assert ret == arr[0] + for it in arr[1:]: + k, v = it.split(':') + cvmap[k].append(float(v)) + + for k, v in sorted(cvmap.items(), key = lambda x:x[0]): + v = np.array(v) + ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v)) + return ret + +def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ + weightscale=None, fobj=None, feval=None): """ cross validation with given paramaters Args: params: dict @@ -521,14 +500,12 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], \ folds to do cv evals: list list of items to be evaluated - obj: + fobj: feval: - set_pos_weight: bool, optional - Adjust pos weight by number """ - cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals) + cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals_metrics) for i in range(num_boost_round): for f in cvfolds: - f.update(i) - res = aggcv([f.eval(i) for f in cvfolds]) + f.update(i, fobj) + res = aggcv([f.eval(i, fval) for f in cvfolds]) sys.stderr.write(res+'\n') From 06b5533209fc14c2c6b3a1d4491be6939272a9f4 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 23:15:41 -0700 Subject: [PATCH 07/22] chg fobj back to obj, to keep parameter name unchanged --- wrapper/xgboost.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 2ae12c341..6b9bc83c6 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -399,7 +399,7 @@ class Booster: fmap[fid]+= 1 return fmap -def train(params, dtrain, num_boost_round = 10, evals = [], fobj=None, feval=None): +def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None): """ train a booster with given paramaters Args: params: dict @@ -410,14 +410,14 @@ def train(params, dtrain, num_boost_round = 10, evals = [], fobj=None, feval=Non num of round to be boosted evals: list list of items to be evaluated - fobj: function + obj: function cutomized objective function feval: function cutomized evaluation function """ bst = Booster(params, [dtrain]+[ d[0] for d in evals ] ) for i in range(num_boost_round): - bst.update( dtrain, i, fobj ) + bst.update( dtrain, i, obj ) if len(evals) != 0: sys.stderr.write(bst.eval_set(evals, i, feval).decode()+'\n') return bst @@ -487,7 +487,7 @@ def aggcv(rlist): return ret def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ - weightscale=None, fobj=None, feval=None): + weightscale=None, obj=None, feval=None): """ cross validation with given paramaters Args: params: dict @@ -500,12 +500,12 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ folds to do cv evals: list list of items to be evaluated - fobj: + obj: feval: """ cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals_metrics) for i in range(num_boost_round): for f in cvfolds: - f.update(i, fobj) + f.update(i, obj) res = aggcv([f.eval(i, fval) for f in cvfolds]) sys.stderr.write(res+'\n') From 582ef2f9d58f865b91e0df1a412beb27157e84ba Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 2 Sep 2014 23:29:48 -0700 Subject: [PATCH 08/22] Update DESCRIPTION --- R-package/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 40705e317..baf5912f4 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -12,7 +12,7 @@ Description: This package is a R wrapper of xgboost, which is short for eXtreme parallel computation with OpenMP, and it can be more than 10 times faster than existing gradient boosting packages such as gbm. It supports various objective functions, including regression, classification and ranking. The - package is made to be extensible, so that user are also allowed to define + package is made to be extensible, so that users are also allowed to define their own objectives easily. License: Apache License (== 2.0) | file LICENSE URL: https://github.com/tqchen/xgboost From 642b5bda0a6928588619678f0fa323bfe6011d92 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 2 Sep 2014 23:30:53 -0700 Subject: [PATCH 09/22] Update DESCRIPTION --- R-package/DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index baf5912f4..33258bf5c 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,14 +1,14 @@ Package: xgboost Type: Package Title: eXtreme Gradient Boosting -Version: 0.3-0 +Version: 0.3-1 Date: 2014-08-23 Author: Tianqi Chen , Tong He Maintainer: Tong He Description: This package is a R wrapper of xgboost, which is short for eXtreme Gradient Boosting. It is an efficient and scalable implementation of gradient boosting framework. The package includes efficient linear model - solver and tree learning algorithm. The package can automatically do + solver and tree learning algorithms. The package can automatically do parallel computation with OpenMP, and it can be more than 10 times faster than existing gradient boosting packages such as gbm. It supports various objective functions, including regression, classification and ranking. The From 85dbaf638bfbb75c023203893cd851920f948cd9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 2 Sep 2014 23:33:04 -0700 Subject: [PATCH 10/22] Update xgboost.Rnw --- R-package/vignettes/xgboost.Rnw | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/R-package/vignettes/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw index 19254abaf..9ecceca17 100644 --- a/R-package/vignettes/xgboost.Rnw +++ b/R-package/vignettes/xgboost.Rnw @@ -52,8 +52,7 @@ This is an introductory document of using the \verb@xgboost@ package in R. and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. The package includes efficient linear model solver and tree learning algorithm. It supports various objective functions, including regression, classification -and ranking. The package is made to be extendible, so that user are also allowed -to define there own objectives easily. It has several features: +and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features: \begin{enumerate} \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on Windows and Linux, with openmp. It is generally over 10 times faster than @@ -137,13 +136,10 @@ diris = xgb.DMatrix('iris.xgb.DMatrix') \section{Advanced Examples} -The function \verb@xgboost@ is a simple function with less parameters, in order -to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It -is more flexible than \verb@xgboost@, but it requires users to read the document -a bit more carefully. +The function \verb@xgboost@ is a simple function with less parameter, in order +to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully. -\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it -supports advanced features as custom objective and evaluation functions. +\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions. <>= logregobj <- function(preds, dtrain) { @@ -213,3 +209,4 @@ competition. \bibliography{xgboost} \end{document} + From 02dd8d121295c18564b62d191c5f9262011f1cc7 Mon Sep 17 00:00:00 2001 From: antinucleon Date: Wed, 3 Sep 2014 00:37:55 -0600 Subject: [PATCH 11/22] chg --- wrapper/xgboost.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 6b9bc83c6..2eea30483 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -296,7 +296,7 @@ class Booster: evals: list of tuple (DMatrix, string) lists of items to be evaluated it: int - feval: function + feval: function custom evaluation function Returns: evals result @@ -325,7 +325,7 @@ class Booster: the dmatrix storing the input output_margin: bool whether output raw margin value that is untransformed - + ntree_limit: limit number of trees in prediction, default to 0, 0 means using all the trees Returns: numpy array of prediction @@ -430,10 +430,10 @@ class CVPack: self.bst = Booster(param, [dtrain,dtest]) def update(self, r, fobj): self.bst.update(self.dtrain, r, fobj) - def eval(self, r, fval): + def eval(self, r, feval): return self.bst.eval_set(self.watchlist, r, feval) -def mknfold(dall, nfold, param, seed, weightscale=None, evals=[]): +def mknfold(dall, nfold, param, seed, evals=[]): """ mk nfold list of cvpack from randidx """ @@ -457,9 +457,6 @@ def mknfold(dall, nfold, param, seed, weightscale=None, evals=[]): dtrain = dall.slice(trainlst) dtest = dall.slice(testlst) # rescale weight of dtrain and dtest - if weightscale != None: - dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() ) - dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() ) plst = param.items() + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret @@ -487,7 +484,7 @@ def aggcv(rlist): return ret def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ - weightscale=None, obj=None, feval=None): + obj=None, feval=None): """ cross validation with given paramaters Args: params: dict @@ -503,9 +500,9 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ obj: feval: """ - cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals_metrics) + cvfolds = mknfold(dtrain, nfold, params, 0, eval_metrics) for i in range(num_boost_round): for f in cvfolds: f.update(i, obj) - res = aggcv([f.eval(i, fval) for f in cvfolds]) + res = aggcv([f.eval(i, feval) for f in cvfolds]) sys.stderr.write(res+'\n') From 998ca3bdc951c00453fd955a055421a95a170cf9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 11:46:33 -0700 Subject: [PATCH 12/22] make some changes to cv --- wrapper/xgboost.py | 50 +++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 2eea30483..6dadaf613 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -433,31 +433,22 @@ class CVPack: def eval(self, r, feval): return self.bst.eval_set(self.watchlist, r, feval) -def mknfold(dall, nfold, param, seed, evals=[]): +def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None): """ mk nfold list of cvpack from randidx """ - randidx = range(dall.num_row()) - random.seed(seed) - random.shuffle(randidx) - - idxset = [] + np.random.seed(seed) + randidx = np.random.permutation(dall.num_rows()) kstep = len(randidx) / nfold - for i in range(nfold): - idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ]) - + idset = [randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ] for i in range(nfold)] ret = [] for k in range(nfold): - trainlst = [] - for j in range(nfold): - if j == k: - testlst = idxset[j] - else: - trainlst += idxset[j] - dtrain = dall.slice(trainlst) - dtest = dall.slice(testlst) - # rescale weight of dtrain and dtest - plst = param.items() + [('eval_metric', itm) for itm in evals] + dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) + dtest = all.slice(idxset[k]) + # run preprocessing on the data set if needed + if fpreproc is not None: + dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) + plst = tparam.items() + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret @@ -466,25 +457,22 @@ def aggcv(rlist): aggregate cross validation results """ cvmap = {} - arr = rlist[0].split() - ret = arr[0] - for it in arr[1:]: - k, v = it.split(':') - cvmap[k] = [float(v)] - for line in rlist[1:]: + ret = rlist[0].split()[0] + for line in rlist: arr = line.split() assert ret == arr[0] for it in arr[1:]: k, v = it.split(':') + if k not in cvmap: + cvmap[k] = [] cvmap[k].append(float(v)) - for k, v in sorted(cvmap.items(), key = lambda x:x[0]): v = np.array(v) ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v)) return ret -def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ - obj=None, feval=None): +def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \ + obj = None, feval = None, fpreproc = None): """ cross validation with given paramaters Args: params: dict @@ -495,12 +483,14 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \ num of round to be boosted nfold: int folds to do cv - evals: list + evals: list or list of items to be evaluated obj: feval: + fpreproc: preprocessing function that takes dtrain, dtest, + param and return transformed version of dtrain, dtest, param """ - cvfolds = mknfold(dtrain, nfold, params, 0, eval_metrics) + cvfolds = mknfold(dtrain, nfold, params, 0, eval_metrics, fpreproc) for i in range(num_boost_round): for f in cvfolds: f.update(i, obj) From 0c36231ea3dd2b292d5bdec483427bf9210b5fcb Mon Sep 17 00:00:00 2001 From: antinucleon Date: Wed, 3 Sep 2014 12:57:05 -0600 Subject: [PATCH 13/22] chg --- wrapper/xgboost.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 6dadaf613..e09c6da8c 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -437,18 +437,18 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None): """ mk nfold list of cvpack from randidx """ - np.random.seed(seed) - randidx = np.random.permutation(dall.num_rows()) + np.random.seed(seed) + randidx = np.random.permutation(dall.num_row()) kstep = len(randidx) / nfold idset = [randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ] for i in range(nfold)] ret = [] for k in range(nfold): dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) - dtest = all.slice(idxset[k]) + dtest = dall.slice(idset[k]) # run preprocessing on the data set if needed if fpreproc is not None: dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) - plst = tparam.items() + [('eval_metric', itm) for itm in evals] + plst = param.items() + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret @@ -483,14 +483,14 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \ num of round to be boosted nfold: int folds to do cv - evals: list or + evals: list or list of items to be evaluated obj: feval: - fpreproc: preprocessing function that takes dtrain, dtest, + fpreproc: preprocessing function that takes dtrain, dtest, param and return transformed version of dtrain, dtest, param """ - cvfolds = mknfold(dtrain, nfold, params, 0, eval_metrics, fpreproc) + cvfolds = mknfold(dtrain, nfold, params, 0, eval_metric, fpreproc) for i in range(num_boost_round): for f in cvfolds: f.update(i, obj) From 3192bf82d861dd55cef8a2c0e8cd71e104cf90e3 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 3 Sep 2014 12:15:57 -0700 Subject: [PATCH 14/22] Update xgboost.py --- wrapper/xgboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index e09c6da8c..5b4eee6b8 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -448,7 +448,7 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None): # run preprocessing on the data set if needed if fpreproc is not None: dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) - plst = param.items() + [('eval_metric', itm) for itm in evals] + plst = tparam.items() + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret From fa11840f4b3691f56b9fe72b947393b6a06a64ad Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:13:54 -0700 Subject: [PATCH 15/22] move python example --- demo/data/README.md | 2 + .../data}/agaricus.txt.test | 0 .../data}/agaricus.txt.train | 0 .../python-example => demo/data}/featmap.txt | 0 demo/guide-python/REAMDE.md | 6 + demo/guide-python/basic_walkthrough.py | 70 ++++++++++ demo/guide-python/boost_from_prediction.py | 26 ++++ demo/guide-python/custom_objective.py | 44 +++++++ demo/guide-python/predict_first_ntree.py | 22 ++++ demo/guide-python/runall.sh | 6 + wrapper/README.md | 3 +- wrapper/python-example/README.md | 3 - wrapper/python-example/demo.py | 121 ------------------ 13 files changed, 177 insertions(+), 126 deletions(-) create mode 100644 demo/data/README.md rename {wrapper/python-example => demo/data}/agaricus.txt.test (100%) rename {wrapper/python-example => demo/data}/agaricus.txt.train (100%) rename {wrapper/python-example => demo/data}/featmap.txt (100%) create mode 100644 demo/guide-python/REAMDE.md create mode 100755 demo/guide-python/basic_walkthrough.py create mode 100755 demo/guide-python/boost_from_prediction.py create mode 100755 demo/guide-python/custom_objective.py create mode 100755 demo/guide-python/predict_first_ntree.py create mode 100755 demo/guide-python/runall.sh delete mode 100644 wrapper/python-example/README.md delete mode 100755 wrapper/python-example/demo.py diff --git a/demo/data/README.md b/demo/data/README.md new file mode 100644 index 000000000..d2d63ec11 --- /dev/null +++ b/demo/data/README.md @@ -0,0 +1,2 @@ +This folder contains processed example dataset used by the demos. +Copyright of the dataset belongs to the original copyright holder diff --git a/wrapper/python-example/agaricus.txt.test b/demo/data/agaricus.txt.test similarity index 100% rename from wrapper/python-example/agaricus.txt.test rename to demo/data/agaricus.txt.test diff --git a/wrapper/python-example/agaricus.txt.train b/demo/data/agaricus.txt.train similarity index 100% rename from wrapper/python-example/agaricus.txt.train rename to demo/data/agaricus.txt.train diff --git a/wrapper/python-example/featmap.txt b/demo/data/featmap.txt similarity index 100% rename from wrapper/python-example/featmap.txt rename to demo/data/featmap.txt diff --git a/demo/guide-python/REAMDE.md b/demo/guide-python/REAMDE.md new file mode 100644 index 000000000..7eaec6155 --- /dev/null +++ b/demo/guide-python/REAMDE.md @@ -0,0 +1,6 @@ +XGBoost Python Feature Walkthrough +==== +* [Basic walkthrough of wrappers](guide-python/basic.py) +* [Cutomize loss function, and evaluation metric](guide-python/custom_objective.py) +* [Boosting from existing prediction](guide-python/boost_from_prediction.py) +* [Predicting using first n trees](guide-python/predict_first_ntree.py) diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py new file mode 100755 index 000000000..f542954ce --- /dev/null +++ b/demo/guide-python/basic_walkthrough.py @@ -0,0 +1,70 @@ +#!/usr/bin/python +import sys +import numpy as np +import scipy.sparse +# append the path to xgboost, you may need to change the following line +# alternatively, you can add the path to PYTHONPATH environment variable +sys.path.append('../../wrapper') +import xgboost as xgb + +### simple example +# load file from text file, also binary buffer generated by xgboost +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') + +# specify parameters via map, definition are same as c++ version +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + +# specify validations set to watch performance +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 2 +bst = xgb.train(param, dtrain, num_round, watchlist) + +# this is prediction +preds = bst.predict(dtest) +labels = dtest.get_label() +print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) +bst.save_model('0001.model') +# dump model +bst.dump_model('dump.raw.txt') +# dump model with feature map +bst.dump_model('dump.nice.txt','../data/featmap.txt') + +# save dmatrix into binary buffer +dtest.save_binary('dtest.buffer') +bst.save_model('xgb.model') +# load model and data in +bst2 = xgb.Booster(model_file='xgb.model') +dtest2 = xgb.DMatrix('dtest.buffer') +preds2 = bst2.predict(dtest2) +# assert they are the same +assert np.sum(np.abs(preds2-preds)) == 0 + +### +# build dmatrix from scipy.sparse +print ('start running example of build DMatrix from scipy.sparse') +labels = [] +row = []; col = []; dat = [] +i = 0 +for l in open('../data/agaricus.txt.train'): + arr = l.split() + labels.append( int(arr[0])) + for it in arr[1:]: + k,v = it.split(':') + row.append(i); col.append(int(k)); dat.append(float(v)) + i += 1 +csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) +dtrain = xgb.DMatrix( csr ) +dtrain.set_label(labels) +watchlist = [(dtest,'eval'), (dtrain,'train')] +bst = xgb.train( param, dtrain, num_round, watchlist ) + +print ('start running example of build DMatrix from numpy array') +# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix +npymat = csr.todense() +dtrain = xgb.DMatrix( npymat) +dtrain.set_label(labels) +watchlist = [(dtest,'eval'), (dtrain,'train')] +bst = xgb.train( param, dtrain, num_round, watchlist ) + + diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py new file mode 100755 index 000000000..0aa2e56ab --- /dev/null +++ b/demo/guide-python/boost_from_prediction.py @@ -0,0 +1,26 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') +watchlist = [(dtest,'eval'), (dtrain,'train')] +### +# advanced: start from a initial base prediction +# +print ('start running example to start from a initial prediction') +# specify parameters via map, definition are same as c++ version +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } +# train xgboost for 1 round +bst = xgb.train( param, dtrain, 1, watchlist ) +# Note: we need the margin value instead of transformed prediction in set_base_margin +# do predict with output_margin=True, will always give you margin values before logistic transformation +ptrain = bst.predict(dtrain, output_margin=True) +ptest = bst.predict(dtest, output_margin=True) +dtrain.set_base_margin(ptrain) +dtest.set_base_margin(ptest) + +print ('this is result of running from initial prediction') +bst = xgb.train( param, dtrain, 1, watchlist ) diff --git a/demo/guide-python/custom_objective.py b/demo/guide-python/custom_objective.py new file mode 100755 index 000000000..5a7f110f4 --- /dev/null +++ b/demo/guide-python/custom_objective.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb +### +# advanced: cutomsized loss function +# +print ('start running example to used cutomized objective function') + +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') + +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param = {'max_depth':2, 'eta':1, 'silent':1 } +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 2 + +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess + +# user defined evaluation function, return a pair metric_name, result +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +def evalerror(preds, dtrain): + labels = dtrain.get_label() + # return a pair metric_name, result + # since preds are margin(before logistic transformation, cutoff at 0) + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py new file mode 100755 index 000000000..03f327e7f --- /dev/null +++ b/demo/guide-python/predict_first_ntree.py @@ -0,0 +1,22 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +### load data in do training +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 3 +bst = xgb.train(param, dtrain, num_round, watchlist) + +print ('start testing prediction from first n trees') +### predict using first 1 tree +label = dtest.get_label() +ypred1 = bst.predict(dtest, ntree_limit=1) +# by default, we predict using all the trees +ypred2 = bst.predict(dtest) +print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label)))) +print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label)))) diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh new file mode 100755 index 000000000..6b37c68ca --- /dev/null +++ b/demo/guide-python/runall.sh @@ -0,0 +1,6 @@ +#!/bin/bash +python basic_walkthrough.py +python custom_objective.py +python boost_from_prediction.py +python boost_from_prediction.py +rm *~ *.model *.buffer \ No newline at end of file diff --git a/wrapper/README.md b/wrapper/README.md index 3f43fa629..e736b9b6a 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -2,11 +2,10 @@ Wrapper of XGBoost ===== This folder provides wrapper of xgboost to other languages - Python ===== * To make the python module, type ```make``` in the root directory of project -* Refer to the walk through example in [python-example/demo.py](python-example/demo.py) +* Refer also to the walk through example in [demo folder](../demo/guide-python) R ===== diff --git a/wrapper/python-example/README.md b/wrapper/python-example/README.md deleted file mode 100644 index be5350dd2..000000000 --- a/wrapper/python-example/README.md +++ /dev/null @@ -1,3 +0,0 @@ -example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format - -for usage: see demo.py and comments in demo.py diff --git a/wrapper/python-example/demo.py b/wrapper/python-example/demo.py deleted file mode 100755 index 687b491a4..000000000 --- a/wrapper/python-example/demo.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/python -import sys -import numpy as np -import scipy.sparse -# append the path to xgboost, you may need to change the following line -# alternatively, you can add the path to PYTHONPATH environment variable -sys.path.append('../') -import xgboost as xgb - -### simple example -# load file from text file, also binary buffer generated by xgboost -dtrain = xgb.DMatrix('agaricus.txt.train') -dtest = xgb.DMatrix('agaricus.txt.test') - -# specify parameters via map, definition are same as c++ version -param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } - -# specify validations set to watch performance -evallist = [(dtest,'eval'), (dtrain,'train')] -num_round = 2 -bst = xgb.train(param, dtrain, num_round, evallist) - -# this is prediction -preds = bst.predict(dtest) -labels = dtest.get_label() -print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) -bst.save_model('0001.model') -# dump model -bst.dump_model('dump.raw.txt') -# dump model with feature map -bst.dump_model('dump.nice.txt','featmap.txt') - -# save dmatrix into binary buffer -dtest.save_binary('dtest.buffer') -bst.save_model('xgb.model') -# load model and data in -bst2 = xgb.Booster(model_file='xgb.model') -dtest2 = xgb.DMatrix('dtest.buffer') -preds2 = bst2.predict(dtest2) -# assert they are the same -assert np.sum(np.abs(preds2-preds)) == 0 - -### -# build dmatrix from scipy.sparse -print ('start running example of build DMatrix from scipy.sparse') -labels = [] -row = []; col = []; dat = [] -i = 0 -for l in open('agaricus.txt.train'): - arr = l.split() - labels.append( int(arr[0])) - for it in arr[1:]: - k,v = it.split(':') - row.append(i); col.append(int(k)); dat.append(float(v)) - i += 1 -csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) -dtrain = xgb.DMatrix( csr ) -dtrain.set_label(labels) -evallist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, evallist ) - -print ('start running example of build DMatrix from numpy array') -# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix -npymat = csr.todense() -dtrain = xgb.DMatrix( npymat) -dtrain.set_label(labels) -evallist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, evallist ) - -### -# advanced: cutomsized loss function -# -print ('start running example to used cutomized objective function') - -# note: for customized objective function, we leave objective as default -# note: what we are getting is margin value in prediction -# you must know what you are doing -param = {'max_depth':2, 'eta':1, 'silent':1 } - -# user define objective function, given prediction, return gradient and second order gradient -# this is loglikelihood loss -def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - grad = preds - labels - hess = preds * (1.0-preds) - return grad, hess - -# user defined evaluation function, return a pair metric_name, result -# NOTE: when you do customized loss function, the default prediction value is margin -# this may make buildin evalution metric not function properly -# for example, we are doing logistic loss, the prediction is score before logistic transformation -# the buildin evaluation error assumes input is after logistic transformation -# Take this in mind when you use the customization, and maybe you need write customized evaluation function -def evalerror(preds, dtrain): - labels = dtrain.get_label() - # return a pair metric_name, result - # since preds are margin(before logistic transformation, cutoff at 0) - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) - -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror) - -### -# advanced: start from a initial base prediction -# -print ('start running example to start from a initial prediction') -# specify parameters via map, definition are same as c++ version -param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } -# train xgboost for 1 round -bst = xgb.train( param, dtrain, 1, evallist ) -# Note: we need the margin value instead of transformed prediction in set_base_margin -# do predict with output_margin=True, will always give you margin values before logistic transformation -ptrain = bst.predict(dtrain, output_margin=True) -ptest = bst.predict(dtest, output_margin=True) -dtrain.set_base_margin(ptrain) -dtest.set_base_margin(ptest) - -print ('this is result of running from initial prediction') -bst = xgb.train( param, dtrain, 1, evallist ) From c1e0ff0326c46a13b52a2d9e40f5439d9f8b7328 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:15:17 -0700 Subject: [PATCH 16/22] push python examples in --- demo/READMDE.md | 25 +++++++++++++++++++++ demo/guide-python/{REAMDE.md => READMDE.md} | 0 2 files changed, 25 insertions(+) create mode 100644 demo/READMDE.md rename demo/guide-python/{REAMDE.md => READMDE.md} (100%) diff --git a/demo/READMDE.md b/demo/READMDE.md new file mode 100644 index 000000000..916a5cea1 --- /dev/null +++ b/demo/READMDE.md @@ -0,0 +1,25 @@ +XGBoost Examples +==== +This folder contains the all example codes using xgboost. +Contribution of exampls, benchmarks is more than welcomed! +If you like to share how you use xgboost to solve your problem, send a pull request:) + +Start Examples by Tasks +==== +* [Binary classification](binary_classification) +* [Multiclass classification](multiclass_classification) +* [Regression](regression) +* [Learning to Rank](rank) + +Features Walkthrough +==== +This is a list of short codes introducing different functionalities of xgboost and its wrapper. +* Basic walkthrough of wrappers. [python](guide-python/basic.py) [R](guide-R/basic.R) +* Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py) [R](guide-R/custom_objective.R) +* Boosting from existing prediction. [python](guide-python/boost_from_prediction.py) [R](guide-R/boost_from_prediction.R) +* Predicting using first n trees. [python](guide-python/predict_first_ntree.py) [R](guide-R/predict_first_ntree.R) +* Cross validation(to come) + +Benchmarks +==== +* [Starter script for Kaggle Higgs Boson](kaggle-higgs) diff --git a/demo/guide-python/REAMDE.md b/demo/guide-python/READMDE.md similarity index 100% rename from demo/guide-python/REAMDE.md rename to demo/guide-python/READMDE.md From 7a61f0dca2949dceb0a1de7fd1e81ec319d926d7 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:18:36 -0700 Subject: [PATCH 17/22] ok --- demo/READMDE.md | 2 +- demo/guide-python/READMDE.md | 8 ++++---- demo/guide-python/runall.sh | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/demo/READMDE.md b/demo/READMDE.md index 916a5cea1..433324947 100644 --- a/demo/READMDE.md +++ b/demo/READMDE.md @@ -14,7 +14,7 @@ Start Examples by Tasks Features Walkthrough ==== This is a list of short codes introducing different functionalities of xgboost and its wrapper. -* Basic walkthrough of wrappers. [python](guide-python/basic.py) [R](guide-R/basic.R) +* Basic walkthrough of wrappers. [python](guide-python/basic_walkthrough.py) [R](guide-R/basic_walkthrough.R) * Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py) [R](guide-R/custom_objective.R) * Boosting from existing prediction. [python](guide-python/boost_from_prediction.py) [R](guide-R/boost_from_prediction.R) * Predicting using first n trees. [python](guide-python/predict_first_ntree.py) [R](guide-R/predict_first_ntree.R) diff --git a/demo/guide-python/READMDE.md b/demo/guide-python/READMDE.md index 7eaec6155..b2cad6b54 100644 --- a/demo/guide-python/READMDE.md +++ b/demo/guide-python/READMDE.md @@ -1,6 +1,6 @@ XGBoost Python Feature Walkthrough ==== -* [Basic walkthrough of wrappers](guide-python/basic.py) -* [Cutomize loss function, and evaluation metric](guide-python/custom_objective.py) -* [Boosting from existing prediction](guide-python/boost_from_prediction.py) -* [Predicting using first n trees](guide-python/predict_first_ntree.py) +* [Basic walkthrough of wrappers](basic_walkthrough.py) +* [Cutomize loss function, and evaluation metric](custom_objective.py) +* [Boosting from existing prediction](boost_from_prediction.py) +* [Predicting using first n trees](predict_first_ntree.py) diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh index 6b37c68ca..4386cf262 100755 --- a/demo/guide-python/runall.sh +++ b/demo/guide-python/runall.sh @@ -2,5 +2,4 @@ python basic_walkthrough.py python custom_objective.py python boost_from_prediction.py -python boost_from_prediction.py rm *~ *.model *.buffer \ No newline at end of file From 60e1167b56d3dfe47a79fc7a9d4fe16d813942fd Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:20:23 -0700 Subject: [PATCH 18/22] fix doc --- demo/{READMDE.md => README.md} | 0 demo/guide-R/README.md | 3 +++ demo/guide-python/{READMDE.md => README.md} | 0 3 files changed, 3 insertions(+) rename demo/{READMDE.md => README.md} (100%) create mode 100644 demo/guide-R/README.md rename demo/guide-python/{READMDE.md => README.md} (100%) diff --git a/demo/READMDE.md b/demo/README.md similarity index 100% rename from demo/READMDE.md rename to demo/README.md diff --git a/demo/guide-R/README.md b/demo/guide-R/README.md new file mode 100644 index 000000000..0c87198bc --- /dev/null +++ b/demo/guide-R/README.md @@ -0,0 +1,3 @@ +XGBoost R Feature Walkthrough +==== +To be finished diff --git a/demo/guide-python/READMDE.md b/demo/guide-python/README.md similarity index 100% rename from demo/guide-python/READMDE.md rename to demo/guide-python/README.md From e6359b54845f11f00f4c6c4e9d41f01ef985a775 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:23:36 -0700 Subject: [PATCH 19/22] ok --- demo/guide-R/runall.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 demo/guide-R/runall.sh diff --git a/demo/guide-R/runall.sh b/demo/guide-R/runall.sh new file mode 100755 index 000000000..2d6cabcb2 --- /dev/null +++ b/demo/guide-R/runall.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# todo +Rscript basic_walkthrough.R +Rscript custom_objective.R +Rscript boost_from_prediction.R From 5cd92e33f6730206fa8918537081ef02b4bc2228 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:24:34 -0700 Subject: [PATCH 20/22] remove R for now --- demo/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demo/README.md b/demo/README.md index 433324947..f5606aff1 100644 --- a/demo/README.md +++ b/demo/README.md @@ -14,10 +14,10 @@ Start Examples by Tasks Features Walkthrough ==== This is a list of short codes introducing different functionalities of xgboost and its wrapper. -* Basic walkthrough of wrappers. [python](guide-python/basic_walkthrough.py) [R](guide-R/basic_walkthrough.R) -* Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py) [R](guide-R/custom_objective.R) -* Boosting from existing prediction. [python](guide-python/boost_from_prediction.py) [R](guide-R/boost_from_prediction.R) -* Predicting using first n trees. [python](guide-python/predict_first_ntree.py) [R](guide-R/predict_first_ntree.R) +* Basic walkthrough of wrappers. [python](guide-python/basic_walkthrough.py) +* Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py) +* Boosting from existing prediction. [python](guide-python/boost_from_prediction.py) +* Predicting using first n trees. [python](guide-python/predict_first_ntree.py) * Cross validation(to come) Benchmarks From b2586b6130b3e6b90601b84aa32ba9fe19f53356 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:27:06 -0700 Subject: [PATCH 21/22] ok --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 38291b09d..ea741072d 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ Turorial and Documentation: https://github.com/tqchen/xgboost/wiki Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.com/tqchen/xgboost/issues?q=is%3Aissue+label%3Aquestion) +Examples Code: [demo folder](demo) + Notes on the Code: [Code Guide](src) Features From 8952d9c3576307580a79d0017e53c5cefb57f267 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Sep 2014 13:28:03 -0700 Subject: [PATCH 22/22] fix --- demo/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/demo/README.md b/demo/README.md index f5606aff1..c9145d29c 100644 --- a/demo/README.md +++ b/demo/README.md @@ -4,13 +4,6 @@ This folder contains the all example codes using xgboost. Contribution of exampls, benchmarks is more than welcomed! If you like to share how you use xgboost to solve your problem, send a pull request:) -Start Examples by Tasks -==== -* [Binary classification](binary_classification) -* [Multiclass classification](multiclass_classification) -* [Regression](regression) -* [Learning to Rank](rank) - Features Walkthrough ==== This is a list of short codes introducing different functionalities of xgboost and its wrapper. @@ -20,6 +13,13 @@ This is a list of short codes introducing different functionalities of xgboost a * Predicting using first n trees. [python](guide-python/predict_first_ntree.py) * Cross validation(to come) +Basic Examples by Tasks +==== +* [Binary classification](binary_classification) +* [Multiclass classification](multiclass_classification) +* [Regression](regression) +* [Learning to Rank](rank) + Benchmarks ==== * [Starter script for Kaggle Higgs Boson](kaggle-higgs)