Merge branch 'dev' of https://github.com/tqchen/xgboost into dev

Conflicts:
	regrank/xgboost_regrank_obj.hpp
This commit is contained in:
kalenhaha 2014-05-07 22:15:59 +08:00
commit 8b3fc78999
17 changed files with 8756 additions and 109 deletions

3
.gitignore vendored
View File

@ -16,4 +16,5 @@
*conf
*buffer
*model
xgboost
xgboost
*pyc

View File

@ -88,8 +88,8 @@ namespace xgboost{
}
}
if (mparam.num_pbuffer != 0){
pred_buffer.resize(mparam.num_pbuffer);
pred_counter.resize(mparam.num_pbuffer);
pred_buffer.resize(mparam.PredBufferSize());
pred_counter.resize(mparam.PredBufferSize());
utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
}
@ -117,8 +117,8 @@ namespace xgboost{
*/
inline void InitModel(void){
pred_buffer.clear(); pred_counter.clear();
pred_buffer.resize(mparam.num_pbuffer, 0.0);
pred_counter.resize(mparam.num_pbuffer, 0);
pred_buffer.resize(mparam.PredBufferSize(), 0.0);
pred_counter.resize(mparam.PredBufferSize(), 0);
utils::Assert(mparam.num_boosters == 0);
utils::Assert(boosters.size() == 0);
}
@ -130,6 +130,7 @@ namespace xgboost{
if (tparam.nthread != 0){
omp_set_num_threads(tparam.nthread);
}
if (mparam.num_booster_group == 0) mparam.num_booster_group = 1;
// make sure all the boosters get the latest parameters
for (size_t i = 0; i < this->boosters.size(); i++){
this->ConfigBooster(this->boosters[i]);
@ -175,12 +176,14 @@ namespace xgboost{
* \param feats features of each instance
* \param root_index pre-partitioned root index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved
* \param bst_group which booster group it belongs to, by default, we only have 1 booster group, and leave this parameter as default
*/
inline void DoBoost(std::vector<float> &grad,
std::vector<float> &hess,
const booster::FMatrixS &feats,
const std::vector<unsigned> &root_index) {
booster::IBooster *bst = this->GetUpdateBooster();
const std::vector<unsigned> &root_index,
int bst_group = 0 ) {
booster::IBooster *bst = this->GetUpdateBooster( bst_group );
bst->DoBoost(grad, hess, feats, root_index);
}
/*!
@ -190,26 +193,30 @@ namespace xgboost{
* \param row_index row index in the feature matrix
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
* \param root_index root id of current instance, default = 0
* \param bst_group booster group index
* \return prediction
*/
inline float Predict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
size_t istart = 0;
inline float Predict(const FMatrixS &feats, bst_uint row_index,
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
size_t itop = 0;
float psum = 0.0f;
const int bid = mparam.BufferOffset(buffer_index, bst_group);
// load buffered results if any
if (mparam.do_reboost == 0 && buffer_index >= 0){
utils::Assert(buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer");
istart = this->pred_counter[buffer_index];
psum = this->pred_buffer[buffer_index];
if (mparam.do_reboost == 0 && bid >= 0){
itop = this->pred_counter[bid];
psum = this->pred_buffer[bid];
}
for (size_t i = istart; i < this->boosters.size(); i++){
psum += this->boosters[i]->Predict(feats, row_index, root_index);
for (size_t i = itop; i < this->boosters.size(); ++i ){
if( booster_info[i] == bst_group ){
psum += this->boosters[i]->Predict(feats, row_index, root_index);
}
}
// updated the buffered results
if (mparam.do_reboost == 0 && buffer_index >= 0){
this->pred_counter[buffer_index] = static_cast<unsigned>(boosters.size());
this->pred_buffer[buffer_index] = psum;
if (mparam.do_reboost == 0 && bid >= 0){
this->pred_counter[bid] = static_cast<unsigned>(boosters.size());
this->pred_buffer[bid] = psum;
}
return psum;
}
@ -217,6 +224,11 @@ namespace xgboost{
inline int NumBoosters(void) const{
return mparam.num_boosters;
}
/*! \return number of booster groups */
inline int NumBoosterGroup(void) const{
if( mparam.num_booster_group == 0 ) return 1;
return mparam.num_booster_group;
}
public:
//--------trial code for interactive update an existing booster------
//-------- usually not needed, ignore this region ---------
@ -224,14 +236,17 @@ namespace xgboost{
* \brief same as Predict, but removes the prediction of booster to be updated
* this function must be called once and only once for every data with pbuffer
*/
inline float InteractPredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
inline float InteractPredict(const FMatrixS &feats, bst_uint row_index,
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0){
float psum = this->Predict(feats, row_index, buffer_index, root_index);
if (tparam.reupdate_booster != -1){
const int bid = tparam.reupdate_booster;
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
psum -= boosters[bid]->Predict(feats, row_index, root_index);
if( bst_group == booster_info[bid] ){
psum -= boosters[bid]->Predict(feats, row_index, root_index);
}
if (mparam.do_reboost == 0 && buffer_index >= 0){
this->pred_buffer[buffer_index] = psum;
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] = psum;
}
}
return psum;
@ -246,15 +261,21 @@ namespace xgboost{
booster_info[i - 1] = booster_info[i];
}
boosters.resize(mparam.num_boosters -= 1);
booster_info.resize(boosters.size());
booster_info.resize(boosters.size());
// update pred counter
for( size_t i = 0; i < pred_counter.size(); ++ i ){
if( pred_counter[i] > (unsigned)bid ) pred_counter[i] -= 1;
}
}
/*! \brief update the prediction buffer, after booster have been updated */
inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index,
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
if (tparam.reupdate_booster != -1){
const int bid = tparam.reupdate_booster;
if( booster_info[bid] != bst_group ) return;
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
if (mparam.do_reboost == 0 && buffer_index >= 0){
this->pred_buffer[buffer_index] += boosters[bid]->Predict(feats, row_index, root_index);
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] += boosters[bid]->Predict(feats, row_index, root_index);
}
}
}
@ -278,18 +299,19 @@ namespace xgboost{
* \brief get a booster to update
* \return the booster created
*/
inline booster::IBooster *GetUpdateBooster(void){
inline booster::IBooster *GetUpdateBooster(int bst_group){
if (tparam.reupdate_booster != -1){
const int bid = tparam.reupdate_booster;
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
this->ConfigBooster(boosters[bid]);
utils::Assert( bst_group == booster_info[bid], "booster group must match existing reupdate booster");
return boosters[bid];
}
if (mparam.do_reboost == 0 || boosters.size() == 0){
mparam.num_boosters += 1;
boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
booster_info.push_back(0);
booster_info.push_back(bst_group);
this->ConfigBooster(boosters.back());
boosters.back()->InitModel();
}
@ -316,8 +338,13 @@ namespace xgboost{
* set to 1 for linear booster, so that regularization term can be considered
*/
int do_reboost;
/*!
* \brief number of booster group, how many predictions a single
* input instance could corresponds to
*/
int num_booster_group;
/*! \brief reserved parameters */
int reserved[32];
int reserved[31];
/*! \brief constructor */
ModelParam(void){
num_boosters = 0;
@ -325,6 +352,7 @@ namespace xgboost{
num_roots = num_feature = 0;
do_reboost = 0;
num_pbuffer = 0;
num_booster_group = 1;
memset(reserved, 0, sizeof(reserved));
}
/*!
@ -338,10 +366,21 @@ namespace xgboost{
// linear boost automatically set do reboost
if (booster_type == 1) do_reboost = 1;
}
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
if (!strcmp("num_booster_group", name)) num_booster_group = atoi(val);
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
inline int PredBufferSize(void) const{
if (num_booster_group == 0) return num_pbuffer;
else return num_booster_group * num_pbuffer;
}
inline int BufferOffset( int buffer_index, int bst_group ) const{
if( buffer_index < 0 ) return -1;
utils::Assert( buffer_index < num_pbuffer, "buffer_indexexceed num_pbuffer" );
return buffer_index + num_pbuffer * bst_group;
}
};
/*! \brief training parameters */

View File

@ -1,2 +1,4 @@
beta version:
python wrapper for xgboost using ctypes
see example for usage

3
python/example/README.md Normal file
View File

@ -0,0 +1,3 @@
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
for usage: see demo.py and comments in demo.py

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

101
python/example/demo.py Executable file
View File

@ -0,0 +1,101 @@
#!/usr/bin/python
import sys
import numpy as np
import scipy.sparse
# append the path to xgboost
sys.path.append('../')
import xgboost as xgb
### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 }
# specify validations set to watch performance
evallist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train( param, dtrain, num_round, evallist )
# this is prediction
preds = bst.predict( dtest )
labels = dtest.get_label()
print 'error=%f' % ( sum(1 for i in xrange(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))
bst.save_model('0001.model')
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.raw.txt','featmap.txt')
# beta: interact mode
bst.set_param('bst:interact:expand',4)
bst.update_interact( dtrain, 'update', 0)
bst.dump_model('dump.raw2.txt')
###
# build dmatrix in python iteratively
#
print 'start running example of build DMatrix in python'
dtrain = xgb.DMatrix()
labels = []
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
feats = []
for it in arr[1:]:
k,v = it.split(':')
feats.append( (int(k), float(v)) )
dtrain.add_row( feats )
dtrain.set_label( labels )
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
###
# build dmatrix from scipy.sparse
print 'start running example of build DMatrix from scipy.sparse'
labels = []
row = []; col = []; dat = []
i = 0
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
for it in arr[1:]:
k,v = it.split(':')
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
print 'start running example of build DMatrix from numpy array'
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix( npymat )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
###
# cutomsized loss function, set loss_type to 0, so that predict get untransformed score
#
print 'start running example to used cutomized objective function'
# note: set loss_type properly, loss_type=2 means the prediction will get logistic transformed
# in most case, we may want to set loss_type = 0, to get untransformed score to compute gradient
bst = param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 }
# user define objective function, given prediction, return gradient and second order gradient
def logregobj( preds, dtrain ):
labels = dtrain.get_label()
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
# training with customized objective, we can also do step by step training, simply look at xgboost.py's implementation of train
bst = xgb.train( param, dtrain, num_round, evallist, logregobj )

126
python/example/featmap.txt Normal file
View File

@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -1,10 +1,13 @@
# module for xgboost
import ctypes
import os
# optinally have scipy sparse, though not necessary
import numpy as np
import numpy
import numpy.ctypeslib
import scipy.sparse as scp
# set this line correctly
XGBOOST_PATH = './libxgboostpy.so'
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostpy.so'
# entry type of sparse matrix
class REntry(ctypes.Structure):
@ -15,7 +18,7 @@ xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
@ -34,9 +37,9 @@ class DMatrix:
else:
try:
csr = scp.csr_matrix(data)
self.__init_from_csr(data)
self.__init_from_csr(csr)
except:
raise "DMatrix", "can not intialize DMatrix from"+type(data)
raise Exception, "can not intialize DMatrix from"+str(type(data))
if label != None:
self.set_label(label)
@ -69,8 +72,8 @@ class DMatrix:
# get label from dmatrix
def get_label(self):
length = ctypes.c_ulong()
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length));
return [ labels[i] for i in xrange(length.value) ]
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
return numpy.array( [labels[i] for i in xrange(length.value)] )
# clear everything
def clear(self):
xglib.XGDMatrixClear(self.handle)
@ -93,12 +96,36 @@ class Booster:
assert isinstance(d,DMatrix)
dmats = ( ctypes.c_void_p * len(cache) )(*[ ctypes.c_void_p(d.handle) for d in cache])
self.handle = xglib.XGBoosterCreate( dmats, len(cache) )
for k, v in params.iteritems():
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
self.set_param( params )
def __del__(self):
xglib.XGBoosterFree(self.handle)
def set_param(self, params,pv=None):
if isinstance(params,dict):
for k, v in params.iteritems():
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
elif isinstance(params,str) and pv != None:
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(params), ctypes.c_char_p(str(pv)) )
else:
for k, v in params:
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
def update(self, dtrain):
""" update """
assert isinstance(dtrain, DMatrix)
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
def boost(self, dtrain, grad, hess, bst_group = -1):
""" update """
assert len(grad) == len(hess)
assert isinstance(dtrain, DMatrix)
xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle,
(ctypes.c_float*len(grad))(*grad),
(ctypes.c_float*len(hess))(*hess),
len(grad), bst_group )
def update_interact(self, dtrain, action, booster_index=None):
""" beta: update with specified action"""
assert isinstance(dtrain, DMatrix)
if booster_index != None:
self.set_param('interact:booster_index', str(booster_index))
xglib.XGBoosterUpdateInteract( self.handle, dtrain.handle, ctypes.c_char_p(str(action)) )
def eval_set(self, evals, it = 0):
for d in evals:
assert isinstance(d[0], DMatrix)
@ -108,10 +135,10 @@ class Booster:
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
def eval(self, mat, name = 'eval', it = 0 ):
self.eval_set( [(mat,name)], it)
def predict(self, data):
def predict(self, data, bst_group = -1):
length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length))
return [ preds[i] for i in xrange(length.value) ]
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
return numpy.array( [ preds[i] for i in xrange(length.value)])
def save_model(self, fname):
""" save model to file """
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
@ -122,12 +149,21 @@ class Booster:
"""dump model into text file"""
xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) )
def train(params, dtrain, num_boost_round = 10, evals = []):
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None):
""" train a booster with given paramaters """
bst = Booster(params, [dtrain] )
for i in xrange(num_boost_round):
bst.update( dtrain )
if len(evals) != 0:
bst.eval_set( evals, i )
if obj == None:
for i in xrange(num_boost_round):
bst.update( dtrain )
if len(evals) != 0:
bst.eval_set( evals, i )
else:
# try customized objective function
for i in xrange(num_boost_round):
pred = bst.predict( dtrain )
grad, hess = obj( pred, dtrain )
bst.boost( dtrain, grad, hess )
if len(evals) != 0:
bst.eval_set( evals, i )
return bst

View File

@ -32,6 +32,7 @@ namespace xgboost{
mat.row_data_.resize( mat.row_ptr_.back() + len );
memcpy( &mat.row_data_[mat.row_ptr_.back()], data, sizeof(XGEntry)*len );
mat.row_ptr_.push_back( mat.row_ptr_.back() + len );
init_col_ = false;
}
inline const XGEntry* GetRow(unsigned ridx, size_t* len) const{
const xgboost::booster::FMatrixS &mat = this->data;
@ -72,7 +73,7 @@ namespace xgboost{
return &(this->info.labels[0]);
}
inline void CheckInit(void){
if(!this->data.HaveColAccess()){
if(!init_col_){
this->data.InitData();
}
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
@ -101,11 +102,34 @@ namespace xgboost{
xgboost::regrank::RegRankBoostLearner::LoadModel(fname);
this->init_model = true;
}
const float *Pred( const DMatrix &dmat, size_t *len ){
this->Predict( this->preds_, dmat );
const float *Pred( const DMatrix &dmat, size_t *len, int bst_group ){
this->CheckInit();
this->Predict( this->preds_, dmat, bst_group );
*len = this->preds_.size();
return &this->preds_[0];
}
inline void BoostOneIter( const DMatrix &train,
float *grad, float *hess, size_t len, int bst_group ){
this->grad_.resize( len ); this->hess_.resize( len );
memcpy( &this->grad_[0], grad, sizeof(float)*len );
memcpy( &this->hess_[0], hess, sizeof(float)*len );
if( grad_.size() == train.Size() ){
if( bst_group < 0 ) bst_group = 0;
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index, bst_group);
}else{
utils::Assert( bst_group == -1, "must set bst_group to -1 to support all group boosting" );
int ngroup = base_gbm.NumBoosterGroup();
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
for( int g = 0; g < ngroup; ++ g ){
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
}
}
}
};
};
};
@ -163,10 +187,15 @@ extern "C"{
void *XGBoosterCreate( void *dmats[], size_t len ){
std::vector<const xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
DMatrix *dtr = static_cast<DMatrix*>(dmats[i]);
dtr->CheckInit();
mats.push_back( dtr );
}
return new Booster( mats );
}
void XGBoosterFree( void *handle ){
delete static_cast<Booster*>(handle);
}
void XGBoosterSetParam( void *handle, const char *name, const char *value ){
static_cast<Booster*>(handle)->SetParam( name, value );
}
@ -176,6 +205,13 @@ extern "C"{
bst->CheckInit(); dtr->CheckInit();
bst->UpdateOneIter( *dtr );
}
void XGBoosterBoostOneIter( void *handle, void *dtrain,
float *grad, float *hess, size_t len, int bst_group ){
Booster *bst = static_cast<Booster*>(handle);
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
bst->CheckInit(); dtr->CheckInit();
bst->BoostOneIter( *dtr, grad, hess, len, bst_group );
}
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){
Booster *bst = static_cast<Booster*>(handle);
bst->CheckInit();
@ -188,8 +224,8 @@ extern "C"{
}
bst->EvalOneIter( iter, mats, names, stdout );
}
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len ){
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len );
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group ){
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len, bst_group );
}
void XGBoosterLoadModel( void *handle, const char *fname ){
static_cast<Booster*>(handle)->LoadModel( fname );
@ -207,5 +243,13 @@ extern "C"{
static_cast<Booster*>(handle)->DumpModel( fo, featmap, false );
fclose( fo );
}
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char *action ){
Booster *bst = static_cast<Booster*>(handle);
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
bst->CheckInit(); dtr->CheckInit();
std::string act( action );
bst->UpdateInteract( act, *dtr );
}
};

View File

@ -109,6 +109,11 @@ extern "C"{
* \param create a booster
*/
void *XGBoosterCreate( void* dmats[], size_t len );
/*!
* \brief free obj in handle
* \param handle handle to be freed
*/
void XGBoosterFree( void* handle );
/*!
* \brief set parameters
* \param handle handle
@ -122,6 +127,19 @@ extern "C"{
* \param dtrain training data
*/
void XGBoosterUpdateOneIter( void *handle, void *dtrain );
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param len length of grad/hess array
* \param bst_group boost group we are working at, default = -1
*/
void XGBoosterBoostOneIter( void *handle, void *dtrain,
float *grad, float *hess, size_t len, int bst_group );
/*!
* \brief print evaluation statistics to stdout for xgboost
* \param handle handle
@ -136,8 +154,9 @@ extern "C"{
* \param handle handle
* \param dmat data matrix
* \param len used to store length of returning result
* \param bst_group booster group, if model contains multiple booster group, default = -1 means predict for all groups
*/
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len );
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group );
/*!
* \brief load model from existing file
* \param handle handle
@ -157,6 +176,13 @@ extern "C"{
* \param fmap name to fmap can be empty string
*/
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap );
/*!
* \brief interactively update model: beta
* \param handle handle
* \param dtrain training data
* \param action action name
*/
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char* action );
};
#endif

View File

@ -86,6 +86,7 @@ namespace xgboost{
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp(name, "objective") ) name_obj_ = val;
if (!strcmp(name, "num_class") ) base_gbm.SetParam("num_booster_group", val );
mparam.SetParam(name, val);
base_gbm.SetParam(name, val);
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
@ -95,7 +96,13 @@ namespace xgboost{
* this function is reserved for solver to allocate necessary space and do other preparation
*/
inline void InitTrainer(void){
base_gbm.InitTrainer();
if( mparam.num_class != 0 ){
if( name_obj_ != "softmax" ){
name_obj_ = "softmax";
printf("auto select objective=softmax to support multi-class classification\n" );
}
}
base_gbm.InitTrainer();
obj_ = CreateObjFunction( name_obj_.c_str() );
for( size_t i = 0; i < cfg_.size(); ++ i ){
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
@ -166,9 +173,18 @@ namespace xgboost{
inline void UpdateOneIter(const DMatrix &train){
this->PredictRaw(preds_, train);
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
// do boost
std::vector<unsigned> root_index;
base_gbm.DoBoost(grad_, hess_, train.data, root_index);
if( grad_.size() == train.Size() ){
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index);
}else{
int ngroup = base_gbm.NumBoosterGroup();
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
for( int g = 0; g < ngroup; ++ g ){
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
}
}
}
/*!
* \brief evaluate the model for specific iteration
@ -190,9 +206,14 @@ namespace xgboost{
fprintf(fo, "\n");
fflush(fo);
}
/*! \brief get prediction, without buffering */
inline void Predict(std::vector<float> &preds, const DMatrix &data){
this->PredictRaw(preds,data);
/*!
* \brief get prediction
* \param storage to store prediction
* \param data input data
* \param bst_group booster group we are in
*/
inline void Predict(std::vector<float> &preds, const DMatrix &data, int bst_group = -1){
this->PredictRaw( preds, data, bst_group );
obj_->PredTransform( preds );
}
public:
@ -241,24 +262,32 @@ namespace xgboost{
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
}
}
private:
/*! \brief get un-transformed prediction*/
inline void PredictRaw(std::vector<float> &preds, const DMatrix &data){
this->PredictBuffer(preds, data, this->FindBufferOffset(data) );
inline void PredictRaw(std::vector<float> &preds, const DMatrix &data, int bst_group = -1 ){
int buffer_offset = this->FindBufferOffset(data);
if( bst_group < 0 ){
int ngroup = base_gbm.NumBoosterGroup();
preds.resize( data.Size() * ngroup );
for( int g = 0; g < ngroup; ++ g ){
this->PredictBuffer(&preds[ data.Size() * g ], data, buffer_offset, g );
}
}else{
preds.resize( data.Size() );
this->PredictBuffer(&preds[0], data, buffer_offset, bst_group );
}
}
/*! \brief get the un-transformed predictions, given data */
inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, int buffer_offset){
preds.resize(data.Size());
inline void PredictBuffer(float *preds, const DMatrix &data, int buffer_offset, int bst_group ){
const unsigned ndata = static_cast<unsigned>(data.Size());
if( buffer_offset >= 0 ){
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
}
}else
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1, data.info.GetRoot(j), bst_group );
}{
}
}
@ -270,14 +299,17 @@ namespace xgboost{
/* \brief type of loss function */
int loss_type;
/* \brief number of features */
int num_feature;
int num_feature;
/* \brief number of class, if it is multi-class classification */
int num_class;
/*! \brief reserved field */
int reserved[16];
int reserved[15];
/*! \brief constructor */
ModelParam(void){
base_score = 0.5f;
loss_type = 0;
num_feature = 0;
num_class = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
@ -288,6 +320,7 @@ namespace xgboost{
inline void SetParam(const char *name, const char *val){
if (!strcmp("base_score", name)) base_score = (float)atof(val);
if (!strcmp("loss_type", name)) loss_type = atoi(val);
if (!strcmp("num_class", name)) num_class = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
/*!

View File

@ -35,11 +35,17 @@ namespace xgboost{
std::vector<unsigned> group_ptr;
/*! \brief weights of each instance, optional */
std::vector<float> weights;
/*! \brief specified root index of each instance, can be used for multi task setting*/
std::vector<unsigned> root_index;
/*! \brief get weight of each instances */
inline float GetWeight( size_t i ) const{
if( weights.size() != 0 ) return weights[i];
if( weights.size() != 0 ) return weights[i];
else return 1.0f;
}
inline float GetRoot( size_t i ) const{
if( root_index.size() != 0 ) return root_index[i];
else return 0;
}
};
public:
/*! \brief feature data content */
@ -112,7 +118,10 @@ namespace xgboost{
unsigned ngptr;
if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){
info.group_ptr.resize( ngptr );
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
if( ngptr != 0 ){
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
utils::Assert( info.group_ptr.back() == data.NumRow(), "number of group must match number of record" );
}
}
}
fs.Close();
@ -121,7 +130,7 @@ namespace xgboost{
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
if( info.group_ptr.size() != 0 ){
printf("data contains %u groups\n", (unsigned)info.group_ptr.size() );
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1 );
}
}
this->TryLoadWeight(fname, silent);
@ -141,16 +150,18 @@ namespace xgboost{
utils::Assert( info.labels.size() == data.NumRow(), "label size is not consistent with feature matrix size" );
fs.Write(&info.labels[0], sizeof(float) * data.NumRow());
{// write out group ptr
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
fs.Write(&ngptr, sizeof(unsigned) );
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
if( ngptr != 0 ){
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
}
}
fs.Close();
if (!silent){
printf("%ux%u matrix with %lu entries is saved to %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
if( info.group_ptr.size() != 0 ){
printf("data contains %u groups\n", (unsigned)info.group_ptr.size() );
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1 );
}
}
}

View File

@ -13,6 +13,7 @@
#include "../utils/xgboost_omp.h"
#include "../utils/xgboost_random.h"
#include "xgboost_regrank_data.h"
#include "xgboost_regrank_utils.h"
namespace xgboost{
namespace regrank{
@ -31,17 +32,11 @@ namespace xgboost{
virtual ~IEvaluator(void){}
};
inline static bool CmpFirst(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
return a.first > b.first;
}
inline static bool CmpSecond(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
return a.second > b.second;
}
/*! \brief RMSE */
struct EvalRMSE : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
@ -62,6 +57,7 @@ namespace xgboost{
struct EvalLogLoss : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0f, wsum = 0.0f;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
@ -106,7 +102,8 @@ namespace xgboost{
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Assert(gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction");
@ -159,8 +156,10 @@ namespace xgboost{
public:
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert(gptr.size() != 0 && gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction");
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
utils::Assert( gptr.back() == preds.size(), "EvalRanklist: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
double sum_metric = 0.0f;

View File

@ -106,8 +106,9 @@ namespace xgboost{
namespace regrank{
IObjFunction* CreateObjFunction( const char *name ){
if( !strcmp("reg", name ) ) return new RegressionObj();
if( !strcmp("rank", name ) ) return new PairwiseRankObj();
if( !strcmp("softmax", name ) ) return new SoftmaxObj();
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
if( !strcmp("softmax", name ) ) return new SoftmaxMultiClassObj();
utils::Error("unknown objective function type");
return NULL;
}

View File

@ -1,7 +1,7 @@
#ifndef XGBOOST_REGRANK_OBJ_HPP
#define XGBOOST_REGRANK_OBJ_HPP
/*!
* \file xgboost_regrank_obj.h
* \file xgboost_regrank_obj.hpp
* \brief implementation of objective functions
* \author Tianqi Chen, Kailong Chen
*/
@ -9,6 +9,8 @@
#include <vector>
#include <functional>
#include "xgboost_regrank_sample.h"
#include "xgboost_regrank_utils.h"
namespace xgboost{
namespace regrank{
class RegressionObj : public IObjFunction{
@ -25,6 +27,7 @@ namespace xgboost{
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const unsigned ndata = static_cast<unsigned>(preds.size());
@ -53,11 +56,11 @@ namespace xgboost{
namespace regrank{
// simple softmax rak
class SoftmaxObj : public IObjFunction{
class SoftmaxRankObj : public IObjFunction{
public:
SoftmaxObj(void){
SoftmaxRankObj(void){
}
virtual ~SoftmaxObj(){}
virtual ~SoftmaxRankObj(){}
virtual void SetParam(const char *name, const char *val){
}
virtual void GetGradient(const std::vector<float>& preds,
@ -65,6 +68,7 @@ namespace xgboost{
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
@ -97,23 +101,76 @@ namespace xgboost{
}
virtual const char* DefaultEvalMetric(void) {
return "pre@1";
}
private:
inline static void Softmax( std::vector<float>& rec ){
float wmax = rec[0];
for( size_t i = 1; i < rec.size(); ++ i ){
wmax = std::max( rec[i], wmax );
}
double wsum = 0.0f;
for( size_t i = 0; i < rec.size(); ++ i ){
rec[i] = expf(rec[i]-wmax);
wsum += rec[i];
}
for( size_t i = 0; i < rec.size(); ++ i ){
rec[i] /= wsum;
}
}
};
// simple softmax multi-class classification
class SoftmaxMultiClassObj : public IObjFunction{
public:
SoftmaxMultiClassObj(void){
nclass = 0;
}
virtual ~SoftmaxMultiClassObj(){}
virtual void SetParam(const char *name, const char *val){
if( !strcmp( "num_class", name ) ) nclass = atoi(val);
}
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( nclass != 0, "must set num_class to use softmax" );
utils::Assert( preds.size() == (size_t)nclass * info.labels.size(), "SoftmaxMultiClassObj: label size and pred size does not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const unsigned ndata = static_cast<unsigned>(info.labels.size());
#pragma omp parallel
{
std::vector<float> rec(nclass);
#pragma for schedule(static)
for (unsigned j = 0; j < ndata; ++j){
for( int k = 0; k < nclass; ++ k ){
rec[k] = preds[j + k * ndata];
}
Softmax( rec );
int label = static_cast<int>(info.labels[j]);
utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
for( int k = 0; k < nclass; ++ k ){
float p = rec[ k ];
if( label == k ){
grad[j+k*ndata] = p - 1.0f;
}else{
grad[j+k*ndata] = p;
}
hess[j+k*ndata] = 2.0f * p * ( 1.0f - p );
}
}
}
}
virtual void PredTransform(std::vector<float> &preds){
utils::Assert( nclass != 0, "must set num_class to use softmax" );
utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
#pragma omp parallel
{
std::vector<float> rec(nclass);
#pragma for schedule(static)
for (unsigned j = 0; j < ndata; ++j){
for( int k = 0; k < nclass; ++ k ){
rec[k] = preds[j + k * ndata];
}
Softmax( rec );
preds[j] = FindMaxIndex( rec );
}
}
preds.resize( ndata );
}
virtual const char* DefaultEvalMetric(void) {
return "error";
}
private:
int nclass;
};
};
namespace regrank{
@ -134,6 +191,7 @@ namespace xgboost{
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );

View File

@ -0,0 +1,43 @@
#ifndef XGBOOST_REGRANK_UTILS_H
#define XGBOOST_REGRANK_UTILS_H
/*!
* \file xgboost_regrank_utils.h
* \brief useful helper functions
* \author Tianqi Chen, Kailong Chen
*/
namespace xgboost{
namespace regrank{
// simple helper function to do softmax
inline static void Softmax( std::vector<float>& rec ){
float wmax = rec[0];
for( size_t i = 1; i < rec.size(); ++ i ){
wmax = std::max( rec[i], wmax );
}
double wsum = 0.0f;
for( size_t i = 0; i < rec.size(); ++ i ){
rec[i] = expf(rec[i]-wmax);
wsum += rec[i];
}
for( size_t i = 0; i < rec.size(); ++ i ){
rec[i] /= wsum;
}
}
// simple helper function to do softmax
inline static int FindMaxIndex( std::vector<float>& rec ){
size_t mxid = 0;
for( size_t i = 1; i < rec.size(); ++ i ){
if( rec[i] > rec[mxid] ) mxid = i;
}
return (int)mxid;
}
inline static bool CmpFirst(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
return a.first > b.first;
}
inline static bool CmpSecond(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
return a.second > b.second;
}
};
};
#endif