Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev
Conflicts: regrank/xgboost_regrank_data.h
This commit is contained in:
commit
a57fbe091a
3
.gitignore
vendored
3
.gitignore
vendored
@ -16,4 +16,5 @@
|
|||||||
*conf
|
*conf
|
||||||
*buffer
|
*buffer
|
||||||
*model
|
*model
|
||||||
xgboost
|
xgboost
|
||||||
|
*pyc
|
||||||
|
|||||||
@ -321,6 +321,8 @@ namespace xgboost{
|
|||||||
fi.Read(&col_access, sizeof(int));
|
fi.Read(&col_access, sizeof(int));
|
||||||
if (col_access != 0){
|
if (col_access != 0){
|
||||||
FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
|
FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
|
||||||
|
}else{
|
||||||
|
this->InitData();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
|
|||||||
@ -88,8 +88,8 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (mparam.num_pbuffer != 0){
|
if (mparam.num_pbuffer != 0){
|
||||||
pred_buffer.resize(mparam.num_pbuffer);
|
pred_buffer.resize(mparam.PredBufferSize());
|
||||||
pred_counter.resize(mparam.num_pbuffer);
|
pred_counter.resize(mparam.PredBufferSize());
|
||||||
utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
|
utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
|
||||||
utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
|
utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
|
||||||
}
|
}
|
||||||
@ -117,8 +117,8 @@ namespace xgboost{
|
|||||||
*/
|
*/
|
||||||
inline void InitModel(void){
|
inline void InitModel(void){
|
||||||
pred_buffer.clear(); pred_counter.clear();
|
pred_buffer.clear(); pred_counter.clear();
|
||||||
pred_buffer.resize(mparam.num_pbuffer, 0.0);
|
pred_buffer.resize(mparam.PredBufferSize(), 0.0);
|
||||||
pred_counter.resize(mparam.num_pbuffer, 0);
|
pred_counter.resize(mparam.PredBufferSize(), 0);
|
||||||
utils::Assert(mparam.num_boosters == 0);
|
utils::Assert(mparam.num_boosters == 0);
|
||||||
utils::Assert(boosters.size() == 0);
|
utils::Assert(boosters.size() == 0);
|
||||||
}
|
}
|
||||||
@ -130,6 +130,7 @@ namespace xgboost{
|
|||||||
if (tparam.nthread != 0){
|
if (tparam.nthread != 0){
|
||||||
omp_set_num_threads(tparam.nthread);
|
omp_set_num_threads(tparam.nthread);
|
||||||
}
|
}
|
||||||
|
if (mparam.num_booster_group == 0) mparam.num_booster_group = 1;
|
||||||
// make sure all the boosters get the latest parameters
|
// make sure all the boosters get the latest parameters
|
||||||
for (size_t i = 0; i < this->boosters.size(); i++){
|
for (size_t i = 0; i < this->boosters.size(); i++){
|
||||||
this->ConfigBooster(this->boosters[i]);
|
this->ConfigBooster(this->boosters[i]);
|
||||||
@ -175,12 +176,14 @@ namespace xgboost{
|
|||||||
* \param feats features of each instance
|
* \param feats features of each instance
|
||||||
* \param root_index pre-partitioned root index of each instance,
|
* \param root_index pre-partitioned root index of each instance,
|
||||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||||
|
* \param bst_group which booster group it belongs to, by default, we only have 1 booster group, and leave this parameter as default
|
||||||
*/
|
*/
|
||||||
inline void DoBoost(std::vector<float> &grad,
|
inline void DoBoost(std::vector<float> &grad,
|
||||||
std::vector<float> &hess,
|
std::vector<float> &hess,
|
||||||
const booster::FMatrixS &feats,
|
const booster::FMatrixS &feats,
|
||||||
const std::vector<unsigned> &root_index) {
|
const std::vector<unsigned> &root_index,
|
||||||
booster::IBooster *bst = this->GetUpdateBooster();
|
int bst_group = 0 ) {
|
||||||
|
booster::IBooster *bst = this->GetUpdateBooster( bst_group );
|
||||||
bst->DoBoost(grad, hess, feats, root_index);
|
bst->DoBoost(grad, hess, feats, root_index);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -190,26 +193,30 @@ namespace xgboost{
|
|||||||
* \param row_index row index in the feature matrix
|
* \param row_index row index in the feature matrix
|
||||||
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
|
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
|
||||||
* \param root_index root id of current instance, default = 0
|
* \param root_index root id of current instance, default = 0
|
||||||
|
* \param bst_group booster group index
|
||||||
* \return prediction
|
* \return prediction
|
||||||
*/
|
*/
|
||||||
inline float Predict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
|
inline float Predict(const FMatrixS &feats, bst_uint row_index,
|
||||||
size_t istart = 0;
|
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
|
||||||
|
size_t itop = 0;
|
||||||
float psum = 0.0f;
|
float psum = 0.0f;
|
||||||
|
const int bid = mparam.BufferOffset(buffer_index, bst_group);
|
||||||
|
|
||||||
// load buffered results if any
|
// load buffered results if any
|
||||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
if (mparam.do_reboost == 0 && bid >= 0){
|
||||||
utils::Assert(buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer");
|
itop = this->pred_counter[bid];
|
||||||
istart = this->pred_counter[buffer_index];
|
psum = this->pred_buffer[bid];
|
||||||
psum = this->pred_buffer[buffer_index];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = istart; i < this->boosters.size(); i++){
|
for (size_t i = itop; i < this->boosters.size(); ++i ){
|
||||||
psum += this->boosters[i]->Predict(feats, row_index, root_index);
|
if( booster_info[i] == bst_group ){
|
||||||
|
psum += this->boosters[i]->Predict(feats, row_index, root_index);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// updated the buffered results
|
// updated the buffered results
|
||||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
if (mparam.do_reboost == 0 && bid >= 0){
|
||||||
this->pred_counter[buffer_index] = static_cast<unsigned>(boosters.size());
|
this->pred_counter[bid] = static_cast<unsigned>(boosters.size());
|
||||||
this->pred_buffer[buffer_index] = psum;
|
this->pred_buffer[bid] = psum;
|
||||||
}
|
}
|
||||||
return psum;
|
return psum;
|
||||||
}
|
}
|
||||||
@ -217,6 +224,11 @@ namespace xgboost{
|
|||||||
inline int NumBoosters(void) const{
|
inline int NumBoosters(void) const{
|
||||||
return mparam.num_boosters;
|
return mparam.num_boosters;
|
||||||
}
|
}
|
||||||
|
/*! \return number of booster groups */
|
||||||
|
inline int NumBoosterGroup(void) const{
|
||||||
|
if( mparam.num_booster_group == 0 ) return 1;
|
||||||
|
return mparam.num_booster_group;
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
//--------trial code for interactive update an existing booster------
|
//--------trial code for interactive update an existing booster------
|
||||||
//-------- usually not needed, ignore this region ---------
|
//-------- usually not needed, ignore this region ---------
|
||||||
@ -224,14 +236,17 @@ namespace xgboost{
|
|||||||
* \brief same as Predict, but removes the prediction of booster to be updated
|
* \brief same as Predict, but removes the prediction of booster to be updated
|
||||||
* this function must be called once and only once for every data with pbuffer
|
* this function must be called once and only once for every data with pbuffer
|
||||||
*/
|
*/
|
||||||
inline float InteractPredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
|
inline float InteractPredict(const FMatrixS &feats, bst_uint row_index,
|
||||||
|
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0){
|
||||||
float psum = this->Predict(feats, row_index, buffer_index, root_index);
|
float psum = this->Predict(feats, row_index, buffer_index, root_index);
|
||||||
if (tparam.reupdate_booster != -1){
|
if (tparam.reupdate_booster != -1){
|
||||||
const int bid = tparam.reupdate_booster;
|
const int bid = tparam.reupdate_booster;
|
||||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||||
psum -= boosters[bid]->Predict(feats, row_index, root_index);
|
if( bst_group == booster_info[bid] ){
|
||||||
|
psum -= boosters[bid]->Predict(feats, row_index, root_index);
|
||||||
|
}
|
||||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||||
this->pred_buffer[buffer_index] = psum;
|
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] = psum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return psum;
|
return psum;
|
||||||
@ -246,15 +261,21 @@ namespace xgboost{
|
|||||||
booster_info[i - 1] = booster_info[i];
|
booster_info[i - 1] = booster_info[i];
|
||||||
}
|
}
|
||||||
boosters.resize(mparam.num_boosters -= 1);
|
boosters.resize(mparam.num_boosters -= 1);
|
||||||
booster_info.resize(boosters.size());
|
booster_info.resize(boosters.size());
|
||||||
|
// update pred counter
|
||||||
|
for( size_t i = 0; i < pred_counter.size(); ++ i ){
|
||||||
|
if( pred_counter[i] > (unsigned)bid ) pred_counter[i] -= 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/*! \brief update the prediction buffer, after booster have been updated */
|
/*! \brief update the prediction buffer, after booster have been updated */
|
||||||
inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
|
inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index,
|
||||||
|
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
|
||||||
if (tparam.reupdate_booster != -1){
|
if (tparam.reupdate_booster != -1){
|
||||||
const int bid = tparam.reupdate_booster;
|
const int bid = tparam.reupdate_booster;
|
||||||
|
if( booster_info[bid] != bst_group ) return;
|
||||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||||
this->pred_buffer[buffer_index] += boosters[bid]->Predict(feats, row_index, root_index);
|
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] += boosters[bid]->Predict(feats, row_index, root_index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -278,18 +299,19 @@ namespace xgboost{
|
|||||||
* \brief get a booster to update
|
* \brief get a booster to update
|
||||||
* \return the booster created
|
* \return the booster created
|
||||||
*/
|
*/
|
||||||
inline booster::IBooster *GetUpdateBooster(void){
|
inline booster::IBooster *GetUpdateBooster(int bst_group){
|
||||||
if (tparam.reupdate_booster != -1){
|
if (tparam.reupdate_booster != -1){
|
||||||
const int bid = tparam.reupdate_booster;
|
const int bid = tparam.reupdate_booster;
|
||||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||||
this->ConfigBooster(boosters[bid]);
|
this->ConfigBooster(boosters[bid]);
|
||||||
|
utils::Assert( bst_group == booster_info[bid], "booster group must match existing reupdate booster");
|
||||||
return boosters[bid];
|
return boosters[bid];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mparam.do_reboost == 0 || boosters.size() == 0){
|
if (mparam.do_reboost == 0 || boosters.size() == 0){
|
||||||
mparam.num_boosters += 1;
|
mparam.num_boosters += 1;
|
||||||
boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
|
boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
|
||||||
booster_info.push_back(0);
|
booster_info.push_back(bst_group);
|
||||||
this->ConfigBooster(boosters.back());
|
this->ConfigBooster(boosters.back());
|
||||||
boosters.back()->InitModel();
|
boosters.back()->InitModel();
|
||||||
}
|
}
|
||||||
@ -316,8 +338,13 @@ namespace xgboost{
|
|||||||
* set to 1 for linear booster, so that regularization term can be considered
|
* set to 1 for linear booster, so that regularization term can be considered
|
||||||
*/
|
*/
|
||||||
int do_reboost;
|
int do_reboost;
|
||||||
|
/*!
|
||||||
|
* \brief number of booster group, how many predictions a single
|
||||||
|
* input instance could corresponds to
|
||||||
|
*/
|
||||||
|
int num_booster_group;
|
||||||
/*! \brief reserved parameters */
|
/*! \brief reserved parameters */
|
||||||
int reserved[32];
|
int reserved[31];
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
ModelParam(void){
|
ModelParam(void){
|
||||||
num_boosters = 0;
|
num_boosters = 0;
|
||||||
@ -325,6 +352,7 @@ namespace xgboost{
|
|||||||
num_roots = num_feature = 0;
|
num_roots = num_feature = 0;
|
||||||
do_reboost = 0;
|
do_reboost = 0;
|
||||||
num_pbuffer = 0;
|
num_pbuffer = 0;
|
||||||
|
num_booster_group = 1;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -338,10 +366,21 @@ namespace xgboost{
|
|||||||
// linear boost automatically set do reboost
|
// linear boost automatically set do reboost
|
||||||
if (booster_type == 1) do_reboost = 1;
|
if (booster_type == 1) do_reboost = 1;
|
||||||
}
|
}
|
||||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
|
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
|
||||||
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
|
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
|
||||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
if (!strcmp("num_booster_group", name)) num_booster_group = atoi(val);
|
||||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||||
|
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||||
|
}
|
||||||
|
inline int PredBufferSize(void) const{
|
||||||
|
if (num_booster_group == 0) return num_pbuffer;
|
||||||
|
else return num_booster_group * num_pbuffer;
|
||||||
|
}
|
||||||
|
inline int BufferOffset( int buffer_index, int bst_group ) const{
|
||||||
|
if( buffer_index < 0 ) return -1;
|
||||||
|
utils::Assert( buffer_index < num_pbuffer, "buffer_indexexceed num_pbuffer" );
|
||||||
|
return buffer_index + num_pbuffer * bst_group;
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
/*! \brief training parameters */
|
/*! \brief training parameters */
|
||||||
|
|||||||
@ -23,5 +23,7 @@ save_period = 0
|
|||||||
data = "agaricus.txt.train"
|
data = "agaricus.txt.train"
|
||||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
eval[test] = "agaricus.txt.test"
|
eval[test] = "agaricus.txt.test"
|
||||||
|
# evaluate on training data as well each round
|
||||||
|
eval_train = 1
|
||||||
# The path of test data
|
# The path of test data
|
||||||
test:data = "agaricus.txt.test"
|
test:data = "agaricus.txt.test"
|
||||||
|
|||||||
26
python/Makefile
Normal file
26
python/Makefile
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
export CC = gcc
|
||||||
|
export CXX = g++
|
||||||
|
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fopenmp
|
||||||
|
|
||||||
|
# specify tensor path
|
||||||
|
SLIB = libxgboostpy.so
|
||||||
|
.PHONY: clean all
|
||||||
|
|
||||||
|
all: $(SLIB)
|
||||||
|
export LDFLAGS= -pthread -lm
|
||||||
|
|
||||||
|
libxgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp
|
||||||
|
|
||||||
|
$(SLIB) :
|
||||||
|
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
$(BIN) :
|
||||||
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
|
$(OBJ) :
|
||||||
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||||
|
|
||||||
|
install:
|
||||||
|
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
$(RM) $(OBJ) $(BIN) $(SLIB) *~
|
||||||
4
python/README.md
Normal file
4
python/README.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
python wrapper for xgboost using ctypes
|
||||||
|
|
||||||
|
see example for usage
|
||||||
|
|
||||||
3
python/example/README.md
Normal file
3
python/example/README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
|
||||||
|
|
||||||
|
for usage: see demo.py and comments in demo.py
|
||||||
1611
python/example/agaricus.txt.test
Normal file
1611
python/example/agaricus.txt.test
Normal file
File diff suppressed because it is too large
Load Diff
6513
python/example/agaricus.txt.train
Normal file
6513
python/example/agaricus.txt.train
Normal file
File diff suppressed because it is too large
Load Diff
101
python/example/demo.py
Executable file
101
python/example/demo.py
Executable file
@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import scipy.sparse
|
||||||
|
# append the path to xgboost
|
||||||
|
sys.path.append('../')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### simple example
|
||||||
|
# load file from text file, also binary buffer generated by xgboost
|
||||||
|
dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||||
|
|
||||||
|
# specify parameters via map, definition are same as c++ version
|
||||||
|
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 }
|
||||||
|
|
||||||
|
# specify validations set to watch performance
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
num_round = 2
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
# this is prediction
|
||||||
|
preds = bst.predict( dtest )
|
||||||
|
labels = dtest.get_label()
|
||||||
|
print 'error=%f' % ( sum(1 for i in xrange(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))
|
||||||
|
bst.save_model('0001.model')
|
||||||
|
# dump model
|
||||||
|
bst.dump_model('dump.raw.txt')
|
||||||
|
# dump model with feature map
|
||||||
|
bst.dump_model('dump.raw.txt','featmap.txt')
|
||||||
|
|
||||||
|
# beta: interact mode
|
||||||
|
bst.set_param('bst:interact:expand',4)
|
||||||
|
bst.update_interact( dtrain, 'update', 0)
|
||||||
|
bst.dump_model('dump.raw2.txt')
|
||||||
|
|
||||||
|
###
|
||||||
|
# build dmatrix in python iteratively
|
||||||
|
#
|
||||||
|
print 'start running example of build DMatrix in python'
|
||||||
|
dtrain = xgb.DMatrix()
|
||||||
|
labels = []
|
||||||
|
for l in open('agaricus.txt.train'):
|
||||||
|
arr = l.split()
|
||||||
|
labels.append( int(arr[0]))
|
||||||
|
feats = []
|
||||||
|
for it in arr[1:]:
|
||||||
|
k,v = it.split(':')
|
||||||
|
feats.append( (int(k), float(v)) )
|
||||||
|
dtrain.add_row( feats )
|
||||||
|
dtrain.set_label( labels )
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
###
|
||||||
|
# build dmatrix from scipy.sparse
|
||||||
|
print 'start running example of build DMatrix from scipy.sparse'
|
||||||
|
labels = []
|
||||||
|
row = []; col = []; dat = []
|
||||||
|
i = 0
|
||||||
|
for l in open('agaricus.txt.train'):
|
||||||
|
arr = l.split()
|
||||||
|
labels.append( int(arr[0]))
|
||||||
|
for it in arr[1:]:
|
||||||
|
k,v = it.split(':')
|
||||||
|
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
||||||
|
dtrain = xgb.DMatrix( csr )
|
||||||
|
dtrain.set_label(labels)
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
print 'start running example of build DMatrix from numpy array'
|
||||||
|
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
|
||||||
|
npymat = csr.todense()
|
||||||
|
dtrain = xgb.DMatrix( npymat )
|
||||||
|
dtrain.set_label(labels)
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
###
|
||||||
|
# cutomsized loss function, set loss_type to 0, so that predict get untransformed score
|
||||||
|
#
|
||||||
|
print 'start running example to used cutomized objective function'
|
||||||
|
|
||||||
|
# note: set loss_type properly, loss_type=2 means the prediction will get logistic transformed
|
||||||
|
# in most case, we may want to set loss_type = 0, to get untransformed score to compute gradient
|
||||||
|
bst = param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 }
|
||||||
|
|
||||||
|
# user define objective function, given prediction, return gradient and second order gradient
|
||||||
|
def logregobj( preds, dtrain ):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
grad = preds - labels
|
||||||
|
hess = preds * (1.0-preds)
|
||||||
|
return grad, hess
|
||||||
|
|
||||||
|
# training with customized objective, we can also do step by step training, simply look at xgboost.py's implementation of train
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist, logregobj )
|
||||||
126
python/example/featmap.txt
Normal file
126
python/example/featmap.txt
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
0 cap-shape=bell i
|
||||||
|
1 cap-shape=conical i
|
||||||
|
2 cap-shape=convex i
|
||||||
|
3 cap-shape=flat i
|
||||||
|
4 cap-shape=knobbed i
|
||||||
|
5 cap-shape=sunken i
|
||||||
|
6 cap-surface=fibrous i
|
||||||
|
7 cap-surface=grooves i
|
||||||
|
8 cap-surface=scaly i
|
||||||
|
9 cap-surface=smooth i
|
||||||
|
10 cap-color=brown i
|
||||||
|
11 cap-color=buff i
|
||||||
|
12 cap-color=cinnamon i
|
||||||
|
13 cap-color=gray i
|
||||||
|
14 cap-color=green i
|
||||||
|
15 cap-color=pink i
|
||||||
|
16 cap-color=purple i
|
||||||
|
17 cap-color=red i
|
||||||
|
18 cap-color=white i
|
||||||
|
19 cap-color=yellow i
|
||||||
|
20 bruises?=bruises i
|
||||||
|
21 bruises?=no i
|
||||||
|
22 odor=almond i
|
||||||
|
23 odor=anise i
|
||||||
|
24 odor=creosote i
|
||||||
|
25 odor=fishy i
|
||||||
|
26 odor=foul i
|
||||||
|
27 odor=musty i
|
||||||
|
28 odor=none i
|
||||||
|
29 odor=pungent i
|
||||||
|
30 odor=spicy i
|
||||||
|
31 gill-attachment=attached i
|
||||||
|
32 gill-attachment=descending i
|
||||||
|
33 gill-attachment=free i
|
||||||
|
34 gill-attachment=notched i
|
||||||
|
35 gill-spacing=close i
|
||||||
|
36 gill-spacing=crowded i
|
||||||
|
37 gill-spacing=distant i
|
||||||
|
38 gill-size=broad i
|
||||||
|
39 gill-size=narrow i
|
||||||
|
40 gill-color=black i
|
||||||
|
41 gill-color=brown i
|
||||||
|
42 gill-color=buff i
|
||||||
|
43 gill-color=chocolate i
|
||||||
|
44 gill-color=gray i
|
||||||
|
45 gill-color=green i
|
||||||
|
46 gill-color=orange i
|
||||||
|
47 gill-color=pink i
|
||||||
|
48 gill-color=purple i
|
||||||
|
49 gill-color=red i
|
||||||
|
50 gill-color=white i
|
||||||
|
51 gill-color=yellow i
|
||||||
|
52 stalk-shape=enlarging i
|
||||||
|
53 stalk-shape=tapering i
|
||||||
|
54 stalk-root=bulbous i
|
||||||
|
55 stalk-root=club i
|
||||||
|
56 stalk-root=cup i
|
||||||
|
57 stalk-root=equal i
|
||||||
|
58 stalk-root=rhizomorphs i
|
||||||
|
59 stalk-root=rooted i
|
||||||
|
60 stalk-root=missing i
|
||||||
|
61 stalk-surface-above-ring=fibrous i
|
||||||
|
62 stalk-surface-above-ring=scaly i
|
||||||
|
63 stalk-surface-above-ring=silky i
|
||||||
|
64 stalk-surface-above-ring=smooth i
|
||||||
|
65 stalk-surface-below-ring=fibrous i
|
||||||
|
66 stalk-surface-below-ring=scaly i
|
||||||
|
67 stalk-surface-below-ring=silky i
|
||||||
|
68 stalk-surface-below-ring=smooth i
|
||||||
|
69 stalk-color-above-ring=brown i
|
||||||
|
70 stalk-color-above-ring=buff i
|
||||||
|
71 stalk-color-above-ring=cinnamon i
|
||||||
|
72 stalk-color-above-ring=gray i
|
||||||
|
73 stalk-color-above-ring=orange i
|
||||||
|
74 stalk-color-above-ring=pink i
|
||||||
|
75 stalk-color-above-ring=red i
|
||||||
|
76 stalk-color-above-ring=white i
|
||||||
|
77 stalk-color-above-ring=yellow i
|
||||||
|
78 stalk-color-below-ring=brown i
|
||||||
|
79 stalk-color-below-ring=buff i
|
||||||
|
80 stalk-color-below-ring=cinnamon i
|
||||||
|
81 stalk-color-below-ring=gray i
|
||||||
|
82 stalk-color-below-ring=orange i
|
||||||
|
83 stalk-color-below-ring=pink i
|
||||||
|
84 stalk-color-below-ring=red i
|
||||||
|
85 stalk-color-below-ring=white i
|
||||||
|
86 stalk-color-below-ring=yellow i
|
||||||
|
87 veil-type=partial i
|
||||||
|
88 veil-type=universal i
|
||||||
|
89 veil-color=brown i
|
||||||
|
90 veil-color=orange i
|
||||||
|
91 veil-color=white i
|
||||||
|
92 veil-color=yellow i
|
||||||
|
93 ring-number=none i
|
||||||
|
94 ring-number=one i
|
||||||
|
95 ring-number=two i
|
||||||
|
96 ring-type=cobwebby i
|
||||||
|
97 ring-type=evanescent i
|
||||||
|
98 ring-type=flaring i
|
||||||
|
99 ring-type=large i
|
||||||
|
100 ring-type=none i
|
||||||
|
101 ring-type=pendant i
|
||||||
|
102 ring-type=sheathing i
|
||||||
|
103 ring-type=zone i
|
||||||
|
104 spore-print-color=black i
|
||||||
|
105 spore-print-color=brown i
|
||||||
|
106 spore-print-color=buff i
|
||||||
|
107 spore-print-color=chocolate i
|
||||||
|
108 spore-print-color=green i
|
||||||
|
109 spore-print-color=orange i
|
||||||
|
110 spore-print-color=purple i
|
||||||
|
111 spore-print-color=white i
|
||||||
|
112 spore-print-color=yellow i
|
||||||
|
113 population=abundant i
|
||||||
|
114 population=clustered i
|
||||||
|
115 population=numerous i
|
||||||
|
116 population=scattered i
|
||||||
|
117 population=several i
|
||||||
|
118 population=solitary i
|
||||||
|
119 habitat=grasses i
|
||||||
|
120 habitat=leaves i
|
||||||
|
121 habitat=meadows i
|
||||||
|
122 habitat=paths i
|
||||||
|
123 habitat=urban i
|
||||||
|
124 habitat=waste i
|
||||||
|
125 habitat=woods i
|
||||||
169
python/xgboost.py
Normal file
169
python/xgboost.py
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
# module for xgboost
|
||||||
|
import ctypes
|
||||||
|
import os
|
||||||
|
# optinally have scipy sparse, though not necessary
|
||||||
|
import numpy
|
||||||
|
import numpy.ctypeslib
|
||||||
|
import scipy.sparse as scp
|
||||||
|
|
||||||
|
# set this line correctly
|
||||||
|
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostpy.so'
|
||||||
|
|
||||||
|
# entry type of sparse matrix
|
||||||
|
class REntry(ctypes.Structure):
|
||||||
|
_fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ]
|
||||||
|
|
||||||
|
# load in xgboost library
|
||||||
|
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
|
||||||
|
|
||||||
|
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
|
||||||
|
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
|
||||||
|
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
|
||||||
|
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
|
||||||
|
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
|
||||||
|
|
||||||
|
# data matrix used in xgboost
|
||||||
|
class DMatrix:
|
||||||
|
# constructor
|
||||||
|
def __init__(self, data=None, label=None):
|
||||||
|
self.handle = xglib.XGDMatrixCreate()
|
||||||
|
if data == None:
|
||||||
|
return
|
||||||
|
if isinstance(data,str):
|
||||||
|
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1)
|
||||||
|
|
||||||
|
elif isinstance(data,scp.csr_matrix):
|
||||||
|
self.__init_from_csr(data)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
csr = scp.csr_matrix(data)
|
||||||
|
self.__init_from_csr(csr)
|
||||||
|
except:
|
||||||
|
raise Exception, "can not intialize DMatrix from"+str(type(data))
|
||||||
|
if label != None:
|
||||||
|
self.set_label(label)
|
||||||
|
|
||||||
|
# convert data from csr matrix
|
||||||
|
def __init_from_csr(self,csr):
|
||||||
|
assert len(csr.indices) == len(csr.data)
|
||||||
|
xglib.XGDMatrixParseCSR( self.handle,
|
||||||
|
( ctypes.c_ulong * len(csr.indptr) )(*csr.indptr),
|
||||||
|
( ctypes.c_uint * len(csr.indices) )(*csr.indices),
|
||||||
|
( ctypes.c_float * len(csr.data) )(*csr.data),
|
||||||
|
len(csr.indptr), len(csr.data) )
|
||||||
|
# destructor
|
||||||
|
def __del__(self):
|
||||||
|
xglib.XGDMatrixFree(self.handle)
|
||||||
|
# load data from file
|
||||||
|
def load(self, fname, silent=True):
|
||||||
|
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname), int(silent))
|
||||||
|
# load data from file
|
||||||
|
def save_binary(self, fname, silent=True):
|
||||||
|
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname), int(silent))
|
||||||
|
# set label of dmatrix
|
||||||
|
def set_label(self, label):
|
||||||
|
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
|
||||||
|
# set group size of dmatrix, used for rank
|
||||||
|
def set_group(self, group):
|
||||||
|
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
|
||||||
|
# set weight of each instances
|
||||||
|
def set_weight(self, weight):
|
||||||
|
xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_uint*len(weight))(*weight), len(weight) )
|
||||||
|
# get label from dmatrix
|
||||||
|
def get_label(self):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
|
||||||
|
return numpy.array( [labels[i] for i in xrange(length.value)] )
|
||||||
|
# clear everything
|
||||||
|
def clear(self):
|
||||||
|
xglib.XGDMatrixClear(self.handle)
|
||||||
|
def num_row(self):
|
||||||
|
return xglib.XGDMatrixNumRow(self.handle)
|
||||||
|
# append a row to DMatrix
|
||||||
|
def add_row(self, row):
|
||||||
|
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
|
||||||
|
# get n-throw from DMatrix
|
||||||
|
def __getitem__(self, ridx):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
|
||||||
|
return [ (int(row[i].findex),row[i].fvalue) for i in xrange(length.value) ]
|
||||||
|
|
||||||
|
class Booster:
|
||||||
|
"""learner class """
|
||||||
|
def __init__(self, params, cache=[]):
|
||||||
|
""" constructor, param: """
|
||||||
|
for d in cache:
|
||||||
|
assert isinstance(d,DMatrix)
|
||||||
|
dmats = ( ctypes.c_void_p * len(cache) )(*[ ctypes.c_void_p(d.handle) for d in cache])
|
||||||
|
self.handle = xglib.XGBoosterCreate( dmats, len(cache) )
|
||||||
|
self.set_param( params )
|
||||||
|
def __del__(self):
|
||||||
|
xglib.XGBoosterFree(self.handle)
|
||||||
|
def set_param(self, params,pv=None):
|
||||||
|
if isinstance(params,dict):
|
||||||
|
for k, v in params.iteritems():
|
||||||
|
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
|
||||||
|
elif isinstance(params,str) and pv != None:
|
||||||
|
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(params), ctypes.c_char_p(str(pv)) )
|
||||||
|
else:
|
||||||
|
for k, v in params:
|
||||||
|
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
|
||||||
|
def update(self, dtrain):
|
||||||
|
""" update """
|
||||||
|
assert isinstance(dtrain, DMatrix)
|
||||||
|
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
|
||||||
|
def boost(self, dtrain, grad, hess, bst_group = -1):
|
||||||
|
""" update """
|
||||||
|
assert len(grad) == len(hess)
|
||||||
|
assert isinstance(dtrain, DMatrix)
|
||||||
|
xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle,
|
||||||
|
(ctypes.c_float*len(grad))(*grad),
|
||||||
|
(ctypes.c_float*len(hess))(*hess),
|
||||||
|
len(grad), bst_group )
|
||||||
|
def update_interact(self, dtrain, action, booster_index=None):
|
||||||
|
""" beta: update with specified action"""
|
||||||
|
assert isinstance(dtrain, DMatrix)
|
||||||
|
if booster_index != None:
|
||||||
|
self.set_param('interact:booster_index', str(booster_index))
|
||||||
|
xglib.XGBoosterUpdateInteract( self.handle, dtrain.handle, ctypes.c_char_p(str(action)) )
|
||||||
|
def eval_set(self, evals, it = 0):
|
||||||
|
for d in evals:
|
||||||
|
assert isinstance(d[0], DMatrix)
|
||||||
|
assert isinstance(d[1], str)
|
||||||
|
dmats = ( ctypes.c_void_p * len(evals) )(*[ ctypes.c_void_p(d[0].handle) for d in evals])
|
||||||
|
evnames = ( ctypes.c_char_p * len(evals) )(*[ ctypes.c_char_p(d[1]) for d in evals])
|
||||||
|
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
|
||||||
|
def eval(self, mat, name = 'eval', it = 0 ):
|
||||||
|
self.eval_set( [(mat,name)], it)
|
||||||
|
def predict(self, data, bst_group = -1):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
|
||||||
|
return numpy.array( [ preds[i] for i in xrange(length.value)])
|
||||||
|
def save_model(self, fname):
|
||||||
|
""" save model to file """
|
||||||
|
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
|
||||||
|
def load_model(self, fname):
|
||||||
|
"""load model from file"""
|
||||||
|
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname) )
|
||||||
|
def dump_model(self, fname, fmap=''):
|
||||||
|
"""dump model into text file"""
|
||||||
|
xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) )
|
||||||
|
|
||||||
|
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None):
|
||||||
|
""" train a booster with given paramaters """
|
||||||
|
bst = Booster(params, [dtrain] )
|
||||||
|
if obj == None:
|
||||||
|
for i in xrange(num_boost_round):
|
||||||
|
bst.update( dtrain )
|
||||||
|
if len(evals) != 0:
|
||||||
|
bst.eval_set( evals, i )
|
||||||
|
else:
|
||||||
|
# try customized objective function
|
||||||
|
for i in xrange(num_boost_round):
|
||||||
|
pred = bst.predict( dtrain )
|
||||||
|
grad, hess = obj( pred, dtrain )
|
||||||
|
bst.boost( dtrain, grad, hess )
|
||||||
|
if len(evals) != 0:
|
||||||
|
bst.eval_set( evals, i )
|
||||||
|
return bst
|
||||||
|
|
||||||
255
python/xgboost_python.cpp
Normal file
255
python/xgboost_python.cpp
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
#include "xgboost_python.h"
|
||||||
|
#include "../regrank/xgboost_regrank.h"
|
||||||
|
#include "../regrank/xgboost_regrank_data.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace python{
|
||||||
|
class DMatrix: public regrank::DMatrix{
|
||||||
|
public:
|
||||||
|
// whether column is initialized
|
||||||
|
bool init_col_;
|
||||||
|
public:
|
||||||
|
DMatrix(void){
|
||||||
|
init_col_ = false;
|
||||||
|
}
|
||||||
|
~DMatrix(void){}
|
||||||
|
public:
|
||||||
|
inline void Load(const char *fname, bool silent){
|
||||||
|
this->CacheLoad(fname, silent);
|
||||||
|
init_col_ = this->data.HaveColAccess();
|
||||||
|
}
|
||||||
|
inline void Clear( void ){
|
||||||
|
this->data.Clear();
|
||||||
|
this->info.labels.clear();
|
||||||
|
this->info.weights.clear();
|
||||||
|
this->info.group_ptr.clear();
|
||||||
|
}
|
||||||
|
inline size_t NumRow( void ) const{
|
||||||
|
return this->data.NumRow();
|
||||||
|
}
|
||||||
|
inline void AddRow( const XGEntry *data, size_t len ){
|
||||||
|
xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
mat.row_data_.resize( mat.row_ptr_.back() + len );
|
||||||
|
memcpy( &mat.row_data_[mat.row_ptr_.back()], data, sizeof(XGEntry)*len );
|
||||||
|
mat.row_ptr_.push_back( mat.row_ptr_.back() + len );
|
||||||
|
init_col_ = false;
|
||||||
|
}
|
||||||
|
inline const XGEntry* GetRow(unsigned ridx, size_t* len) const{
|
||||||
|
const xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
|
||||||
|
*len = mat.row_ptr_[ridx+1] - mat.row_ptr_[ridx];
|
||||||
|
return &mat.row_data_[ mat.row_ptr_[ridx] ];
|
||||||
|
}
|
||||||
|
inline void ParseCSR( const size_t *indptr,
|
||||||
|
const unsigned *indices,
|
||||||
|
const float *data,
|
||||||
|
size_t nindptr,
|
||||||
|
size_t nelem ){
|
||||||
|
xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
mat.row_ptr_.resize( nindptr );
|
||||||
|
memcpy( &mat.row_ptr_[0], indptr, sizeof(size_t)*nindptr );
|
||||||
|
mat.row_data_.resize( nelem );
|
||||||
|
for( size_t i = 0; i < nelem; ++ i ){
|
||||||
|
mat.row_data_[i] = XGEntry(indices[i], data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void SetLabel( const float *label, size_t len ){
|
||||||
|
this->info.labels.resize( len );
|
||||||
|
memcpy( &(this->info).labels[0], label, sizeof(float)*len );
|
||||||
|
}
|
||||||
|
inline void SetGroup( const unsigned *group, size_t len ){
|
||||||
|
this->info.group_ptr.resize( len + 1 );
|
||||||
|
this->info.group_ptr[0] = 0;
|
||||||
|
for( size_t i = 0; i < len; ++ i ){
|
||||||
|
this->info.group_ptr[i+1] = this->info.group_ptr[i]+group[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void SetWeight( const float *weight, size_t len ){
|
||||||
|
this->info.weights.resize( len );
|
||||||
|
memcpy( &(this->info).weights[0], weight, sizeof(float)*len );
|
||||||
|
}
|
||||||
|
inline const float* GetLabel( size_t* len ) const{
|
||||||
|
*len = this->info.labels.size();
|
||||||
|
return &(this->info.labels[0]);
|
||||||
|
}
|
||||||
|
inline void CheckInit(void){
|
||||||
|
if(!init_col_){
|
||||||
|
this->data.InitData();
|
||||||
|
}
|
||||||
|
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class Booster: public xgboost::regrank::RegRankBoostLearner{
|
||||||
|
private:
|
||||||
|
bool init_trainer, init_model;
|
||||||
|
public:
|
||||||
|
Booster(const std::vector<const regrank::DMatrix *> mats){
|
||||||
|
silent = 1;
|
||||||
|
init_trainer = false;
|
||||||
|
init_model = false;
|
||||||
|
this->SetCacheData(mats);
|
||||||
|
}
|
||||||
|
inline void CheckInit(void){
|
||||||
|
if( !init_trainer ){
|
||||||
|
this->InitTrainer(); init_trainer = true;
|
||||||
|
}
|
||||||
|
if( !init_model ){
|
||||||
|
this->InitModel(); init_model = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void LoadModel( const char *fname ){
|
||||||
|
xgboost::regrank::RegRankBoostLearner::LoadModel(fname);
|
||||||
|
this->init_model = true;
|
||||||
|
}
|
||||||
|
const float *Pred( const DMatrix &dmat, size_t *len, int bst_group ){
|
||||||
|
this->CheckInit();
|
||||||
|
|
||||||
|
this->Predict( this->preds_, dmat, bst_group );
|
||||||
|
*len = this->preds_.size();
|
||||||
|
return &this->preds_[0];
|
||||||
|
}
|
||||||
|
inline void BoostOneIter( const DMatrix &train,
|
||||||
|
float *grad, float *hess, size_t len, int bst_group ){
|
||||||
|
this->grad_.resize( len ); this->hess_.resize( len );
|
||||||
|
memcpy( &this->grad_[0], grad, sizeof(float)*len );
|
||||||
|
memcpy( &this->hess_[0], hess, sizeof(float)*len );
|
||||||
|
|
||||||
|
if( grad_.size() == train.Size() ){
|
||||||
|
if( bst_group < 0 ) bst_group = 0;
|
||||||
|
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index, bst_group);
|
||||||
|
}else{
|
||||||
|
utils::Assert( bst_group == -1, "must set bst_group to -1 to support all group boosting" );
|
||||||
|
int ngroup = base_gbm.NumBoosterGroup();
|
||||||
|
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
|
||||||
|
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
|
||||||
|
for( int g = 0; g < ngroup; ++ g ){
|
||||||
|
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
using namespace xgboost::python;
|
||||||
|
|
||||||
|
|
||||||
|
extern "C"{
|
||||||
|
void* XGDMatrixCreate( void ){
|
||||||
|
return new DMatrix();
|
||||||
|
}
|
||||||
|
void XGDMatrixFree( void *handle ){
|
||||||
|
delete static_cast<DMatrix*>(handle);
|
||||||
|
}
|
||||||
|
void XGDMatrixLoad( void *handle, const char *fname, int silent ){
|
||||||
|
static_cast<DMatrix*>(handle)->Load(fname, silent!=0);
|
||||||
|
}
|
||||||
|
void XGDMatrixSaveBinary( void *handle, const char *fname, int silent ){
|
||||||
|
static_cast<DMatrix*>(handle)->SaveBinary(fname, silent!=0);
|
||||||
|
}
|
||||||
|
void XGDMatrixParseCSR( void *handle,
|
||||||
|
const size_t *indptr,
|
||||||
|
const unsigned *indices,
|
||||||
|
const float *data,
|
||||||
|
size_t nindptr,
|
||||||
|
size_t nelem ){
|
||||||
|
static_cast<DMatrix*>(handle)->ParseCSR(indptr, indices, data, nindptr, nelem);
|
||||||
|
}
|
||||||
|
void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->SetLabel(label,len);
|
||||||
|
}
|
||||||
|
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->SetWeight(weight,len);
|
||||||
|
}
|
||||||
|
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->SetGroup(group,len);
|
||||||
|
}
|
||||||
|
const float* XGDMatrixGetLabel( const void *handle, size_t* len ){
|
||||||
|
return static_cast<const DMatrix*>(handle)->GetLabel(len);
|
||||||
|
}
|
||||||
|
void XGDMatrixClear(void *handle){
|
||||||
|
static_cast<DMatrix*>(handle)->Clear();
|
||||||
|
}
|
||||||
|
void XGDMatrixAddRow( void *handle, const XGEntry *data, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->AddRow(data, len);
|
||||||
|
}
|
||||||
|
size_t XGDMatrixNumRow(const void *handle){
|
||||||
|
return static_cast<const DMatrix*>(handle)->NumRow();
|
||||||
|
}
|
||||||
|
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len){
|
||||||
|
return static_cast<DMatrix*>(handle)->GetRow(ridx, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
// xgboost implementation
|
||||||
|
void *XGBoosterCreate( void *dmats[], size_t len ){
|
||||||
|
std::vector<const xgboost::regrank::DMatrix*> mats;
|
||||||
|
for( size_t i = 0; i < len; ++i ){
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dmats[i]);
|
||||||
|
dtr->CheckInit();
|
||||||
|
mats.push_back( dtr );
|
||||||
|
}
|
||||||
|
return new Booster( mats );
|
||||||
|
}
|
||||||
|
void XGBoosterFree( void *handle ){
|
||||||
|
delete static_cast<Booster*>(handle);
|
||||||
|
}
|
||||||
|
void XGBoosterSetParam( void *handle, const char *name, const char *value ){
|
||||||
|
static_cast<Booster*>(handle)->SetParam( name, value );
|
||||||
|
}
|
||||||
|
void XGBoosterUpdateOneIter( void *handle, void *dtrain ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||||
|
bst->CheckInit(); dtr->CheckInit();
|
||||||
|
bst->UpdateOneIter( *dtr );
|
||||||
|
}
|
||||||
|
void XGBoosterBoostOneIter( void *handle, void *dtrain,
|
||||||
|
float *grad, float *hess, size_t len, int bst_group ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||||
|
bst->CheckInit(); dtr->CheckInit();
|
||||||
|
bst->BoostOneIter( *dtr, grad, hess, len, bst_group );
|
||||||
|
}
|
||||||
|
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
bst->CheckInit();
|
||||||
|
|
||||||
|
std::vector<std::string> names;
|
||||||
|
std::vector<const xgboost::regrank::DMatrix*> mats;
|
||||||
|
for( size_t i = 0; i < len; ++i ){
|
||||||
|
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
|
||||||
|
names.push_back( std::string( evnames[i]) );
|
||||||
|
}
|
||||||
|
bst->EvalOneIter( iter, mats, names, stdout );
|
||||||
|
}
|
||||||
|
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group ){
|
||||||
|
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len, bst_group );
|
||||||
|
}
|
||||||
|
void XGBoosterLoadModel( void *handle, const char *fname ){
|
||||||
|
static_cast<Booster*>(handle)->LoadModel( fname );
|
||||||
|
}
|
||||||
|
void XGBoosterSaveModel( const void *handle, const char *fname ){
|
||||||
|
static_cast<const Booster*>(handle)->SaveModel( fname );
|
||||||
|
}
|
||||||
|
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ){
|
||||||
|
using namespace xgboost::utils;
|
||||||
|
FILE *fo = FopenCheck( fname, "w" );
|
||||||
|
FeatMap featmap;
|
||||||
|
if( strlen(fmap) != 0 ){
|
||||||
|
featmap.LoadText( fmap );
|
||||||
|
}
|
||||||
|
static_cast<Booster*>(handle)->DumpModel( fo, featmap, false );
|
||||||
|
fclose( fo );
|
||||||
|
}
|
||||||
|
|
||||||
|
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char *action ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||||
|
bst->CheckInit(); dtr->CheckInit();
|
||||||
|
std::string act( action );
|
||||||
|
bst->UpdateInteract( act, *dtr );
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
188
python/xgboost_python.h
Normal file
188
python/xgboost_python.h
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
#ifndef XGBOOST_PYTHON_H
|
||||||
|
#define XGBOOST_PYTHON_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_data.h
|
||||||
|
* \brief python wrapper for xgboost, using ctypes,
|
||||||
|
* hides everything behind functions
|
||||||
|
* use c style interface
|
||||||
|
*/
|
||||||
|
#include "../booster/xgboost_data.h"
|
||||||
|
extern "C"{
|
||||||
|
/*! \brief type of row entry */
|
||||||
|
typedef xgboost::booster::FMatrixS::REntry XGEntry;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief create a data matrix
|
||||||
|
* \return a new data matrix
|
||||||
|
*/
|
||||||
|
void* XGDMatrixCreate(void);
|
||||||
|
/*!
|
||||||
|
* \brief free space in data matrix
|
||||||
|
*/
|
||||||
|
void XGDMatrixFree(void *handle);
|
||||||
|
/*!
|
||||||
|
* \brief load a data matrix from text file or buffer(if exists)
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param fname file name
|
||||||
|
* \param silent print statistics when loading
|
||||||
|
*/
|
||||||
|
void XGDMatrixLoad(void *handle, const char *fname, int silent);
|
||||||
|
/*!
|
||||||
|
* \brief load a data matrix into binary file
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param fname file name
|
||||||
|
* \param silent print statistics when saving
|
||||||
|
*/
|
||||||
|
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
|
||||||
|
/*!
|
||||||
|
* \brief set matrix content from csr format
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param indptr pointer to row headers
|
||||||
|
* \param indices findex
|
||||||
|
* \param data fvalue
|
||||||
|
* \param nindptr number of rows in the matix + 1
|
||||||
|
* \param nelem number of nonzero elements in the matrix
|
||||||
|
*/
|
||||||
|
void XGDMatrixParseCSR( void *handle,
|
||||||
|
const size_t *indptr,
|
||||||
|
const unsigned *indices,
|
||||||
|
const float *data,
|
||||||
|
size_t nindptr,
|
||||||
|
size_t nelem );
|
||||||
|
/*!
|
||||||
|
* \brief set label of the training matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param label pointer to label
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixSetLabel( void *handle, const float *label, size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief set label of the training matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param group pointer to group size
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief set weight of each instacne
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param weight data pointer to weights
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief get label set from matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param len used to set result length
|
||||||
|
* \return pointer to the row
|
||||||
|
*/
|
||||||
|
const float* XGDMatrixGetLabel( const void *handle, size_t* len );
|
||||||
|
/*!
|
||||||
|
* \brief clear all the records, including feature matrix and label
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
*/
|
||||||
|
void XGDMatrixClear(void *handle);
|
||||||
|
/*!
|
||||||
|
* \brief return number of rows
|
||||||
|
*/
|
||||||
|
size_t XGDMatrixNumRow(const void *handle);
|
||||||
|
/*!
|
||||||
|
* \brief add row
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param data array of row content
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixAddRow(void *handle, const XGEntry *data, size_t len);
|
||||||
|
/*!
|
||||||
|
* \brief get ridx-th row of sparse matrix
|
||||||
|
* \param handle handle
|
||||||
|
* \param ridx row index
|
||||||
|
* \param len used to set result length
|
||||||
|
* \reurn pointer to the row
|
||||||
|
*/
|
||||||
|
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len);
|
||||||
|
|
||||||
|
// --- start XGBoost class
|
||||||
|
/*!
|
||||||
|
* \brief create xgboost learner
|
||||||
|
* \param dmats matrices that are set to be cached
|
||||||
|
* \param create a booster
|
||||||
|
*/
|
||||||
|
void *XGBoosterCreate( void* dmats[], size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief free obj in handle
|
||||||
|
* \param handle handle to be freed
|
||||||
|
*/
|
||||||
|
void XGBoosterFree( void* handle );
|
||||||
|
/*!
|
||||||
|
* \brief set parameters
|
||||||
|
* \param handle handle
|
||||||
|
* \param name parameter name
|
||||||
|
* \param val value of parameter
|
||||||
|
*/
|
||||||
|
void XGBoosterSetParam( void *handle, const char *name, const char *value );
|
||||||
|
/*!
|
||||||
|
* \brief update the model in one round using dtrain
|
||||||
|
* \param handle handle
|
||||||
|
* \param dtrain training data
|
||||||
|
*/
|
||||||
|
void XGBoosterUpdateOneIter( void *handle, void *dtrain );
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief update the model, by directly specify gradient and second order gradient,
|
||||||
|
* this can be used to replace UpdateOneIter, to support customized loss function
|
||||||
|
* \param handle handle
|
||||||
|
* \param dtrain training data
|
||||||
|
* \param grad gradient statistics
|
||||||
|
* \param hess second order gradient statistics
|
||||||
|
* \param len length of grad/hess array
|
||||||
|
* \param bst_group boost group we are working at, default = -1
|
||||||
|
*/
|
||||||
|
void XGBoosterBoostOneIter( void *handle, void *dtrain,
|
||||||
|
float *grad, float *hess, size_t len, int bst_group );
|
||||||
|
/*!
|
||||||
|
* \brief print evaluation statistics to stdout for xgboost
|
||||||
|
* \param handle handle
|
||||||
|
* \param iter current iteration rounds
|
||||||
|
* \param dmats pointers to data to be evaluated
|
||||||
|
* \param evnames pointers to names of each data
|
||||||
|
* \param len length of dmats
|
||||||
|
*/
|
||||||
|
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief make prediction based on dmat
|
||||||
|
* \param handle handle
|
||||||
|
* \param dmat data matrix
|
||||||
|
* \param len used to store length of returning result
|
||||||
|
* \param bst_group booster group, if model contains multiple booster group, default = -1 means predict for all groups
|
||||||
|
*/
|
||||||
|
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group );
|
||||||
|
/*!
|
||||||
|
* \brief load model from existing file
|
||||||
|
* \param handle handle
|
||||||
|
* \param fname file name
|
||||||
|
*/
|
||||||
|
void XGBoosterLoadModel( void *handle, const char *fname );
|
||||||
|
/*!
|
||||||
|
* \brief save model into existing file
|
||||||
|
* \param handle handle
|
||||||
|
* \param fname file name
|
||||||
|
*/
|
||||||
|
void XGBoosterSaveModel( const void *handle, const char *fname );
|
||||||
|
/*!
|
||||||
|
* \brief dump model into text file
|
||||||
|
* \param handle handle
|
||||||
|
* \param fname file name
|
||||||
|
* \param fmap name to fmap can be empty string
|
||||||
|
*/
|
||||||
|
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap );
|
||||||
|
/*!
|
||||||
|
* \brief interactively update model: beta
|
||||||
|
* \param handle handle
|
||||||
|
* \param dtrain training data
|
||||||
|
* \param action action name
|
||||||
|
*/
|
||||||
|
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char* action );
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
@ -28,40 +28,41 @@ namespace xgboost{
|
|||||||
name_obj_ = "reg";
|
name_obj_ = "reg";
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief a regression booter associated with training and evaluating data
|
* \brief a regression booter associated with training and evaluating data
|
||||||
* \param train pointer to the training data
|
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||||
* \param evals array of evaluating data
|
*/
|
||||||
* \param evname name of evaluation data, used print statistics
|
RegRankBoostLearner(const std::vector<const DMatrix *>& mats){
|
||||||
*/
|
|
||||||
RegRankBoostLearner(const DMatrix *train,
|
|
||||||
const std::vector<DMatrix *> &evals,
|
|
||||||
const std::vector<std::string> &evname){
|
|
||||||
silent = 0;
|
silent = 0;
|
||||||
this->SetData(train, evals, evname);
|
obj_ = NULL;
|
||||||
}
|
name_obj_ = "reg";
|
||||||
|
this->SetCacheData(mats);
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief associate regression booster with training and evaluating data
|
* \brief add internal cache space for mat, this can speedup prediction for matrix,
|
||||||
* \param train pointer to the training data
|
* please cache prediction for training and eval data
|
||||||
* \param evals array of evaluating data
|
* warning: if the model is loaded from file from some previous training history
|
||||||
* \param evname name of evaluation data, used print statistics
|
* set cache data must be called with exactly SAME
|
||||||
*/
|
* data matrices to continue training otherwise it will cause error
|
||||||
inline void SetData(const DMatrix *train,
|
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||||
const std::vector<DMatrix *> &evals,
|
*/
|
||||||
const std::vector<std::string> &evname){
|
inline void SetCacheData(const std::vector<const DMatrix *>& mats){
|
||||||
this->train_ = train;
|
|
||||||
this->evals_ = evals;
|
|
||||||
this->evname_ = evname;
|
|
||||||
// estimate feature bound
|
// estimate feature bound
|
||||||
int num_feature = (int)(train->data.NumCol());
|
int num_feature = 0;
|
||||||
// assign buffer index
|
// assign buffer index
|
||||||
unsigned buffer_size = static_cast<unsigned>(train->Size());
|
unsigned buffer_size = 0;
|
||||||
|
|
||||||
for (size_t i = 0; i < evals.size(); ++i){
|
utils::Assert( cache_.size() == 0, "can only call cache data once" );
|
||||||
buffer_size += static_cast<unsigned>(evals[i]->Size());
|
for( size_t i = 0; i < mats.size(); ++i ){
|
||||||
num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol()));
|
bool dupilicate = false;
|
||||||
|
for( size_t j = 0; j < i; ++ j ){
|
||||||
|
if( mats[i] == mats[j] ) dupilicate = true;
|
||||||
|
}
|
||||||
|
if( dupilicate ) continue;
|
||||||
|
cache_.push_back( CacheEntry( mats[i], buffer_size ) );
|
||||||
|
buffer_size += static_cast<unsigned>(mats[i]->Size());
|
||||||
|
num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
|
||||||
}
|
}
|
||||||
|
|
||||||
char str_temp[25];
|
char str_temp[25];
|
||||||
if (num_feature > mparam.num_feature){
|
if (num_feature > mparam.num_feature){
|
||||||
mparam.num_feature = num_feature;
|
mparam.num_feature = num_feature;
|
||||||
@ -74,19 +75,18 @@ namespace xgboost{
|
|||||||
if (!silent){
|
if (!silent){
|
||||||
printf("buffer_size=%u\n", buffer_size);
|
printf("buffer_size=%u\n", buffer_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// set eval_preds tmp sapce
|
|
||||||
this->eval_preds_.resize(evals.size(), std::vector<float>());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
* \param name name of the parameter
|
* \param name name of the parameter
|
||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val){
|
inline void SetParam(const char *name, const char *val){
|
||||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||||
if (!strcmp(name, "objective") ) name_obj_ = val;
|
if (!strcmp(name, "objective") ) name_obj_ = val;
|
||||||
|
if (!strcmp(name, "num_class") ) base_gbm.SetParam("num_booster_group", val );
|
||||||
mparam.SetParam(name, val);
|
mparam.SetParam(name, val);
|
||||||
base_gbm.SetParam(name, val);
|
base_gbm.SetParam(name, val);
|
||||||
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
|
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
|
||||||
@ -96,7 +96,13 @@ namespace xgboost{
|
|||||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||||
*/
|
*/
|
||||||
inline void InitTrainer(void){
|
inline void InitTrainer(void){
|
||||||
base_gbm.InitTrainer();
|
if( mparam.num_class != 0 ){
|
||||||
|
if( name_obj_ != "softmax" ){
|
||||||
|
name_obj_ = "softmax";
|
||||||
|
printf("auto select objective=softmax to support multi-class classification\n" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
base_gbm.InitTrainer();
|
||||||
obj_ = CreateObjFunction( name_obj_.c_str() );
|
obj_ = CreateObjFunction( name_obj_.c_str() );
|
||||||
for( size_t i = 0; i < cfg_.size(); ++ i ){
|
for( size_t i = 0; i < cfg_.size(); ++ i ){
|
||||||
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
|
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
|
||||||
@ -104,16 +110,25 @@ namespace xgboost{
|
|||||||
evaluator_.AddEval( obj_->DefaultEvalMetric() );
|
evaluator_.AddEval( obj_->DefaultEvalMetric() );
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||||
*/
|
*/
|
||||||
inline void InitModel(void){
|
inline void InitModel(void){
|
||||||
base_gbm.InitModel();
|
base_gbm.InitModel();
|
||||||
mparam.AdjustBase();
|
mparam.AdjustBase();
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief load model from stream
|
* \brief load model from file
|
||||||
* \param fi input stream
|
* \param fname file name
|
||||||
*/
|
*/
|
||||||
|
inline void LoadModel(const char *fname){
|
||||||
|
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
|
||||||
|
this->LoadModel(fi);
|
||||||
|
fi.Close();
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load model from stream
|
||||||
|
* \param fi input stream
|
||||||
|
*/
|
||||||
inline void LoadModel(utils::IStream &fi){
|
inline void LoadModel(utils::IStream &fi){
|
||||||
base_gbm.LoadModel(fi);
|
base_gbm.LoadModel(fi);
|
||||||
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||||
@ -144,77 +159,91 @@ namespace xgboost{
|
|||||||
fo.Write(&mparam, sizeof(ModelParam));
|
fo.Write(&mparam, sizeof(ModelParam));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief update the model for one iteration
|
* \brief save model into file
|
||||||
* \param iteration iteration number
|
* \param fname file name
|
||||||
*/
|
*/
|
||||||
inline void UpdateOneIter(int iter){
|
inline void SaveModel(const char *fname) const{
|
||||||
this->PredictBuffer(preds_, *train_, 0);
|
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||||
obj_->GetGradient(preds_, train_->info, base_gbm.NumBoosters(), grad_, hess_);
|
this->SaveModel(fo);
|
||||||
std::vector<unsigned> root_index;
|
fo.Close();
|
||||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
}
|
||||||
|
/*!
|
||||||
|
* \brief update the model for one iteration
|
||||||
|
*/
|
||||||
|
inline void UpdateOneIter(const DMatrix &train){
|
||||||
|
this->PredictRaw(preds_, train);
|
||||||
|
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
|
||||||
|
if( grad_.size() == train.Size() ){
|
||||||
|
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index);
|
||||||
|
}else{
|
||||||
|
int ngroup = base_gbm.NumBoosterGroup();
|
||||||
|
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
|
||||||
|
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
|
||||||
|
for( int g = 0; g < ngroup; ++ g ){
|
||||||
|
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief evaluate the model for specific iteration
|
* \brief evaluate the model for specific iteration
|
||||||
* \param iter iteration number
|
* \param iter iteration number
|
||||||
|
* \param evals datas i want to evaluate
|
||||||
|
* \param evname name of each dataset
|
||||||
* \param fo file to output log
|
* \param fo file to output log
|
||||||
*/
|
*/
|
||||||
inline void EvalOneIter(int iter, FILE *fo = stderr){
|
inline void EvalOneIter(int iter,
|
||||||
|
const std::vector<const DMatrix*> &evals,
|
||||||
|
const std::vector<std::string> &evname,
|
||||||
|
FILE *fo=stderr ){
|
||||||
fprintf(fo, "[%d]", iter);
|
fprintf(fo, "[%d]", iter);
|
||||||
int buffer_offset = static_cast<int>(train_->Size());
|
for (size_t i = 0; i < evals.size(); ++i){
|
||||||
|
this->PredictRaw(preds_, *evals[i]);
|
||||||
for (size_t i = 0; i < evals_.size(); ++i){
|
obj_->PredTransform(preds_);
|
||||||
std::vector<float> &preds = this->eval_preds_[i];
|
evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info);
|
||||||
this->PredictBuffer(preds, *evals_[i], buffer_offset);
|
|
||||||
obj_->PredTransform(preds);
|
|
||||||
evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info);
|
|
||||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
|
||||||
}
|
}
|
||||||
fprintf(fo, "\n");
|
fprintf(fo, "\n");
|
||||||
fflush(fo);
|
fflush(fo);
|
||||||
}
|
}
|
||||||
/*! \brief get prediction, without buffering */
|
/*!
|
||||||
inline void Predict(std::vector<float> &preds, const DMatrix &data){
|
* \brief get prediction
|
||||||
preds.resize(data.Size());
|
* \param storage to store prediction
|
||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
* \param data input data
|
||||||
#pragma omp parallel for schedule( static )
|
* \param bst_group booster group we are in
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
*/
|
||||||
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
|
inline void Predict(std::vector<float> &preds, const DMatrix &data, int bst_group = -1){
|
||||||
}
|
this->PredictRaw( preds, data, bst_group );
|
||||||
obj_->PredTransform( preds );
|
obj_->PredTransform( preds );
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
/*!
|
/*!
|
||||||
* \brief interactive update
|
* \brief interactive update
|
||||||
* \param action action type
|
* \param action action type
|
||||||
|
* \parma train training data
|
||||||
*/
|
*/
|
||||||
inline void UpdateInteract(std::string action){
|
inline void UpdateInteract(std::string action, const DMatrix& train){
|
||||||
this->InteractPredict(preds_, *train_, 0);
|
for(size_t i = 0; i < cache_.size(); ++i){
|
||||||
|
this->InteractPredict(preds_, *cache_[i].mat_);
|
||||||
int buffer_offset = static_cast<int>(train_->Size());
|
|
||||||
for (size_t i = 0; i < evals_.size(); ++i){
|
|
||||||
std::vector<float> &preds = this->eval_preds_[i];
|
|
||||||
this->InteractPredict(preds, *evals_[i], buffer_offset);
|
|
||||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (action == "remove"){
|
if (action == "remove"){
|
||||||
base_gbm.DelteBooster(); return;
|
base_gbm.DelteBooster(); return;
|
||||||
}
|
}
|
||||||
|
|
||||||
obj_->GetGradient(preds_, train_->info, base_gbm.NumBoosters(), grad_, hess_);
|
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
|
||||||
std::vector<unsigned> root_index;
|
std::vector<unsigned> root_index;
|
||||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
base_gbm.DoBoost(grad_, hess_, train.data, root_index);
|
||||||
|
|
||||||
this->InteractRePredict(*train_, 0);
|
for(size_t i = 0; i < cache_.size(); ++i){
|
||||||
buffer_offset = static_cast<int>(train_->Size());
|
this->InteractRePredict(*cache_[i].mat_);
|
||||||
for (size_t i = 0; i < evals_.size(); ++i){
|
|
||||||
this->InteractRePredict(*evals_[i], buffer_offset);
|
|
||||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
/*! \brief get the transformed predictions, given data */
|
/*! \brief get the transformed predictions, given data */
|
||||||
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
|
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data){
|
||||||
|
int buffer_offset = this->FindBufferOffset(data);
|
||||||
|
utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
|
||||||
preds.resize(data.Size());
|
preds.resize(data.Size());
|
||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
@ -224,21 +253,42 @@ namespace xgboost{
|
|||||||
obj_->PredTransform( preds );
|
obj_->PredTransform( preds );
|
||||||
}
|
}
|
||||||
/*! \brief repredict trial */
|
/*! \brief repredict trial */
|
||||||
inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
|
inline void InteractRePredict(const DMatrix &data){
|
||||||
|
int buffer_offset = this->FindBufferOffset(data);
|
||||||
|
utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
|
||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
|
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
/*! \brief get un-transformed prediction*/
|
||||||
/*! \brief get the transformed predictions, given data */
|
inline void PredictRaw(std::vector<float> &preds, const DMatrix &data, int bst_group = -1 ){
|
||||||
inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
|
int buffer_offset = this->FindBufferOffset(data);
|
||||||
preds.resize(data.Size());
|
if( bst_group < 0 ){
|
||||||
|
int ngroup = base_gbm.NumBoosterGroup();
|
||||||
|
preds.resize( data.Size() * ngroup );
|
||||||
|
for( int g = 0; g < ngroup; ++ g ){
|
||||||
|
this->PredictBuffer(&preds[ data.Size() * g ], data, buffer_offset, g );
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
preds.resize( data.Size() );
|
||||||
|
this->PredictBuffer(&preds[0], data, buffer_offset, bst_group );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief get the un-transformed predictions, given data */
|
||||||
|
inline void PredictBuffer(float *preds, const DMatrix &data, int buffer_offset, int bst_group ){
|
||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
#pragma omp parallel for schedule( static )
|
if( buffer_offset >= 0 ){
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
#pragma omp parallel for schedule( static )
|
||||||
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
|
||||||
|
}
|
||||||
|
}else
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1, data.info.GetRoot(j), bst_group );
|
||||||
|
}{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
@ -249,24 +299,28 @@ namespace xgboost{
|
|||||||
/* \brief type of loss function */
|
/* \brief type of loss function */
|
||||||
int loss_type;
|
int loss_type;
|
||||||
/* \brief number of features */
|
/* \brief number of features */
|
||||||
int num_feature;
|
int num_feature;
|
||||||
|
/* \brief number of class, if it is multi-class classification */
|
||||||
|
int num_class;
|
||||||
/*! \brief reserved field */
|
/*! \brief reserved field */
|
||||||
int reserved[16];
|
int reserved[15];
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
ModelParam(void){
|
ModelParam(void){
|
||||||
base_score = 0.5f;
|
base_score = 0.5f;
|
||||||
loss_type = 0;
|
loss_type = 0;
|
||||||
num_feature = 0;
|
num_feature = 0;
|
||||||
|
num_class = 0;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
* \param name name of the parameter
|
* \param name name of the parameter
|
||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val){
|
inline void SetParam(const char *name, const char *val){
|
||||||
if (!strcmp("base_score", name)) base_score = (float)atof(val);
|
if (!strcmp("base_score", name)) base_score = (float)atof(val);
|
||||||
if (!strcmp("loss_type", name)) loss_type = atoi(val);
|
if (!strcmp("loss_type", name)) loss_type = atoi(val);
|
||||||
|
if (!strcmp("num_class", name)) num_class = atoi(val);
|
||||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -280,22 +334,34 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
|
struct CacheEntry{
|
||||||
|
const DMatrix *mat_;
|
||||||
|
int buffer_offset_;
|
||||||
|
CacheEntry(const DMatrix *mat, int buffer_offset)
|
||||||
|
:mat_(mat), buffer_offset_(buffer_offset){}
|
||||||
|
};
|
||||||
|
/*! \brief the entries indicates that we have internal prediction cache */
|
||||||
|
std::vector<CacheEntry> cache_;
|
||||||
|
private:
|
||||||
|
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||||
|
inline int FindBufferOffset(const DMatrix &mat){
|
||||||
|
for(size_t i = 0; i < cache_.size(); ++i){
|
||||||
|
if( cache_[i].mat_ == &mat ) return cache_[i].buffer_offset_;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
protected:
|
||||||
int silent;
|
int silent;
|
||||||
EvalSet evaluator_;
|
EvalSet evaluator_;
|
||||||
booster::GBMBase base_gbm;
|
booster::GBMBase base_gbm;
|
||||||
ModelParam mparam;
|
ModelParam mparam;
|
||||||
const DMatrix *train_;
|
|
||||||
std::vector<DMatrix *> evals_;
|
|
||||||
std::vector<std::string> evname_;
|
|
||||||
std::vector<unsigned> buffer_index_;
|
|
||||||
// objective fnction
|
// objective fnction
|
||||||
IObjFunction *obj_;
|
IObjFunction *obj_;
|
||||||
// name of objective function
|
// name of objective function
|
||||||
std::string name_obj_;
|
std::string name_obj_;
|
||||||
std::vector< std::pair<std::string, std::string> > cfg_;
|
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||||
private:
|
protected:
|
||||||
std::vector<float> grad_, hess_, preds_;
|
std::vector<float> grad_, hess_, preds_;
|
||||||
std::vector< std::vector<float> > eval_preds_;
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -35,11 +35,17 @@ namespace xgboost{
|
|||||||
std::vector<unsigned> group_ptr;
|
std::vector<unsigned> group_ptr;
|
||||||
/*! \brief weights of each instance, optional */
|
/*! \brief weights of each instance, optional */
|
||||||
std::vector<float> weights;
|
std::vector<float> weights;
|
||||||
|
/*! \brief specified root index of each instance, can be used for multi task setting*/
|
||||||
|
std::vector<unsigned> root_index;
|
||||||
/*! \brief get weight of each instances */
|
/*! \brief get weight of each instances */
|
||||||
inline float GetWeight( size_t i ) const{
|
inline float GetWeight( size_t i ) const{
|
||||||
if( weights.size() != 0 ) return weights[i];
|
if( weights.size() != 0 ) return weights[i];
|
||||||
else return 1.0f;
|
else return 1.0f;
|
||||||
}
|
}
|
||||||
|
inline float GetRoot( size_t i ) const{
|
||||||
|
if( root_index.size() != 0 ) return root_index[i];
|
||||||
|
else return 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
/*! \brief feature data content */
|
/*! \brief feature data content */
|
||||||
@ -113,12 +119,13 @@ namespace xgboost{
|
|||||||
if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){
|
if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){
|
||||||
info.group_ptr.resize( ngptr );
|
info.group_ptr.resize( ngptr );
|
||||||
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
|
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
|
||||||
utils::Assert( info.group_ptr.back() == data.NumRow(), "number of group must match number of record" );
|
if( ngptr != 0 ){
|
||||||
|
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
|
||||||
|
utils::Assert( info.group_ptr.back() == data.NumRow(), "number of group must match number of record" );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fs.Close();
|
fs.Close();
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
if (!silent){
|
if (!silent){
|
||||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
@ -146,7 +153,9 @@ namespace xgboost{
|
|||||||
{// write out group ptr
|
{// write out group ptr
|
||||||
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
|
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
|
||||||
fs.Write(&ngptr, sizeof(unsigned) );
|
fs.Write(&ngptr, sizeof(unsigned) );
|
||||||
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
|
if( ngptr != 0 ){
|
||||||
|
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
fs.Close();
|
fs.Close();
|
||||||
if (!silent){
|
if (!silent){
|
||||||
@ -169,7 +178,11 @@ namespace xgboost{
|
|||||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
||||||
int len = strlen(fname);
|
int len = strlen(fname);
|
||||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||||
this->LoadBinary(fname, silent); return;
|
if( !this->LoadBinary(fname, silent) ){
|
||||||
|
fprintf(stderr,"can not open file \"%s\"", fname);
|
||||||
|
utils::Error("DMatrix::CacheLoad failed");
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
char bname[1024];
|
char bname[1024];
|
||||||
sprintf(bname, "%s.buffer", fname);
|
sprintf(bname, "%s.buffer", fname);
|
||||||
|
|||||||
@ -13,6 +13,7 @@
|
|||||||
#include "../utils/xgboost_omp.h"
|
#include "../utils/xgboost_omp.h"
|
||||||
#include "../utils/xgboost_random.h"
|
#include "../utils/xgboost_random.h"
|
||||||
#include "xgboost_regrank_data.h"
|
#include "xgboost_regrank_data.h"
|
||||||
|
#include "xgboost_regrank_utils.h"
|
||||||
|
|
||||||
namespace xgboost{
|
namespace xgboost{
|
||||||
namespace regrank{
|
namespace regrank{
|
||||||
@ -31,17 +32,11 @@ namespace xgboost{
|
|||||||
virtual ~IEvaluator(void){}
|
virtual ~IEvaluator(void){}
|
||||||
};
|
};
|
||||||
|
|
||||||
inline static bool CmpFirst(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
|
|
||||||
return a.first > b.first;
|
|
||||||
}
|
|
||||||
inline static bool CmpSecond(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
|
|
||||||
return a.second > b.second;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! \brief RMSE */
|
/*! \brief RMSE */
|
||||||
struct EvalRMSE : public IEvaluator{
|
struct EvalRMSE : public IEvaluator{
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const DMatrix::Info &info) const {
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
float sum = 0.0, wsum = 0.0;
|
float sum = 0.0, wsum = 0.0;
|
||||||
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
@ -62,6 +57,7 @@ namespace xgboost{
|
|||||||
struct EvalLogLoss : public IEvaluator{
|
struct EvalLogLoss : public IEvaluator{
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const DMatrix::Info &info) const {
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
float sum = 0.0f, wsum = 0.0f;
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
@ -106,7 +102,8 @@ namespace xgboost{
|
|||||||
/*! \brief Area under curve, for both classification and rank */
|
/*! \brief Area under curve, for both classification and rank */
|
||||||
struct EvalAuc : public IEvaluator{
|
struct EvalAuc : public IEvaluator{
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const DMatrix::Info &info) const {
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
|
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
|
||||||
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||||
utils::Assert(gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction");
|
utils::Assert(gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction");
|
||||||
@ -159,6 +156,7 @@ namespace xgboost{
|
|||||||
public:
|
public:
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const DMatrix::Info &info) const {
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
const std::vector<unsigned> &gptr = info.group_ptr;
|
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||||
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
|
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
|
||||||
utils::Assert( gptr.back() == preds.size(), "EvalRanklist: group structure must match number of prediction");
|
utils::Assert( gptr.back() == preds.size(), "EvalRanklist: group structure must match number of prediction");
|
||||||
|
|||||||
@ -62,6 +62,7 @@ namespace xgboost{
|
|||||||
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||||
|
if (!strcmp("eval_train", name)) eval_train = atoi(val);
|
||||||
if (!strcmp("task", name)) task = val;
|
if (!strcmp("task", name)) task = val;
|
||||||
if (!strcmp("data", name)) train_path = val;
|
if (!strcmp("data", name)) train_path = val;
|
||||||
if (!strcmp("test:data", name)) test_path = val;
|
if (!strcmp("test:data", name)) test_path = val;
|
||||||
@ -92,6 +93,7 @@ namespace xgboost{
|
|||||||
use_buffer = 1;
|
use_buffer = 1;
|
||||||
num_round = 10;
|
num_round = 10;
|
||||||
save_period = 0;
|
save_period = 0;
|
||||||
|
eval_train = 0;
|
||||||
dump_model_stats = 0;
|
dump_model_stats = 0;
|
||||||
task = "train";
|
task = "train";
|
||||||
model_in = "NULL";
|
model_in = "NULL";
|
||||||
@ -122,9 +124,22 @@ namespace xgboost{
|
|||||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||||
deval.push_back(new DMatrix());
|
deval.push_back(new DMatrix());
|
||||||
deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
|
deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
|
||||||
|
devalall.push_back(deval.back());
|
||||||
}
|
}
|
||||||
|
std::vector<const DMatrix *> dcache(1, &data);
|
||||||
|
for( size_t i = 0; i < deval.size(); ++ i){
|
||||||
|
dcache.push_back( deval[i] );
|
||||||
|
}
|
||||||
|
// set cache data to be all training and evaluation data
|
||||||
|
learner.SetCacheData(dcache);
|
||||||
|
|
||||||
|
// add training set to evaluation set if needed
|
||||||
|
if( eval_train != 0 ){
|
||||||
|
devalall.push_back( &data );
|
||||||
|
eval_data_names.push_back( std::string("train") );
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
learner.SetData(&data, deval, eval_data_names);
|
|
||||||
}
|
}
|
||||||
inline void InitLearner(void){
|
inline void InitLearner(void){
|
||||||
cfg.BeforeFirst();
|
cfg.BeforeFirst();
|
||||||
@ -148,8 +163,8 @@ namespace xgboost{
|
|||||||
for (int i = 0; i < num_round; ++i){
|
for (int i = 0; i < num_round; ++i){
|
||||||
elapsed = (unsigned long)(time(NULL) - start);
|
elapsed = (unsigned long)(time(NULL) - start);
|
||||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||||
learner.UpdateOneIter(i);
|
learner.UpdateOneIter(data);
|
||||||
learner.EvalOneIter(i);
|
learner.EvalOneIter(i, devalall, eval_data_names);
|
||||||
if (save_period != 0 && (i + 1) % save_period == 0){
|
if (save_period != 0 && (i + 1) % save_period == 0){
|
||||||
this->SaveModel(i);
|
this->SaveModel(i);
|
||||||
}
|
}
|
||||||
@ -169,7 +184,7 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
inline void TaskEval(void){
|
inline void TaskEval(void){
|
||||||
learner.EvalOneIter(0);
|
learner.EvalOneIter(0, devalall, eval_data_names);
|
||||||
}
|
}
|
||||||
inline void TaskInteractive(void){
|
inline void TaskInteractive(void){
|
||||||
const time_t start = time(NULL);
|
const time_t start = time(NULL);
|
||||||
@ -179,7 +194,7 @@ namespace xgboost{
|
|||||||
cfg_batch.BeforeFirst();
|
cfg_batch.BeforeFirst();
|
||||||
while (cfg_batch.Next()){
|
while (cfg_batch.Next()){
|
||||||
if (!strcmp(cfg_batch.name(), "run")){
|
if (!strcmp(cfg_batch.name(), "run")){
|
||||||
learner.UpdateInteract(interact_action);
|
learner.UpdateInteract(interact_action, data);
|
||||||
batch_action += 1;
|
batch_action += 1;
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
@ -188,7 +203,7 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (batch_action == 0){
|
if (batch_action == 0){
|
||||||
learner.UpdateInteract(interact_action);
|
learner.UpdateInteract(interact_action, data);
|
||||||
}
|
}
|
||||||
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
|
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
|
||||||
this->SaveModel(model_out.c_str());
|
this->SaveModel(model_out.c_str());
|
||||||
@ -235,6 +250,8 @@ namespace xgboost{
|
|||||||
int silent;
|
int silent;
|
||||||
/* \brief whether use auto binary buffer */
|
/* \brief whether use auto binary buffer */
|
||||||
int use_buffer;
|
int use_buffer;
|
||||||
|
/* \brief whether evaluate training statistics */
|
||||||
|
int eval_train;
|
||||||
/* \brief number of boosting iterations */
|
/* \brief number of boosting iterations */
|
||||||
int num_round;
|
int num_round;
|
||||||
/* \brief the period to save the model, 0 means only save the final round model */
|
/* \brief the period to save the model, 0 means only save the final round model */
|
||||||
@ -272,6 +289,7 @@ namespace xgboost{
|
|||||||
private:
|
private:
|
||||||
DMatrix data;
|
DMatrix data;
|
||||||
std::vector<DMatrix*> deval;
|
std::vector<DMatrix*> deval;
|
||||||
|
std::vector<const DMatrix*> devalall;
|
||||||
utils::FeatMap fmap;
|
utils::FeatMap fmap;
|
||||||
RegRankBoostLearner learner;
|
RegRankBoostLearner learner;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -106,8 +106,9 @@ namespace xgboost{
|
|||||||
namespace regrank{
|
namespace regrank{
|
||||||
IObjFunction* CreateObjFunction( const char *name ){
|
IObjFunction* CreateObjFunction( const char *name ){
|
||||||
if( !strcmp("reg", name ) ) return new RegressionObj();
|
if( !strcmp("reg", name ) ) return new RegressionObj();
|
||||||
if( !strcmp("rank", name ) ) return new PairwiseRankObj();
|
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
||||||
if( !strcmp("softmax", name ) ) return new SoftmaxObj();
|
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
|
||||||
|
if( !strcmp("softmax", name ) ) return new SoftmaxMultiClassObj();
|
||||||
utils::Error("unknown objective function type");
|
utils::Error("unknown objective function type");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,12 +1,13 @@
|
|||||||
#ifndef XGBOOST_REGRANK_OBJ_HPP
|
#ifndef XGBOOST_REGRANK_OBJ_HPP
|
||||||
#define XGBOOST_REGRANK_OBJ_HPP
|
#define XGBOOST_REGRANK_OBJ_HPP
|
||||||
/*!
|
/*!
|
||||||
* \file xgboost_regrank_obj.h
|
* \file xgboost_regrank_obj.hpp
|
||||||
* \brief implementation of objective functions
|
* \brief implementation of objective functions
|
||||||
* \author Tianqi Chen, Kailong Chen
|
* \author Tianqi Chen, Kailong Chen
|
||||||
*/
|
*/
|
||||||
//#include "xgboost_regrank_sample.h"
|
//#include "xgboost_regrank_sample.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include "xgboost_regrank_utils.h"
|
||||||
|
|
||||||
namespace xgboost{
|
namespace xgboost{
|
||||||
namespace regrank{
|
namespace regrank{
|
||||||
@ -24,6 +25,7 @@ namespace xgboost{
|
|||||||
int iter,
|
int iter,
|
||||||
std::vector<float> &grad,
|
std::vector<float> &grad,
|
||||||
std::vector<float> &hess ) {
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
grad.resize(preds.size()); hess.resize(preds.size());
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
@ -52,11 +54,11 @@ namespace xgboost{
|
|||||||
|
|
||||||
namespace regrank{
|
namespace regrank{
|
||||||
// simple softmax rak
|
// simple softmax rak
|
||||||
class SoftmaxObj : public IObjFunction{
|
class SoftmaxRankObj : public IObjFunction{
|
||||||
public:
|
public:
|
||||||
SoftmaxObj(void){
|
SoftmaxRankObj(void){
|
||||||
}
|
}
|
||||||
virtual ~SoftmaxObj(){}
|
virtual ~SoftmaxRankObj(){}
|
||||||
virtual void SetParam(const char *name, const char *val){
|
virtual void SetParam(const char *name, const char *val){
|
||||||
}
|
}
|
||||||
virtual void GetGradient(const std::vector<float>& preds,
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
@ -64,6 +66,7 @@ namespace xgboost{
|
|||||||
int iter,
|
int iter,
|
||||||
std::vector<float> &grad,
|
std::vector<float> &grad,
|
||||||
std::vector<float> &hess ) {
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
grad.resize(preds.size()); hess.resize(preds.size());
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
const std::vector<unsigned> &gptr = info.group_ptr;
|
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||||
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
|
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
|
||||||
@ -96,23 +99,76 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
virtual const char* DefaultEvalMetric(void) {
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
return "pre@1";
|
return "pre@1";
|
||||||
}
|
|
||||||
private:
|
|
||||||
inline static void Softmax( std::vector<float>& rec ){
|
|
||||||
float wmax = rec[0];
|
|
||||||
for( size_t i = 1; i < rec.size(); ++ i ){
|
|
||||||
wmax = std::max( rec[i], wmax );
|
|
||||||
}
|
|
||||||
double wsum = 0.0f;
|
|
||||||
for( size_t i = 0; i < rec.size(); ++ i ){
|
|
||||||
rec[i] = expf(rec[i]-wmax);
|
|
||||||
wsum += rec[i];
|
|
||||||
}
|
|
||||||
for( size_t i = 0; i < rec.size(); ++ i ){
|
|
||||||
rec[i] /= wsum;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// simple softmax multi-class classification
|
||||||
|
class SoftmaxMultiClassObj : public IObjFunction{
|
||||||
|
public:
|
||||||
|
SoftmaxMultiClassObj(void){
|
||||||
|
nclass = 0;
|
||||||
|
}
|
||||||
|
virtual ~SoftmaxMultiClassObj(){}
|
||||||
|
virtual void SetParam(const char *name, const char *val){
|
||||||
|
if( !strcmp( "num_class", name ) ) nclass = atoi(val);
|
||||||
|
}
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
int iter,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( nclass != 0, "must set num_class to use softmax" );
|
||||||
|
utils::Assert( preds.size() == (size_t)nclass * info.labels.size(), "SoftmaxMultiClassObj: label size and pred size does not match" );
|
||||||
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
|
||||||
|
const unsigned ndata = static_cast<unsigned>(info.labels.size());
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
std::vector<float> rec(nclass);
|
||||||
|
#pragma for schedule(static)
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
rec[k] = preds[j + k * ndata];
|
||||||
|
}
|
||||||
|
Softmax( rec );
|
||||||
|
int label = static_cast<int>(info.labels[j]);
|
||||||
|
utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
float p = rec[ k ];
|
||||||
|
if( label == k ){
|
||||||
|
grad[j+k*ndata] = p - 1.0f;
|
||||||
|
}else{
|
||||||
|
grad[j+k*ndata] = p;
|
||||||
|
}
|
||||||
|
hess[j+k*ndata] = 2.0f * p * ( 1.0f - p );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual void PredTransform(std::vector<float> &preds){
|
||||||
|
utils::Assert( nclass != 0, "must set num_class to use softmax" );
|
||||||
|
utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
std::vector<float> rec(nclass);
|
||||||
|
#pragma for schedule(static)
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
rec[k] = preds[j + k * ndata];
|
||||||
|
}
|
||||||
|
Softmax( rec );
|
||||||
|
preds[j] = FindMaxIndex( rec );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
preds.resize( ndata );
|
||||||
|
}
|
||||||
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
|
return "error";
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
int nclass;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace regrank{
|
namespace regrank{
|
||||||
@ -133,6 +189,7 @@ namespace xgboost{
|
|||||||
int iter,
|
int iter,
|
||||||
std::vector<float> &grad,
|
std::vector<float> &grad,
|
||||||
std::vector<float> &hess ) {
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
grad.resize(preds.size()); hess.resize(preds.size());
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
const std::vector<unsigned> &gptr = info.group_ptr;
|
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||||
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
|
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
|
||||||
|
|||||||
@ -31,7 +31,7 @@ namespace xgboost{
|
|||||||
/*! \brief load feature map from text format */
|
/*! \brief load feature map from text format */
|
||||||
inline void LoadText(FILE *fi){
|
inline void LoadText(FILE *fi){
|
||||||
int fid;
|
int fid;
|
||||||
char fname[256], ftype[256];
|
char fname[1256], ftype[1256];
|
||||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
|
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
|
||||||
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
||||||
names_.push_back(std::string(fname));
|
names_.push_back(std::string(fname));
|
||||||
|
|||||||
@ -38,6 +38,7 @@ namespace xgboost{
|
|||||||
namespace utils{
|
namespace utils{
|
||||||
inline void Error(const char *msg){
|
inline void Error(const char *msg){
|
||||||
fprintf(stderr, "Error:%s\n", msg);
|
fprintf(stderr, "Error:%s\n", msg);
|
||||||
|
fflush(stderr);
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,7 +58,8 @@ namespace xgboost{
|
|||||||
inline FILE *FopenCheck(const char *fname, const char *flag){
|
inline FILE *FopenCheck(const char *fname, const char *flag){
|
||||||
FILE *fp = fopen64(fname, flag);
|
FILE *fp = fopen64(fname, flag);
|
||||||
if (fp == NULL){
|
if (fp == NULL){
|
||||||
fprintf(stderr, "can not open file \"%s\"\n", fname);
|
fprintf(stderr, "can not open file \"%s\" \n", fname);
|
||||||
|
fflush(stderr);
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
return fp;
|
return fp;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user