Compare commits
166 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
56b1a3301f | ||
|
|
920f9f3565 | ||
|
|
c1a868e7ff | ||
|
|
63c4025656 | ||
|
|
4a622da67b | ||
|
|
b10efa2e4b | ||
|
|
0d6b977395 | ||
|
|
ca4b3b7541 | ||
|
|
4a98205ef1 | ||
|
|
982d16b2b6 | ||
|
|
fde318716f | ||
|
|
094d0a4497 | ||
|
|
d8b0edf133 | ||
|
|
bf5fcec8e8 | ||
|
|
278b788b34 | ||
|
|
76c44072d1 | ||
|
|
ccde443590 | ||
|
|
cf710bfa59 | ||
|
|
be2c3d299e | ||
|
|
2eba59000a | ||
|
|
a958fe8d52 | ||
|
|
96667b8bad | ||
|
|
95f4052aae | ||
|
|
e9e3e0281d | ||
|
|
c23d8c8b88 | ||
|
|
e59f4d5a18 | ||
|
|
e267f4c5f9 | ||
|
|
505e65ac08 | ||
|
|
13fc48623e | ||
|
|
591a43ac0e | ||
|
|
5375ac5c23 | ||
|
|
6930758294 | ||
|
|
e09d6ab9de | ||
|
|
db4a100f6b | ||
|
|
495e37e0dc | ||
|
|
b56b34944e | ||
|
|
d4530b7a47 | ||
|
|
334cf5de9b | ||
|
|
004e8d811e | ||
|
|
4baefd857e | ||
|
|
b52f01d61d | ||
|
|
35f9ef684a | ||
|
|
6f34096613 | ||
|
|
31c5d7843f | ||
|
|
f60dbe299e | ||
|
|
a77debc0c5 | ||
|
|
dc2b9c86e6 | ||
|
|
73bc8c0de4 | ||
|
|
ad8eb21fcd | ||
|
|
416050d5c0 | ||
|
|
d5f6fba82d | ||
|
|
23f4c41035 | ||
|
|
7ea988a76b | ||
|
|
d3c0ed14f3 | ||
|
|
2fcd875675 | ||
|
|
615074efb6 | ||
|
|
945b336fc6 | ||
|
|
8e8b8a8ee3 | ||
|
|
42267807f5 | ||
|
|
df23464a20 | ||
|
|
2ea8d9c511 | ||
|
|
3206235a5e | ||
|
|
956fc09da0 | ||
|
|
da482500c7 | ||
|
|
b19f2bfda8 | ||
|
|
21b21e69de | ||
|
|
b90d1dc92b | ||
|
|
3429ab3447 | ||
|
|
ebcce4a2bf | ||
|
|
1839e6efe9 | ||
|
|
9bc6e83afe | ||
|
|
fd2774e133 | ||
|
|
72d3a6a3cc | ||
|
|
5febbecd88 | ||
|
|
b3c3ecd9c9 | ||
|
|
c28a1be34c | ||
|
|
ae70b9b152 | ||
|
|
e0a0343ae6 | ||
|
|
0e0d3efd6a | ||
|
|
a3bd5000ba | ||
|
|
dd71c0e070 | ||
|
|
d9ea324057 | ||
|
|
0d29610c40 | ||
|
|
0af2c92d3b | ||
|
|
f9cdce077b | ||
|
|
59183b9ed8 | ||
|
|
6ff272eec6 | ||
|
|
c8073e13e4 | ||
|
|
698fa87bc3 | ||
|
|
8f56671901 | ||
|
|
9ea9a7a01e | ||
|
|
d59940f1d5 | ||
|
|
6aa190e10c | ||
|
|
54c486bcf1 | ||
|
|
88ff293de5 | ||
|
|
50af92e29e | ||
|
|
bbe4957cd2 | ||
|
|
789ad18d36 | ||
|
|
2b34d5a25e | ||
|
|
bd574e4967 | ||
|
|
e8d81c1da5 | ||
|
|
c84bbc91d1 | ||
|
|
61e3d1562c | ||
|
|
97db8c29f2 | ||
|
|
f2552f8ef2 | ||
|
|
2563b6d2d6 | ||
|
|
e90ffece67 | ||
|
|
85f92681f9 | ||
|
|
5e0d52cb8c | ||
|
|
c9d156d99e | ||
|
|
ecf6e8f49f | ||
|
|
93778aa4aa | ||
|
|
f8cacc7308 | ||
|
|
c0e1e9fe7a | ||
|
|
fa5afe2141 | ||
|
|
f7789ecf14 | ||
|
|
a57fbe091a | ||
|
|
9f82b53366 | ||
|
|
248b2cf74d | ||
|
|
5fb9376af4 | ||
|
|
9c2bb12cd1 | ||
|
|
ebde99bde8 | ||
|
|
ef7be5398d | ||
|
|
2ef61bf982 | ||
|
|
d4d141347a | ||
|
|
e18ba04751 | ||
|
|
3388d1a8b5 | ||
|
|
65917bb831 | ||
|
|
140499ac9e | ||
|
|
ccd037292d | ||
|
|
59939d0b14 | ||
|
|
9a2c00554d | ||
|
|
ee30c1728b | ||
|
|
8f75b0ef75 | ||
|
|
3128e718e2 | ||
|
|
657c617215 | ||
|
|
439d4725a0 | ||
|
|
8491bb3651 | ||
|
|
cce96e8f41 | ||
|
|
f02dd68713 | ||
|
|
ec14d32756 | ||
|
|
38577d45b0 | ||
|
|
ab0e7a3ddc | ||
|
|
bbd952a021 | ||
|
|
77e3051b1d | ||
|
|
924e164c14 | ||
|
|
25ff5ef169 | ||
|
|
3ea29eccae | ||
|
|
0f8a3d21a5 | ||
|
|
7487c2f668 | ||
|
|
88787b8573 | ||
|
|
17559a90f9 | ||
|
|
24696071a8 | ||
|
|
cca67af8d7 | ||
|
|
2beb92745f | ||
|
|
d6b582dc70 | ||
|
|
218320daf2 | ||
|
|
f83942d3e9 | ||
|
|
60d79eb2e7 | ||
|
|
1136c71e64 | ||
|
|
1bbbb0cf7f | ||
|
|
1756fde0c6 | ||
|
|
7f30fc1468 | ||
|
|
d5607fbb55 | ||
|
|
05d984d83d | ||
|
|
1110ae7421 |
7
.gitignore
vendored
7
.gitignore
vendored
@@ -17,3 +17,10 @@
|
|||||||
*buffer
|
*buffer
|
||||||
*model
|
*model
|
||||||
xgboost
|
xgboost
|
||||||
|
*pyc
|
||||||
|
*train
|
||||||
|
*test
|
||||||
|
*group
|
||||||
|
*rar
|
||||||
|
*vali
|
||||||
|
*data
|
||||||
|
|||||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
|||||||
Copyright (c) 2014 Tianqi Chen
|
Copyright (c) 2014 by Tianqi Chen and Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|||||||
3
Makefile
3
Makefile
@@ -10,7 +10,8 @@ OBJ =
|
|||||||
all: $(BIN) $(OBJ)
|
all: $(BIN) $(OBJ)
|
||||||
export LDFLAGS= -pthread -lm
|
export LDFLAGS= -pthread -lm
|
||||||
|
|
||||||
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h regrank/*.hpp booster/*.h booster/*/*.hpp booster/*.hpp
|
||||||
|
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|||||||
19
README.md
19
README.md
@@ -1,20 +1,23 @@
|
|||||||
xgboost: eXtreme Gradient Boosting
|
xgboost: eXtreme Gradient Boosting
|
||||||
=======
|
=======
|
||||||
A General purpose gradient boosting (tree) library.
|
An optimized general purpose gradient boosting (tree) library.
|
||||||
|
|
||||||
Authors:
|
Contributors: https://github.com/tqchen/xgboost/graphs/contributors
|
||||||
* Tianqi Chen, project creater
|
|
||||||
* Kailong Chen, contributes regression module
|
|
||||||
|
|
||||||
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
|
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
|
||||||
|
|
||||||
|
Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.com/tqchen/xgboost/issues?q=is%3Aissue+label%3Aquestion)
|
||||||
|
|
||||||
Features
|
Features
|
||||||
=======
|
=======
|
||||||
* Sparse feature format:
|
* Sparse feature format:
|
||||||
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
||||||
* Push the limit on single machine:
|
* Push the limit on single machine:
|
||||||
- Efficient implementation that optimizes memory and computation.
|
- Efficient implementation that optimizes memory and computation.
|
||||||
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
|
* Speed: XGBoost is very fast
|
||||||
|
- IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
|
||||||
|
* Layout of gradient boosting algorithm to support user defined objective
|
||||||
|
* Python interface, works with numpy and scipy.sparse matrix
|
||||||
|
|
||||||
Supported key components
|
Supported key components
|
||||||
=======
|
=======
|
||||||
@@ -33,6 +36,12 @@ Planned components
|
|||||||
- matrix factorization
|
- matrix factorization
|
||||||
- structured prediction
|
- structured prediction
|
||||||
|
|
||||||
|
Build
|
||||||
|
======
|
||||||
|
* Simply type make
|
||||||
|
* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
|
||||||
|
- You may get a error: -lgomp is not found, you can remove -fopenmp flag in Makefile to get single thread xgboost, or upgrade your compiler to compile multi-thread version
|
||||||
|
|
||||||
File extension convention
|
File extension convention
|
||||||
=======
|
=======
|
||||||
* .h are interface, utils and data structures, with detailed comment;
|
* .h are interface, utils and data structures, with detailed comment;
|
||||||
|
|||||||
@@ -49,9 +49,8 @@ namespace xgboost{
|
|||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
Entry best_entry;
|
Entry best_entry;
|
||||||
const TreeParamTrain ¶m;
|
|
||||||
public:
|
public:
|
||||||
RTSelecter( const TreeParamTrain &p ):param( p ){
|
RTSelecter( void ){
|
||||||
memset( &best_entry, 0, sizeof(best_entry) );
|
memset( &best_entry, 0, sizeof(best_entry) );
|
||||||
best_entry.loss_chg = 0.0f;
|
best_entry.loss_chg = 0.0f;
|
||||||
}
|
}
|
||||||
@@ -211,7 +210,7 @@ namespace xgboost{
|
|||||||
const SCEntry *entry, size_t start, size_t end,
|
const SCEntry *entry, size_t start, size_t end,
|
||||||
int findex, float parent_base_weight ){
|
int findex, float parent_base_weight ){
|
||||||
// local selecter
|
// local selecter
|
||||||
RTSelecter slocal( param );
|
RTSelecter slocal;
|
||||||
|
|
||||||
if( param.need_forward_search() ){
|
if( param.need_forward_search() ){
|
||||||
// forward process, default right
|
// forward process, default right
|
||||||
@@ -320,7 +319,7 @@ namespace xgboost{
|
|||||||
// after this point, tmp_rptr and entry is ready to use
|
// after this point, tmp_rptr and entry is ready to use
|
||||||
|
|
||||||
// global selecter
|
// global selecter
|
||||||
RTSelecter sglobal( param );
|
RTSelecter sglobal;
|
||||||
// gain root
|
// gain root
|
||||||
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
|
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
|
||||||
// KEY: layerwise, weight of current node if it is leaf
|
// KEY: layerwise, weight of current node if it is leaf
|
||||||
|
|||||||
@@ -290,6 +290,7 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
// sort columns
|
// sort columns
|
||||||
unsigned ncol = static_cast<unsigned>(this->NumCol());
|
unsigned ncol = static_cast<unsigned>(this->NumCol());
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
for (unsigned i = 0; i < ncol; i++){
|
for (unsigned i = 0; i < ncol; i++){
|
||||||
std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue);
|
std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue);
|
||||||
}
|
}
|
||||||
@@ -320,6 +321,8 @@ namespace xgboost{
|
|||||||
fi.Read(&col_access, sizeof(int));
|
fi.Read(&col_access, sizeof(int));
|
||||||
if (col_access != 0){
|
if (col_access != 0){
|
||||||
FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
|
FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
|
||||||
|
}else{
|
||||||
|
this->InitData();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@@ -371,14 +374,14 @@ namespace xgboost{
|
|||||||
size_t nrow;
|
size_t nrow;
|
||||||
utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS");
|
utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS");
|
||||||
ptr.resize(nrow + 1);
|
ptr.resize(nrow + 1);
|
||||||
utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" );
|
utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)) != 0, "Load FMatrixS");
|
||||||
|
|
||||||
data.resize(ptr.back());
|
data.resize(ptr.back());
|
||||||
if (data.size() != 0){
|
if (data.size() != 0){
|
||||||
utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" );
|
utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)) != 0, "Load FMatrixS");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
protected:
|
public:
|
||||||
/*! \brief row pointer of CSR sparse storage */
|
/*! \brief row pointer of CSR sparse storage */
|
||||||
std::vector<size_t> row_ptr_;
|
std::vector<size_t> row_ptr_;
|
||||||
/*! \brief data in the row */
|
/*! \brief data in the row */
|
||||||
|
|||||||
@@ -88,8 +88,8 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (mparam.num_pbuffer != 0){
|
if (mparam.num_pbuffer != 0){
|
||||||
pred_buffer.resize ( mparam.num_pbuffer );
|
pred_buffer.resize(mparam.PredBufferSize());
|
||||||
pred_counter.resize( mparam.num_pbuffer );
|
pred_counter.resize(mparam.PredBufferSize());
|
||||||
utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
|
utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
|
||||||
utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
|
utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
|
||||||
}
|
}
|
||||||
@@ -117,8 +117,8 @@ namespace xgboost{
|
|||||||
*/
|
*/
|
||||||
inline void InitModel(void){
|
inline void InitModel(void){
|
||||||
pred_buffer.clear(); pred_counter.clear();
|
pred_buffer.clear(); pred_counter.clear();
|
||||||
pred_buffer.resize ( mparam.num_pbuffer, 0.0 );
|
pred_buffer.resize(mparam.PredBufferSize(), 0.0);
|
||||||
pred_counter.resize( mparam.num_pbuffer, 0 );
|
pred_counter.resize(mparam.PredBufferSize(), 0);
|
||||||
utils::Assert(mparam.num_boosters == 0);
|
utils::Assert(mparam.num_boosters == 0);
|
||||||
utils::Assert(boosters.size() == 0);
|
utils::Assert(boosters.size() == 0);
|
||||||
}
|
}
|
||||||
@@ -130,6 +130,7 @@ namespace xgboost{
|
|||||||
if (tparam.nthread != 0){
|
if (tparam.nthread != 0){
|
||||||
omp_set_num_threads(tparam.nthread);
|
omp_set_num_threads(tparam.nthread);
|
||||||
}
|
}
|
||||||
|
if (mparam.num_booster_group == 0) mparam.num_booster_group = 1;
|
||||||
// make sure all the boosters get the latest parameters
|
// make sure all the boosters get the latest parameters
|
||||||
for (size_t i = 0; i < this->boosters.size(); i++){
|
for (size_t i = 0; i < this->boosters.size(); i++){
|
||||||
this->ConfigBooster(this->boosters[i]);
|
this->ConfigBooster(this->boosters[i]);
|
||||||
@@ -175,12 +176,14 @@ namespace xgboost{
|
|||||||
* \param feats features of each instance
|
* \param feats features of each instance
|
||||||
* \param root_index pre-partitioned root index of each instance,
|
* \param root_index pre-partitioned root index of each instance,
|
||||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||||
|
* \param bst_group which booster group it belongs to, by default, we only have 1 booster group, and leave this parameter as default
|
||||||
*/
|
*/
|
||||||
inline void DoBoost(std::vector<float> &grad,
|
inline void DoBoost(std::vector<float> &grad,
|
||||||
std::vector<float> &hess,
|
std::vector<float> &hess,
|
||||||
const booster::FMatrixS &feats,
|
const booster::FMatrixS &feats,
|
||||||
const std::vector<unsigned> &root_index ) {
|
const std::vector<unsigned> &root_index,
|
||||||
booster::IBooster *bst = this->GetUpdateBooster();
|
int bst_group = 0 ) {
|
||||||
|
booster::IBooster *bst = this->GetUpdateBooster( bst_group );
|
||||||
bst->DoBoost(grad, hess, feats, root_index);
|
bst->DoBoost(grad, hess, feats, root_index);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@@ -190,29 +193,42 @@ namespace xgboost{
|
|||||||
* \param row_index row index in the feature matrix
|
* \param row_index row index in the feature matrix
|
||||||
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
|
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
|
||||||
* \param root_index root id of current instance, default = 0
|
* \param root_index root id of current instance, default = 0
|
||||||
|
* \param bst_group booster group index
|
||||||
* \return prediction
|
* \return prediction
|
||||||
*/
|
*/
|
||||||
inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
inline float Predict(const FMatrixS &feats, bst_uint row_index,
|
||||||
size_t istart = 0;
|
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
|
||||||
|
size_t itop = 0;
|
||||||
float psum = 0.0f;
|
float psum = 0.0f;
|
||||||
|
const int bid = mparam.BufferOffset(buffer_index, bst_group);
|
||||||
|
|
||||||
// load buffered results if any
|
// load buffered results if any
|
||||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
if (mparam.do_reboost == 0 && bid >= 0){
|
||||||
utils::Assert( buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer" );
|
itop = this->pred_counter[bid];
|
||||||
istart = this->pred_counter[ buffer_index ];
|
psum = this->pred_buffer[bid];
|
||||||
psum = this->pred_buffer [ buffer_index ];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( size_t i = istart; i < this->boosters.size(); i ++ ){
|
for (size_t i = itop; i < this->boosters.size(); ++i ){
|
||||||
|
if( booster_info[i] == bst_group ){
|
||||||
psum += this->boosters[i]->Predict(feats, row_index, root_index);
|
psum += this->boosters[i]->Predict(feats, row_index, root_index);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// updated the buffered results
|
// updated the buffered results
|
||||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
if (mparam.do_reboost == 0 && bid >= 0){
|
||||||
this->pred_counter[ buffer_index ] = static_cast<unsigned>( boosters.size() );
|
this->pred_counter[bid] = static_cast<unsigned>(boosters.size());
|
||||||
this->pred_buffer [ buffer_index ] = psum;
|
this->pred_buffer[bid] = psum;
|
||||||
}
|
}
|
||||||
return psum;
|
return psum;
|
||||||
}
|
}
|
||||||
|
/*! \return number of boosters so far */
|
||||||
|
inline int NumBoosters(void) const{
|
||||||
|
return mparam.num_boosters;
|
||||||
|
}
|
||||||
|
/*! \return number of booster groups */
|
||||||
|
inline int NumBoosterGroup(void) const{
|
||||||
|
if( mparam.num_booster_group == 0 ) return 1;
|
||||||
|
return mparam.num_booster_group;
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
//--------trial code for interactive update an existing booster------
|
//--------trial code for interactive update an existing booster------
|
||||||
//-------- usually not needed, ignore this region ---------
|
//-------- usually not needed, ignore this region ---------
|
||||||
@@ -220,14 +236,17 @@ namespace xgboost{
|
|||||||
* \brief same as Predict, but removes the prediction of booster to be updated
|
* \brief same as Predict, but removes the prediction of booster to be updated
|
||||||
* this function must be called once and only once for every data with pbuffer
|
* this function must be called once and only once for every data with pbuffer
|
||||||
*/
|
*/
|
||||||
inline float InteractPredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
inline float InteractPredict(const FMatrixS &feats, bst_uint row_index,
|
||||||
|
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0){
|
||||||
float psum = this->Predict(feats, row_index, buffer_index, root_index);
|
float psum = this->Predict(feats, row_index, buffer_index, root_index);
|
||||||
if (tparam.reupdate_booster != -1){
|
if (tparam.reupdate_booster != -1){
|
||||||
const int bid = tparam.reupdate_booster;
|
const int bid = tparam.reupdate_booster;
|
||||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||||
|
if( bst_group == booster_info[bid] ){
|
||||||
psum -= boosters[bid]->Predict(feats, row_index, root_index);
|
psum -= boosters[bid]->Predict(feats, row_index, root_index);
|
||||||
|
}
|
||||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||||
this->pred_buffer[ buffer_index ] = psum;
|
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] = psum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return psum;
|
return psum;
|
||||||
@@ -243,14 +262,20 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
boosters.resize(mparam.num_boosters -= 1);
|
boosters.resize(mparam.num_boosters -= 1);
|
||||||
booster_info.resize(boosters.size());
|
booster_info.resize(boosters.size());
|
||||||
|
// update pred counter
|
||||||
|
for( size_t i = 0; i < pred_counter.size(); ++ i ){
|
||||||
|
if( pred_counter[i] > (unsigned)bid ) pred_counter[i] -= 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/*! \brief update the prediction buffer, after booster have been updated */
|
/*! \brief update the prediction buffer, after booster have been updated */
|
||||||
inline void InteractRePredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index,
|
||||||
|
int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
|
||||||
if (tparam.reupdate_booster != -1){
|
if (tparam.reupdate_booster != -1){
|
||||||
const int bid = tparam.reupdate_booster;
|
const int bid = tparam.reupdate_booster;
|
||||||
|
if( booster_info[bid] != bst_group ) return;
|
||||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||||
this->pred_buffer[ buffer_index ] += boosters[ bid ]->Predict( feats, row_index, root_index );
|
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] += boosters[bid]->Predict(feats, row_index, root_index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -274,21 +299,23 @@ namespace xgboost{
|
|||||||
* \brief get a booster to update
|
* \brief get a booster to update
|
||||||
* \return the booster created
|
* \return the booster created
|
||||||
*/
|
*/
|
||||||
inline booster::IBooster *GetUpdateBooster( void ){
|
inline booster::IBooster *GetUpdateBooster(int bst_group){
|
||||||
if (tparam.reupdate_booster != -1){
|
if (tparam.reupdate_booster != -1){
|
||||||
const int bid = tparam.reupdate_booster;
|
const int bid = tparam.reupdate_booster;
|
||||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||||
this->ConfigBooster(boosters[bid]);
|
this->ConfigBooster(boosters[bid]);
|
||||||
|
utils::Assert( bst_group == booster_info[bid], "booster group must match existing reupdate booster");
|
||||||
return boosters[bid];
|
return boosters[bid];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mparam.do_reboost == 0 || boosters.size() == 0){
|
if (mparam.do_reboost == 0 || boosters.size() == 0){
|
||||||
mparam.num_boosters += 1;
|
mparam.num_boosters += 1;
|
||||||
boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
|
boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
|
||||||
booster_info.push_back( 0 );
|
booster_info.push_back(bst_group);
|
||||||
this->ConfigBooster(boosters.back());
|
this->ConfigBooster(boosters.back());
|
||||||
boosters.back()->InitModel();
|
boosters.back()->InitModel();
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
this->ConfigBooster(boosters.back());
|
this->ConfigBooster(boosters.back());
|
||||||
}
|
}
|
||||||
return boosters.back();
|
return boosters.back();
|
||||||
@@ -311,8 +338,13 @@ namespace xgboost{
|
|||||||
* set to 1 for linear booster, so that regularization term can be considered
|
* set to 1 for linear booster, so that regularization term can be considered
|
||||||
*/
|
*/
|
||||||
int do_reboost;
|
int do_reboost;
|
||||||
|
/*!
|
||||||
|
* \brief number of booster group, how many predictions a single
|
||||||
|
* input instance could corresponds to
|
||||||
|
*/
|
||||||
|
int num_booster_group;
|
||||||
/*! \brief reserved parameters */
|
/*! \brief reserved parameters */
|
||||||
int reserved[ 32 ];
|
int reserved[31];
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
ModelParam(void){
|
ModelParam(void){
|
||||||
num_boosters = 0;
|
num_boosters = 0;
|
||||||
@@ -320,6 +352,7 @@ namespace xgboost{
|
|||||||
num_roots = num_feature = 0;
|
num_roots = num_feature = 0;
|
||||||
do_reboost = 0;
|
do_reboost = 0;
|
||||||
num_pbuffer = 0;
|
num_pbuffer = 0;
|
||||||
|
num_booster_group = 1;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@@ -335,9 +368,20 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
|
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
|
||||||
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
|
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
|
||||||
|
if (!strcmp("num_booster_group", name)) num_booster_group = atoi(val);
|
||||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||||
}
|
}
|
||||||
|
inline int PredBufferSize(void) const{
|
||||||
|
if (num_booster_group == 0) return num_pbuffer;
|
||||||
|
else return num_booster_group * num_pbuffer;
|
||||||
|
}
|
||||||
|
inline int BufferOffset( int buffer_index, int bst_group ) const{
|
||||||
|
if( buffer_index < 0 ) return -1;
|
||||||
|
utils::Assert( buffer_index < num_pbuffer, "buffer_indexexceed num_pbuffer" );
|
||||||
|
return buffer_index + num_pbuffer * bst_group;
|
||||||
|
|
||||||
|
}
|
||||||
};
|
};
|
||||||
/*! \brief training parameters */
|
/*! \brief training parameters */
|
||||||
struct TrainParam{
|
struct TrainParam{
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ def loadfmap( fname ):
|
|||||||
return fmap, nmap
|
return fmap, nmap
|
||||||
|
|
||||||
def write_nmap( fo, nmap ):
|
def write_nmap( fo, nmap ):
|
||||||
for i in xrange( len(nmap) ):
|
for i in range( len(nmap) ):
|
||||||
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
|
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
|
||||||
|
|
||||||
# start here
|
# start here
|
||||||
@@ -41,7 +41,7 @@ for l in open( 'agaricus-lepiota.data' ):
|
|||||||
else:
|
else:
|
||||||
assert arr[0] == 'e'
|
assert arr[0] == 'e'
|
||||||
fo.write('0')
|
fo.write('0')
|
||||||
for i in xrange( 1,len(arr) ):
|
for i in range( 1,len(arr) ):
|
||||||
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
|
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
|
||||||
fo.write('\n')
|
fo.write('\n')
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import sys
|
|||||||
import random
|
import random
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print 'Usage:<filename> <k> [nfold = 5]'
|
print ('Usage:<filename> <k> [nfold = 5]')
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
random.seed( 10 )
|
random.seed( 10 )
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# choose the tree booster, 0: tree, 1: linear
|
# choose the tree booster, 0: tree, 1: linear
|
||||||
booster_type = 0
|
booster_type = 0
|
||||||
# choose logistic regression loss function for binary classification
|
# choose logistic regression loss function for binary classification
|
||||||
loss_type = 2
|
objective = binary:logistic
|
||||||
|
|
||||||
# Tree Booster Parameters
|
# Tree Booster Parameters
|
||||||
# step size shrinkage
|
# step size shrinkage
|
||||||
@@ -23,5 +23,7 @@ save_period = 0
|
|||||||
data = "agaricus.txt.train"
|
data = "agaricus.txt.train"
|
||||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
eval[test] = "agaricus.txt.test"
|
eval[test] = "agaricus.txt.test"
|
||||||
|
# evaluate on training data as well each round
|
||||||
|
eval_train = 1
|
||||||
# The path of test data
|
# The path of test data
|
||||||
test:data = "agaricus.txt.test"
|
test:data = "agaricus.txt.test"
|
||||||
|
|||||||
19
demo/kaggle-higgs/README.md
Normal file
19
demo/kaggle-higgs/README.md
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
Guide for Kaggle Higgs Challenge
|
||||||
|
=====
|
||||||
|
|
||||||
|
This is the folder giving example of how to use XGBoost Python Module to run Kaggle Higgs competition
|
||||||
|
|
||||||
|
This script will achieve about 3.600 AMS score in public leadboard. To get start, you need do following step:
|
||||||
|
|
||||||
|
1. Compile the XGBoost python lib
|
||||||
|
```bash
|
||||||
|
cd ../../python
|
||||||
|
make
|
||||||
|
```
|
||||||
|
2. Put training.csv test.csv on folder './data' (you can create a symbolic link)
|
||||||
|
|
||||||
|
3. Run ./run.sh
|
||||||
|
|
||||||
|
Speed
|
||||||
|
=====
|
||||||
|
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
||||||
62
demo/kaggle-higgs/higgs-numpy.py
Executable file
62
demo/kaggle-higgs/higgs-numpy.py
Executable file
@@ -0,0 +1,62 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# this is the example script to use xgboost to train
|
||||||
|
import inspect
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
# add path of xgboost python module
|
||||||
|
code_path = os.path.join(
|
||||||
|
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../python")
|
||||||
|
|
||||||
|
sys.path.append(code_path)
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
test_size = 550000
|
||||||
|
|
||||||
|
# path to where the data lies
|
||||||
|
dpath = 'data'
|
||||||
|
|
||||||
|
# load in training data, directly use numpy
|
||||||
|
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
|
||||||
|
print ('finish loading from csv ')
|
||||||
|
|
||||||
|
label = dtrain[:,32]
|
||||||
|
data = dtrain[:,1:31]
|
||||||
|
# rescale weight to make it same as test set
|
||||||
|
weight = dtrain[:,31] * float(test_size) / len(label)
|
||||||
|
|
||||||
|
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
|
||||||
|
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
|
||||||
|
|
||||||
|
# print weight statistics
|
||||||
|
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
|
||||||
|
|
||||||
|
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
|
||||||
|
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||||
|
|
||||||
|
# setup parameters for xgboost
|
||||||
|
param = {}
|
||||||
|
# use logistic regression loss, use raw prediction before logistic transformation
|
||||||
|
# since we only need the rank
|
||||||
|
param['objective'] = 'binary:logitraw'
|
||||||
|
# scale weight of positive examples
|
||||||
|
param['scale_pos_weight'] = sum_wneg/sum_wpos
|
||||||
|
param['bst:eta'] = 0.1
|
||||||
|
param['bst:max_depth'] = 6
|
||||||
|
param['eval_metric'] = 'auc'
|
||||||
|
param['silent'] = 1
|
||||||
|
param['nthread'] = 16
|
||||||
|
|
||||||
|
# you can directly throw param in, though we want to watch multiple metrics here
|
||||||
|
plst = list(param.items())+[('eval_metric', 'ams@0.15')]
|
||||||
|
|
||||||
|
watchlist = [ (xgmat,'train') ]
|
||||||
|
# boost 120 tres
|
||||||
|
num_round = 120
|
||||||
|
print ('loading data end, start to boost trees')
|
||||||
|
bst = xgb.train( plst, xgmat, num_round, watchlist );
|
||||||
|
# save out model
|
||||||
|
bst.save_model('higgs.model')
|
||||||
|
|
||||||
|
print ('finish training')
|
||||||
54
demo/kaggle-higgs/higgs-pred.py
Executable file
54
demo/kaggle-higgs/higgs-pred.py
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# make prediction
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
# add path of xgboost python module
|
||||||
|
sys.path.append('../../python/')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
# path to where the data lies
|
||||||
|
dpath = 'data'
|
||||||
|
|
||||||
|
modelfile = 'higgs.model'
|
||||||
|
outfile = 'higgs.pred.csv'
|
||||||
|
# make top 15% as positive
|
||||||
|
threshold_ratio = 0.15
|
||||||
|
|
||||||
|
# load in training data, directly use numpy
|
||||||
|
dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 )
|
||||||
|
data = dtest[:,1:31]
|
||||||
|
idx = dtest[:,0]
|
||||||
|
|
||||||
|
print ('finish loading from csv ')
|
||||||
|
xgmat = xgb.DMatrix( data, missing = -999.0 )
|
||||||
|
bst = xgb.Booster({'nthread':16})
|
||||||
|
bst.load_model( modelfile )
|
||||||
|
ypred = bst.predict( xgmat )
|
||||||
|
|
||||||
|
res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]
|
||||||
|
|
||||||
|
rorder = {}
|
||||||
|
for k, v in sorted( res, key = lambda x:-x[1] ):
|
||||||
|
rorder[ k ] = len(rorder) + 1
|
||||||
|
|
||||||
|
# write out predictions
|
||||||
|
ntop = int( threshold_ratio * len(rorder ) )
|
||||||
|
fo = open(outfile, 'w')
|
||||||
|
nhit = 0
|
||||||
|
ntot = 0
|
||||||
|
fo.write('EventId,RankOrder,Class\n')
|
||||||
|
for k, v in res:
|
||||||
|
if rorder[k] <= ntop:
|
||||||
|
lb = 's'
|
||||||
|
nhit += 1
|
||||||
|
else:
|
||||||
|
lb = 'b'
|
||||||
|
# change output rank order to follow Kaggle convention
|
||||||
|
fo.write('%s,%d,%s\n' % ( k, len(rorder)+1-rorder[k], lb ) )
|
||||||
|
ntot += 1
|
||||||
|
fo.close()
|
||||||
|
|
||||||
|
print ('finished writing into prediction file')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
14
demo/kaggle-higgs/run.sh
Executable file
14
demo/kaggle-higgs/run.sh
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
python -u higgs-numpy.py
|
||||||
|
ret=$?
|
||||||
|
if [[ $ret != 0 ]]; then
|
||||||
|
echo "ERROR in higgs-numpy.py"
|
||||||
|
exit $ret
|
||||||
|
fi
|
||||||
|
python -u higgs-pred.py
|
||||||
|
ret=$?
|
||||||
|
if [[ $ret != 0 ]]; then
|
||||||
|
echo "ERROR in higgs-pred.py"
|
||||||
|
exit $ret
|
||||||
|
fi
|
||||||
66
demo/kaggle-higgs/speedtest.py
Executable file
66
demo/kaggle-higgs/speedtest.py
Executable file
@@ -0,0 +1,66 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# this is the example script to use xgboost to train
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
# add path of xgboost python module
|
||||||
|
sys.path.append('../../python/')
|
||||||
|
import xgboost as xgb
|
||||||
|
from sklearn.ensemble import GradientBoostingClassifier
|
||||||
|
import time
|
||||||
|
test_size = 550000
|
||||||
|
|
||||||
|
# path to where the data lies
|
||||||
|
dpath = 'data'
|
||||||
|
|
||||||
|
# load in training data, directly use numpy
|
||||||
|
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
|
||||||
|
print ('finish loading from csv ')
|
||||||
|
|
||||||
|
label = dtrain[:,32]
|
||||||
|
data = dtrain[:,1:31]
|
||||||
|
# rescale weight to make it same as test set
|
||||||
|
weight = dtrain[:,31] * float(test_size) / len(label)
|
||||||
|
|
||||||
|
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
|
||||||
|
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
|
||||||
|
|
||||||
|
# print weight statistics
|
||||||
|
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
|
||||||
|
|
||||||
|
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
|
||||||
|
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||||
|
|
||||||
|
# setup parameters for xgboost
|
||||||
|
param = {}
|
||||||
|
# use logistic regression loss
|
||||||
|
param['objective'] = 'binary:logitraw'
|
||||||
|
# scale weight of positive examples
|
||||||
|
param['scale_pos_weight'] = sum_wneg/sum_wpos
|
||||||
|
param['bst:eta'] = 0.1
|
||||||
|
param['bst:max_depth'] = 6
|
||||||
|
param['eval_metric'] = 'auc'
|
||||||
|
param['silent'] = 1
|
||||||
|
param['nthread'] = 4
|
||||||
|
|
||||||
|
plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||||
|
|
||||||
|
watchlist = [ (xgmat,'train') ]
|
||||||
|
# boost 10 tres
|
||||||
|
num_round = 10
|
||||||
|
print ('loading data end, start to boost trees')
|
||||||
|
print ("training GBM from sklearn")
|
||||||
|
tmp = time.time()
|
||||||
|
gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
|
||||||
|
gbm.fit(data, label)
|
||||||
|
print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
|
||||||
|
#raw_input()
|
||||||
|
print ("training xgboost")
|
||||||
|
threads = [1, 2, 4, 16]
|
||||||
|
for i in threads:
|
||||||
|
param['nthread'] = i
|
||||||
|
tmp = time.time()
|
||||||
|
plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||||
|
bst = xgb.train( plst, xgmat, num_round, watchlist );
|
||||||
|
print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)))
|
||||||
|
|
||||||
|
print ('finish training')
|
||||||
10
demo/multiclass_classification/README.md
Normal file
10
demo/multiclass_classification/README.md
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology)
|
||||||
|
|
||||||
|
Make sure you make make xgboost python module in ../../python
|
||||||
|
|
||||||
|
1. Run runexp.sh
|
||||||
|
```bash
|
||||||
|
./runexp.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki)
|
||||||
9
demo/multiclass_classification/runexp.sh
Executable file
9
demo/multiclass_classification/runexp.sh
Executable file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [ -f dermatology.data ]
|
||||||
|
then
|
||||||
|
echo "use existing data to run multi class classification"
|
||||||
|
else
|
||||||
|
echo "getting data from uci, make sure you are connected to internet"
|
||||||
|
wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data
|
||||||
|
fi
|
||||||
|
python train.py
|
||||||
49
demo/multiclass_classification/train.py
Executable file
49
demo/multiclass_classification/train.py
Executable file
@@ -0,0 +1,49 @@
|
|||||||
|
#! /usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../python/')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
# label need to be 0 to num_class -1
|
||||||
|
data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } )
|
||||||
|
sz = data.shape
|
||||||
|
|
||||||
|
train = data[:int(sz[0] * 0.7), :]
|
||||||
|
test = data[int(sz[0] * 0.7):, :]
|
||||||
|
|
||||||
|
train_X = train[:,0:33]
|
||||||
|
train_Y = train[:, 34]
|
||||||
|
|
||||||
|
|
||||||
|
test_X = test[:,0:33]
|
||||||
|
test_Y = test[:, 34]
|
||||||
|
|
||||||
|
xg_train = xgb.DMatrix( train_X, label=train_Y)
|
||||||
|
xg_test = xgb.DMatrix(test_X, label=test_Y)
|
||||||
|
# setup parameters for xgboost
|
||||||
|
param = {}
|
||||||
|
# use softmax multi-class classification
|
||||||
|
param['objective'] = 'multi:softmax'
|
||||||
|
# scale weight of positive examples
|
||||||
|
param['bst:eta'] = 0.1
|
||||||
|
param['bst:max_depth'] = 6
|
||||||
|
param['silent'] = 1
|
||||||
|
param['nthread'] = 4
|
||||||
|
param['num_class'] = 6
|
||||||
|
|
||||||
|
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
|
||||||
|
num_round = 5
|
||||||
|
bst = xgb.train(param, xg_train, num_round, watchlist );
|
||||||
|
# get prediction
|
||||||
|
pred = bst.predict( xg_test );
|
||||||
|
|
||||||
|
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
|
||||||
|
|
||||||
|
# do the same thing again, but output probabilities
|
||||||
|
param['objective'] = 'multi:softprob'
|
||||||
|
bst = xgb.train(param, xg_train, num_round, watchlist );
|
||||||
|
# get prediction, this is in 1D array, need reshape to (nclass, ndata)
|
||||||
|
yprob = bst.predict( xg_test ).reshape( 6, test_Y.shape[0] )
|
||||||
|
ylabel = np.argmax( yprob, axis=0)
|
||||||
|
|
||||||
|
print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
|
||||||
13
demo/rank/README
Normal file
13
demo/rank/README
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
Instructions:
|
||||||
|
The dataset for ranking demo is from LETOR04 MQ2008 fold1,
|
||||||
|
You can use the following command to run the example
|
||||||
|
|
||||||
|
|
||||||
|
Get the data: ./wgetdata.sh
|
||||||
|
Run the example: ./runexp.sh
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
30
demo/rank/mq2008.conf
Normal file
30
demo/rank/mq2008.conf
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# General Parameters, see comment for each definition
|
||||||
|
# choose the tree booster, 0: tree, 1: linear
|
||||||
|
booster_type = 0
|
||||||
|
|
||||||
|
# specify objective
|
||||||
|
objective="rank:pairwise"
|
||||||
|
|
||||||
|
# Tree Booster Parameters
|
||||||
|
# step size shrinkage
|
||||||
|
bst:eta = 0.1
|
||||||
|
# minimum loss reduction required to make a further partition
|
||||||
|
bst:gamma = 1.0
|
||||||
|
# minimum sum of instance weight(hessian) needed in a child
|
||||||
|
bst:min_child_weight = 0.1
|
||||||
|
# maximum depth of a tree
|
||||||
|
bst:max_depth = 6
|
||||||
|
|
||||||
|
# Task parameters
|
||||||
|
# the number of round to do boosting
|
||||||
|
num_round = 4
|
||||||
|
# 0 means do not save any model except the final round model
|
||||||
|
save_period = 0
|
||||||
|
# The path of training data
|
||||||
|
data = "mq2008.train"
|
||||||
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
|
eval[test] = "mq2008.vali"
|
||||||
|
# The path of test data
|
||||||
|
test:data = "mq2008.test"
|
||||||
|
|
||||||
|
|
||||||
11
demo/rank/runexp.sh
Executable file
11
demo/rank/runexp.sh
Executable file
@@ -0,0 +1,11 @@
|
|||||||
|
python trans_data.py train.txt mq2008.train mq2008.train.group
|
||||||
|
|
||||||
|
python trans_data.py test.txt mq2008.test mq2008.test.group
|
||||||
|
|
||||||
|
python trans_data.py vali.txt mq2008.vali mq2008.vali.group
|
||||||
|
|
||||||
|
../../xgboost mq2008.conf
|
||||||
|
|
||||||
|
../../xgboost mq2008.conf task=pred model_in=0004.model
|
||||||
|
|
||||||
|
|
||||||
41
demo/rank/trans_data.py
Normal file
41
demo/rank/trans_data.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
def save_data(group_data,output_feature,output_group):
|
||||||
|
if len(group_data) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
output_group.write(str(len(group_data))+"\n")
|
||||||
|
for data in group_data:
|
||||||
|
# only include nonzero features
|
||||||
|
feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]
|
||||||
|
output_feature.write(data[0] + " " + " ".join(feats) + "\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 4:
|
||||||
|
print ("Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
fi = open(sys.argv[1])
|
||||||
|
output_feature = open(sys.argv[2],"w")
|
||||||
|
output_group = open(sys.argv[3],"w")
|
||||||
|
|
||||||
|
group_data = []
|
||||||
|
group = ""
|
||||||
|
for line in fi:
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
if "#" in line:
|
||||||
|
line = line[:line.index("#")]
|
||||||
|
splits = line.strip().split(" ")
|
||||||
|
if splits[1] != group:
|
||||||
|
save_data(group_data,output_feature,output_group)
|
||||||
|
group_data = []
|
||||||
|
group = splits[1]
|
||||||
|
group_data.append(splits)
|
||||||
|
|
||||||
|
save_data(group_data,output_feature,output_group)
|
||||||
|
|
||||||
|
fi.close()
|
||||||
|
output_feature.close()
|
||||||
|
output_group.close()
|
||||||
|
|
||||||
4
demo/rank/wgetdata.sh
Executable file
4
demo/rank/wgetdata.sh
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
|
||||||
|
unrar x MQ2008.rar
|
||||||
|
mv -f MQ2008/Fold1/*.txt .
|
||||||
@@ -1,9 +1,9 @@
|
|||||||
# General Parameters, see comment for each definition
|
# General Parameters, see comment for each definition
|
||||||
# choose the tree booster, 0: tree, 1: linear
|
# choose the tree booster, 0: tree, 1: linear
|
||||||
booster_type = 0
|
booster_type = 0
|
||||||
# this is the only difference with classification, use 0: linear regression
|
# this is the only difference with classification, use reg:linear to do linear classification
|
||||||
# when labels are in [0,1] we can also use 1: logistic regression
|
# when labels are in [0,1] we can also use reg:logistic
|
||||||
loss_type = 0
|
objective = reg:linear
|
||||||
|
|
||||||
# Tree Booster Parameters
|
# Tree Booster Parameters
|
||||||
# step size shrinkage
|
# step size shrinkage
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ fmap = {}
|
|||||||
for l in open( 'machine.data' ):
|
for l in open( 'machine.data' ):
|
||||||
arr = l.split(',')
|
arr = l.split(',')
|
||||||
fo.write(arr[8])
|
fo.write(arr[8])
|
||||||
for i in xrange( 0,6 ):
|
for i in range( 0,6 ):
|
||||||
fo.write( ' %d:%s' %(i,arr[i+2]) )
|
fo.write( ' %d:%s' %(i,arr[i+2]) )
|
||||||
|
|
||||||
if arr[0] not in fmap:
|
if arr[0] not in fmap:
|
||||||
@@ -24,9 +24,9 @@ fo = open('featmap.txt', 'w')
|
|||||||
# list from machine.names
|
# list from machine.names
|
||||||
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
|
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
|
||||||
|
|
||||||
for i in xrange(0,6):
|
for i in range(0,6):
|
||||||
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
|
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
|
||||||
|
|
||||||
for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ):
|
for v, k in sorted( fmap.items(), key = lambda x:x[1] ):
|
||||||
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
|
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
|
||||||
fo.close()
|
fo.close()
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import sys
|
|||||||
import random
|
import random
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print 'Usage:<filename> <k> [nfold = 5]'
|
print ('Usage:<filename> <k> [nfold = 5]')
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
random.seed( 10 )
|
random.seed( 10 )
|
||||||
|
|||||||
26
python/Makefile
Normal file
26
python/Makefile
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
export CC = gcc
|
||||||
|
export CXX = g++
|
||||||
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||||
|
|
||||||
|
# specify tensor path
|
||||||
|
SLIB = libxgboostpy.so
|
||||||
|
.PHONY: clean all
|
||||||
|
|
||||||
|
all: $(SLIB)
|
||||||
|
export LDFLAGS= -pthread -lm
|
||||||
|
|
||||||
|
libxgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp
|
||||||
|
|
||||||
|
$(SLIB) :
|
||||||
|
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
$(BIN) :
|
||||||
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
|
$(OBJ) :
|
||||||
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||||
|
|
||||||
|
install:
|
||||||
|
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
$(RM) $(OBJ) $(BIN) $(SLIB) *~
|
||||||
3
python/README.md
Normal file
3
python/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
python wrapper for xgboost using ctypes
|
||||||
|
|
||||||
|
see example for usage
|
||||||
3
python/example/README.md
Normal file
3
python/example/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
|
||||||
|
|
||||||
|
for usage: see demo.py and comments in demo.py
|
||||||
1611
python/example/agaricus.txt.test
Normal file
1611
python/example/agaricus.txt.test
Normal file
File diff suppressed because it is too large
Load Diff
6513
python/example/agaricus.txt.train
Normal file
6513
python/example/agaricus.txt.train
Normal file
File diff suppressed because it is too large
Load Diff
96
python/example/demo.py
Executable file
96
python/example/demo.py
Executable file
@@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import scipy.sparse
|
||||||
|
# append the path to xgboost, you may need to change the following line
|
||||||
|
sys.path.append('../')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### simple example
|
||||||
|
# load file from text file, also binary buffer generated by xgboost
|
||||||
|
dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||||
|
|
||||||
|
# specify parameters via map, definition are same as c++ version
|
||||||
|
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||||
|
|
||||||
|
# specify validations set to watch performance
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
num_round = 2
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
# this is prediction
|
||||||
|
preds = bst.predict( dtest )
|
||||||
|
labels = dtest.get_label()
|
||||||
|
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||||
|
bst.save_model('0001.model')
|
||||||
|
# dump model
|
||||||
|
bst.dump_model('dump.raw.txt')
|
||||||
|
# dump model with feature map
|
||||||
|
bst.dump_model('dump.raw.txt','featmap.txt')
|
||||||
|
|
||||||
|
###
|
||||||
|
# build dmatrix in python iteratively
|
||||||
|
#
|
||||||
|
print ('start running example of build DMatrix in python')
|
||||||
|
dtrain = xgb.DMatrix()
|
||||||
|
labels = []
|
||||||
|
for l in open('agaricus.txt.train'):
|
||||||
|
arr = l.split()
|
||||||
|
labels.append( int(arr[0]))
|
||||||
|
feats = []
|
||||||
|
for it in arr[1:]:
|
||||||
|
k,v = it.split(':')
|
||||||
|
feats.append( (int(k), float(v)) )
|
||||||
|
dtrain.add_row( feats )
|
||||||
|
dtrain.set_label( labels )
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
###
|
||||||
|
# build dmatrix from scipy.sparse
|
||||||
|
print ('start running example of build DMatrix from scipy.sparse')
|
||||||
|
labels = []
|
||||||
|
row = []; col = []; dat = []
|
||||||
|
i = 0
|
||||||
|
for l in open('agaricus.txt.train'):
|
||||||
|
arr = l.split()
|
||||||
|
labels.append( int(arr[0]))
|
||||||
|
for it in arr[1:]:
|
||||||
|
k,v = it.split(':')
|
||||||
|
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
||||||
|
dtrain = xgb.DMatrix( csr )
|
||||||
|
dtrain.set_label(labels)
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
print ('start running example of build DMatrix from numpy array')
|
||||||
|
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
|
||||||
|
npymat = csr.todense()
|
||||||
|
dtrain = xgb.DMatrix( npymat )
|
||||||
|
dtrain.set_label(labels)
|
||||||
|
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||||
|
|
||||||
|
###
|
||||||
|
# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score
|
||||||
|
#
|
||||||
|
print ('start running example to used cutomized objective function')
|
||||||
|
|
||||||
|
# note: set objective= binary:logistic means the prediction will get logistic transformed
|
||||||
|
# in most case, we may want to leave it as default
|
||||||
|
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||||
|
|
||||||
|
# user define objective function, given prediction, return gradient and second order gradient
|
||||||
|
def logregobj( preds, dtrain ):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
grad = preds - labels
|
||||||
|
hess = preds * (1.0-preds)
|
||||||
|
return grad, hess
|
||||||
|
|
||||||
|
# training with customized objective, we can also do step by step training, simply look at xgboost.py's implementation of train
|
||||||
|
bst = xgb.train( param, dtrain, num_round, evallist, logregobj )
|
||||||
126
python/example/featmap.txt
Normal file
126
python/example/featmap.txt
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
0 cap-shape=bell i
|
||||||
|
1 cap-shape=conical i
|
||||||
|
2 cap-shape=convex i
|
||||||
|
3 cap-shape=flat i
|
||||||
|
4 cap-shape=knobbed i
|
||||||
|
5 cap-shape=sunken i
|
||||||
|
6 cap-surface=fibrous i
|
||||||
|
7 cap-surface=grooves i
|
||||||
|
8 cap-surface=scaly i
|
||||||
|
9 cap-surface=smooth i
|
||||||
|
10 cap-color=brown i
|
||||||
|
11 cap-color=buff i
|
||||||
|
12 cap-color=cinnamon i
|
||||||
|
13 cap-color=gray i
|
||||||
|
14 cap-color=green i
|
||||||
|
15 cap-color=pink i
|
||||||
|
16 cap-color=purple i
|
||||||
|
17 cap-color=red i
|
||||||
|
18 cap-color=white i
|
||||||
|
19 cap-color=yellow i
|
||||||
|
20 bruises?=bruises i
|
||||||
|
21 bruises?=no i
|
||||||
|
22 odor=almond i
|
||||||
|
23 odor=anise i
|
||||||
|
24 odor=creosote i
|
||||||
|
25 odor=fishy i
|
||||||
|
26 odor=foul i
|
||||||
|
27 odor=musty i
|
||||||
|
28 odor=none i
|
||||||
|
29 odor=pungent i
|
||||||
|
30 odor=spicy i
|
||||||
|
31 gill-attachment=attached i
|
||||||
|
32 gill-attachment=descending i
|
||||||
|
33 gill-attachment=free i
|
||||||
|
34 gill-attachment=notched i
|
||||||
|
35 gill-spacing=close i
|
||||||
|
36 gill-spacing=crowded i
|
||||||
|
37 gill-spacing=distant i
|
||||||
|
38 gill-size=broad i
|
||||||
|
39 gill-size=narrow i
|
||||||
|
40 gill-color=black i
|
||||||
|
41 gill-color=brown i
|
||||||
|
42 gill-color=buff i
|
||||||
|
43 gill-color=chocolate i
|
||||||
|
44 gill-color=gray i
|
||||||
|
45 gill-color=green i
|
||||||
|
46 gill-color=orange i
|
||||||
|
47 gill-color=pink i
|
||||||
|
48 gill-color=purple i
|
||||||
|
49 gill-color=red i
|
||||||
|
50 gill-color=white i
|
||||||
|
51 gill-color=yellow i
|
||||||
|
52 stalk-shape=enlarging i
|
||||||
|
53 stalk-shape=tapering i
|
||||||
|
54 stalk-root=bulbous i
|
||||||
|
55 stalk-root=club i
|
||||||
|
56 stalk-root=cup i
|
||||||
|
57 stalk-root=equal i
|
||||||
|
58 stalk-root=rhizomorphs i
|
||||||
|
59 stalk-root=rooted i
|
||||||
|
60 stalk-root=missing i
|
||||||
|
61 stalk-surface-above-ring=fibrous i
|
||||||
|
62 stalk-surface-above-ring=scaly i
|
||||||
|
63 stalk-surface-above-ring=silky i
|
||||||
|
64 stalk-surface-above-ring=smooth i
|
||||||
|
65 stalk-surface-below-ring=fibrous i
|
||||||
|
66 stalk-surface-below-ring=scaly i
|
||||||
|
67 stalk-surface-below-ring=silky i
|
||||||
|
68 stalk-surface-below-ring=smooth i
|
||||||
|
69 stalk-color-above-ring=brown i
|
||||||
|
70 stalk-color-above-ring=buff i
|
||||||
|
71 stalk-color-above-ring=cinnamon i
|
||||||
|
72 stalk-color-above-ring=gray i
|
||||||
|
73 stalk-color-above-ring=orange i
|
||||||
|
74 stalk-color-above-ring=pink i
|
||||||
|
75 stalk-color-above-ring=red i
|
||||||
|
76 stalk-color-above-ring=white i
|
||||||
|
77 stalk-color-above-ring=yellow i
|
||||||
|
78 stalk-color-below-ring=brown i
|
||||||
|
79 stalk-color-below-ring=buff i
|
||||||
|
80 stalk-color-below-ring=cinnamon i
|
||||||
|
81 stalk-color-below-ring=gray i
|
||||||
|
82 stalk-color-below-ring=orange i
|
||||||
|
83 stalk-color-below-ring=pink i
|
||||||
|
84 stalk-color-below-ring=red i
|
||||||
|
85 stalk-color-below-ring=white i
|
||||||
|
86 stalk-color-below-ring=yellow i
|
||||||
|
87 veil-type=partial i
|
||||||
|
88 veil-type=universal i
|
||||||
|
89 veil-color=brown i
|
||||||
|
90 veil-color=orange i
|
||||||
|
91 veil-color=white i
|
||||||
|
92 veil-color=yellow i
|
||||||
|
93 ring-number=none i
|
||||||
|
94 ring-number=one i
|
||||||
|
95 ring-number=two i
|
||||||
|
96 ring-type=cobwebby i
|
||||||
|
97 ring-type=evanescent i
|
||||||
|
98 ring-type=flaring i
|
||||||
|
99 ring-type=large i
|
||||||
|
100 ring-type=none i
|
||||||
|
101 ring-type=pendant i
|
||||||
|
102 ring-type=sheathing i
|
||||||
|
103 ring-type=zone i
|
||||||
|
104 spore-print-color=black i
|
||||||
|
105 spore-print-color=brown i
|
||||||
|
106 spore-print-color=buff i
|
||||||
|
107 spore-print-color=chocolate i
|
||||||
|
108 spore-print-color=green i
|
||||||
|
109 spore-print-color=orange i
|
||||||
|
110 spore-print-color=purple i
|
||||||
|
111 spore-print-color=white i
|
||||||
|
112 spore-print-color=yellow i
|
||||||
|
113 population=abundant i
|
||||||
|
114 population=clustered i
|
||||||
|
115 population=numerous i
|
||||||
|
116 population=scattered i
|
||||||
|
117 population=several i
|
||||||
|
118 population=solitary i
|
||||||
|
119 habitat=grasses i
|
||||||
|
120 habitat=leaves i
|
||||||
|
121 habitat=meadows i
|
||||||
|
122 habitat=paths i
|
||||||
|
123 habitat=urban i
|
||||||
|
124 habitat=waste i
|
||||||
|
125 habitat=woods i
|
||||||
205
python/xgboost.py
Normal file
205
python/xgboost.py
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
# Author: Tianqi Chen, Bing Xu
|
||||||
|
# module for xgboost
|
||||||
|
import ctypes
|
||||||
|
import os
|
||||||
|
# optinally have scipy sparse, though not necessary
|
||||||
|
import numpy
|
||||||
|
import numpy.ctypeslib
|
||||||
|
import scipy.sparse as scp
|
||||||
|
|
||||||
|
# set this line correctly
|
||||||
|
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostpy.so'
|
||||||
|
|
||||||
|
# entry type of sparse matrix
|
||||||
|
class REntry(ctypes.Structure):
|
||||||
|
_fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ]
|
||||||
|
|
||||||
|
# load in xgboost library
|
||||||
|
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
|
||||||
|
|
||||||
|
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
|
||||||
|
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
|
||||||
|
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
|
||||||
|
xglib.XGDMatrixGetWeight.restype = ctypes.POINTER( ctypes.c_float )
|
||||||
|
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
|
||||||
|
xglib.XGBoosterCreate.restype = ctypes.c_void_p
|
||||||
|
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
|
||||||
|
|
||||||
|
def ctypes2numpy( cptr, length ):
|
||||||
|
# convert a ctypes pointer array to numpy
|
||||||
|
assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) )
|
||||||
|
res = numpy.zeros( length, dtype='float32' )
|
||||||
|
assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] )
|
||||||
|
return res
|
||||||
|
|
||||||
|
# data matrix used in xgboost
|
||||||
|
class DMatrix:
|
||||||
|
# constructor
|
||||||
|
def __init__(self, data=None, label=None, missing=0.0, weight = None):
|
||||||
|
# force into void_p, mac need to pass things in as void_p
|
||||||
|
self.handle = ctypes.c_void_p( xglib.XGDMatrixCreate() )
|
||||||
|
if data == None:
|
||||||
|
return
|
||||||
|
if isinstance(data,str):
|
||||||
|
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data.encode('utf-8')), 1)
|
||||||
|
elif isinstance(data,scp.csr_matrix):
|
||||||
|
self.__init_from_csr(data)
|
||||||
|
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
|
||||||
|
self.__init_from_npy2d(data, missing)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
csr = scp.csr_matrix(data)
|
||||||
|
self.__init_from_csr(csr)
|
||||||
|
except:
|
||||||
|
raise Exception("can not intialize DMatrix from"+str(type(data)))
|
||||||
|
if label != None:
|
||||||
|
self.set_label(label)
|
||||||
|
if weight !=None:
|
||||||
|
self.set_weight(weight)
|
||||||
|
|
||||||
|
# convert data from csr matrix
|
||||||
|
def __init_from_csr(self,csr):
|
||||||
|
assert len(csr.indices) == len(csr.data)
|
||||||
|
xglib.XGDMatrixParseCSR( self.handle,
|
||||||
|
( ctypes.c_ulong * len(csr.indptr) )(*csr.indptr),
|
||||||
|
( ctypes.c_uint * len(csr.indices) )(*csr.indices),
|
||||||
|
( ctypes.c_float * len(csr.data) )(*csr.data),
|
||||||
|
len(csr.indptr), len(csr.data) )
|
||||||
|
# convert data from numpy matrix
|
||||||
|
def __init_from_npy2d(self,mat,missing):
|
||||||
|
data = numpy.array( mat.reshape(mat.size), dtype='float32' )
|
||||||
|
xglib.XGDMatrixParseMat( self.handle,
|
||||||
|
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||||
|
mat.shape[0], mat.shape[1], ctypes.c_float(missing) )
|
||||||
|
# destructor
|
||||||
|
def __del__(self):
|
||||||
|
xglib.XGDMatrixFree(self.handle)
|
||||||
|
# load data from file
|
||||||
|
def load(self, fname, silent=True):
|
||||||
|
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
|
||||||
|
# load data from file
|
||||||
|
def save_binary(self, fname, silent=True):
|
||||||
|
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
|
||||||
|
# set label of dmatrix
|
||||||
|
def set_label(self, label):
|
||||||
|
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
|
||||||
|
# set group size of dmatrix, used for rank
|
||||||
|
def set_group(self, group):
|
||||||
|
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
|
||||||
|
# set weight of each instances
|
||||||
|
def set_weight(self, weight):
|
||||||
|
xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight) )
|
||||||
|
# get label from dmatrix
|
||||||
|
def get_label(self):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
|
||||||
|
return ctypes2numpy( labels, length.value );
|
||||||
|
# get weight from dmatrix
|
||||||
|
def get_weight(self):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
weights = xglib.XGDMatrixGetWeight(self.handle, ctypes.byref(length))
|
||||||
|
return ctypes2numpy( weights, length.value );
|
||||||
|
# clear everything
|
||||||
|
def clear(self):
|
||||||
|
xglib.XGDMatrixClear(self.handle)
|
||||||
|
def num_row(self):
|
||||||
|
return xglib.XGDMatrixNumRow(self.handle)
|
||||||
|
# append a row to DMatrix
|
||||||
|
def add_row(self, row):
|
||||||
|
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
|
||||||
|
# get n-throw from DMatrix
|
||||||
|
def __getitem__(self, ridx):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
|
||||||
|
return [ (int(row[i].findex),row[i].fvalue) for i in range(length.value) ]
|
||||||
|
|
||||||
|
class Booster:
|
||||||
|
"""learner class """
|
||||||
|
def __init__(self, params={}, cache=[]):
|
||||||
|
""" constructor, param: """
|
||||||
|
for d in cache:
|
||||||
|
assert isinstance(d,DMatrix)
|
||||||
|
dmats = ( ctypes.c_void_p * len(cache) )(*[ d.handle for d in cache])
|
||||||
|
self.handle = ctypes.c_void_p( xglib.XGBoosterCreate( dmats, len(cache) ) )
|
||||||
|
self.set_param( {'seed':0} )
|
||||||
|
self.set_param( params )
|
||||||
|
def __del__(self):
|
||||||
|
xglib.XGBoosterFree(self.handle)
|
||||||
|
def set_param(self, params, pv=None):
|
||||||
|
if isinstance(params,dict):
|
||||||
|
for k, v in params.items():
|
||||||
|
xglib.XGBoosterSetParam(
|
||||||
|
self.handle, ctypes.c_char_p(k.encode('utf-8')),
|
||||||
|
ctypes.c_char_p(str(v).encode('utf-8')))
|
||||||
|
elif isinstance(params,str) and pv != None:
|
||||||
|
xglib.XGBoosterSetParam(
|
||||||
|
self.handle, ctypes.c_char_p(params.encode('utf-8')),
|
||||||
|
ctypes.c_char_p(str(pv).encode('utf-8')) )
|
||||||
|
else:
|
||||||
|
for k, v in params:
|
||||||
|
xglib.XGBoosterSetParam(
|
||||||
|
self.handle, ctypes.c_char_p(k.encode('utf-8')),
|
||||||
|
ctypes.c_char_p(str(v).encode('utf-8')) )
|
||||||
|
def update(self, dtrain):
|
||||||
|
""" update """
|
||||||
|
assert isinstance(dtrain, DMatrix)
|
||||||
|
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
|
||||||
|
def boost(self, dtrain, grad, hess, bst_group = -1):
|
||||||
|
""" update """
|
||||||
|
assert len(grad) == len(hess)
|
||||||
|
assert isinstance(dtrain, DMatrix)
|
||||||
|
xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle,
|
||||||
|
(ctypes.c_float*len(grad))(*grad),
|
||||||
|
(ctypes.c_float*len(hess))(*hess),
|
||||||
|
len(grad), bst_group )
|
||||||
|
def update_interact(self, dtrain, action, booster_index=None):
|
||||||
|
""" beta: update with specified action"""
|
||||||
|
assert isinstance(dtrain, DMatrix)
|
||||||
|
if booster_index != None:
|
||||||
|
self.set_param('interact:booster_index', str(booster_index))
|
||||||
|
xglib.XGBoosterUpdateInteract(
|
||||||
|
self.handle, dtrain.handle, ctypes.c_char_p(str(action)) )
|
||||||
|
def eval_set(self, evals, it = 0):
|
||||||
|
for d in evals:
|
||||||
|
assert isinstance(d[0], DMatrix)
|
||||||
|
assert isinstance(d[1], str)
|
||||||
|
dmats = ( ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
|
||||||
|
evnames = ( ctypes.c_char_p * len(evals) )(
|
||||||
|
*[ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
|
||||||
|
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
|
||||||
|
def eval(self, mat, name = 'eval', it = 0 ):
|
||||||
|
self.eval_set( [(mat,name)], it)
|
||||||
|
def predict(self, data, bst_group = -1):
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
|
||||||
|
return ctypes2numpy( preds, length.value )
|
||||||
|
def save_model(self, fname):
|
||||||
|
""" save model to file """
|
||||||
|
xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
|
||||||
|
def load_model(self, fname):
|
||||||
|
"""load model from file"""
|
||||||
|
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
|
||||||
|
def dump_model(self, fname, fmap=''):
|
||||||
|
"""dump model into text file"""
|
||||||
|
xglib.XGBoosterDumpModel(
|
||||||
|
self.handle, ctypes.c_char_p(fname.encode('utf-8')),
|
||||||
|
ctypes.c_char_p(fmap.encode('utf-8')))
|
||||||
|
|
||||||
|
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None):
|
||||||
|
""" train a booster with given paramaters """
|
||||||
|
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
|
||||||
|
if obj == None:
|
||||||
|
for i in range(num_boost_round):
|
||||||
|
bst.update( dtrain )
|
||||||
|
if len(evals) != 0:
|
||||||
|
bst.eval_set( evals, i )
|
||||||
|
else:
|
||||||
|
# try customized objective function
|
||||||
|
for i in range(num_boost_round):
|
||||||
|
pred = bst.predict( dtrain )
|
||||||
|
grad, hess = obj( pred, dtrain )
|
||||||
|
bst.boost( dtrain, grad, hess )
|
||||||
|
if len(evals) != 0:
|
||||||
|
bst.eval_set( evals, i )
|
||||||
|
return bst
|
||||||
|
|
||||||
297
python/xgboost_python.cpp
Normal file
297
python/xgboost_python.cpp
Normal file
@@ -0,0 +1,297 @@
|
|||||||
|
// implementations in ctypes
|
||||||
|
#include "xgboost_python.h"
|
||||||
|
#include "../regrank/xgboost_regrank.h"
|
||||||
|
#include "../regrank/xgboost_regrank_data.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace python{
|
||||||
|
class DMatrix: public regrank::DMatrix{
|
||||||
|
public:
|
||||||
|
// whether column is initialized
|
||||||
|
bool init_col_;
|
||||||
|
public:
|
||||||
|
DMatrix(void){
|
||||||
|
init_col_ = false;
|
||||||
|
}
|
||||||
|
~DMatrix(void){}
|
||||||
|
public:
|
||||||
|
inline void Load(const char *fname, bool silent){
|
||||||
|
this->CacheLoad(fname, silent);
|
||||||
|
init_col_ = this->data.HaveColAccess();
|
||||||
|
}
|
||||||
|
inline void Clear( void ){
|
||||||
|
this->data.Clear();
|
||||||
|
this->info.labels.clear();
|
||||||
|
this->info.weights.clear();
|
||||||
|
this->info.group_ptr.clear();
|
||||||
|
}
|
||||||
|
inline size_t NumRow( void ) const{
|
||||||
|
return this->data.NumRow();
|
||||||
|
}
|
||||||
|
inline void AddRow( const XGEntry *data, size_t len ){
|
||||||
|
xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
mat.row_data_.resize( mat.row_ptr_.back() + len );
|
||||||
|
memcpy( &mat.row_data_[mat.row_ptr_.back()], data, sizeof(XGEntry)*len );
|
||||||
|
mat.row_ptr_.push_back( mat.row_ptr_.back() + len );
|
||||||
|
init_col_ = false;
|
||||||
|
}
|
||||||
|
inline const XGEntry* GetRow(unsigned ridx, size_t* len) const{
|
||||||
|
const xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
|
||||||
|
*len = mat.row_ptr_[ridx+1] - mat.row_ptr_[ridx];
|
||||||
|
return &mat.row_data_[ mat.row_ptr_[ridx] ];
|
||||||
|
}
|
||||||
|
inline void ParseCSR( const size_t *indptr,
|
||||||
|
const unsigned *indices,
|
||||||
|
const float *data,
|
||||||
|
size_t nindptr,
|
||||||
|
size_t nelem ){
|
||||||
|
xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
mat.row_ptr_.resize( nindptr );
|
||||||
|
memcpy( &mat.row_ptr_[0], indptr, sizeof(size_t)*nindptr );
|
||||||
|
mat.row_data_.resize( nelem );
|
||||||
|
for( size_t i = 0; i < nelem; ++ i ){
|
||||||
|
mat.row_data_[i] = XGEntry(indices[i], data[i]);
|
||||||
|
}
|
||||||
|
this->data.InitData();
|
||||||
|
this->init_col_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void ParseMat( const float *data,
|
||||||
|
size_t nrow,
|
||||||
|
size_t ncol,
|
||||||
|
float missing ){
|
||||||
|
xgboost::booster::FMatrixS &mat = this->data;
|
||||||
|
mat.Clear();
|
||||||
|
for( size_t i = 0; i < nrow; ++i, data += ncol ){
|
||||||
|
size_t nelem = 0;
|
||||||
|
for( size_t j = 0; j < ncol; ++j ){
|
||||||
|
if( data[j] != missing ){
|
||||||
|
mat.row_data_.push_back( XGEntry(j, data[j]) );
|
||||||
|
++ nelem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mat.row_ptr_.push_back( mat.row_ptr_.back() + nelem );
|
||||||
|
}
|
||||||
|
this->data.InitData();
|
||||||
|
this->init_col_ = true;
|
||||||
|
}
|
||||||
|
inline void SetLabel( const float *label, size_t len ){
|
||||||
|
this->info.labels.resize( len );
|
||||||
|
memcpy( &(this->info).labels[0], label, sizeof(float)*len );
|
||||||
|
}
|
||||||
|
inline void SetGroup( const unsigned *group, size_t len ){
|
||||||
|
this->info.group_ptr.resize( len + 1 );
|
||||||
|
this->info.group_ptr[0] = 0;
|
||||||
|
for( size_t i = 0; i < len; ++ i ){
|
||||||
|
this->info.group_ptr[i+1] = this->info.group_ptr[i]+group[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void SetWeight( const float *weight, size_t len ){
|
||||||
|
this->info.weights.resize( len );
|
||||||
|
memcpy( &(this->info).weights[0], weight, sizeof(float)*len );
|
||||||
|
}
|
||||||
|
inline const float* GetLabel( size_t* len ) const{
|
||||||
|
*len = this->info.labels.size();
|
||||||
|
return &(this->info.labels[0]);
|
||||||
|
}
|
||||||
|
inline const float* GetWeight( size_t* len ) const{
|
||||||
|
*len = this->info.weights.size();
|
||||||
|
return &(this->info.weights[0]);
|
||||||
|
}
|
||||||
|
inline void CheckInit(void){
|
||||||
|
if(!init_col_){
|
||||||
|
this->data.InitData();
|
||||||
|
init_col_ = true;
|
||||||
|
}
|
||||||
|
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class Booster: public xgboost::regrank::RegRankBoostLearner{
|
||||||
|
private:
|
||||||
|
bool init_trainer, init_model;
|
||||||
|
public:
|
||||||
|
Booster(const std::vector<regrank::DMatrix *> mats){
|
||||||
|
silent = 1;
|
||||||
|
init_trainer = false;
|
||||||
|
init_model = false;
|
||||||
|
this->SetCacheData(mats);
|
||||||
|
}
|
||||||
|
inline void CheckInit(void){
|
||||||
|
if( !init_trainer ){
|
||||||
|
this->InitTrainer(); init_trainer = true;
|
||||||
|
}
|
||||||
|
if( !init_model ){
|
||||||
|
this->InitModel(); init_model = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void LoadModel( const char *fname ){
|
||||||
|
xgboost::regrank::RegRankBoostLearner::LoadModel(fname);
|
||||||
|
this->init_model = true;
|
||||||
|
}
|
||||||
|
inline void SetParam( const char *name, const char *val ){
|
||||||
|
if( !strcmp( name, "seed" ) ) random::Seed(atoi(val));
|
||||||
|
xgboost::regrank::RegRankBoostLearner::SetParam( name, val );
|
||||||
|
}
|
||||||
|
const float *Pred( const DMatrix &dmat, size_t *len, int bst_group ){
|
||||||
|
this->CheckInit();
|
||||||
|
|
||||||
|
this->Predict( this->preds_, dmat, bst_group );
|
||||||
|
*len = this->preds_.size();
|
||||||
|
return &this->preds_[0];
|
||||||
|
}
|
||||||
|
inline void BoostOneIter( const DMatrix &train,
|
||||||
|
float *grad, float *hess, size_t len, int bst_group ){
|
||||||
|
this->grad_.resize( len ); this->hess_.resize( len );
|
||||||
|
memcpy( &this->grad_[0], grad, sizeof(float)*len );
|
||||||
|
memcpy( &this->hess_[0], hess, sizeof(float)*len );
|
||||||
|
|
||||||
|
if( grad_.size() == train.Size() ){
|
||||||
|
if( bst_group < 0 ) bst_group = 0;
|
||||||
|
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index, bst_group);
|
||||||
|
}else{
|
||||||
|
utils::Assert( bst_group == -1, "must set bst_group to -1 to support all group boosting" );
|
||||||
|
int ngroup = base_gbm.NumBoosterGroup();
|
||||||
|
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
|
||||||
|
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
|
||||||
|
for( int g = 0; g < ngroup; ++ g ){
|
||||||
|
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
using namespace xgboost::python;
|
||||||
|
|
||||||
|
|
||||||
|
extern "C"{
|
||||||
|
void* XGDMatrixCreate( void ){
|
||||||
|
return new DMatrix();
|
||||||
|
}
|
||||||
|
void XGDMatrixFree( void *handle ){
|
||||||
|
delete static_cast<DMatrix*>(handle);
|
||||||
|
}
|
||||||
|
void XGDMatrixLoad( void *handle, const char *fname, int silent ){
|
||||||
|
static_cast<DMatrix*>(handle)->Load(fname, silent!=0);
|
||||||
|
}
|
||||||
|
void XGDMatrixSaveBinary( void *handle, const char *fname, int silent ){
|
||||||
|
static_cast<DMatrix*>(handle)->SaveBinary(fname, silent!=0);
|
||||||
|
}
|
||||||
|
void XGDMatrixParseCSR( void *handle,
|
||||||
|
const size_t *indptr,
|
||||||
|
const unsigned *indices,
|
||||||
|
const float *data,
|
||||||
|
size_t nindptr,
|
||||||
|
size_t nelem ){
|
||||||
|
static_cast<DMatrix*>(handle)->ParseCSR(indptr, indices, data, nindptr, nelem);
|
||||||
|
}
|
||||||
|
void XGDMatrixParseMat( void *handle,
|
||||||
|
const float *data,
|
||||||
|
size_t nrow,
|
||||||
|
size_t ncol,
|
||||||
|
float missing ){
|
||||||
|
static_cast<DMatrix*>(handle)->ParseMat(data, nrow, ncol, missing);
|
||||||
|
}
|
||||||
|
void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->SetLabel(label,len);
|
||||||
|
}
|
||||||
|
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->SetWeight(weight,len);
|
||||||
|
}
|
||||||
|
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->SetGroup(group,len);
|
||||||
|
}
|
||||||
|
const float* XGDMatrixGetLabel( const void *handle, size_t* len ){
|
||||||
|
return static_cast<const DMatrix*>(handle)->GetLabel(len);
|
||||||
|
}
|
||||||
|
const float* XGDMatrixGetWeight( const void *handle, size_t* len ){
|
||||||
|
return static_cast<const DMatrix*>(handle)->GetWeight(len);
|
||||||
|
}
|
||||||
|
void XGDMatrixClear(void *handle){
|
||||||
|
static_cast<DMatrix*>(handle)->Clear();
|
||||||
|
}
|
||||||
|
void XGDMatrixAddRow( void *handle, const XGEntry *data, size_t len ){
|
||||||
|
static_cast<DMatrix*>(handle)->AddRow(data, len);
|
||||||
|
}
|
||||||
|
size_t XGDMatrixNumRow(const void *handle){
|
||||||
|
return static_cast<const DMatrix*>(handle)->NumRow();
|
||||||
|
}
|
||||||
|
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len){
|
||||||
|
return static_cast<DMatrix*>(handle)->GetRow(ridx, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
// xgboost implementation
|
||||||
|
void *XGBoosterCreate( void *dmats[], size_t len ){
|
||||||
|
std::vector<xgboost::regrank::DMatrix*> mats;
|
||||||
|
for( size_t i = 0; i < len; ++i ){
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dmats[i]);
|
||||||
|
dtr->CheckInit();
|
||||||
|
mats.push_back( dtr );
|
||||||
|
}
|
||||||
|
return new Booster( mats );
|
||||||
|
}
|
||||||
|
void XGBoosterFree( void *handle ){
|
||||||
|
delete static_cast<Booster*>(handle);
|
||||||
|
}
|
||||||
|
void XGBoosterSetParam( void *handle, const char *name, const char *value ){
|
||||||
|
static_cast<Booster*>(handle)->SetParam( name, value );
|
||||||
|
}
|
||||||
|
void XGBoosterUpdateOneIter( void *handle, void *dtrain ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||||
|
bst->CheckInit(); dtr->CheckInit();
|
||||||
|
bst->UpdateOneIter( *dtr );
|
||||||
|
}
|
||||||
|
void XGBoosterBoostOneIter( void *handle, void *dtrain,
|
||||||
|
float *grad, float *hess, size_t len, int bst_group ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||||
|
bst->CheckInit(); dtr->CheckInit();
|
||||||
|
bst->BoostOneIter( *dtr, grad, hess, len, bst_group );
|
||||||
|
}
|
||||||
|
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
bst->CheckInit();
|
||||||
|
|
||||||
|
std::vector<std::string> names;
|
||||||
|
std::vector<const xgboost::regrank::DMatrix*> mats;
|
||||||
|
for( size_t i = 0; i < len; ++i ){
|
||||||
|
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
|
||||||
|
names.push_back( std::string( evnames[i]) );
|
||||||
|
}
|
||||||
|
bst->EvalOneIter( iter, mats, names, stderr );
|
||||||
|
}
|
||||||
|
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group ){
|
||||||
|
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len, bst_group );
|
||||||
|
}
|
||||||
|
void XGBoosterLoadModel( void *handle, const char *fname ){
|
||||||
|
static_cast<Booster*>(handle)->LoadModel( fname );
|
||||||
|
}
|
||||||
|
void XGBoosterSaveModel( const void *handle, const char *fname ){
|
||||||
|
static_cast<const Booster*>(handle)->SaveModel( fname );
|
||||||
|
}
|
||||||
|
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ){
|
||||||
|
using namespace xgboost::utils;
|
||||||
|
FILE *fo = FopenCheck( fname, "w" );
|
||||||
|
FeatMap featmap;
|
||||||
|
if( strlen(fmap) != 0 ){
|
||||||
|
featmap.LoadText( fmap );
|
||||||
|
}
|
||||||
|
static_cast<Booster*>(handle)->DumpModel( fo, featmap, false );
|
||||||
|
fclose( fo );
|
||||||
|
}
|
||||||
|
|
||||||
|
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char *action ){
|
||||||
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||||
|
bst->CheckInit(); dtr->CheckInit();
|
||||||
|
std::string act( action );
|
||||||
|
bst->UpdateInteract( act, *dtr );
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
209
python/xgboost_python.h
Normal file
209
python/xgboost_python.h
Normal file
@@ -0,0 +1,209 @@
|
|||||||
|
#ifndef XGBOOST_PYTHON_H
|
||||||
|
#define XGBOOST_PYTHON_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_python.h
|
||||||
|
* \author Tianqi Chen
|
||||||
|
* \brief python wrapper for xgboost, using ctypes,
|
||||||
|
* hides everything behind functions
|
||||||
|
* use c style interface
|
||||||
|
*/
|
||||||
|
#include "../booster/xgboost_data.h"
|
||||||
|
extern "C"{
|
||||||
|
/*! \brief type of row entry */
|
||||||
|
typedef xgboost::booster::FMatrixS::REntry XGEntry;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief create a data matrix
|
||||||
|
* \return a new data matrix
|
||||||
|
*/
|
||||||
|
void* XGDMatrixCreate(void);
|
||||||
|
/*!
|
||||||
|
* \brief free space in data matrix
|
||||||
|
*/
|
||||||
|
void XGDMatrixFree(void *handle);
|
||||||
|
/*!
|
||||||
|
* \brief load a data matrix from text file or buffer(if exists)
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param fname file name
|
||||||
|
* \param silent print statistics when loading
|
||||||
|
*/
|
||||||
|
void XGDMatrixLoad(void *handle, const char *fname, int silent);
|
||||||
|
/*!
|
||||||
|
* \brief load a data matrix into binary file
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param fname file name
|
||||||
|
* \param silent print statistics when saving
|
||||||
|
*/
|
||||||
|
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
|
||||||
|
/*!
|
||||||
|
* \brief set matrix content from csr format
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param indptr pointer to row headers
|
||||||
|
* \param indices findex
|
||||||
|
* \param data fvalue
|
||||||
|
* \param nindptr number of rows in the matix + 1
|
||||||
|
* \param nelem number of nonzero elements in the matrix
|
||||||
|
*/
|
||||||
|
void XGDMatrixParseCSR( void *handle,
|
||||||
|
const size_t *indptr,
|
||||||
|
const unsigned *indices,
|
||||||
|
const float *data,
|
||||||
|
size_t nindptr,
|
||||||
|
size_t nelem );
|
||||||
|
/*!
|
||||||
|
* \brief set matrix content from data content
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param data pointer to the data space
|
||||||
|
* \param nrow number of rows
|
||||||
|
* \param ncol number columns
|
||||||
|
* \param missing which value to represent missing value
|
||||||
|
*/
|
||||||
|
void XGDMatrixParseMat( void *handle,
|
||||||
|
const float *data,
|
||||||
|
size_t nrow,
|
||||||
|
size_t ncol,
|
||||||
|
float missing );
|
||||||
|
/*!
|
||||||
|
* \brief set label of the training matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param label pointer to label
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixSetLabel( void *handle, const float *label, size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief set label of the training matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param group pointer to group size
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief set weight of each instacne
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param weight data pointer to weights
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief get label set from matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param len used to set result length
|
||||||
|
* \return pointer to the label
|
||||||
|
*/
|
||||||
|
const float* XGDMatrixGetLabel( const void *handle, size_t* len );
|
||||||
|
/*!
|
||||||
|
* \brief get weight set from matrix
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param len used to set result length
|
||||||
|
* \return pointer to the weight
|
||||||
|
*/
|
||||||
|
const float* XGDMatrixGetWeight( const void *handle, size_t* len );
|
||||||
|
/*!
|
||||||
|
* \brief clear all the records, including feature matrix and label
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
*/
|
||||||
|
void XGDMatrixClear(void *handle);
|
||||||
|
/*!
|
||||||
|
* \brief return number of rows
|
||||||
|
*/
|
||||||
|
size_t XGDMatrixNumRow(const void *handle);
|
||||||
|
/*!
|
||||||
|
* \brief add row
|
||||||
|
* \param handle a instance of data matrix
|
||||||
|
* \param data array of row content
|
||||||
|
* \param len length of array
|
||||||
|
*/
|
||||||
|
void XGDMatrixAddRow(void *handle, const XGEntry *data, size_t len);
|
||||||
|
/*!
|
||||||
|
* \brief get ridx-th row of sparse matrix
|
||||||
|
* \param handle handle
|
||||||
|
* \param ridx row index
|
||||||
|
* \param len used to set result length
|
||||||
|
* \reurn pointer to the row
|
||||||
|
*/
|
||||||
|
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len);
|
||||||
|
|
||||||
|
// --- start XGBoost class
|
||||||
|
/*!
|
||||||
|
* \brief create xgboost learner
|
||||||
|
* \param dmats matrices that are set to be cached
|
||||||
|
* \param create a booster
|
||||||
|
*/
|
||||||
|
void *XGBoosterCreate( void* dmats[], size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief free obj in handle
|
||||||
|
* \param handle handle to be freed
|
||||||
|
*/
|
||||||
|
void XGBoosterFree( void* handle );
|
||||||
|
/*!
|
||||||
|
* \brief set parameters
|
||||||
|
* \param handle handle
|
||||||
|
* \param name parameter name
|
||||||
|
* \param val value of parameter
|
||||||
|
*/
|
||||||
|
void XGBoosterSetParam( void *handle, const char *name, const char *value );
|
||||||
|
/*!
|
||||||
|
* \brief update the model in one round using dtrain
|
||||||
|
* \param handle handle
|
||||||
|
* \param dtrain training data
|
||||||
|
*/
|
||||||
|
void XGBoosterUpdateOneIter( void *handle, void *dtrain );
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief update the model, by directly specify gradient and second order gradient,
|
||||||
|
* this can be used to replace UpdateOneIter, to support customized loss function
|
||||||
|
* \param handle handle
|
||||||
|
* \param dtrain training data
|
||||||
|
* \param grad gradient statistics
|
||||||
|
* \param hess second order gradient statistics
|
||||||
|
* \param len length of grad/hess array
|
||||||
|
* \param bst_group boost group we are working at, default = -1
|
||||||
|
*/
|
||||||
|
void XGBoosterBoostOneIter( void *handle, void *dtrain,
|
||||||
|
float *grad, float *hess, size_t len, int bst_group );
|
||||||
|
/*!
|
||||||
|
* \brief print evaluation statistics to stdout for xgboost
|
||||||
|
* \param handle handle
|
||||||
|
* \param iter current iteration rounds
|
||||||
|
* \param dmats pointers to data to be evaluated
|
||||||
|
* \param evnames pointers to names of each data
|
||||||
|
* \param len length of dmats
|
||||||
|
*/
|
||||||
|
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len );
|
||||||
|
/*!
|
||||||
|
* \brief make prediction based on dmat
|
||||||
|
* \param handle handle
|
||||||
|
* \param dmat data matrix
|
||||||
|
* \param len used to store length of returning result
|
||||||
|
* \param bst_group booster group, if model contains multiple booster group, default = -1 means predict for all groups
|
||||||
|
*/
|
||||||
|
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group );
|
||||||
|
/*!
|
||||||
|
* \brief load model from existing file
|
||||||
|
* \param handle handle
|
||||||
|
* \param fname file name
|
||||||
|
*/
|
||||||
|
void XGBoosterLoadModel( void *handle, const char *fname );
|
||||||
|
/*!
|
||||||
|
* \brief save model into existing file
|
||||||
|
* \param handle handle
|
||||||
|
* \param fname file name
|
||||||
|
*/
|
||||||
|
void XGBoosterSaveModel( const void *handle, const char *fname );
|
||||||
|
/*!
|
||||||
|
* \brief dump model into text file
|
||||||
|
* \param handle handle
|
||||||
|
* \param fname file name
|
||||||
|
* \param fmap name to fmap can be empty string
|
||||||
|
*/
|
||||||
|
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap );
|
||||||
|
/*!
|
||||||
|
* \brief interactively update model: beta
|
||||||
|
* \param handle handle
|
||||||
|
* \param dtrain training data
|
||||||
|
* \param action action name
|
||||||
|
*/
|
||||||
|
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char* action );
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
401
regrank/xgboost_regrank.h
Normal file
401
regrank/xgboost_regrank.h
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_H
|
||||||
|
#define XGBOOST_REGRANK_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank.h
|
||||||
|
* \brief class for gradient boosted regression and ranking
|
||||||
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
|
*/
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include "xgboost_regrank_data.h"
|
||||||
|
#include "xgboost_regrank_eval.h"
|
||||||
|
#include "xgboost_regrank_obj.h"
|
||||||
|
#include "../utils/xgboost_omp.h"
|
||||||
|
#include "../booster/xgboost_gbmbase.h"
|
||||||
|
#include "../utils/xgboost_utils.h"
|
||||||
|
#include "../utils/xgboost_stream.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief class for gradient boosted regression and ranking */
|
||||||
|
class RegRankBoostLearner{
|
||||||
|
public:
|
||||||
|
/*! \brief constructor */
|
||||||
|
RegRankBoostLearner(void){
|
||||||
|
silent = 0;
|
||||||
|
obj_ = NULL;
|
||||||
|
name_obj_ = "reg:linear";
|
||||||
|
}
|
||||||
|
/*! \brief destructor */
|
||||||
|
~RegRankBoostLearner(void){
|
||||||
|
if( obj_ != NULL ) delete obj_;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief a regression booter associated with training and evaluating data
|
||||||
|
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||||
|
*/
|
||||||
|
RegRankBoostLearner(const std::vector<DMatrix *>& mats){
|
||||||
|
silent = 0;
|
||||||
|
obj_ = NULL;
|
||||||
|
name_obj_ = "reg:linear";
|
||||||
|
this->SetCacheData(mats);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief add internal cache space for mat, this can speedup prediction for matrix,
|
||||||
|
* please cache prediction for training and eval data
|
||||||
|
* warning: if the model is loaded from file from some previous training history
|
||||||
|
* set cache data must be called with exactly SAME
|
||||||
|
* data matrices to continue training otherwise it will cause error
|
||||||
|
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||||
|
*/
|
||||||
|
inline void SetCacheData(const std::vector<DMatrix *>& mats){
|
||||||
|
// estimate feature bound
|
||||||
|
int num_feature = 0;
|
||||||
|
// assign buffer index
|
||||||
|
unsigned buffer_size = 0;
|
||||||
|
|
||||||
|
utils::Assert( cache_.size() == 0, "can only call cache data once" );
|
||||||
|
for( size_t i = 0; i < mats.size(); ++i ){
|
||||||
|
bool dupilicate = false;
|
||||||
|
for( size_t j = 0; j < i; ++ j ){
|
||||||
|
if( mats[i] == mats[j] ) dupilicate = true;
|
||||||
|
}
|
||||||
|
if( dupilicate ) continue;
|
||||||
|
// set mats[i]'s cache learner pointer to this
|
||||||
|
mats[i]->cache_learner_ptr_ = this;
|
||||||
|
cache_.push_back( CacheEntry( mats[i], buffer_size, mats[i]->Size() ) );
|
||||||
|
buffer_size += static_cast<unsigned>(mats[i]->Size());
|
||||||
|
num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
|
||||||
|
}
|
||||||
|
|
||||||
|
char str_temp[25];
|
||||||
|
if (num_feature > mparam.num_feature){
|
||||||
|
mparam.num_feature = num_feature;
|
||||||
|
sprintf(str_temp, "%d", num_feature);
|
||||||
|
base_gbm.SetParam("bst:num_feature", str_temp);
|
||||||
|
}
|
||||||
|
|
||||||
|
sprintf(str_temp, "%u", buffer_size);
|
||||||
|
base_gbm.SetParam("num_pbuffer", str_temp);
|
||||||
|
if (!silent){
|
||||||
|
printf("buffer_size=%u\n", buffer_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief set parameters from outside
|
||||||
|
* \param name name of the parameter
|
||||||
|
* \param val value of the parameter
|
||||||
|
*/
|
||||||
|
inline void SetParam(const char *name, const char *val){
|
||||||
|
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||||
|
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||||
|
if (!strcmp(name, "objective") ) name_obj_ = val;
|
||||||
|
if (!strcmp(name, "num_class") ) base_gbm.SetParam("num_booster_group", val );
|
||||||
|
mparam.SetParam(name, val);
|
||||||
|
base_gbm.SetParam(name, val);
|
||||||
|
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief initialize solver before training, called before training
|
||||||
|
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||||
|
*/
|
||||||
|
inline void InitTrainer(void){
|
||||||
|
if( mparam.num_class != 0 ){
|
||||||
|
if( name_obj_ != "multi:softmax" && name_obj_ != "multi:softprob"){
|
||||||
|
name_obj_ = "multi:softmax";
|
||||||
|
printf("auto select objective=softmax to support multi-class classification\n" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
base_gbm.InitTrainer();
|
||||||
|
obj_ = CreateObjFunction( name_obj_.c_str() );
|
||||||
|
for( size_t i = 0; i < cfg_.size(); ++ i ){
|
||||||
|
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
|
||||||
|
}
|
||||||
|
evaluator_.AddEval( obj_->DefaultEvalMetric() );
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||||
|
*/
|
||||||
|
inline void InitModel(void){
|
||||||
|
base_gbm.InitModel();
|
||||||
|
mparam.AdjustBase(name_obj_.c_str());
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load model from file
|
||||||
|
* \param fname file name
|
||||||
|
*/
|
||||||
|
inline void LoadModel(const char *fname){
|
||||||
|
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
|
||||||
|
this->LoadModel(fi);
|
||||||
|
fi.Close();
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load model from stream
|
||||||
|
* \param fi input stream
|
||||||
|
*/
|
||||||
|
inline void LoadModel(utils::IStream &fi){
|
||||||
|
base_gbm.LoadModel(fi);
|
||||||
|
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||||
|
// save name obj
|
||||||
|
size_t len;
|
||||||
|
if( fi.Read(&len, sizeof(len)) != 0 ){
|
||||||
|
name_obj_.resize( len );
|
||||||
|
if( len != 0 ){
|
||||||
|
utils::Assert( fi.Read(&name_obj_[0], len*sizeof(char)) != 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief DumpModel
|
||||||
|
* \param fo text file
|
||||||
|
* \param fmap feature map that may help give interpretations of feature
|
||||||
|
* \param with_stats whether print statistics as well
|
||||||
|
*/
|
||||||
|
inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){
|
||||||
|
base_gbm.DumpModel(fo, fmap, with_stats);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief Dump path of all trees
|
||||||
|
* \param fo text file
|
||||||
|
* \param data input data
|
||||||
|
*/
|
||||||
|
inline void DumpPath(FILE *fo, const DMatrix &data){
|
||||||
|
base_gbm.DumpPath(fo, data.data);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save model to stream
|
||||||
|
* \param fo output stream
|
||||||
|
*/
|
||||||
|
inline void SaveModel(utils::IStream &fo) const{
|
||||||
|
base_gbm.SaveModel(fo);
|
||||||
|
fo.Write(&mparam, sizeof(ModelParam));
|
||||||
|
// save name obj
|
||||||
|
size_t len = name_obj_.length();
|
||||||
|
fo.Write(&len, sizeof(len));
|
||||||
|
fo.Write(&name_obj_[0], len*sizeof(char));
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save model into file
|
||||||
|
* \param fname file name
|
||||||
|
*/
|
||||||
|
inline void SaveModel(const char *fname) const{
|
||||||
|
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||||
|
this->SaveModel(fo);
|
||||||
|
fo.Close();
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief update the model for one iteration
|
||||||
|
*/
|
||||||
|
inline void UpdateOneIter(const DMatrix &train){
|
||||||
|
this->PredictRaw(preds_, train);
|
||||||
|
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
|
||||||
|
if( grad_.size() == train.Size() ){
|
||||||
|
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index);
|
||||||
|
}else{
|
||||||
|
int ngroup = base_gbm.NumBoosterGroup();
|
||||||
|
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
|
||||||
|
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
|
||||||
|
for( int g = 0; g < ngroup; ++ g ){
|
||||||
|
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
|
||||||
|
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief evaluate the model for specific iteration
|
||||||
|
* \param iter iteration number
|
||||||
|
* \param evals datas i want to evaluate
|
||||||
|
* \param evname name of each dataset
|
||||||
|
* \param fo file to output log
|
||||||
|
*/
|
||||||
|
inline void EvalOneIter(int iter,
|
||||||
|
const std::vector<const DMatrix*> &evals,
|
||||||
|
const std::vector<std::string> &evname,
|
||||||
|
FILE *fo=stderr ){
|
||||||
|
fprintf(fo, "[%d]", iter);
|
||||||
|
for (size_t i = 0; i < evals.size(); ++i){
|
||||||
|
this->PredictRaw(preds_, *evals[i]);
|
||||||
|
obj_->EvalTransform(preds_);
|
||||||
|
evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info);
|
||||||
|
}
|
||||||
|
fprintf(fo, "\n");
|
||||||
|
fflush(fo);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief get prediction
|
||||||
|
* \param storage to store prediction
|
||||||
|
* \param data input data
|
||||||
|
* \param bst_group booster group we are in
|
||||||
|
*/
|
||||||
|
inline void Predict(std::vector<float> &preds, const DMatrix &data, int bst_group = -1){
|
||||||
|
this->PredictRaw( preds, data, bst_group );
|
||||||
|
obj_->PredTransform( preds );
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
/*!
|
||||||
|
* \brief interactive update
|
||||||
|
* \param action action type
|
||||||
|
* \parma train training data
|
||||||
|
*/
|
||||||
|
inline void UpdateInteract(std::string action, const DMatrix& train){
|
||||||
|
for(size_t i = 0; i < cache_.size(); ++i){
|
||||||
|
this->InteractPredict(preds_, *cache_[i].mat_);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (action == "remove"){
|
||||||
|
base_gbm.DelteBooster(); return;
|
||||||
|
}
|
||||||
|
|
||||||
|
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
|
||||||
|
std::vector<unsigned> root_index;
|
||||||
|
base_gbm.DoBoost(grad_, hess_, train.data, root_index);
|
||||||
|
|
||||||
|
for(size_t i = 0; i < cache_.size(); ++i){
|
||||||
|
this->InteractRePredict(*cache_[i].mat_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
/*! \brief get the transformed predictions, given data */
|
||||||
|
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data){
|
||||||
|
int buffer_offset = this->FindBufferOffset(data);
|
||||||
|
utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
|
||||||
|
preds.resize(data.Size());
|
||||||
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j);
|
||||||
|
}
|
||||||
|
obj_->PredTransform( preds );
|
||||||
|
}
|
||||||
|
/*! \brief repredict trial */
|
||||||
|
inline void InteractRePredict(const DMatrix &data){
|
||||||
|
int buffer_offset = this->FindBufferOffset(data);
|
||||||
|
utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
|
||||||
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief get un-transformed prediction*/
|
||||||
|
inline void PredictRaw(std::vector<float> &preds, const DMatrix &data, int bst_group = -1 ){
|
||||||
|
int buffer_offset = this->FindBufferOffset(data);
|
||||||
|
if( bst_group < 0 ){
|
||||||
|
int ngroup = base_gbm.NumBoosterGroup();
|
||||||
|
preds.resize( data.Size() * ngroup );
|
||||||
|
for( int g = 0; g < ngroup; ++ g ){
|
||||||
|
this->PredictBuffer(&preds[ data.Size() * g ], data, buffer_offset, g );
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
preds.resize( data.Size() );
|
||||||
|
this->PredictBuffer(&preds[0], data, buffer_offset, bst_group );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief get the un-transformed predictions, given data */
|
||||||
|
inline void PredictBuffer(float *preds, const DMatrix &data, int buffer_offset, int bst_group ){
|
||||||
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
|
if( buffer_offset >= 0 ){
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
|
||||||
|
|
||||||
|
}
|
||||||
|
}else
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1, data.info.GetRoot(j), bst_group );
|
||||||
|
}{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
/*! \brief training parameter for regression */
|
||||||
|
struct ModelParam{
|
||||||
|
/* \brief global bias */
|
||||||
|
float base_score;
|
||||||
|
/* \brief type of loss function */
|
||||||
|
int loss_type;
|
||||||
|
/* \brief number of features */
|
||||||
|
int num_feature;
|
||||||
|
/* \brief number of class, if it is multi-class classification */
|
||||||
|
int num_class;
|
||||||
|
/*! \brief reserved field */
|
||||||
|
int reserved[15];
|
||||||
|
/*! \brief constructor */
|
||||||
|
ModelParam(void){
|
||||||
|
base_score = 0.5f;
|
||||||
|
loss_type = -1;
|
||||||
|
num_feature = 0;
|
||||||
|
num_class = 0;
|
||||||
|
memset(reserved, 0, sizeof(reserved));
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief set parameters from outside
|
||||||
|
* \param name name of the parameter
|
||||||
|
* \param val value of the parameter
|
||||||
|
*/
|
||||||
|
inline void SetParam(const char *name, const char *val){
|
||||||
|
if (!strcmp("base_score", name)) base_score = (float)atof(val);
|
||||||
|
if (!strcmp("num_class", name)) num_class = atoi(val);
|
||||||
|
if (!strcmp("loss_type", name)) loss_type = atoi(val);
|
||||||
|
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief adjust base_score based on loss type and objective function
|
||||||
|
*/
|
||||||
|
inline void AdjustBase(const char *obj){
|
||||||
|
// some tweaks for loss type
|
||||||
|
if( loss_type == -1 ){
|
||||||
|
loss_type = 1;
|
||||||
|
if( !strcmp("reg:linear", obj ) ) loss_type = 0;
|
||||||
|
}
|
||||||
|
if (loss_type == 1 || loss_type == 2|| loss_type == 3){
|
||||||
|
utils::Assert(base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain");
|
||||||
|
base_score = -logf(1.0f / base_score - 1.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
private:
|
||||||
|
struct CacheEntry{
|
||||||
|
const DMatrix *mat_;
|
||||||
|
int buffer_offset_;
|
||||||
|
size_t num_row_;
|
||||||
|
CacheEntry(const DMatrix *mat, int buffer_offset, size_t num_row)
|
||||||
|
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row){}
|
||||||
|
};
|
||||||
|
/*! \brief the entries indicates that we have internal prediction cache */
|
||||||
|
std::vector<CacheEntry> cache_;
|
||||||
|
private:
|
||||||
|
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||||
|
inline int FindBufferOffset(const DMatrix &mat){
|
||||||
|
for(size_t i = 0; i < cache_.size(); ++i){
|
||||||
|
if( cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this ) {
|
||||||
|
if( cache_[i].num_row_ == mat.Size() ){
|
||||||
|
return cache_[i].buffer_offset_;
|
||||||
|
}else{
|
||||||
|
fprintf( stderr, "warning: number of rows in input matrix changed as remembered in cachelist, ignore cached results\n" );
|
||||||
|
fflush( stderr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
int silent;
|
||||||
|
EvalSet evaluator_;
|
||||||
|
booster::GBMBase base_gbm;
|
||||||
|
ModelParam mparam;
|
||||||
|
// objective fnction
|
||||||
|
IObjFunction *obj_;
|
||||||
|
// name of objective function
|
||||||
|
std::string name_obj_;
|
||||||
|
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||||
|
protected:
|
||||||
|
std::vector<float> grad_, hess_, preds_;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
260
regrank/xgboost_regrank_data.h
Normal file
260
regrank/xgboost_regrank_data.h
Normal file
@@ -0,0 +1,260 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_DATA_H
|
||||||
|
#define XGBOOST_REGRANK_DATA_H
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_data.h
|
||||||
|
* \brief input data structure for regression, binary classification, and rankning.
|
||||||
|
* Format:
|
||||||
|
* The data should contain each data instance in each line.
|
||||||
|
* The format of line data is as below:
|
||||||
|
* label <nonzero feature dimension> [feature index:feature value]+
|
||||||
|
* When using rank, an addtional group file with suffix group must be provided, giving the number of instances in each group
|
||||||
|
* When using weighted aware classification(regression), an addtional weight file must be provided, giving the weight of each instance
|
||||||
|
*
|
||||||
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
|
*/
|
||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
#include "../booster/xgboost_data.h"
|
||||||
|
#include "../utils/xgboost_utils.h"
|
||||||
|
#include "../utils/xgboost_stream.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
/*! \brief namespace to handle regression and rank */
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief data matrix for regression content */
|
||||||
|
struct DMatrix{
|
||||||
|
public:
|
||||||
|
/*! \brief data information besides the features */
|
||||||
|
struct Info{
|
||||||
|
/*! \brief label of each instance */
|
||||||
|
std::vector<float> labels;
|
||||||
|
/*! \brief the index of begin and end of a groupneeded when the learning task is ranking */
|
||||||
|
std::vector<unsigned> group_ptr;
|
||||||
|
/*! \brief weights of each instance, optional */
|
||||||
|
std::vector<float> weights;
|
||||||
|
/*! \brief specified root index of each instance, can be used for multi task setting*/
|
||||||
|
std::vector<unsigned> root_index;
|
||||||
|
/*! \brief get weight of each instances */
|
||||||
|
inline float GetWeight( size_t i ) const{
|
||||||
|
if( weights.size() != 0 ) return weights[i];
|
||||||
|
else return 1.0f;
|
||||||
|
}
|
||||||
|
inline float GetRoot( size_t i ) const{
|
||||||
|
if( root_index.size() != 0 ) return static_cast<float>(root_index[i]);
|
||||||
|
else return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
public:
|
||||||
|
/*! \brief feature data content */
|
||||||
|
booster::FMatrixS data;
|
||||||
|
/*! \brief information fields */
|
||||||
|
Info info;
|
||||||
|
/*!
|
||||||
|
* \brief cache pointer to verify if the data structure is cached in some learner
|
||||||
|
* this is a bit ugly, we need to have double check verification, so if one side get deleted,
|
||||||
|
* and some strange re-allocation gets the same pointer we will still be fine
|
||||||
|
*/
|
||||||
|
void *cache_learner_ptr_;
|
||||||
|
public:
|
||||||
|
/*! \brief default constructor */
|
||||||
|
DMatrix(void):cache_learner_ptr_(NULL){}
|
||||||
|
/*! \brief get the number of instances */
|
||||||
|
inline size_t Size() const{
|
||||||
|
return data.NumRow();
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load from text file
|
||||||
|
* \param fname name of text data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
*/
|
||||||
|
inline void LoadText(const char* fname, bool silent = false){
|
||||||
|
data.Clear();
|
||||||
|
FILE* file = utils::FopenCheck(fname, "r");
|
||||||
|
float label; bool init = true;
|
||||||
|
char tmp[1024];
|
||||||
|
std::vector<booster::bst_uint> findex;
|
||||||
|
std::vector<booster::bst_float> fvalue;
|
||||||
|
|
||||||
|
while (fscanf(file, "%s", tmp) == 1){
|
||||||
|
unsigned index; float value;
|
||||||
|
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
||||||
|
findex.push_back(index); fvalue.push_back(value);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
if (!init){
|
||||||
|
info.labels.push_back(label);
|
||||||
|
data.AddRow(findex, fvalue);
|
||||||
|
}
|
||||||
|
findex.clear(); fvalue.clear();
|
||||||
|
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
|
||||||
|
init = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info.labels.push_back(label);
|
||||||
|
data.AddRow(findex, fvalue);
|
||||||
|
// initialize column support as well
|
||||||
|
data.InitData();
|
||||||
|
|
||||||
|
if (!silent){
|
||||||
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
|
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
this->TryLoadGroup(fname, silent);
|
||||||
|
this->TryLoadWeight(fname, silent);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load from binary file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
* \return whether loading is success
|
||||||
|
*/
|
||||||
|
inline bool LoadBinary(const char* fname, bool silent = false){
|
||||||
|
FILE *fp = fopen64(fname, "rb");
|
||||||
|
if (fp == NULL) return false;
|
||||||
|
utils::FileStream fs(fp);
|
||||||
|
data.LoadBinary(fs);
|
||||||
|
info.labels.resize(data.NumRow());
|
||||||
|
utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
|
||||||
|
{// load in group ptr
|
||||||
|
unsigned ngptr;
|
||||||
|
if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){
|
||||||
|
info.group_ptr.resize( ngptr );
|
||||||
|
if( ngptr != 0 ){
|
||||||
|
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
|
||||||
|
utils::Assert( info.group_ptr.back() == data.NumRow(), "number of group must match number of record" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{// load in weight
|
||||||
|
unsigned nwt;
|
||||||
|
if( fs.Read(&nwt, sizeof(unsigned) ) != 0 ){
|
||||||
|
utils::Assert( nwt == 0 || nwt == data.NumRow(), "invalid weight" );
|
||||||
|
info.weights.resize( nwt );
|
||||||
|
if( nwt != 0 ){
|
||||||
|
utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fs.Close();
|
||||||
|
|
||||||
|
if (!silent){
|
||||||
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
|
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||||
|
if( info.group_ptr.size() != 0 ){
|
||||||
|
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save to binary file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
*/
|
||||||
|
inline void SaveBinary(const char* fname, bool silent = false){
|
||||||
|
// initialize column support as well
|
||||||
|
data.InitData();
|
||||||
|
|
||||||
|
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||||
|
data.SaveBinary(fs);
|
||||||
|
utils::Assert( info.labels.size() == data.NumRow(), "label size is not consistent with feature matrix size" );
|
||||||
|
fs.Write(&info.labels[0], sizeof(float) * data.NumRow());
|
||||||
|
{// write out group ptr
|
||||||
|
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
|
||||||
|
fs.Write(&ngptr, sizeof(unsigned) );
|
||||||
|
if( ngptr != 0 ){
|
||||||
|
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{// write out weight
|
||||||
|
unsigned nwt = static_cast<unsigned>( info.weights.size() );
|
||||||
|
fs.Write( &nwt, sizeof(unsigned) );
|
||||||
|
if( nwt != 0 ){
|
||||||
|
fs.Write(&info.weights[0], sizeof(float) * nwt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fs.Close();
|
||||||
|
if (!silent){
|
||||||
|
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||||
|
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||||
|
if( info.group_ptr.size() != 0 ){
|
||||||
|
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||||
|
* otherwise the function will first check if fname + '.buffer' exists,
|
||||||
|
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||||
|
* and try to create a buffer file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
* \param savebuffer whether do save binary buffer if it is text
|
||||||
|
*/
|
||||||
|
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
||||||
|
int len = strlen(fname);
|
||||||
|
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||||
|
if( !this->LoadBinary(fname, silent) ){
|
||||||
|
fprintf(stderr,"can not open file \"%s\"", fname);
|
||||||
|
utils::Error("DMatrix::CacheLoad failed");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
char bname[1024];
|
||||||
|
sprintf(bname, "%s.buffer", fname);
|
||||||
|
if (!this->LoadBinary(bname, silent)){
|
||||||
|
this->LoadText(fname, silent);
|
||||||
|
if (savebuffer) this->SaveBinary(bname, silent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
inline bool TryLoadGroup(const char* fname, bool silent = false){
|
||||||
|
std::string name = fname;
|
||||||
|
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
|
||||||
|
name.resize( name.length() - 7 );
|
||||||
|
}
|
||||||
|
name += ".group";
|
||||||
|
//if exists group data load it in
|
||||||
|
FILE *fi = fopen64(name.c_str(), "r");
|
||||||
|
if (fi == NULL) return false;
|
||||||
|
info.group_ptr.push_back(0);
|
||||||
|
unsigned nline;
|
||||||
|
while (fscanf(fi, "%u", &nline) == 1){
|
||||||
|
info.group_ptr.push_back(info.group_ptr.back()+nline);
|
||||||
|
}
|
||||||
|
if(!silent){
|
||||||
|
printf("%lu groups are loaded from %s\n", info.group_ptr.size()-1, name.c_str());
|
||||||
|
}
|
||||||
|
fclose(fi);
|
||||||
|
utils::Assert( info.group_ptr.back() == data.NumRow(), "DMatrix: group data does not match the number of rows in feature matrix" );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline bool TryLoadWeight(const char* fname, bool silent = false){
|
||||||
|
std::string name = fname;
|
||||||
|
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
|
||||||
|
name.resize( name.length() - 7 );
|
||||||
|
}
|
||||||
|
name += ".weight";
|
||||||
|
//if exists group data load it in
|
||||||
|
FILE *fi = fopen64(name.c_str(), "r");
|
||||||
|
if (fi == NULL) return false;
|
||||||
|
float wt;
|
||||||
|
while (fscanf(fi, "%f", &wt) == 1){
|
||||||
|
info.weights.push_back( wt );
|
||||||
|
}
|
||||||
|
if(!silent){
|
||||||
|
printf("loading weight from %s\n", name.c_str());
|
||||||
|
}
|
||||||
|
fclose(fi);
|
||||||
|
utils::Assert( info.weights.size() == data.NumRow(), "DMatrix: weight data does not match the number of rows in feature matrix" );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
375
regrank/xgboost_regrank_eval.h
Normal file
375
regrank/xgboost_regrank_eval.h
Normal file
@@ -0,0 +1,375 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_EVAL_H
|
||||||
|
#define XGBOOST_REGRANK_EVAL_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_eval.h
|
||||||
|
* \brief evaluation metrics for regression and classification and rank
|
||||||
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include "../utils/xgboost_utils.h"
|
||||||
|
#include "../utils/xgboost_omp.h"
|
||||||
|
#include "../utils/xgboost_random.h"
|
||||||
|
#include "xgboost_regrank_data.h"
|
||||||
|
#include "xgboost_regrank_utils.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief evaluator that evaluates the loss metrics */
|
||||||
|
struct IEvaluator{
|
||||||
|
/*!
|
||||||
|
* \brief evaluate a specific metric
|
||||||
|
* \param preds prediction
|
||||||
|
* \param info information, including label etc.
|
||||||
|
*/
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const = 0;
|
||||||
|
/*! \return name of metric */
|
||||||
|
virtual const char *Name(void) const = 0;
|
||||||
|
/*! \brief virtual destructor */
|
||||||
|
virtual ~IEvaluator(void){}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief RMSE */
|
||||||
|
struct EvalRMSE : public IEvaluator{
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
float sum = 0.0, wsum = 0.0;
|
||||||
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
|
const float diff = info.labels[i] - preds[i];
|
||||||
|
sum += diff*diff * wt;
|
||||||
|
wsum += wt;
|
||||||
|
}
|
||||||
|
return sqrtf(sum / wsum);
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return "rmse";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Error */
|
||||||
|
struct EvalLogLoss : public IEvaluator{
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float y = info.labels[i];
|
||||||
|
const float py = preds[i];
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
|
sum -= wt * (y * std::log(py) + (1.0f - y)*std::log(1 - py));
|
||||||
|
wsum += wt;
|
||||||
|
}
|
||||||
|
return sum / wsum;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return "negllik";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Error */
|
||||||
|
struct EvalError : public IEvaluator{
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
|
if (preds[i] > 0.5f){
|
||||||
|
if (info.labels[i] < 0.5f) sum += wt;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
if (info.labels[i] >= 0.5f) sum += wt;
|
||||||
|
}
|
||||||
|
wsum += wt;
|
||||||
|
}
|
||||||
|
return sum / wsum;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return "error";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief AMS: also records best threshold */
|
||||||
|
struct EvalAMS : public IEvaluator{
|
||||||
|
public:
|
||||||
|
EvalAMS(const char *name){
|
||||||
|
name_ = name;
|
||||||
|
// note: ams@0 will automatically select which ratio to go
|
||||||
|
utils::Assert( sscanf(name, "ams@%f", &ratio_ ) == 1, "invalid ams format" );
|
||||||
|
}
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
utils::Assert( info.weights.size() == ndata, "we need weight to evaluate ams");
|
||||||
|
std::vector< std::pair<float, unsigned> > rec(ndata);
|
||||||
|
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
rec[i] = std::make_pair( preds[i], i );
|
||||||
|
}
|
||||||
|
std::sort( rec.begin(), rec.end(), CmpFirst );
|
||||||
|
unsigned ntop = static_cast<unsigned>( ratio_ * ndata );
|
||||||
|
if( ntop == 0 ) ntop = ndata;
|
||||||
|
const double br = 10.0;
|
||||||
|
unsigned thresindex = 0;
|
||||||
|
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
|
||||||
|
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i){
|
||||||
|
const unsigned ridx = rec[i].second;
|
||||||
|
const float wt = info.weights[ridx];
|
||||||
|
if( info.labels[ridx] > 0.5f ){
|
||||||
|
s_tp += wt;
|
||||||
|
}else{
|
||||||
|
b_fp += wt;
|
||||||
|
}
|
||||||
|
if( rec[i].first != rec[i+1].first ){
|
||||||
|
double ams = sqrtf( 2*((s_tp+b_fp+br) * log( 1.0 + s_tp/(b_fp+br) ) - s_tp) );
|
||||||
|
if( tams < ams ){
|
||||||
|
thresindex = i;
|
||||||
|
tams = ams;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( ntop == ndata ){
|
||||||
|
fprintf( stderr, "\tams-ratio=%g", float(thresindex)/ndata );
|
||||||
|
return tams;
|
||||||
|
}else{
|
||||||
|
return sqrtf( 2*((s_tp+b_fp+br) * log( 1.0 + s_tp/(b_fp+br) ) - s_tp) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return name_.c_str();
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
std::string name_;
|
||||||
|
float ratio_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Error for multi-class classification, need exact match */
|
||||||
|
struct EvalMatchError : public IEvaluator{
|
||||||
|
public:
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
|
int label = static_cast<int>(info.labels[i]);
|
||||||
|
if (static_cast<int>(preds[i]) != label ) sum += wt;
|
||||||
|
wsum += wt;
|
||||||
|
}
|
||||||
|
return sum / wsum;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return "merror";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Area under curve, for both classification and rank */
|
||||||
|
struct EvalAuc : public IEvaluator{
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
|
||||||
|
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||||
|
utils::Assert(gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction");
|
||||||
|
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
|
||||||
|
|
||||||
|
double sum_auc = 0.0f;
|
||||||
|
#pragma omp parallel reduction(+:sum_auc)
|
||||||
|
{
|
||||||
|
// each thread takes a local rec
|
||||||
|
std::vector< std::pair<float, unsigned> > rec;
|
||||||
|
#pragma omp for schedule(static)
|
||||||
|
for (unsigned k = 0; k < ngroup; ++k){
|
||||||
|
rec.clear();
|
||||||
|
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j){
|
||||||
|
rec.push_back(std::make_pair(preds[j], j));
|
||||||
|
}
|
||||||
|
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||||
|
// calculate AUC
|
||||||
|
double sum_pospair = 0.0;
|
||||||
|
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
||||||
|
for (size_t j = 0; j < rec.size(); ++j){
|
||||||
|
const float wt = info.GetWeight(rec[j].second);
|
||||||
|
const float ctr = info.labels[rec[j].second];
|
||||||
|
// keep bucketing predictions in same bucket
|
||||||
|
if (j != 0 && rec[j].first != rec[j - 1].first){
|
||||||
|
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||||
|
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||||
|
buf_neg = buf_pos = 0.0f;
|
||||||
|
}
|
||||||
|
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
|
||||||
|
}
|
||||||
|
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||||
|
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||||
|
//
|
||||||
|
utils::Assert(sum_npos > 0.0 && sum_nneg > 0.0, "the dataset only contains pos or neg samples");
|
||||||
|
// this is the AUC
|
||||||
|
sum_auc += sum_pospair / (sum_npos*sum_nneg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// return average AUC over list
|
||||||
|
return static_cast<float>(sum_auc) / ngroup;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return "auc";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Evaluate rank list */
|
||||||
|
struct EvalRankList : public IEvaluator{
|
||||||
|
public:
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||||
|
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
|
||||||
|
utils::Assert( gptr.back() == preds.size(), "EvalRanklist: group structure must match number of prediction");
|
||||||
|
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
|
||||||
|
|
||||||
|
double sum_metric = 0.0f;
|
||||||
|
#pragma omp parallel reduction(+:sum_metric)
|
||||||
|
{
|
||||||
|
// each thread takes a local rec
|
||||||
|
std::vector< std::pair<float, unsigned> > rec;
|
||||||
|
#pragma omp for schedule(static)
|
||||||
|
for (unsigned k = 0; k < ngroup; ++k){
|
||||||
|
rec.clear();
|
||||||
|
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j){
|
||||||
|
rec.push_back(std::make_pair(preds[j], (int)info.labels[j]));
|
||||||
|
}
|
||||||
|
sum_metric += this->EvalMetric( rec );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return static_cast<float>(sum_metric) / ngroup;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return name_.c_str();
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
EvalRankList(const char *name){
|
||||||
|
name_ = name;
|
||||||
|
if( sscanf(name, "%*[^@]@%u", &topn_) != 1 ){
|
||||||
|
topn_ = UINT_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \return evaluation metric, given the pair_sort record, (pred,label) */
|
||||||
|
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &pair_sort ) const = 0;
|
||||||
|
protected:
|
||||||
|
unsigned topn_;
|
||||||
|
std::string name_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Precison at N, for both classification and rank */
|
||||||
|
struct EvalPrecision : public EvalRankList{
|
||||||
|
public:
|
||||||
|
EvalPrecision(const char *name):EvalRankList(name){}
|
||||||
|
protected:
|
||||||
|
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
|
||||||
|
// calculate Preicsion
|
||||||
|
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||||
|
unsigned nhit = 0;
|
||||||
|
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j){
|
||||||
|
nhit += (rec[j].second != 0 );
|
||||||
|
}
|
||||||
|
return static_cast<float>( nhit ) / topn_;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief NDCG */
|
||||||
|
struct EvalNDCG : public EvalRankList{
|
||||||
|
public:
|
||||||
|
EvalNDCG(const char *name):EvalRankList(name){}
|
||||||
|
protected:
|
||||||
|
inline float CalcDCG( const std::vector< std::pair<float,unsigned> > &rec ) const {
|
||||||
|
double sumdcg = 0.0;
|
||||||
|
for( size_t i = 0; i < rec.size() && i < this->topn_; i ++ ){
|
||||||
|
const unsigned rel = rec[i].second;
|
||||||
|
if( rel != 0 ){
|
||||||
|
sumdcg += logf(2.0f) * ((1<<rel)-1) / logf( i + 2 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return static_cast<float>(sumdcg);
|
||||||
|
}
|
||||||
|
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
|
||||||
|
std::sort(rec.begin(), rec.end(), CmpSecond);
|
||||||
|
float idcg = this->CalcDCG(rec);
|
||||||
|
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||||
|
float dcg = this->CalcDCG(rec);
|
||||||
|
if( idcg == 0.0f ) return 0.0f;
|
||||||
|
else return dcg/idcg;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief Precison at N, for both classification and rank */
|
||||||
|
struct EvalMAP : public EvalRankList{
|
||||||
|
public:
|
||||||
|
EvalMAP(const char *name):EvalRankList(name){}
|
||||||
|
protected:
|
||||||
|
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
|
||||||
|
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||||
|
unsigned nhits = 0;
|
||||||
|
double sumap = 0.0;
|
||||||
|
for( size_t i = 0; i < rec.size(); ++i){
|
||||||
|
if( rec[i].second != 0 ){
|
||||||
|
nhits += 1;
|
||||||
|
if( i < this->topn_ ){
|
||||||
|
sumap += static_cast<float>(nhits) / (i+1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (nhits != 0) sumap /= nhits;
|
||||||
|
return static_cast<float>(sumap);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief a set of evaluators */
|
||||||
|
struct EvalSet{
|
||||||
|
public:
|
||||||
|
inline void AddEval(const char *name){
|
||||||
|
for (size_t i = 0; i < evals_.size(); ++i){
|
||||||
|
if (!strcmp(name, evals_[i]->Name())) return;
|
||||||
|
}
|
||||||
|
if (!strcmp(name, "rmse")) evals_.push_back(new EvalRMSE());
|
||||||
|
if (!strcmp(name, "error")) evals_.push_back(new EvalError());
|
||||||
|
if (!strcmp(name, "merror")) evals_.push_back(new EvalMatchError());
|
||||||
|
if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss());
|
||||||
|
if (!strcmp(name, "auc")) evals_.push_back(new EvalAuc());
|
||||||
|
if (!strncmp(name, "ams@",4)) evals_.push_back(new EvalAMS(name));
|
||||||
|
if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name));
|
||||||
|
if (!strncmp(name, "map", 3)) evals_.push_back(new EvalMAP(name));
|
||||||
|
if (!strncmp(name, "ndcg", 3)) evals_.push_back(new EvalNDCG(name));
|
||||||
|
}
|
||||||
|
~EvalSet(){
|
||||||
|
for (size_t i = 0; i < evals_.size(); ++i){
|
||||||
|
delete evals_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void Eval(FILE *fo, const char *evname,
|
||||||
|
const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const{
|
||||||
|
for (size_t i = 0; i < evals_.size(); ++i){
|
||||||
|
float res = evals_[i]->Eval(preds, info);
|
||||||
|
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
std::vector<const IEvaluator*> evals_;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
303
regrank/xgboost_regrank_main.cpp
Normal file
303
regrank/xgboost_regrank_main.cpp
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
|
||||||
|
#include <ctime>
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
#include "xgboost_regrank.h"
|
||||||
|
#include "../utils/xgboost_fmap.h"
|
||||||
|
#include "../utils/xgboost_random.h"
|
||||||
|
#include "../utils/xgboost_config.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
/*!
|
||||||
|
* \brief wrapping the training process of the gradient boosting regression model,
|
||||||
|
* given the configuation
|
||||||
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
|
||||||
|
*/
|
||||||
|
class RegBoostTask{
|
||||||
|
public:
|
||||||
|
inline int Run(int argc, char *argv[]){
|
||||||
|
if (argc < 2){
|
||||||
|
printf("Usage: <config>\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
utils::ConfigIterator itr(argv[1]);
|
||||||
|
while (itr.Next()){
|
||||||
|
this->SetParam(itr.name(), itr.val());
|
||||||
|
}
|
||||||
|
for (int i = 2; i < argc; i++){
|
||||||
|
char name[256], val[256];
|
||||||
|
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){
|
||||||
|
this->SetParam(name, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this->InitData();
|
||||||
|
this->InitLearner();
|
||||||
|
if (task == "dump"){
|
||||||
|
this->TaskDump();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (task == "interact"){
|
||||||
|
this->TaskInteractive(); return 0;
|
||||||
|
}
|
||||||
|
if (task == "dumppath"){
|
||||||
|
this->TaskDumpPath(); return 0;
|
||||||
|
}
|
||||||
|
if (task == "eval"){
|
||||||
|
this->TaskEval(); return 0;
|
||||||
|
}
|
||||||
|
if (task == "pred"){
|
||||||
|
this->TaskPred();
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
this->TaskTrain();
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
inline void SetParam(const char *name, const char *val){
|
||||||
|
if (!strcmp("silent", name)) silent = atoi(val);
|
||||||
|
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||||
|
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||||
|
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||||
|
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||||
|
if (!strcmp("eval_train", name)) eval_train = atoi(val);
|
||||||
|
if (!strcmp("task", name)) task = val;
|
||||||
|
if (!strcmp("data", name)) train_path = val;
|
||||||
|
if (!strcmp("test:data", name)) test_path = val;
|
||||||
|
if (!strcmp("model_in", name)) model_in = val;
|
||||||
|
if (!strcmp("model_out", name)) model_out = val;
|
||||||
|
if (!strcmp("model_dir", name)) model_dir_path = val;
|
||||||
|
if (!strcmp("fmap", name)) name_fmap = val;
|
||||||
|
if (!strcmp("name_dump", name)) name_dump = val;
|
||||||
|
if (!strcmp("name_dumppath", name)) name_dumppath = val;
|
||||||
|
if (!strcmp("name_pred", name)) name_pred = val;
|
||||||
|
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
|
||||||
|
if (!strcmp("interact:action", name)) interact_action = val;
|
||||||
|
if (!strncmp("batch:", name, 6)){
|
||||||
|
cfg_batch.PushBack(name + 6, val);
|
||||||
|
}
|
||||||
|
if (!strncmp("eval[", name, 5)) {
|
||||||
|
char evname[256];
|
||||||
|
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
|
||||||
|
eval_data_names.push_back(std::string(evname));
|
||||||
|
eval_data_paths.push_back(std::string(val));
|
||||||
|
}
|
||||||
|
cfg.PushBack(name, val);
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
RegBoostTask(void){
|
||||||
|
// default parameters
|
||||||
|
silent = 0;
|
||||||
|
use_buffer = 1;
|
||||||
|
num_round = 10;
|
||||||
|
save_period = 0;
|
||||||
|
eval_train = 0;
|
||||||
|
dump_model_stats = 0;
|
||||||
|
task = "train";
|
||||||
|
model_in = "NULL";
|
||||||
|
model_out = "NULL";
|
||||||
|
name_fmap = "NULL";
|
||||||
|
name_pred = "pred.txt";
|
||||||
|
name_dump = "dump.txt";
|
||||||
|
name_dumppath = "dump.path.txt";
|
||||||
|
model_dir_path = "./";
|
||||||
|
interact_action = "update";
|
||||||
|
}
|
||||||
|
~RegBoostTask(void){
|
||||||
|
for (size_t i = 0; i < deval.size(); i++){
|
||||||
|
delete deval[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
inline void InitData(void){
|
||||||
|
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
|
||||||
|
if (task == "dump") return;
|
||||||
|
if (task == "pred" || task == "dumppath"){
|
||||||
|
data.CacheLoad(test_path.c_str(), silent != 0, use_buffer != 0);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
// training
|
||||||
|
data.CacheLoad(train_path.c_str(), silent != 0, use_buffer != 0);
|
||||||
|
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||||
|
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||||
|
deval.push_back(new DMatrix());
|
||||||
|
deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
|
||||||
|
devalall.push_back(deval.back());
|
||||||
|
}
|
||||||
|
std::vector<DMatrix *> dcache(1, &data);
|
||||||
|
for( size_t i = 0; i < deval.size(); ++ i){
|
||||||
|
dcache.push_back( deval[i] );
|
||||||
|
}
|
||||||
|
// set cache data to be all training and evaluation data
|
||||||
|
learner.SetCacheData(dcache);
|
||||||
|
|
||||||
|
// add training set to evaluation set if needed
|
||||||
|
if( eval_train != 0 ){
|
||||||
|
devalall.push_back( &data );
|
||||||
|
eval_data_names.push_back( std::string("train") );
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void InitLearner(void){
|
||||||
|
cfg.BeforeFirst();
|
||||||
|
while (cfg.Next()){
|
||||||
|
learner.SetParam(cfg.name(), cfg.val());
|
||||||
|
}
|
||||||
|
if (model_in != "NULL"){
|
||||||
|
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
|
||||||
|
learner.LoadModel(fi);
|
||||||
|
fi.Close();
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
utils::Assert(task == "train", "model_in not specified");
|
||||||
|
learner.InitModel();
|
||||||
|
}
|
||||||
|
learner.InitTrainer();
|
||||||
|
}
|
||||||
|
inline void TaskTrain(void){
|
||||||
|
const time_t start = time(NULL);
|
||||||
|
unsigned long elapsed = 0;
|
||||||
|
for (int i = 0; i < num_round; ++i){
|
||||||
|
elapsed = (unsigned long)(time(NULL) - start);
|
||||||
|
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||||
|
learner.UpdateOneIter(data);
|
||||||
|
learner.EvalOneIter(i, devalall, eval_data_names);
|
||||||
|
if (save_period != 0 && (i + 1) % save_period == 0){
|
||||||
|
this->SaveModel(i);
|
||||||
|
}
|
||||||
|
elapsed = (unsigned long)(time(NULL) - start);
|
||||||
|
}
|
||||||
|
// always save final round
|
||||||
|
if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE"){
|
||||||
|
if (model_out == "NULL"){
|
||||||
|
this->SaveModel(num_round - 1);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
this->SaveModel(model_out.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!silent){
|
||||||
|
printf("\nupdating end, %lu sec in all\n", elapsed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void TaskEval(void){
|
||||||
|
learner.EvalOneIter(0, devalall, eval_data_names);
|
||||||
|
}
|
||||||
|
inline void TaskInteractive(void){
|
||||||
|
const time_t start = time(NULL);
|
||||||
|
unsigned long elapsed = 0;
|
||||||
|
int batch_action = 0;
|
||||||
|
|
||||||
|
cfg_batch.BeforeFirst();
|
||||||
|
while (cfg_batch.Next()){
|
||||||
|
if (!strcmp(cfg_batch.name(), "run")){
|
||||||
|
learner.UpdateInteract(interact_action, data);
|
||||||
|
batch_action += 1;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
learner.SetParam(cfg_batch.name(), cfg_batch.val());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (batch_action == 0){
|
||||||
|
learner.UpdateInteract(interact_action, data);
|
||||||
|
}
|
||||||
|
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
|
||||||
|
this->SaveModel(model_out.c_str());
|
||||||
|
elapsed = (unsigned long)(time(NULL) - start);
|
||||||
|
|
||||||
|
if (!silent){
|
||||||
|
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void TaskDump(void){
|
||||||
|
FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
|
||||||
|
learner.DumpModel(fo, fmap, dump_model_stats != 0);
|
||||||
|
fclose(fo);
|
||||||
|
}
|
||||||
|
inline void TaskDumpPath(void){
|
||||||
|
FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w");
|
||||||
|
learner.DumpPath(fo, data);
|
||||||
|
fclose(fo);
|
||||||
|
}
|
||||||
|
inline void SaveModel(const char *fname) const{
|
||||||
|
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||||
|
learner.SaveModel(fo);
|
||||||
|
fo.Close();
|
||||||
|
}
|
||||||
|
inline void SaveModel(int i) const{
|
||||||
|
char fname[256];
|
||||||
|
sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1);
|
||||||
|
this->SaveModel(fname);
|
||||||
|
}
|
||||||
|
inline void TaskPred(void){
|
||||||
|
std::vector<float> preds;
|
||||||
|
if (!silent) printf("start prediction...\n");
|
||||||
|
learner.Predict(preds, data);
|
||||||
|
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
||||||
|
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
||||||
|
for (size_t i = 0; i < preds.size(); i++){
|
||||||
|
fprintf(fo, "%f\n", preds[i]);
|
||||||
|
}
|
||||||
|
fclose(fo);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
/* \brief whether silent */
|
||||||
|
int silent;
|
||||||
|
/* \brief whether use auto binary buffer */
|
||||||
|
int use_buffer;
|
||||||
|
/* \brief whether evaluate training statistics */
|
||||||
|
int eval_train;
|
||||||
|
/* \brief number of boosting iterations */
|
||||||
|
int num_round;
|
||||||
|
/* \brief the period to save the model, 0 means only save the final round model */
|
||||||
|
int save_period;
|
||||||
|
/*! \brief interfact action */
|
||||||
|
std::string interact_action;
|
||||||
|
/* \brief the path of training/test data set */
|
||||||
|
std::string train_path, test_path;
|
||||||
|
/* \brief the path of test model file, or file to restart training */
|
||||||
|
std::string model_in;
|
||||||
|
/* \brief the path of final model file, to be saved */
|
||||||
|
std::string model_out;
|
||||||
|
/* \brief the path of directory containing the saved models */
|
||||||
|
std::string model_dir_path;
|
||||||
|
/* \brief task to perform */
|
||||||
|
std::string task;
|
||||||
|
/* \brief name of predict file */
|
||||||
|
std::string name_pred;
|
||||||
|
/* \brief whether dump statistics along with model */
|
||||||
|
int dump_model_stats;
|
||||||
|
/* \brief name of feature map */
|
||||||
|
std::string name_fmap;
|
||||||
|
/* \brief name of dump file */
|
||||||
|
std::string name_dump;
|
||||||
|
/* \brief name of dump path file */
|
||||||
|
std::string name_dumppath;
|
||||||
|
/* \brief the paths of validation data sets */
|
||||||
|
std::vector<std::string> eval_data_paths;
|
||||||
|
/* \brief the names of the evaluation data used in output log */
|
||||||
|
std::vector<std::string> eval_data_names;
|
||||||
|
/*! \brief saves configurations */
|
||||||
|
utils::ConfigSaver cfg;
|
||||||
|
/*! \brief batch configurations */
|
||||||
|
utils::ConfigSaver cfg_batch;
|
||||||
|
private:
|
||||||
|
DMatrix data;
|
||||||
|
std::vector<DMatrix*> deval;
|
||||||
|
std::vector<const DMatrix*> devalall;
|
||||||
|
utils::FeatMap fmap;
|
||||||
|
RegRankBoostLearner learner;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
int main( int argc, char *argv[] ){
|
||||||
|
xgboost::random::Seed( 0 );
|
||||||
|
xgboost::regrank::RegBoostTask tsk;
|
||||||
|
return tsk.Run( argc, argv );
|
||||||
|
}
|
||||||
131
regrank/xgboost_regrank_obj.h
Normal file
131
regrank/xgboost_regrank_obj.h
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_OBJ_H
|
||||||
|
#define XGBOOST_REGRANK_OBJ_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_obj.h
|
||||||
|
* \brief defines objective function interface used in xgboost for regression and rank
|
||||||
|
* \author Tianqi Chen, Kailong Chen
|
||||||
|
*/
|
||||||
|
#include "xgboost_regrank_data.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief interface of objective function */
|
||||||
|
class IObjFunction{
|
||||||
|
public:
|
||||||
|
/*! \brief virtual destructor */
|
||||||
|
virtual ~IObjFunction(void){}
|
||||||
|
/*!
|
||||||
|
* \brief set parameters from outside
|
||||||
|
* \param name name of the parameter
|
||||||
|
* \param val value of the parameter
|
||||||
|
*/
|
||||||
|
virtual void SetParam(const char *name, const char *val) = 0;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief get gradient over each of predictions, given existing information
|
||||||
|
* \param preds prediction of current round
|
||||||
|
* \param info information about labels, weights, groups in rank
|
||||||
|
* \param iter current iteration number
|
||||||
|
* \param grad gradient over each preds
|
||||||
|
* \param hess second order gradient over each preds
|
||||||
|
*/
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
int iter,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) = 0;
|
||||||
|
/*! \return the default evaluation metric for the problem */
|
||||||
|
virtual const char* DefaultEvalMetric(void) = 0;
|
||||||
|
/*!
|
||||||
|
* \brief transform prediction values, this is only called when Prediction is called
|
||||||
|
* \param preds prediction values, saves to this vector as well
|
||||||
|
*/
|
||||||
|
virtual void PredTransform(std::vector<float> &preds){}
|
||||||
|
/*!
|
||||||
|
* \brief transform prediction values, this is only called when Eval is called, usually it redirect to PredTransform
|
||||||
|
* \param preds prediction values, saves to this vector as well
|
||||||
|
*/
|
||||||
|
virtual void EvalTransform(std::vector<float> &preds){ this->PredTransform(preds); }
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief defines functions to calculate some commonly used functions */
|
||||||
|
struct LossType{
|
||||||
|
public:
|
||||||
|
const static int kLinearSquare = 0;
|
||||||
|
const static int kLogisticNeglik = 1;
|
||||||
|
const static int kLogisticClassify = 2;
|
||||||
|
const static int kLogisticRaw = 3;
|
||||||
|
public:
|
||||||
|
/*! \brief indicate which type we are using */
|
||||||
|
int loss_type;
|
||||||
|
public:
|
||||||
|
/*!
|
||||||
|
* \brief transform the linear sum to prediction
|
||||||
|
* \param x linear sum of boosting ensemble
|
||||||
|
* \return transformed prediction
|
||||||
|
*/
|
||||||
|
inline float PredTransform(float x){
|
||||||
|
switch (loss_type){
|
||||||
|
case kLogisticRaw:
|
||||||
|
case kLinearSquare: return x;
|
||||||
|
case kLogisticClassify:
|
||||||
|
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
||||||
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief calculate first order gradient of loss, given transformed prediction
|
||||||
|
* \param predt transformed prediction
|
||||||
|
* \param label true label
|
||||||
|
* \return first order gradient
|
||||||
|
*/
|
||||||
|
inline float FirstOrderGradient(float predt, float label) const{
|
||||||
|
switch (loss_type){
|
||||||
|
case kLinearSquare: return predt - label;
|
||||||
|
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||||
|
case kLogisticClassify:
|
||||||
|
case kLogisticNeglik: return predt - label;
|
||||||
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief calculate second order gradient of loss, given transformed prediction
|
||||||
|
* \param predt transformed prediction
|
||||||
|
* \param label true label
|
||||||
|
* \return second order gradient
|
||||||
|
*/
|
||||||
|
inline float SecondOrderGradient(float predt, float label) const{
|
||||||
|
switch (loss_type){
|
||||||
|
case kLinearSquare: return 1.0f;
|
||||||
|
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||||
|
case kLogisticClassify:
|
||||||
|
case kLogisticNeglik: return predt * (1 - predt);
|
||||||
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
#include "xgboost_regrank_obj.hpp"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
inline IObjFunction* CreateObjFunction( const char *name ){
|
||||||
|
if( !strcmp("reg:linear", name ) ) return new RegressionObj( LossType::kLinearSquare );
|
||||||
|
if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik );
|
||||||
|
if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify );
|
||||||
|
if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw );
|
||||||
|
if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(0);
|
||||||
|
if( !strcmp("multi:softprob", name ) ) return new SoftmaxMultiClassObj(1);
|
||||||
|
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
||||||
|
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
|
||||||
|
utils::Error("unknown objective function type");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
353
regrank/xgboost_regrank_obj.hpp
Normal file
353
regrank/xgboost_regrank_obj.hpp
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_OBJ_HPP
|
||||||
|
#define XGBOOST_REGRANK_OBJ_HPP
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_obj.hpp
|
||||||
|
* \brief implementation of objective functions
|
||||||
|
* \author Tianqi Chen, Kailong Chen
|
||||||
|
*/
|
||||||
|
//#include "xgboost_regrank_sample.h"
|
||||||
|
#include <vector>
|
||||||
|
#include <functional>
|
||||||
|
#include "xgboost_regrank_utils.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
class RegressionObj : public IObjFunction{
|
||||||
|
public:
|
||||||
|
RegressionObj( int loss_type ){
|
||||||
|
loss.loss_type = loss_type;
|
||||||
|
scale_pos_weight = 1.0f;
|
||||||
|
}
|
||||||
|
virtual ~RegressionObj(){}
|
||||||
|
virtual void SetParam(const char *name, const char *val){
|
||||||
|
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
|
||||||
|
if( !strcmp( "scale_pos_weight", name ) ) scale_pos_weight = (float)atof( val );
|
||||||
|
}
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
int iter,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
float p = loss.PredTransform(preds[j]);
|
||||||
|
float w = info.GetWeight(j);
|
||||||
|
if( info.labels[j] == 1.0f ) w *= scale_pos_weight;
|
||||||
|
grad[j] = loss.FirstOrderGradient(p, info.labels[j]) * w;
|
||||||
|
hess[j] = loss.SecondOrderGradient(p, info.labels[j]) * w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
|
if( loss.loss_type == LossType::kLogisticClassify ) return "error";
|
||||||
|
if( loss.loss_type == LossType::kLogisticRaw ) return "auc";
|
||||||
|
return "rmse";
|
||||||
|
}
|
||||||
|
virtual void PredTransform(std::vector<float> &preds){
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = loss.PredTransform( preds[j] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
float scale_pos_weight;
|
||||||
|
LossType loss;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
// simple softmax rak
|
||||||
|
class SoftmaxRankObj : public IObjFunction{
|
||||||
|
public:
|
||||||
|
SoftmaxRankObj(void){
|
||||||
|
}
|
||||||
|
virtual ~SoftmaxRankObj(){}
|
||||||
|
virtual void SetParam(const char *name, const char *val){
|
||||||
|
}
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
int iter,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||||
|
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
|
||||||
|
const unsigned ngroup = static_cast<unsigned>( gptr.size() - 1 );
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
std::vector< float > rec;
|
||||||
|
#pragma omp for schedule(static)
|
||||||
|
for (unsigned k = 0; k < ngroup; ++k){
|
||||||
|
rec.clear();
|
||||||
|
int nhit = 0;
|
||||||
|
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
|
||||||
|
rec.push_back( preds[j] );
|
||||||
|
grad[j] = hess[j] = 0.0f;
|
||||||
|
nhit += info.labels[j];
|
||||||
|
}
|
||||||
|
Softmax( rec );
|
||||||
|
if( nhit == 1 ){
|
||||||
|
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
|
||||||
|
float p = rec[ j - gptr[k] ];
|
||||||
|
grad[j] = p - info.labels[j];
|
||||||
|
hess[j] = 2.0f * p * ( 1.0f - p );
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
utils::Assert( nhit == 0, "softmax does not allow multiple labels" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
|
return "pre@1";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// simple softmax multi-class classification
|
||||||
|
class SoftmaxMultiClassObj : public IObjFunction{
|
||||||
|
public:
|
||||||
|
SoftmaxMultiClassObj(int output_prob):output_prob(output_prob){
|
||||||
|
nclass = 0;
|
||||||
|
}
|
||||||
|
virtual ~SoftmaxMultiClassObj(){}
|
||||||
|
virtual void SetParam(const char *name, const char *val){
|
||||||
|
if( !strcmp( "num_class", name ) ) nclass = atoi(val);
|
||||||
|
}
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
int iter,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( nclass != 0, "must set num_class to use softmax" );
|
||||||
|
utils::Assert( preds.size() == (size_t)nclass * info.labels.size(), "SoftmaxMultiClassObj: label size and pred size does not match" );
|
||||||
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
|
||||||
|
const unsigned ndata = static_cast<unsigned>(info.labels.size());
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
std::vector<float> rec(nclass);
|
||||||
|
#pragma omp for schedule(static)
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
rec[k] = preds[j + k * ndata];
|
||||||
|
}
|
||||||
|
Softmax( rec );
|
||||||
|
int label = static_cast<int>(info.labels[j]);
|
||||||
|
if( label < 0 ){
|
||||||
|
label = -label - 1;
|
||||||
|
}
|
||||||
|
utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
float p = rec[ k ];
|
||||||
|
if( label == k ){
|
||||||
|
grad[j+k*ndata] = p - 1.0f;
|
||||||
|
}else{
|
||||||
|
grad[j+k*ndata] = p;
|
||||||
|
}
|
||||||
|
hess[j+k*ndata] = 2.0f * p * ( 1.0f - p );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual void PredTransform(std::vector<float> &preds){
|
||||||
|
this->Transform(preds, output_prob);
|
||||||
|
}
|
||||||
|
virtual void EvalTransform(std::vector<float> &preds){
|
||||||
|
this->Transform(preds, 0);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
inline void Transform(std::vector<float> &preds, int prob){
|
||||||
|
utils::Assert( nclass != 0, "must set num_class to use softmax" );
|
||||||
|
utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
std::vector<float> rec(nclass);
|
||||||
|
#pragma omp for schedule(static)
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
rec[k] = preds[j + k * ndata];
|
||||||
|
}
|
||||||
|
if( prob == 0 ){
|
||||||
|
preds[j] = FindMaxIndex( rec );
|
||||||
|
}else{
|
||||||
|
Softmax( rec );
|
||||||
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
|
preds[j + k * ndata] = rec[k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( prob == 0 ){
|
||||||
|
preds.resize( ndata );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
|
return "merror";
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
int nclass;
|
||||||
|
int output_prob;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief objective for lambda rank */
|
||||||
|
class LambdaRankObj : public IObjFunction{
|
||||||
|
public:
|
||||||
|
LambdaRankObj(void){
|
||||||
|
loss.loss_type = LossType::kLogisticRaw;
|
||||||
|
fix_list_weight = 0.0f;
|
||||||
|
num_pairsample = 1;
|
||||||
|
}
|
||||||
|
virtual ~LambdaRankObj(){}
|
||||||
|
virtual void SetParam(const char *name, const char *val){
|
||||||
|
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
|
||||||
|
if( !strcmp( "fix_list_weight", name ) ) fix_list_weight = (float)atof( val );
|
||||||
|
if( !strcmp( "num_pairsample", name ) ) num_pairsample = atoi( val );
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
int iter,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) {
|
||||||
|
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
|
||||||
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||||
|
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
|
||||||
|
const unsigned ngroup = static_cast<unsigned>( gptr.size() - 1 );
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
// parall construct, declare random number generator here, so that each
|
||||||
|
// thread use its own random number generator, seed by thread id and current iteration
|
||||||
|
random::Random rnd; rnd.Seed( iter * 1111 + omp_get_thread_num() );
|
||||||
|
std::vector<LambdaPair> pairs;
|
||||||
|
std::vector<ListEntry> lst;
|
||||||
|
std::vector< std::pair<float,unsigned> > rec;
|
||||||
|
|
||||||
|
#pragma omp for schedule(static)
|
||||||
|
for (unsigned k = 0; k < ngroup; ++k){
|
||||||
|
lst.clear(); pairs.clear();
|
||||||
|
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
|
||||||
|
lst.push_back( ListEntry(preds[j], info.labels[j], j ) );
|
||||||
|
grad[j] = hess[j] = 0.0f;
|
||||||
|
}
|
||||||
|
std::sort( lst.begin(), lst.end(), ListEntry::CmpPred );
|
||||||
|
rec.resize( lst.size() );
|
||||||
|
for( unsigned i = 0; i < lst.size(); ++i ){
|
||||||
|
rec[i] = std::make_pair( lst[i].label, i );
|
||||||
|
}
|
||||||
|
std::sort( rec.begin(), rec.end(), CmpFirst );
|
||||||
|
// enumerate buckets with same label, for each item in the lst, grab another sample randomly
|
||||||
|
for( unsigned i = 0; i < rec.size(); ){
|
||||||
|
unsigned j = i + 1;
|
||||||
|
while( j < rec.size() && rec[j].first == rec[i].first ) ++ j;
|
||||||
|
// bucket in [i,j), get a sample outside bucket
|
||||||
|
unsigned nleft = i, nright = rec.size() - j;
|
||||||
|
if( nleft + nright != 0 ){
|
||||||
|
int nsample = num_pairsample;
|
||||||
|
while( nsample -- ){
|
||||||
|
for( unsigned pid = i; pid < j; ++ pid ){
|
||||||
|
unsigned ridx = static_cast<unsigned>( rnd.RandDouble() * (nleft+nright) );
|
||||||
|
if( ridx < nleft ){
|
||||||
|
pairs.push_back( LambdaPair( rec[ridx].second, rec[pid].second ) );
|
||||||
|
}else{
|
||||||
|
pairs.push_back( LambdaPair( rec[pid].second, rec[ridx+j-i].second ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i = j;
|
||||||
|
}
|
||||||
|
// get lambda weight for the pairs
|
||||||
|
this->GetLambdaWeight( lst, pairs );
|
||||||
|
// rescale each gradient and hessian so that the lst have constant weighted
|
||||||
|
float scale = 1.0f / num_pairsample;
|
||||||
|
if( fix_list_weight != 0.0f ){
|
||||||
|
scale *= fix_list_weight / (gptr[k+1] - gptr[k]);
|
||||||
|
}
|
||||||
|
for( size_t i = 0; i < pairs.size(); ++ i ){
|
||||||
|
const ListEntry &pos = lst[ pairs[i].pos_index ];
|
||||||
|
const ListEntry &neg = lst[ pairs[i].neg_index ];
|
||||||
|
const float w = pairs[i].weight * scale;
|
||||||
|
float p = loss.PredTransform( pos.pred - neg.pred );
|
||||||
|
float g = loss.FirstOrderGradient( p, 1.0f );
|
||||||
|
float h = loss.SecondOrderGradient( p, 1.0f );
|
||||||
|
// accumulate gradient and hessian in both pid, and nid,
|
||||||
|
grad[ pos.rindex ] += g * w;
|
||||||
|
grad[ neg.rindex ] -= g * w;
|
||||||
|
// take conservative update, scale hessian by 2
|
||||||
|
hess[ pos.rindex ] += 2.0f * h * w;
|
||||||
|
hess[ neg.rindex ] += 2.0f * h * w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
|
return "map";
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
// loss function
|
||||||
|
LossType loss;
|
||||||
|
// number of samples peformed for each instance
|
||||||
|
int num_pairsample;
|
||||||
|
// fix weight of each elements in list
|
||||||
|
float fix_list_weight;
|
||||||
|
protected:
|
||||||
|
/*! \brief helper information in a list */
|
||||||
|
struct ListEntry{
|
||||||
|
/*! \brief the predict score we in the data */
|
||||||
|
float pred;
|
||||||
|
/*! \brief the actual label of the entry */
|
||||||
|
float label;
|
||||||
|
/*! \brief row index in the data matrix */
|
||||||
|
unsigned rindex;
|
||||||
|
// constructor
|
||||||
|
ListEntry(float pred, float label, unsigned rindex): pred(pred),label(label),rindex(rindex){}
|
||||||
|
// comparator by prediction
|
||||||
|
inline static bool CmpPred(const ListEntry &a, const ListEntry &b){
|
||||||
|
return a.pred > b.pred;
|
||||||
|
}
|
||||||
|
// comparator by label
|
||||||
|
inline static bool CmpLabel(const ListEntry &a, const ListEntry &b){
|
||||||
|
return a.label > b.label;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
/*! \brief a pair in the lambda rank */
|
||||||
|
struct LambdaPair{
|
||||||
|
/*! \brief positive index: this is a position in the list */
|
||||||
|
unsigned pos_index;
|
||||||
|
/*! \brief negative index: this is a position in the list */
|
||||||
|
unsigned neg_index;
|
||||||
|
/*! \brief weight to be filled in */
|
||||||
|
float weight;
|
||||||
|
LambdaPair( unsigned pos_index, unsigned neg_index ):pos_index(pos_index),neg_index(neg_index),weight(1.0f){}
|
||||||
|
};
|
||||||
|
/*!
|
||||||
|
* \brief get lambda weight for existing pairs
|
||||||
|
* \param list a list that is sorted by pred score
|
||||||
|
* \param pairs record of pairs, containing the pairs to fill in weights
|
||||||
|
*/
|
||||||
|
virtual void GetLambdaWeight( const std::vector<ListEntry> &sorted_list, std::vector<LambdaPair> &pairs ) = 0;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
class PairwiseRankObj: public LambdaRankObj{
|
||||||
|
public:
|
||||||
|
virtual ~PairwiseRankObj(void){}
|
||||||
|
virtual void GetLambdaWeight( const std::vector<ListEntry> &sorted_list, std::vector<LambdaPair> &pairs ){}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
45
regrank/xgboost_regrank_utils.h
Normal file
45
regrank/xgboost_regrank_utils.h
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_UTILS_H
|
||||||
|
#define XGBOOST_REGRANK_UTILS_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_utils.h
|
||||||
|
* \brief useful helper functions
|
||||||
|
* \author Tianqi Chen, Kailong Chen
|
||||||
|
*/
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
// simple helper function to do softmax
|
||||||
|
inline static void Softmax( std::vector<float>& rec ){
|
||||||
|
float wmax = rec[0];
|
||||||
|
for( size_t i = 1; i < rec.size(); ++ i ){
|
||||||
|
wmax = std::max( rec[i], wmax );
|
||||||
|
}
|
||||||
|
double wsum = 0.0f;
|
||||||
|
for( size_t i = 0; i < rec.size(); ++ i ){
|
||||||
|
rec[i] = expf(rec[i]-wmax);
|
||||||
|
wsum += rec[i];
|
||||||
|
}
|
||||||
|
for( size_t i = 0; i < rec.size(); ++ i ){
|
||||||
|
rec[i] /= static_cast<float>(wsum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// simple helper function to do softmax
|
||||||
|
inline static int FindMaxIndex( std::vector<float>& rec ){
|
||||||
|
size_t mxid = 0;
|
||||||
|
for( size_t i = 1; i < rec.size(); ++ i ){
|
||||||
|
if( rec[i] > rec[mxid]+1e-6f ){
|
||||||
|
mxid = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (int)mxid;
|
||||||
|
}
|
||||||
|
inline static bool CmpFirst(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
|
||||||
|
return a.first > b.first;
|
||||||
|
}
|
||||||
|
inline static bool CmpSecond(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
|
||||||
|
return a.second > b.second;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
@@ -1,403 +0,0 @@
|
|||||||
#ifndef XGBOOST_REG_H
|
|
||||||
#define XGBOOST_REG_H
|
|
||||||
/*!
|
|
||||||
* \file xgboost_reg.h
|
|
||||||
* \brief class for gradient boosted regression
|
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
|
||||||
*/
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include "xgboost_reg_data.h"
|
|
||||||
#include "xgboost_reg_eval.h"
|
|
||||||
#include "../utils/xgboost_omp.h"
|
|
||||||
#include "../booster/xgboost_gbmbase.h"
|
|
||||||
#include "../utils/xgboost_utils.h"
|
|
||||||
#include "../utils/xgboost_stream.h"
|
|
||||||
|
|
||||||
namespace xgboost{
|
|
||||||
namespace regression{
|
|
||||||
/*! \brief class for gradient boosted regression */
|
|
||||||
class RegBoostLearner{
|
|
||||||
public:
|
|
||||||
/*! \brief constructor */
|
|
||||||
RegBoostLearner( void ){
|
|
||||||
silent = 0;
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief a regression booter associated with training and evaluating data
|
|
||||||
* \param train pointer to the training data
|
|
||||||
* \param evals array of evaluating data
|
|
||||||
* \param evname name of evaluation data, used print statistics
|
|
||||||
*/
|
|
||||||
RegBoostLearner( const DMatrix *train,
|
|
||||||
const std::vector<DMatrix *> &evals,
|
|
||||||
const std::vector<std::string> &evname ){
|
|
||||||
silent = 0;
|
|
||||||
this->SetData(train,evals,evname);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief associate regression booster with training and evaluating data
|
|
||||||
* \param train pointer to the training data
|
|
||||||
* \param evals array of evaluating data
|
|
||||||
* \param evname name of evaluation data, used print statistics
|
|
||||||
*/
|
|
||||||
inline void SetData( const DMatrix *train,
|
|
||||||
const std::vector<DMatrix *> &evals,
|
|
||||||
const std::vector<std::string> &evname ){
|
|
||||||
this->train_ = train;
|
|
||||||
this->evals_ = evals;
|
|
||||||
this->evname_ = evname;
|
|
||||||
// estimate feature bound
|
|
||||||
int num_feature = (int)(train->data.NumCol());
|
|
||||||
// assign buffer index
|
|
||||||
unsigned buffer_size = static_cast<unsigned>( train->Size() );
|
|
||||||
|
|
||||||
for( size_t i = 0; i < evals.size(); ++ i ){
|
|
||||||
buffer_size += static_cast<unsigned>( evals[i]->Size() );
|
|
||||||
num_feature = std::max( num_feature, (int)(evals[i]->data.NumCol()) );
|
|
||||||
}
|
|
||||||
|
|
||||||
char str_temp[25];
|
|
||||||
if( num_feature > mparam.num_feature ){
|
|
||||||
mparam.num_feature = num_feature;
|
|
||||||
sprintf( str_temp, "%d", num_feature );
|
|
||||||
base_gbm.SetParam( "bst:num_feature", str_temp );
|
|
||||||
}
|
|
||||||
|
|
||||||
sprintf( str_temp, "%u", buffer_size );
|
|
||||||
base_gbm.SetParam( "num_pbuffer", str_temp );
|
|
||||||
if( !silent ){
|
|
||||||
printf( "buffer_size=%u\n", buffer_size );
|
|
||||||
}
|
|
||||||
|
|
||||||
// set eval_preds tmp sapce
|
|
||||||
this->eval_preds_.resize( evals.size(), std::vector<float>() );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief set parameters from outside
|
|
||||||
* \param name name of the parameter
|
|
||||||
* \param val value of the parameter
|
|
||||||
*/
|
|
||||||
inline void SetParam( const char *name, const char *val ){
|
|
||||||
if( !strcmp( name, "silent") ) silent = atoi( val );
|
|
||||||
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
|
|
||||||
mparam.SetParam( name, val );
|
|
||||||
base_gbm.SetParam( name, val );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief initialize solver before training, called before training
|
|
||||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
|
||||||
*/
|
|
||||||
inline void InitTrainer( void ){
|
|
||||||
base_gbm.InitTrainer();
|
|
||||||
if( mparam.loss_type == kLogisticClassify ){
|
|
||||||
evaluator_.AddEval( "error" );
|
|
||||||
}else{
|
|
||||||
evaluator_.AddEval( "rmse" );
|
|
||||||
}
|
|
||||||
evaluator_.Init();
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
|
||||||
*/
|
|
||||||
inline void InitModel( void ){
|
|
||||||
base_gbm.InitModel();
|
|
||||||
mparam.AdjustBase();
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load model from stream
|
|
||||||
* \param fi input stream
|
|
||||||
*/
|
|
||||||
inline void LoadModel( utils::IStream &fi ){
|
|
||||||
base_gbm.LoadModel( fi );
|
|
||||||
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief DumpModel
|
|
||||||
* \param fo text file
|
|
||||||
* \param fmap feature map that may help give interpretations of feature
|
|
||||||
* \param with_stats whether print statistics as well
|
|
||||||
*/
|
|
||||||
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
|
|
||||||
base_gbm.DumpModel( fo, fmap, with_stats );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief Dump path of all trees
|
|
||||||
* \param fo text file
|
|
||||||
* \param data input data
|
|
||||||
*/
|
|
||||||
inline void DumpPath( FILE *fo, const DMatrix &data ){
|
|
||||||
base_gbm.DumpPath( fo, data.data );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief save model to stream
|
|
||||||
* \param fo output stream
|
|
||||||
*/
|
|
||||||
inline void SaveModel( utils::IStream &fo ) const{
|
|
||||||
base_gbm.SaveModel( fo );
|
|
||||||
fo.Write( &mparam, sizeof(ModelParam) );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief update the model for one iteration
|
|
||||||
* \param iteration iteration number
|
|
||||||
*/
|
|
||||||
inline void UpdateOneIter( int iter ){
|
|
||||||
this->PredictBuffer( preds_, *train_, 0 );
|
|
||||||
this->GetGradient( preds_, train_->labels, grad_, hess_ );
|
|
||||||
std::vector<unsigned> root_index;
|
|
||||||
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief evaluate the model for specific iteration
|
|
||||||
* \param iter iteration number
|
|
||||||
* \param fo file to output log
|
|
||||||
*/
|
|
||||||
inline void EvalOneIter( int iter, FILE *fo = stderr ){
|
|
||||||
fprintf( fo, "[%d]", iter );
|
|
||||||
int buffer_offset = static_cast<int>( train_->Size() );
|
|
||||||
|
|
||||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
|
||||||
std::vector<float> &preds = this->eval_preds_[ i ];
|
|
||||||
this->PredictBuffer( preds, *evals_[i], buffer_offset);
|
|
||||||
evaluator_.Eval( fo, evname_[i].c_str(), preds, (*evals_[i]).labels );
|
|
||||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
|
||||||
}
|
|
||||||
fprintf( fo,"\n" );
|
|
||||||
}
|
|
||||||
/*! \brief get prediction, without buffering */
|
|
||||||
inline void Predict( std::vector<float> &preds, const DMatrix &data ){
|
|
||||||
preds.resize( data.Size() );
|
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
|
||||||
#pragma omp parallel for schedule( static )
|
|
||||||
for( unsigned j = 0; j < ndata; ++ j ){
|
|
||||||
preds[j] = mparam.PredTransform
|
|
||||||
( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public:
|
|
||||||
/*!
|
|
||||||
* \brief update the model for one iteration
|
|
||||||
* \param iteration iteration number
|
|
||||||
*/
|
|
||||||
inline void UpdateInteract( std::string action ){
|
|
||||||
this->InteractPredict( preds_, *train_, 0 );
|
|
||||||
|
|
||||||
int buffer_offset = static_cast<int>( train_->Size() );
|
|
||||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
|
||||||
std::vector<float> &preds = this->eval_preds_[ i ];
|
|
||||||
this->InteractPredict( preds, *evals_[i], buffer_offset );
|
|
||||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
|
||||||
}
|
|
||||||
|
|
||||||
if( action == "remove" ){
|
|
||||||
base_gbm.DelteBooster(); return;
|
|
||||||
}
|
|
||||||
|
|
||||||
this->GetGradient( preds_, train_->labels, grad_, hess_ );
|
|
||||||
std::vector<unsigned> root_index;
|
|
||||||
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
|
|
||||||
|
|
||||||
this->InteractRePredict( *train_, 0 );
|
|
||||||
buffer_offset = static_cast<int>( train_->Size() );
|
|
||||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
|
||||||
this->InteractRePredict( *evals_[i], buffer_offset );
|
|
||||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
/*! \brief get the transformed predictions, given data */
|
|
||||||
inline void InteractPredict( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
|
|
||||||
preds.resize( data.Size() );
|
|
||||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
|
||||||
#pragma omp parallel for schedule( static )
|
|
||||||
for( unsigned j = 0; j < ndata; ++ j ){
|
|
||||||
preds[j] = mparam.PredTransform
|
|
||||||
( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*! \brief repredict trial */
|
|
||||||
inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){
|
|
||||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
|
||||||
#pragma omp parallel for schedule( static )
|
|
||||||
for( unsigned j = 0; j < ndata; ++ j ){
|
|
||||||
base_gbm.InteractRePredict( data.data, j, buffer_offset + j );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
/*! \brief get the transformed predictions, given data */
|
|
||||||
inline void PredictBuffer( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
|
|
||||||
preds.resize( data.Size() );
|
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
|
||||||
#pragma omp parallel for schedule( static )
|
|
||||||
for( unsigned j = 0; j < ndata; ++ j ){
|
|
||||||
preds[j] = mparam.PredTransform
|
|
||||||
( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
|
||||||
inline void GetGradient( const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels,
|
|
||||||
std::vector<float> &grad,
|
|
||||||
std::vector<float> &hess ){
|
|
||||||
grad.resize( preds.size() ); hess.resize( preds.size() );
|
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
|
||||||
#pragma omp parallel for schedule( static )
|
|
||||||
for( unsigned j = 0; j < ndata; ++ j ){
|
|
||||||
grad[j] = mparam.FirstOrderGradient( preds[j], labels[j] );
|
|
||||||
hess[j] = mparam.SecondOrderGradient( preds[j], labels[j] );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
enum LossType{
|
|
||||||
kLinearSquare = 0,
|
|
||||||
kLogisticNeglik = 1,
|
|
||||||
kLogisticClassify = 2
|
|
||||||
};
|
|
||||||
|
|
||||||
/*! \brief training parameter for regression */
|
|
||||||
struct ModelParam{
|
|
||||||
/* \brief global bias */
|
|
||||||
float base_score;
|
|
||||||
/* \brief type of loss function */
|
|
||||||
int loss_type;
|
|
||||||
/* \brief number of features */
|
|
||||||
int num_feature;
|
|
||||||
/*! \brief reserved field */
|
|
||||||
int reserved[ 16 ];
|
|
||||||
/*! \brief constructor */
|
|
||||||
ModelParam( void ){
|
|
||||||
base_score = 0.5f;
|
|
||||||
loss_type = 0;
|
|
||||||
num_feature = 0;
|
|
||||||
memset( reserved, 0, sizeof( reserved ) );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief set parameters from outside
|
|
||||||
* \param name name of the parameter
|
|
||||||
* \param val value of the parameter
|
|
||||||
*/
|
|
||||||
inline void SetParam( const char *name, const char *val ){
|
|
||||||
if( !strcmp("base_score", name ) ) base_score = (float)atof( val );
|
|
||||||
if( !strcmp("loss_type", name ) ) loss_type = atoi( val );
|
|
||||||
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief adjust base_score
|
|
||||||
*/
|
|
||||||
inline void AdjustBase( void ){
|
|
||||||
if( loss_type == 1 || loss_type == 2 ){
|
|
||||||
utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" );
|
|
||||||
base_score = - logf( 1.0f / base_score - 1.0f );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief transform the linear sum to prediction
|
|
||||||
* \param x linear sum of boosting ensemble
|
|
||||||
* \return transformed prediction
|
|
||||||
*/
|
|
||||||
inline float PredTransform( float x ){
|
|
||||||
switch( loss_type ){
|
|
||||||
case kLinearSquare: return x;
|
|
||||||
case kLogisticClassify:
|
|
||||||
case kLogisticNeglik: return 1.0f/(1.0f + expf(-x));
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculate first order gradient of loss, given transformed prediction
|
|
||||||
* \param predt transformed prediction
|
|
||||||
* \param label true label
|
|
||||||
* \return first order gradient
|
|
||||||
*/
|
|
||||||
inline float FirstOrderGradient( float predt, float label ) const{
|
|
||||||
switch( loss_type ){
|
|
||||||
case kLinearSquare: return predt - label;
|
|
||||||
case kLogisticClassify:
|
|
||||||
case kLogisticNeglik: return predt - label;
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief calculate second order gradient of loss, given transformed prediction
|
|
||||||
* \param predt transformed prediction
|
|
||||||
* \param label true label
|
|
||||||
* \return second order gradient
|
|
||||||
*/
|
|
||||||
inline float SecondOrderGradient( float predt, float label ) const{
|
|
||||||
switch( loss_type ){
|
|
||||||
case kLinearSquare: return 1.0f;
|
|
||||||
case kLogisticClassify:
|
|
||||||
case kLogisticNeglik: return predt * ( 1 - predt );
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculating the loss, given the predictions, labels and the loss type
|
|
||||||
* \param preds the given predictions
|
|
||||||
* \param labels the given labels
|
|
||||||
* \return the specified loss
|
|
||||||
*/
|
|
||||||
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
|
||||||
switch( loss_type ){
|
|
||||||
case kLinearSquare: return SquareLoss(preds,labels);
|
|
||||||
case kLogisticNeglik:
|
|
||||||
case kLogisticClassify: return NegLoglikelihoodLoss(preds,labels);
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculating the square loss, given the predictions and labels
|
|
||||||
* \param preds the given predictions
|
|
||||||
* \param labels the given labels
|
|
||||||
* \return the summation of square loss
|
|
||||||
*/
|
|
||||||
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
|
||||||
float ans = 0.0;
|
|
||||||
for(size_t i = 0; i < preds.size(); i++){
|
|
||||||
float dif = preds[i] - labels[i];
|
|
||||||
ans += dif * dif;
|
|
||||||
}
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculating the square loss, given the predictions and labels
|
|
||||||
* \param preds the given predictions
|
|
||||||
* \param labels the given labels
|
|
||||||
* \return the summation of square loss
|
|
||||||
*/
|
|
||||||
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
|
||||||
float ans = 0.0;
|
|
||||||
for(size_t i = 0; i < preds.size(); i++)
|
|
||||||
ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]);
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
private:
|
|
||||||
int silent;
|
|
||||||
EvalSet evaluator_;
|
|
||||||
booster::GBMBase base_gbm;
|
|
||||||
ModelParam mparam;
|
|
||||||
const DMatrix *train_;
|
|
||||||
std::vector<DMatrix *> evals_;
|
|
||||||
std::vector<std::string> evname_;
|
|
||||||
std::vector<unsigned> buffer_index_;
|
|
||||||
private:
|
|
||||||
std::vector<float> grad_, hess_, preds_;
|
|
||||||
std::vector< std::vector<float> > eval_preds_;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,155 +0,0 @@
|
|||||||
#ifndef XGBOOST_REG_DATA_H
|
|
||||||
#define XGBOOST_REG_DATA_H
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \file xgboost_reg_data.h
|
|
||||||
* \brief input data structure for regression and binary classification task.
|
|
||||||
* Format:
|
|
||||||
* The data should contain each data instance in each line.
|
|
||||||
* The format of line data is as below:
|
|
||||||
* label <nonzero feature dimension> [feature index:feature value]+
|
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
|
||||||
*/
|
|
||||||
#include <cstdio>
|
|
||||||
#include <vector>
|
|
||||||
#include "../booster/xgboost_data.h"
|
|
||||||
#include "../utils/xgboost_utils.h"
|
|
||||||
#include "../utils/xgboost_stream.h"
|
|
||||||
|
|
||||||
namespace xgboost{
|
|
||||||
namespace regression{
|
|
||||||
/*! \brief data matrix for regression content */
|
|
||||||
struct DMatrix{
|
|
||||||
public:
|
|
||||||
/*! \brief maximum feature dimension */
|
|
||||||
unsigned num_feature;
|
|
||||||
/*! \brief feature data content */
|
|
||||||
booster::FMatrixS data;
|
|
||||||
/*! \brief label of each instance */
|
|
||||||
std::vector<float> labels;
|
|
||||||
public:
|
|
||||||
/*! \brief default constructor */
|
|
||||||
DMatrix( void ){}
|
|
||||||
|
|
||||||
/*! \brief get the number of instances */
|
|
||||||
inline size_t Size() const{
|
|
||||||
return labels.size();
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load from text file
|
|
||||||
* \param fname name of text data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
*/
|
|
||||||
inline void LoadText( const char* fname, bool silent = false ){
|
|
||||||
data.Clear();
|
|
||||||
FILE* file = utils::FopenCheck( fname, "r" );
|
|
||||||
float label; bool init = true;
|
|
||||||
char tmp[ 1024 ];
|
|
||||||
std::vector<booster::bst_uint> findex;
|
|
||||||
std::vector<booster::bst_float> fvalue;
|
|
||||||
|
|
||||||
while( fscanf( file, "%s", tmp ) == 1 ){
|
|
||||||
unsigned index; float value;
|
|
||||||
if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){
|
|
||||||
findex.push_back( index ); fvalue.push_back( value );
|
|
||||||
}else{
|
|
||||||
if( !init ){
|
|
||||||
labels.push_back( label );
|
|
||||||
data.AddRow( findex, fvalue );
|
|
||||||
}
|
|
||||||
findex.clear(); fvalue.clear();
|
|
||||||
utils::Assert( sscanf( tmp, "%f", &label ) == 1, "invalid format" );
|
|
||||||
init = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
labels.push_back( label );
|
|
||||||
data.AddRow( findex, fvalue );
|
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
if( !silent ){
|
|
||||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
|
||||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
|
||||||
}
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load from binary file
|
|
||||||
* \param fname name of binary data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
* \return whether loading is success
|
|
||||||
*/
|
|
||||||
inline bool LoadBinary( const char* fname, bool silent = false ){
|
|
||||||
FILE *fp = fopen64( fname, "rb" );
|
|
||||||
if( fp == NULL ) return false;
|
|
||||||
utils::FileStream fs( fp );
|
|
||||||
data.LoadBinary( fs );
|
|
||||||
labels.resize( data.NumRow() );
|
|
||||||
utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" );
|
|
||||||
fs.Close();
|
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
if( !silent ){
|
|
||||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
|
||||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief save to binary file
|
|
||||||
* \param fname name of binary data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
*/
|
|
||||||
inline void SaveBinary( const char* fname, bool silent = false ){
|
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
utils::FileStream fs( utils::FopenCheck( fname, "wb" ) );
|
|
||||||
data.SaveBinary( fs );
|
|
||||||
fs.Write( &labels[0], sizeof(float) * data.NumRow() );
|
|
||||||
fs.Close();
|
|
||||||
if( !silent ){
|
|
||||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
|
||||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
|
||||||
* otherwise the function will first check if fname + '.buffer' exists,
|
|
||||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
|
||||||
* and try to create a buffer file
|
|
||||||
* \param fname name of binary data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
* \param savebuffer whether do save binary buffer if it is text
|
|
||||||
*/
|
|
||||||
inline void CacheLoad( const char *fname, bool silent = false, bool savebuffer = true ){
|
|
||||||
int len = strlen( fname );
|
|
||||||
if( len > 8 && !strcmp( fname + len - 7, ".buffer") ){
|
|
||||||
this->LoadBinary( fname, silent ); return;
|
|
||||||
}
|
|
||||||
char bname[ 1024 ];
|
|
||||||
sprintf( bname, "%s.buffer", fname );
|
|
||||||
if( !this->LoadBinary( bname, silent ) ){
|
|
||||||
this->LoadText( fname, silent );
|
|
||||||
if( savebuffer ) this->SaveBinary( bname, silent );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
/*! \brief update num_feature info */
|
|
||||||
inline void UpdateInfo( void ){
|
|
||||||
this->num_feature = 0;
|
|
||||||
for( size_t i = 0; i < data.NumRow(); i ++ ){
|
|
||||||
booster::FMatrixS::Line sp = data[i];
|
|
||||||
for( unsigned j = 0; j < sp.len; j ++ ){
|
|
||||||
if( num_feature <= sp[j].findex ){
|
|
||||||
num_feature = sp[j].findex + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
@@ -1,119 +0,0 @@
|
|||||||
#ifndef XGBOOST_REG_EVAL_H
|
|
||||||
#define XGBOOST_REG_EVAL_H
|
|
||||||
/*!
|
|
||||||
* \file xgboost_reg_eval.h
|
|
||||||
* \brief evaluation metrics for regression and classification
|
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
|
||||||
#include "../utils/xgboost_utils.h"
|
|
||||||
#include "../utils/xgboost_omp.h"
|
|
||||||
|
|
||||||
namespace xgboost{
|
|
||||||
namespace regression{
|
|
||||||
/*! \brief evaluator that evaluates the loss metrics */
|
|
||||||
struct IEvaluator{
|
|
||||||
/*!
|
|
||||||
* \brief evaluate a specific metric
|
|
||||||
* \param preds prediction
|
|
||||||
* \param labels label
|
|
||||||
*/
|
|
||||||
virtual float Eval( const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels ) const= 0;
|
|
||||||
/*! \return name of metric */
|
|
||||||
virtual const char *Name( void ) const= 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
/*! \brief RMSE */
|
|
||||||
struct EvalRMSE : public IEvaluator{
|
|
||||||
virtual float Eval( const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels ) const{
|
|
||||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
|
||||||
float sum = 0.0;
|
|
||||||
#pragma omp parallel for reduction(+:sum) schedule( static )
|
|
||||||
for( unsigned i = 0; i < ndata; ++ i ){
|
|
||||||
float diff = preds[i] - labels[i];
|
|
||||||
sum += diff * diff;
|
|
||||||
}
|
|
||||||
return sqrtf( sum / ndata );
|
|
||||||
}
|
|
||||||
virtual const char *Name( void ) const{
|
|
||||||
return "rmse";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/*! \brief Error */
|
|
||||||
struct EvalError : public IEvaluator{
|
|
||||||
virtual float Eval( const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels ) const{
|
|
||||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
|
||||||
unsigned nerr = 0;
|
|
||||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
|
||||||
for( unsigned i = 0; i < ndata; ++ i ){
|
|
||||||
if( preds[i] > 0.5f ){
|
|
||||||
if( labels[i] < 0.5f ) nerr += 1;
|
|
||||||
}else{
|
|
||||||
if( labels[i] > 0.5f ) nerr += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return static_cast<float>(nerr) / ndata;
|
|
||||||
}
|
|
||||||
virtual const char *Name( void ) const{
|
|
||||||
return "error";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/*! \brief Error */
|
|
||||||
struct EvalLogLoss : public IEvaluator{
|
|
||||||
virtual float Eval( const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels ) const{
|
|
||||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
|
||||||
unsigned nerr = 0;
|
|
||||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
|
||||||
for( unsigned i = 0; i < ndata; ++ i ){
|
|
||||||
const float y = labels[i];
|
|
||||||
const float py = preds[i];
|
|
||||||
nerr -= y * std::log(py) + (1.0f-y)*std::log(1-py);
|
|
||||||
}
|
|
||||||
return static_cast<float>(nerr) / ndata;
|
|
||||||
}
|
|
||||||
virtual const char *Name( void ) const{
|
|
||||||
return "negllik";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
namespace regression{
|
|
||||||
/*! \brief a set of evaluators */
|
|
||||||
struct EvalSet{
|
|
||||||
public:
|
|
||||||
inline void AddEval( const char *name ){
|
|
||||||
if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ );
|
|
||||||
if( !strcmp( name, "error") ) evals_.push_back( &error_ );
|
|
||||||
if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ );
|
|
||||||
}
|
|
||||||
inline void Init( void ){
|
|
||||||
std::sort( evals_.begin(), evals_.end() );
|
|
||||||
evals_.resize( std::unique( evals_.begin(), evals_.end() ) - evals_.begin() );
|
|
||||||
}
|
|
||||||
inline void Eval( FILE *fo, const char *evname,
|
|
||||||
const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels ) const{
|
|
||||||
for( size_t i = 0; i < evals_.size(); ++ i ){
|
|
||||||
float res = evals_[i]->Eval( preds, labels );
|
|
||||||
fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
EvalRMSE rmse_;
|
|
||||||
EvalError error_;
|
|
||||||
EvalLogLoss logloss_;
|
|
||||||
std::vector<const IEvaluator*> evals_;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
@@ -1,280 +0,0 @@
|
|||||||
#define _CRT_SECURE_NO_WARNINGS
|
|
||||||
#define _CRT_SECURE_NO_DEPRECATE
|
|
||||||
|
|
||||||
#include <ctime>
|
|
||||||
#include <string>
|
|
||||||
#include <cstring>
|
|
||||||
#include "xgboost_reg.h"
|
|
||||||
#include "../utils/xgboost_fmap.h"
|
|
||||||
#include "../utils/xgboost_random.h"
|
|
||||||
#include "../utils/xgboost_config.h"
|
|
||||||
|
|
||||||
namespace xgboost{
|
|
||||||
namespace regression{
|
|
||||||
/*!
|
|
||||||
* \brief wrapping the training process of the gradient boosting regression model,
|
|
||||||
* given the configuation
|
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
|
|
||||||
*/
|
|
||||||
class RegBoostTask{
|
|
||||||
public:
|
|
||||||
inline int Run( int argc, char *argv[] ){
|
|
||||||
if( argc < 2 ){
|
|
||||||
printf("Usage: <config>\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
utils::ConfigIterator itr( argv[1] );
|
|
||||||
while( itr.Next() ){
|
|
||||||
this->SetParam( itr.name(), itr.val() );
|
|
||||||
}
|
|
||||||
for( int i = 2; i < argc; i ++ ){
|
|
||||||
char name[256], val[256];
|
|
||||||
if( sscanf( argv[i], "%[^=]=%s", name, val ) == 2 ){
|
|
||||||
this->SetParam( name, val );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this->InitData();
|
|
||||||
this->InitLearner();
|
|
||||||
if( task == "dump" ){
|
|
||||||
this->TaskDump();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if( task == "interact" ){
|
|
||||||
this->TaskInteractive(); return 0;
|
|
||||||
}
|
|
||||||
if( task == "dumppath" ){
|
|
||||||
this->TaskDumpPath(); return 0;
|
|
||||||
}
|
|
||||||
if( task == "eval" ){
|
|
||||||
this->TaskEval(); return 0;
|
|
||||||
}
|
|
||||||
if( task == "pred" ){
|
|
||||||
this->TaskPred();
|
|
||||||
}else{
|
|
||||||
this->TaskTrain();
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
inline void SetParam( const char *name, const char *val ){
|
|
||||||
if( !strcmp("silent", name ) ) silent = atoi( val );
|
|
||||||
if( !strcmp("use_buffer", name ) ) use_buffer = atoi( val );
|
|
||||||
if( !strcmp("seed", name ) ) random::Seed( atoi(val) );
|
|
||||||
if( !strcmp("num_round", name ) ) num_round = atoi( val );
|
|
||||||
if( !strcmp("save_period", name ) ) save_period = atoi( val );
|
|
||||||
if( !strcmp("task", name ) ) task = val;
|
|
||||||
if( !strcmp("data", name ) ) train_path = val;
|
|
||||||
if( !strcmp("test:data", name ) ) test_path = val;
|
|
||||||
if( !strcmp("model_in", name ) ) model_in = val;
|
|
||||||
if( !strcmp("model_out", name ) ) model_out = val;
|
|
||||||
if( !strcmp("model_dir", name ) ) model_dir_path = val;
|
|
||||||
if( !strcmp("fmap", name ) ) name_fmap = val;
|
|
||||||
if( !strcmp("name_dump", name ) ) name_dump = val;
|
|
||||||
if( !strcmp("name_dumppath", name ) ) name_dumppath = val;
|
|
||||||
if( !strcmp("name_pred", name ) ) name_pred = val;
|
|
||||||
if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val );
|
|
||||||
if( !strcmp("interact:action", name ) ) interact_action = val;
|
|
||||||
if( !strncmp("batch:", name, 6 ) ){
|
|
||||||
cfg_batch.PushBack( name + 6, val );
|
|
||||||
}
|
|
||||||
if( !strncmp("eval[", name, 5 ) ) {
|
|
||||||
char evname[ 256 ];
|
|
||||||
utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display");
|
|
||||||
eval_data_names.push_back( std::string( evname ) );
|
|
||||||
eval_data_paths.push_back( std::string( val ) );
|
|
||||||
}
|
|
||||||
cfg.PushBack( name, val );
|
|
||||||
}
|
|
||||||
public:
|
|
||||||
RegBoostTask( void ){
|
|
||||||
// default parameters
|
|
||||||
silent = 0;
|
|
||||||
use_buffer = 1;
|
|
||||||
num_round = 10;
|
|
||||||
save_period = 0;
|
|
||||||
dump_model_stats = 0;
|
|
||||||
task = "train";
|
|
||||||
model_in = "NULL";
|
|
||||||
model_out = "NULL";
|
|
||||||
name_fmap = "NULL";
|
|
||||||
name_pred = "pred.txt";
|
|
||||||
name_dump = "dump.txt";
|
|
||||||
name_dumppath = "dump.path.txt";
|
|
||||||
model_dir_path = "./";
|
|
||||||
interact_action = "update";
|
|
||||||
}
|
|
||||||
~RegBoostTask( void ){
|
|
||||||
for( size_t i = 0; i < deval.size(); i ++ ){
|
|
||||||
delete deval[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
inline void InitData( void ){
|
|
||||||
if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() );
|
|
||||||
if( task == "dump" ) return;
|
|
||||||
if( task == "pred" || task == "dumppath" ){
|
|
||||||
data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 );
|
|
||||||
}else{
|
|
||||||
// training
|
|
||||||
data.CacheLoad( train_path.c_str(), silent!=0, use_buffer!=0 );
|
|
||||||
utils::Assert( eval_data_names.size() == eval_data_paths.size() );
|
|
||||||
for( size_t i = 0; i < eval_data_names.size(); ++ i ){
|
|
||||||
deval.push_back( new DMatrix() );
|
|
||||||
deval.back()->CacheLoad( eval_data_paths[i].c_str(), silent!=0, use_buffer!=0 );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
learner.SetData( &data, deval, eval_data_names );
|
|
||||||
}
|
|
||||||
inline void InitLearner( void ){
|
|
||||||
cfg.BeforeFirst();
|
|
||||||
while( cfg.Next() ){
|
|
||||||
learner.SetParam( cfg.name(), cfg.val() );
|
|
||||||
}
|
|
||||||
if( model_in != "NULL" ){
|
|
||||||
utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") );
|
|
||||||
learner.LoadModel( fi );
|
|
||||||
fi.Close();
|
|
||||||
}else{
|
|
||||||
utils::Assert( task == "train", "model_in not specified" );
|
|
||||||
learner.InitModel();
|
|
||||||
}
|
|
||||||
learner.InitTrainer();
|
|
||||||
}
|
|
||||||
inline void TaskTrain( void ){
|
|
||||||
const time_t start = time( NULL );
|
|
||||||
unsigned long elapsed = 0;
|
|
||||||
for( int i = 0; i < num_round; ++ i ){
|
|
||||||
elapsed = (unsigned long)(time(NULL) - start);
|
|
||||||
if( !silent ) printf("boosting round %d, %lu sec elapsed\n", i , elapsed );
|
|
||||||
learner.UpdateOneIter( i );
|
|
||||||
learner.EvalOneIter( i );
|
|
||||||
if( save_period != 0 && (i+1) % save_period == 0 ){
|
|
||||||
this->SaveModel( i );
|
|
||||||
}
|
|
||||||
elapsed = (unsigned long)(time(NULL) - start);
|
|
||||||
}
|
|
||||||
// always save final round
|
|
||||||
if( save_period == 0 || num_round % save_period != 0 ){
|
|
||||||
if( model_out == "NULL" ){
|
|
||||||
this->SaveModel( num_round - 1 );
|
|
||||||
}else{
|
|
||||||
this->SaveModel( model_out.c_str() );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( !silent ){
|
|
||||||
printf("\nupdating end, %lu sec in all\n", elapsed );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline void TaskEval( void ){
|
|
||||||
learner.EvalOneIter( 0 );
|
|
||||||
}
|
|
||||||
inline void TaskInteractive( void ){
|
|
||||||
const time_t start = time( NULL );
|
|
||||||
unsigned long elapsed = 0;
|
|
||||||
int batch_action = 0;
|
|
||||||
|
|
||||||
cfg_batch.BeforeFirst();
|
|
||||||
while( cfg_batch.Next() ){
|
|
||||||
if( !strcmp( cfg_batch.name(), "run" ) ){
|
|
||||||
learner.UpdateInteract( interact_action );
|
|
||||||
batch_action += 1;
|
|
||||||
} else{
|
|
||||||
learner.SetParam( cfg_batch.name(), cfg_batch.val() );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( batch_action == 0 ){
|
|
||||||
learner.UpdateInteract( interact_action );
|
|
||||||
}
|
|
||||||
utils::Assert( model_out != "NULL", "interactive mode must specify model_out" );
|
|
||||||
this->SaveModel( model_out.c_str() );
|
|
||||||
elapsed = (unsigned long)(time(NULL) - start);
|
|
||||||
|
|
||||||
if( !silent ){
|
|
||||||
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TaskDump( void ){
|
|
||||||
FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" );
|
|
||||||
learner.DumpModel( fo, fmap, dump_model_stats != 0 );
|
|
||||||
fclose( fo );
|
|
||||||
}
|
|
||||||
inline void TaskDumpPath( void ){
|
|
||||||
FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" );
|
|
||||||
learner.DumpPath( fo, data );
|
|
||||||
fclose( fo );
|
|
||||||
}
|
|
||||||
inline void SaveModel( const char *fname ) const{
|
|
||||||
utils::FileStream fo( utils::FopenCheck( fname, "wb" ) );
|
|
||||||
learner.SaveModel( fo );
|
|
||||||
fo.Close();
|
|
||||||
}
|
|
||||||
inline void SaveModel( int i ) const{
|
|
||||||
char fname[256];
|
|
||||||
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
|
|
||||||
this->SaveModel( fname );
|
|
||||||
}
|
|
||||||
inline void TaskPred( void ){
|
|
||||||
std::vector<float> preds;
|
|
||||||
if( !silent ) printf("start prediction...\n");
|
|
||||||
learner.Predict( preds, data );
|
|
||||||
if( !silent ) printf("writing prediction to %s\n", name_pred.c_str() );
|
|
||||||
FILE *fo = utils::FopenCheck( name_pred.c_str(), "w" );
|
|
||||||
for( size_t i = 0; i < preds.size(); i ++ ){
|
|
||||||
fprintf( fo, "%f\n", preds[i] );
|
|
||||||
}
|
|
||||||
fclose( fo );
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
/* \brief whether silent */
|
|
||||||
int silent;
|
|
||||||
/* \brief whether use auto binary buffer */
|
|
||||||
int use_buffer;
|
|
||||||
/* \brief number of boosting iterations */
|
|
||||||
int num_round;
|
|
||||||
/* \brief the period to save the model, 0 means only save the final round model */
|
|
||||||
int save_period;
|
|
||||||
/*! \brief interfact action */
|
|
||||||
std::string interact_action;
|
|
||||||
/* \brief the path of training/test data set */
|
|
||||||
std::string train_path, test_path;
|
|
||||||
/* \brief the path of test model file, or file to restart training */
|
|
||||||
std::string model_in;
|
|
||||||
/* \brief the path of final model file, to be saved */
|
|
||||||
std::string model_out;
|
|
||||||
/* \brief the path of directory containing the saved models */
|
|
||||||
std::string model_dir_path;
|
|
||||||
/* \brief task to perform */
|
|
||||||
std::string task;
|
|
||||||
/* \brief name of predict file */
|
|
||||||
std::string name_pred;
|
|
||||||
/* \brief whether dump statistics along with model */
|
|
||||||
int dump_model_stats;
|
|
||||||
/* \brief name of feature map */
|
|
||||||
std::string name_fmap;
|
|
||||||
/* \brief name of dump file */
|
|
||||||
std::string name_dump;
|
|
||||||
/* \brief name of dump path file */
|
|
||||||
std::string name_dumppath;
|
|
||||||
/* \brief the paths of validation data sets */
|
|
||||||
std::vector<std::string> eval_data_paths;
|
|
||||||
/* \brief the names of the evaluation data used in output log */
|
|
||||||
std::vector<std::string> eval_data_names;
|
|
||||||
/*! \brief saves configurations */
|
|
||||||
utils::ConfigSaver cfg;
|
|
||||||
/*! \brief batch configurations */
|
|
||||||
utils::ConfigSaver cfg_batch;
|
|
||||||
private:
|
|
||||||
DMatrix data;
|
|
||||||
std::vector<DMatrix*> deval;
|
|
||||||
utils::FeatMap fmap;
|
|
||||||
RegBoostLearner learner;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
int main( int argc, char *argv[] ){
|
|
||||||
xgboost::random::Seed( 0 );
|
|
||||||
xgboost::regression::RegBoostTask tsk;
|
|
||||||
return tsk.Run( argc, argv );
|
|
||||||
}
|
|
||||||
26
tools/Makefile
Normal file
26
tools/Makefile
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
export CC = gcc
|
||||||
|
export CXX = g++
|
||||||
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||||
|
|
||||||
|
# specify tensor path
|
||||||
|
BIN = xgcombine_buffer
|
||||||
|
OBJ =
|
||||||
|
.PHONY: clean all
|
||||||
|
|
||||||
|
all: $(BIN) $(OBJ)
|
||||||
|
export LDFLAGS= -pthread -lm
|
||||||
|
|
||||||
|
xgcombine_buffer : xgcombine_buffer.cpp
|
||||||
|
|
||||||
|
|
||||||
|
$(BIN) :
|
||||||
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
|
$(OBJ) :
|
||||||
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||||
|
|
||||||
|
install:
|
||||||
|
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
$(RM) $(OBJ) $(BIN) *~
|
||||||
248
tools/xgcombine_buffer.cpp
Normal file
248
tools/xgcombine_buffer.cpp
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
/*!
|
||||||
|
* a tool to combine different set of features into binary buffer
|
||||||
|
* not well organized code, but does it's job
|
||||||
|
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
|
*/
|
||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
|
#include <cmath>
|
||||||
|
#include "../regrank/xgboost_regrank_data.h"
|
||||||
|
#include "../utils/xgboost_utils.h"
|
||||||
|
|
||||||
|
using namespace xgboost;
|
||||||
|
using namespace xgboost::booster;
|
||||||
|
using namespace xgboost::regrank;
|
||||||
|
|
||||||
|
// header in dataset
|
||||||
|
struct Header{
|
||||||
|
FILE *fi;
|
||||||
|
int tmp_num;
|
||||||
|
int base;
|
||||||
|
int num_feat;
|
||||||
|
// whether it's dense format
|
||||||
|
bool is_dense;
|
||||||
|
bool warned;
|
||||||
|
|
||||||
|
Header( void ){ this->warned = false; this->is_dense = false; }
|
||||||
|
|
||||||
|
inline void CheckBase( unsigned findex ){
|
||||||
|
if( findex >= (unsigned)num_feat && ! warned ) {
|
||||||
|
fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat );
|
||||||
|
warned = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
inline int norm( std::vector<Header> &vec, int base = 0 ){
|
||||||
|
int n = base;
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
if( vec[i].is_dense ) vec[i].num_feat = 1;
|
||||||
|
vec[i].base = n; n += vec[i].num_feat;
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void vclose( std::vector<Header> &vec ){
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
fclose( vec[i].fi );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int readnum( std::vector<Header> &vec ){
|
||||||
|
int n = 0;
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
if( !vec[i].is_dense ){
|
||||||
|
utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" );
|
||||||
|
n += vec[i].tmp_num;
|
||||||
|
}else{
|
||||||
|
n ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void vskip( std::vector<Header> &vec ){
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
if( !vec[i].is_dense ){
|
||||||
|
utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0 );
|
||||||
|
}else{
|
||||||
|
utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class DataLoader: public DMatrix{
|
||||||
|
public:
|
||||||
|
// whether to do node and edge feature renormalization
|
||||||
|
int rescale;
|
||||||
|
int linelimit;
|
||||||
|
public:
|
||||||
|
FILE *fp, *fwlist, *fgroup, *fweight;
|
||||||
|
std::vector<Header> fheader;
|
||||||
|
std::vector<FMatrixS::REntry> entry;
|
||||||
|
DataLoader( void ){
|
||||||
|
rescale = 0;
|
||||||
|
linelimit = -1;
|
||||||
|
fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
inline void Load( std::vector<unsigned> &findex, std::vector<float> &fvalue, std::vector<Header> &vec ){
|
||||||
|
unsigned fidx; float fv;
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
if( !vec[i].is_dense ) {
|
||||||
|
for( int j = 0; j < vec[i].tmp_num; j ++ ){
|
||||||
|
utils::Assert( fscanf ( vec[i].fi, "%u:%f", &fidx, &fv ) == 2, "Error when load feat" );
|
||||||
|
vec[i].CheckBase( fidx );
|
||||||
|
fidx += vec[i].base;
|
||||||
|
findex.push_back( fidx ); fvalue.push_back( fv );
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
utils::Assert( fscanf ( vec[i].fi, "%f", &fv ) == 1, "load feat" );
|
||||||
|
fidx = vec[i].base;
|
||||||
|
findex.push_back( fidx ); fvalue.push_back( fv );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void DoRescale( std::vector<float> &vec ){
|
||||||
|
double sum = 0.0;
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
sum += vec[i] * vec[i];
|
||||||
|
}
|
||||||
|
sum = sqrt( sum );
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
vec[i] /= sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
// basically we are loading all the data inside
|
||||||
|
inline void Load( void ){
|
||||||
|
this->data.Clear();
|
||||||
|
float label, weight = 0.0f;
|
||||||
|
|
||||||
|
unsigned ngleft = 0, ngacc = 0;
|
||||||
|
if( fgroup != NULL ){
|
||||||
|
info.group_ptr.clear();
|
||||||
|
info.group_ptr.push_back(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
while( fscanf( fp, "%f", &label ) == 1 ){
|
||||||
|
if( ngleft == 0 && fgroup != NULL ){
|
||||||
|
utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1 );
|
||||||
|
}
|
||||||
|
if( fweight != NULL ){
|
||||||
|
utils::Assert( fscanf( fweight, "%f", &weight ) == 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
ngleft -= 1; ngacc += 1;
|
||||||
|
|
||||||
|
int pass = 1;
|
||||||
|
if( fwlist != NULL ){
|
||||||
|
utils::Assert( fscanf( fwlist, "%u", &pass ) ==1 );
|
||||||
|
}
|
||||||
|
if( pass == 0 ){
|
||||||
|
vskip( fheader ); ngacc -= 1;
|
||||||
|
}else{
|
||||||
|
const int nfeat = readnum( fheader );
|
||||||
|
std::vector<unsigned> findex;
|
||||||
|
std::vector<float> fvalue;
|
||||||
|
// pairs
|
||||||
|
this->Load( findex, fvalue, fheader );
|
||||||
|
utils::Assert( findex.size() == (unsigned)nfeat );
|
||||||
|
if( rescale != 0 ) this->DoRescale( fvalue );
|
||||||
|
// push back data :)
|
||||||
|
this->info.labels.push_back( label );
|
||||||
|
// push back weight if any
|
||||||
|
if( fweight != NULL ){
|
||||||
|
this->info.weights.push_back( weight );
|
||||||
|
}
|
||||||
|
this->data.AddRow( findex, fvalue );
|
||||||
|
}
|
||||||
|
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
||||||
|
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
||||||
|
utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" );
|
||||||
|
ngacc = 0;
|
||||||
|
}
|
||||||
|
// linelimit
|
||||||
|
if( linelimit >= 0 ) {
|
||||||
|
if( -- linelimit <= 0 ) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
||||||
|
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
||||||
|
utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" );
|
||||||
|
}
|
||||||
|
this->data.InitData();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const char *folder = "features";
|
||||||
|
|
||||||
|
int main( int argc, char *argv[] ){
|
||||||
|
if( argc < 3 ){
|
||||||
|
printf("Usage:xgcombine_buffer <inname> <outname> [options] -f [features] -fd [densefeatures]\n"\
|
||||||
|
"options: -rescale -linelimit -fgroup <groupfilename> -wlist <whitelistinstance>\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
DataLoader loader;
|
||||||
|
time_t start = time( NULL );
|
||||||
|
|
||||||
|
int mode = 0;
|
||||||
|
for( int i = 3; i < argc; i ++ ){
|
||||||
|
if( !strcmp( argv[i], "-f") ){
|
||||||
|
mode = 0; continue;
|
||||||
|
}
|
||||||
|
if( !strcmp( argv[i], "-fd") ){
|
||||||
|
mode = 2; continue;
|
||||||
|
}
|
||||||
|
if( !strcmp( argv[i], "-rescale") ){
|
||||||
|
loader.rescale = 1; continue;
|
||||||
|
}
|
||||||
|
if( !strcmp( argv[i], "-wlist") ){
|
||||||
|
loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||||
|
}
|
||||||
|
if( !strcmp( argv[i], "-fgroup") ){
|
||||||
|
loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||||
|
}
|
||||||
|
if( !strcmp( argv[i], "-fweight") ){
|
||||||
|
loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||||
|
}
|
||||||
|
if( !strcmp( argv[i], "-linelimit") ){
|
||||||
|
loader.linelimit = atoi( argv[ ++i ] ); continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
char name[ 256 ];
|
||||||
|
sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] );
|
||||||
|
Header h;
|
||||||
|
h.fi = utils::FopenCheck( name, "r" );
|
||||||
|
|
||||||
|
if( mode == 2 ){
|
||||||
|
h.is_dense = true; h.num_feat = 1;
|
||||||
|
loader.fheader.push_back( h );
|
||||||
|
}else{
|
||||||
|
utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" );
|
||||||
|
switch( mode ){
|
||||||
|
case 0: loader.fheader.push_back( h ); break;
|
||||||
|
default: ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
loader.fp = utils::FopenCheck( argv[1], "r" );
|
||||||
|
|
||||||
|
printf("num_features=%d\n", norm( loader.fheader ) );
|
||||||
|
printf("start creating buffer...\n");
|
||||||
|
loader.Load();
|
||||||
|
loader.SaveBinary( argv[2] );
|
||||||
|
// close files
|
||||||
|
fclose( loader.fp );
|
||||||
|
if( loader.fwlist != NULL ) fclose( loader.fwlist );
|
||||||
|
if( loader.fgroup != NULL ) fclose( loader.fgroup );
|
||||||
|
vclose( loader.fheader );
|
||||||
|
printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) );
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -94,7 +94,8 @@ namespace xgboost{
|
|||||||
case '\"':
|
case '\"':
|
||||||
if (i == 0){
|
if (i == 0){
|
||||||
ParseStr(tok); ch_buf = fgetc(fi); return new_line;
|
ParseStr(tok); ch_buf = fgetc(fi); return new_line;
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
Error("token followed directly by string");
|
Error("token followed directly by string");
|
||||||
}
|
}
|
||||||
case '=':
|
case '=':
|
||||||
@@ -102,7 +103,8 @@ namespace xgboost{
|
|||||||
ch_buf = fgetc(fi);
|
ch_buf = fgetc(fi);
|
||||||
tok[0] = '=';
|
tok[0] = '=';
|
||||||
tok[1] = '\0';
|
tok[1] = '\0';
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
tok[i] = '\0';
|
tok[i] = '\0';
|
||||||
}
|
}
|
||||||
return new_line;
|
return new_line;
|
||||||
@@ -155,7 +157,8 @@ namespace xgboost{
|
|||||||
if (priority == 0){
|
if (priority == 0){
|
||||||
names.push_back(std::string(name));
|
names.push_back(std::string(name));
|
||||||
values.push_back(std::string(val));
|
values.push_back(std::string(val));
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
names_high.push_back(std::string(name));
|
names_high.push_back(std::string(name));
|
||||||
values_high.push_back(std::string(val));
|
values_high.push_back(std::string(val));
|
||||||
}
|
}
|
||||||
@@ -184,7 +187,8 @@ namespace xgboost{
|
|||||||
size_t i = idx - 1;
|
size_t i = idx - 1;
|
||||||
if (i >= names.size()){
|
if (i >= names.size()){
|
||||||
return names_high[i - names.size()].c_str();
|
return names_high[i - names.size()].c_str();
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
return names[i].c_str();
|
return names[i].c_str();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -197,7 +201,8 @@ namespace xgboost{
|
|||||||
size_t i = idx - 1;
|
size_t i = idx - 1;
|
||||||
if (i >= values.size()){
|
if (i >= values.size()){
|
||||||
return values_high[i - values.size()].c_str();
|
return values_high[i - values.size()].c_str();
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
return values[i].c_str();
|
return values[i].c_str();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,8 +31,8 @@ namespace xgboost{
|
|||||||
/*! \brief load feature map from text format */
|
/*! \brief load feature map from text format */
|
||||||
inline void LoadText(FILE *fi){
|
inline void LoadText(FILE *fi){
|
||||||
int fid;
|
int fid;
|
||||||
char fname[256], ftype[256];
|
char fname[1256], ftype[1256];
|
||||||
while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){
|
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
|
||||||
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
||||||
names_.push_back(std::string(fname));
|
names_.push_back(std::string(fname));
|
||||||
types_.push_back(GetType(ftype));
|
types_.push_back(GetType(ftype));
|
||||||
|
|||||||
@@ -50,7 +50,8 @@ namespace xgboost{
|
|||||||
if (!UseAcList){
|
if (!UseAcList){
|
||||||
rptr.clear();
|
rptr.clear();
|
||||||
rptr.resize(nrows + 1, 0);
|
rptr.resize(nrows + 1, 0);
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
||||||
this->Cleanup();
|
this->Cleanup();
|
||||||
}
|
}
|
||||||
@@ -79,7 +80,8 @@ namespace xgboost{
|
|||||||
rptr[i] = start;
|
rptr[i] = start;
|
||||||
start += rlen;
|
start += rlen;
|
||||||
}
|
}
|
||||||
}else{
|
}
|
||||||
|
else{
|
||||||
// case with active list
|
// case with active list
|
||||||
std::sort(aclist.begin(), aclist.end());
|
std::sort(aclist.begin(), aclist.end());
|
||||||
|
|
||||||
|
|||||||
@@ -88,7 +88,8 @@ namespace xgboost{
|
|||||||
u = NextDouble();
|
u = NextDouble();
|
||||||
} while (u == 0.0);
|
} while (u == 0.0);
|
||||||
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
|
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
double d, c, x, v, u;
|
double d, c, x, v, u;
|
||||||
d = alpha - 1.0 / 3.0;
|
d = alpha - 1.0 / 3.0;
|
||||||
c = 1.0 / sqrt(9.0 * d);
|
c = 1.0 / sqrt(9.0 * d);
|
||||||
@@ -126,6 +127,22 @@ namespace xgboost{
|
|||||||
Shuffle(&data[0], data.size());
|
Shuffle(&data[0], data.size());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
namespace random{
|
||||||
|
/*! \brief random number generator with independent random number seed*/
|
||||||
|
struct Random{
|
||||||
|
/*! \brief set random number seed */
|
||||||
|
inline void Seed( unsigned sd ){
|
||||||
|
this->rseed = sd;
|
||||||
|
}
|
||||||
|
/*! \brief return a real number uniform in [0,1) */
|
||||||
|
inline double RandDouble( void ){
|
||||||
|
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
|
||||||
|
}
|
||||||
|
// random number seed
|
||||||
|
unsigned rseed;
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ namespace xgboost{
|
|||||||
namespace utils{
|
namespace utils{
|
||||||
inline void Error(const char *msg){
|
inline void Error(const char *msg){
|
||||||
fprintf(stderr, "Error:%s\n", msg);
|
fprintf(stderr, "Error:%s\n", msg);
|
||||||
|
fflush(stderr);
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,6 +59,7 @@ namespace xgboost{
|
|||||||
FILE *fp = fopen64(fname, flag);
|
FILE *fp = fopen64(fname, flag);
|
||||||
if (fp == NULL){
|
if (fp == NULL){
|
||||||
fprintf(stderr, "can not open file \"%s\" \n", fname);
|
fprintf(stderr, "can not open file \"%s\" \n", fname);
|
||||||
|
fflush(stderr);
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
return fp;
|
return fp;
|
||||||
|
|||||||
Reference in New Issue
Block a user