rank pass toy
This commit is contained in:
parent
40c380e40a
commit
a10f594644
2
Makefile
2
Makefile
@ -12,6 +12,8 @@ export LDFLAGS= -pthread -lm
|
||||
|
||||
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
||||
|
||||
#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
||||
|
||||
$(BIN) :
|
||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
|
||||
|
||||
@ -288,7 +288,8 @@ namespace xgboost{
|
||||
booster_info.push_back(0);
|
||||
this->ConfigBooster(boosters.back());
|
||||
boosters.back()->InitModel();
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
this->ConfigBooster(boosters.back());
|
||||
}
|
||||
return boosters.back();
|
||||
|
||||
13
demo/rank/README
Normal file
13
demo/rank/README
Normal file
@ -0,0 +1,13 @@
|
||||
Demonstrating how to use XGBoost accomplish regression tasks on computer hardware dataset https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
|
||||
|
||||
Run: ./runexp.sh
|
||||
|
||||
Format of input: LIBSVM format
|
||||
|
||||
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
|
||||
- Feature id must be from 0 to number of features, in sorted order.
|
||||
- i means this feature is binary indicator feature
|
||||
- q means this feature is a quantitative value, such as age, time, can be missing
|
||||
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
|
||||
|
||||
Explainations: https://github.com/tqchen/xgboost/wiki/Regression
|
||||
16
demo/rank/runexp.sh
Normal file
16
demo/rank/runexp.sh
Normal file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
# map the data to features. For convenience we only use 7 original attributes and encode them as features in a trivial way
|
||||
python mapfeat.py
|
||||
# split train and test
|
||||
python mknfold.py machine.txt 1
|
||||
# training and output the models
|
||||
../../xgboost machine.conf
|
||||
# output predictions of test data
|
||||
../../xgboost machine.conf task=pred model_in=0002.model
|
||||
# print the boosters of 0002.model in dump.raw.txt
|
||||
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
# print the boosters of 0002.model in dump.nice.txt with feature map
|
||||
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
|
||||
# cat the result
|
||||
cat dump.nice.txt
|
||||
5
demo/rank/toy.eval
Normal file
5
demo/rank/toy.eval
Normal file
@ -0,0 +1,5 @@
|
||||
1 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
1 0:2 1:3 2:2
|
||||
2
demo/rank/toy.eval.group
Normal file
2
demo/rank/toy.eval.group
Normal file
@ -0,0 +1,2 @@
|
||||
2
|
||||
3
|
||||
5
demo/rank/toy.test
Normal file
5
demo/rank/toy.test
Normal file
@ -0,0 +1,5 @@
|
||||
1 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
1 0:2 1:3 2:2
|
||||
2
demo/rank/toy.test.group
Normal file
2
demo/rank/toy.test.group
Normal file
@ -0,0 +1,2 @@
|
||||
2
|
||||
3
|
||||
5
demo/rank/toy.train
Normal file
5
demo/rank/toy.train
Normal file
@ -0,0 +1,5 @@
|
||||
1 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
1 0:2 1:3 2:2
|
||||
2
demo/rank/toy.train.group
Normal file
2
demo/rank/toy.train.group
Normal file
@ -0,0 +1,2 @@
|
||||
2
|
||||
3
|
||||
0
demo/rank/train
Normal file
0
demo/rank/train
Normal file
@ -20,6 +20,7 @@ namespace xgboost{
|
||||
class BoostTask{
|
||||
public:
|
||||
inline int Run(int argc, char *argv[]){
|
||||
|
||||
if (argc < 2){
|
||||
printf("Usage: <config>\n");
|
||||
return 0;
|
||||
@ -34,6 +35,7 @@ namespace xgboost{
|
||||
this->SetParam(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
this->InitData();
|
||||
this->InitLearner();
|
||||
if (task == "dump"){
|
||||
@ -128,6 +130,7 @@ namespace xgboost{
|
||||
|
||||
|
||||
inline void InitData(void){
|
||||
|
||||
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
|
||||
if (task == "dump") return;
|
||||
if (learning_task == RANKING){
|
||||
@ -140,6 +143,7 @@ namespace xgboost{
|
||||
// training
|
||||
sscanf(train_path.c_str(), "%[^;];%s", instance_path, group_path);
|
||||
data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||
deval.push_back(new DMatrix());
|
||||
@ -147,8 +151,6 @@ namespace xgboost{
|
||||
deval.back()->CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else{
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
@ -166,7 +168,9 @@ namespace xgboost{
|
||||
}
|
||||
|
||||
learner_->SetData(&data, deval, eval_data_names);
|
||||
if(!silent) printf("BoostTask:Data Initiation Done!\n");
|
||||
}
|
||||
|
||||
inline void InitLearner(void){
|
||||
cfg.BeforeFirst();
|
||||
while (cfg.Next()){
|
||||
@ -182,6 +186,7 @@ namespace xgboost{
|
||||
learner_->InitModel();
|
||||
}
|
||||
learner_->InitTrainer();
|
||||
if(!silent) printf("BoostTask:InitLearner Done!\n");
|
||||
}
|
||||
|
||||
inline void TaskTrain(void){
|
||||
|
||||
@ -70,17 +70,27 @@ namespace xgboost{
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
fclose(file);
|
||||
LoadGroup(fgroup,silent);
|
||||
}
|
||||
|
||||
inline void LoadGroup(const char* fgroup, bool silent = false){
|
||||
//if exists group data load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
|
||||
if (file_group != NULL){
|
||||
group_index.push_back(0);
|
||||
int tmp = 0, acc = 0;
|
||||
while (fscanf(file_group, "%d", tmp) == 1){
|
||||
int tmp = 0, acc = 0,cnt = 0;
|
||||
while (fscanf(file_group, "%d", &tmp) == 1){
|
||||
acc += tmp;
|
||||
group_index.push_back(acc);
|
||||
cnt++;
|
||||
}
|
||||
if(!silent) printf("%d groups are loaded from %s\n",cnt,fgroup);
|
||||
fclose(file_group);
|
||||
}else{
|
||||
if(!silent) printf("There is no group file\n");
|
||||
}
|
||||
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
@ -100,24 +110,11 @@ namespace xgboost{
|
||||
data.InitData();
|
||||
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s as binary\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
|
||||
//if group data exists load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
if (file_group != NULL){
|
||||
int group_index_size = 0;
|
||||
utils::FileStream group_stream(file_group);
|
||||
utils::Assert(group_stream.Read(&group_index_size, sizeof(int)) != 0, "Load group indice size");
|
||||
group_index.resize(group_index_size);
|
||||
utils::Assert(group_stream.Read(&group_index, sizeof(int)* group_index_size) != 0, "Load group indice");
|
||||
|
||||
if (!silent){
|
||||
printf("the group index of %d groups is loaded from %s\n",
|
||||
group_index_size - 1, fgroup);
|
||||
}
|
||||
}
|
||||
LoadGroupBinary(fgroup,silent);
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
@ -134,16 +131,42 @@ namespace xgboost{
|
||||
fs.Write(&labels[0], sizeof(float)* data.NumRow());
|
||||
fs.Close();
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||
printf("%ux%u matrix with %lu entries is saved to %s as binary\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
|
||||
SaveGroupBinary(fgroup,silent);
|
||||
}
|
||||
|
||||
inline void SaveGroupBinary(const char* fgroup, bool silent = false){
|
||||
//save group data
|
||||
if (group_index.size() > 0){
|
||||
utils::FileStream file_group(utils::FopenCheck(fgroup, "wb"));
|
||||
int group_index_size = group_index.size();
|
||||
file_group.Write(&(group_index_size), sizeof(int));
|
||||
file_group.Write(&group_index[0], sizeof(int) * group_index_size);
|
||||
file_group.Close();
|
||||
if(!silent){printf("Index info of %d groups is saved to %s as binary\n",group_index_size-1,fgroup);}
|
||||
}
|
||||
}
|
||||
|
||||
inline void LoadGroupBinary(const char* fgroup, bool silent = false){
|
||||
//if group data exists load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
if (file_group != NULL){
|
||||
int group_index_size = 0;
|
||||
utils::FileStream group_stream(file_group);
|
||||
utils::Assert(group_stream.Read(&group_index_size, sizeof(int)) != 0, "Load group indice size");
|
||||
group_index.resize(group_index_size);
|
||||
utils::Assert(group_stream.Read(&group_index[0], sizeof(int) * group_index_size) != 0, "Load group indice");
|
||||
|
||||
if (!silent){
|
||||
printf("Index info of %d groups is loaded from %s as binary\n",
|
||||
group_index.size() - 1, fgroup);
|
||||
}
|
||||
fclose(file_group);
|
||||
}else{
|
||||
if(!silent){printf("The binary file of group info not exists");}
|
||||
}
|
||||
|
||||
}
|
||||
@ -161,11 +184,13 @@ namespace xgboost{
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||
this->LoadBinary(fname, fgroup, silent); return;
|
||||
}
|
||||
char bname[1024];
|
||||
char bname[1024],bgroup[1024];
|
||||
sprintf(bname, "%s.buffer", fname);
|
||||
if (!this->LoadBinary(bname, fgroup, silent)){
|
||||
sprintf(bgroup, "%s.buffer", fgroup);
|
||||
if (!this->LoadBinary(bname, bgroup, silent))
|
||||
{
|
||||
this->LoadText(fname, fgroup, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, fgroup, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, bgroup, silent);
|
||||
}
|
||||
}
|
||||
private:
|
||||
|
||||
@ -96,6 +96,7 @@ namespace xgboost {
|
||||
*/
|
||||
inline void InitModel(void) {
|
||||
base_gbm.InitModel();
|
||||
if(!silent) printf("BoostLearner:InitModel Done!\n");
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
@ -210,7 +211,6 @@ namespace xgboost {
|
||||
/*! \brief get intransformed predictions, given data */
|
||||
virtual inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
|
||||
preds.resize(data.Size());
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
|
||||
@ -11,20 +11,11 @@
|
||||
#include "../base/xgboost_boost_task.h"
|
||||
#include "xgboost_rank.h"
|
||||
#include "../regression/xgboost_reg.h"
|
||||
|
||||
#include "../regression/xgboost_reg_main.cpp"
|
||||
#include "../base/xgboost_data_instance.h"
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
xgboost::random::Seed(0);
|
||||
xgboost::base::BoostTask tsk;
|
||||
xgboost::utils::ConfigIterator itr(argv[1]);
|
||||
/* int learner_index = 0;
|
||||
while (itr.Next()){
|
||||
if (!strcmp(itr.name(), "learning_task")){
|
||||
learner_index = atoi(itr.val());
|
||||
}
|
||||
}*/
|
||||
xgboost::rank::RankBoostLearner* rank_learner = new xgboost::rank::RankBoostLearner;
|
||||
xgboost::base::BoostLearner *parent = static_cast<xgboost::base::BoostLearner*>(rank_learner);
|
||||
tsk.SetLearner(parent);
|
||||
return tsk.Run(argc, argv);
|
||||
xgboost::base::BoostTask rank_tsk;
|
||||
rank_tsk.SetLearner(new xgboost::rank::RankBoostLearner);
|
||||
return rank_tsk.Run(argc, argv);
|
||||
}
|
||||
|
||||
@ -115,6 +115,7 @@ namespace xgboost {
|
||||
Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start, int end){
|
||||
utils::Assert(sampler_ != NULL,"Not config the sampler yet. Add rank:sampler in the config file\n");
|
||||
return sampler_->GenPairs(preds, labels, start, end);
|
||||
}
|
||||
private:
|
||||
|
||||
@ -94,7 +94,8 @@ namespace xgboost{
|
||||
base_gbm.InitTrainer();
|
||||
if (mparam.loss_type == kLogisticClassify){
|
||||
evaluator_.AddEval("error");
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
evaluator_.AddEval("rmse");
|
||||
}
|
||||
evaluator_.Init();
|
||||
|
||||
@ -52,7 +52,8 @@ namespace xgboost{
|
||||
unsigned index; float value;
|
||||
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
||||
findex.push_back(index); fvalue.push_back(value);
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
if (!init){
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
|
||||
@ -55,7 +55,8 @@ namespace xgboost{
|
||||
for (unsigned i = 0; i < ndata; ++i){
|
||||
if (preds[i] > 0.5f){
|
||||
if (labels[i] < 0.5f) nerr += 1;
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
if (labels[i] > 0.5f) nerr += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -50,7 +50,8 @@ namespace xgboost{
|
||||
}
|
||||
if (task == "pred"){
|
||||
this->TaskPred();
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
this->TaskTrain();
|
||||
}
|
||||
return 0;
|
||||
@ -113,7 +114,8 @@ namespace xgboost{
|
||||
if (task == "dump") return;
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
data.CacheLoad(test_path.c_str(), silent != 0, use_buffer != 0);
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
// training
|
||||
data.CacheLoad(train_path.c_str(), silent != 0, use_buffer != 0);
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
@ -133,7 +135,8 @@ namespace xgboost{
|
||||
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
|
||||
learner.LoadModel(fi);
|
||||
fi.Close();
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
utils::Assert(task == "train", "model_in not specified");
|
||||
learner.InitModel();
|
||||
}
|
||||
@ -156,7 +159,8 @@ namespace xgboost{
|
||||
if (save_period == 0 || num_round % save_period != 0){
|
||||
if (model_out == "NULL"){
|
||||
this->SaveModel(num_round - 1);
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
this->SaveModel(model_out.c_str());
|
||||
}
|
||||
}
|
||||
@ -177,7 +181,8 @@ namespace xgboost{
|
||||
if (!strcmp(cfg_batch.name(), "run")){
|
||||
learner.UpdateInteract(interact_action);
|
||||
batch_action += 1;
|
||||
} else{
|
||||
}
|
||||
else{
|
||||
learner.SetParam(cfg_batch.name(), cfg_batch.val());
|
||||
}
|
||||
}
|
||||
@ -273,8 +278,8 @@ namespace xgboost{
|
||||
};
|
||||
};
|
||||
|
||||
int main( int argc, char *argv[] ){
|
||||
xgboost::random::Seed( 0 );
|
||||
xgboost::regression::RegBoostTask tsk;
|
||||
return tsk.Run( argc, argv );
|
||||
}
|
||||
// int main( int argc, char *argv[] ){
|
||||
// xgboost::random::Seed( 0 );
|
||||
// xgboost::regression::RegBoostTask tsk;
|
||||
// return tsk.Run( argc, argv );
|
||||
// }
|
||||
|
||||
@ -94,7 +94,8 @@ namespace xgboost{
|
||||
case '\"':
|
||||
if (i == 0){
|
||||
ParseStr(tok); ch_buf = fgetc(fi); return new_line;
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
Error("token followed directly by string");
|
||||
}
|
||||
case '=':
|
||||
@ -102,7 +103,8 @@ namespace xgboost{
|
||||
ch_buf = fgetc(fi);
|
||||
tok[0] = '=';
|
||||
tok[1] = '\0';
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
tok[i] = '\0';
|
||||
}
|
||||
return new_line;
|
||||
@ -155,7 +157,8 @@ namespace xgboost{
|
||||
if (priority == 0){
|
||||
names.push_back(std::string(name));
|
||||
values.push_back(std::string(val));
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
names_high.push_back(std::string(name));
|
||||
values_high.push_back(std::string(val));
|
||||
}
|
||||
@ -184,7 +187,8 @@ namespace xgboost{
|
||||
size_t i = idx - 1;
|
||||
if (i >= names.size()){
|
||||
return names_high[i - names.size()].c_str();
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
return names[i].c_str();
|
||||
}
|
||||
}
|
||||
@ -197,7 +201,8 @@ namespace xgboost{
|
||||
size_t i = idx - 1;
|
||||
if (i >= values.size()){
|
||||
return values_high[i - values.size()].c_str();
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
return values[i].c_str();
|
||||
}
|
||||
}
|
||||
|
||||
@ -50,7 +50,8 @@ namespace xgboost{
|
||||
if (!UseAcList){
|
||||
rptr.clear();
|
||||
rptr.resize(nrows + 1, 0);
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
||||
this->Cleanup();
|
||||
}
|
||||
@ -79,7 +80,8 @@ namespace xgboost{
|
||||
rptr[i] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
// case with active list
|
||||
std::sort(aclist.begin(), aclist.end());
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#else
|
||||
//#warning "OpenMP is not available, compile to single thread code"
|
||||
#warning "OpenMP is not available, compile to single thread code"
|
||||
inline int omp_get_thread_num() { return 0; }
|
||||
inline int omp_get_num_threads() { return 1; }
|
||||
inline void omp_set_num_threads(int nthread) {}
|
||||
|
||||
@ -88,7 +88,8 @@ namespace xgboost{
|
||||
u = NextDouble();
|
||||
} while (u == 0.0);
|
||||
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
double d, c, x, v, u;
|
||||
d = alpha - 1.0 / 3.0;
|
||||
c = 1.0 / sqrt(9.0 * d);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user