make xgcombine buffer work
This commit is contained in:
parent
4ed4b08146
commit
0d9a8c042c
6
Makefile
6
Makefile
@ -1,6 +1,6 @@
|
|||||||
export CC = clang
|
export CC = gcc
|
||||||
export CXX = clang++
|
export CXX = g++
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
BIN = xgboost
|
BIN = xgboost
|
||||||
|
|||||||
@ -140,7 +140,7 @@ extern "C"{
|
|||||||
for (size_t i = 0; i < len; ++i) {
|
for (size_t i = 0; i < len; ++i) {
|
||||||
const int ridx = idxset[i];
|
const int ridx = idxset[i];
|
||||||
SparseBatch::Inst inst = batch[ridx];
|
SparseBatch::Inst inst = batch[ridx];
|
||||||
utils::Check(ridx < batch.size, "slice index exceed number of rows");
|
utils::Check(static_cast<size_t>(ridx) < batch.size, "slice index exceed number of rows");
|
||||||
ret.row_data_.resize(ret.row_data_.size() + inst.length);
|
ret.row_data_.resize(ret.row_data_.size() + inst.length);
|
||||||
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
|
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
|
||||||
sizeof(SparseBatch::Entry) * inst.length);
|
sizeof(SparseBatch::Entry) * inst.length);
|
||||||
|
|||||||
@ -171,7 +171,7 @@ class SoftmaxMultiClassObj : public IObjFunction {
|
|||||||
rec[k] = preds[j * nclass + k];
|
rec[k] = preds[j * nclass + k];
|
||||||
}
|
}
|
||||||
Softmax(&rec);
|
Softmax(&rec);
|
||||||
unsigned label = static_cast<unsigned>(info.labels[j]);
|
int label = static_cast<int>(info.labels[j]);
|
||||||
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
|
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
|
||||||
const float wt = info.GetWeight(j);
|
const float wt = info.GetWeight(j);
|
||||||
for (int k = 0; k < nclass; ++k) {
|
for (int k = 0; k < nclass; ++k) {
|
||||||
|
|||||||
@ -12,7 +12,6 @@ export LDFLAGS= -pthread -lm
|
|||||||
|
|
||||||
xgcombine_buffer : xgcombine_buffer.cpp
|
xgcombine_buffer : xgcombine_buffer.cpp
|
||||||
|
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
|
|||||||
@ -10,239 +10,238 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "../regrank/xgboost_regrank_data.h"
|
#include "../src/io/simple_dmatrix-inl.hpp"
|
||||||
#include "../utils/xgboost_utils.h"
|
#include "../src/utils/utils.h"
|
||||||
|
|
||||||
using namespace xgboost;
|
using namespace xgboost;
|
||||||
using namespace xgboost::booster;
|
using namespace xgboost::io;
|
||||||
using namespace xgboost::regrank;
|
|
||||||
|
|
||||||
// header in dataset
|
// header in dataset
|
||||||
struct Header{
|
struct Header{
|
||||||
FILE *fi;
|
FILE *fi;
|
||||||
int tmp_num;
|
int tmp_num;
|
||||||
int base;
|
int base;
|
||||||
int num_feat;
|
int num_feat;
|
||||||
// whether it's dense format
|
// whether it's dense format
|
||||||
bool is_dense;
|
bool is_dense;
|
||||||
bool warned;
|
bool warned;
|
||||||
|
|
||||||
Header( void ){ this->warned = false; this->is_dense = false; }
|
Header( void ){ this->warned = false; this->is_dense = false; }
|
||||||
|
|
||||||
inline void CheckBase( unsigned findex ){
|
inline void CheckBase( unsigned findex ){
|
||||||
if( findex >= (unsigned)num_feat && ! warned ) {
|
if( findex >= (unsigned)num_feat && ! warned ) {
|
||||||
fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat );
|
fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat );
|
||||||
warned = true;
|
warned = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
inline int norm( std::vector<Header> &vec, int base = 0 ){
|
inline int norm( std::vector<Header> &vec, int base = 0 ){
|
||||||
int n = base;
|
int n = base;
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
if( vec[i].is_dense ) vec[i].num_feat = 1;
|
if( vec[i].is_dense ) vec[i].num_feat = 1;
|
||||||
vec[i].base = n; n += vec[i].num_feat;
|
vec[i].base = n; n += vec[i].num_feat;
|
||||||
}
|
}
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void vclose( std::vector<Header> &vec ){
|
inline void vclose( std::vector<Header> &vec ){
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
fclose( vec[i].fi );
|
fclose( vec[i].fi );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int readnum( std::vector<Header> &vec ){
|
inline int readnum( std::vector<Header> &vec ){
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
if( !vec[i].is_dense ){
|
if( !vec[i].is_dense ){
|
||||||
utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" );
|
utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" );
|
||||||
n += vec[i].tmp_num;
|
n += vec[i].tmp_num;
|
||||||
}else{
|
}else{
|
||||||
n ++;
|
n ++;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return n;
|
}
|
||||||
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void vskip( std::vector<Header> &vec ){
|
inline void vskip( std::vector<Header> &vec ){
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
if( !vec[i].is_dense ){
|
if( !vec[i].is_dense ){
|
||||||
utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0 );
|
utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0, "sparse" );
|
||||||
}else{
|
}else{
|
||||||
utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0 );
|
utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0, "dense" );
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class DataLoader: public DMatrix{
|
class DataLoader: public DMatrixSimple {
|
||||||
public:
|
public:
|
||||||
// whether to do node and edge feature renormalization
|
// whether to do node and edge feature renormalization
|
||||||
int rescale;
|
int rescale;
|
||||||
int linelimit;
|
int linelimit;
|
||||||
public:
|
public:
|
||||||
FILE *fp, *fwlist, *fgroup, *fweight;
|
FILE *fp, *fwlist, *fgroup, *fweight;
|
||||||
std::vector<Header> fheader;
|
std::vector<Header> fheader;
|
||||||
std::vector<FMatrixS::REntry> entry;
|
DataLoader( void ){
|
||||||
DataLoader( void ){
|
rescale = 0;
|
||||||
rescale = 0;
|
linelimit = -1;
|
||||||
linelimit = -1;
|
fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL;
|
||||||
fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL;
|
}
|
||||||
|
private:
|
||||||
|
inline void Load( std::vector<SparseBatch::Entry> &feats, std::vector<Header> &vec ){
|
||||||
|
SparseBatch::Entry e;
|
||||||
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
|
if( !vec[i].is_dense ) {
|
||||||
|
for( int j = 0; j < vec[i].tmp_num; j ++ ){
|
||||||
|
utils::Assert( fscanf ( vec[i].fi, "%u:%f", &e.findex, &e.fvalue ) == 2, "Error when load feat" );
|
||||||
|
vec[i].CheckBase( e.findex );
|
||||||
|
e.findex += vec[i].base;
|
||||||
|
feats.push_back(e);
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
utils::Assert( fscanf ( vec[i].fi, "%f", &e.fvalue ) == 1, "load feat" );
|
||||||
|
e.findex = vec[i].base;
|
||||||
|
feats.push_back(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
private:
|
}
|
||||||
inline void Load( std::vector<unsigned> &findex, std::vector<float> &fvalue, std::vector<Header> &vec ){
|
inline void DoRescale( std::vector<SparseBatch::Entry> &vec ){
|
||||||
unsigned fidx; float fv;
|
double sum = 0.0;
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
if( !vec[i].is_dense ) {
|
sum += vec[i].fvalue * vec[i].fvalue;
|
||||||
for( int j = 0; j < vec[i].tmp_num; j ++ ){
|
}
|
||||||
utils::Assert( fscanf ( vec[i].fi, "%u:%f", &fidx, &fv ) == 2, "Error when load feat" );
|
sum = sqrt( sum );
|
||||||
vec[i].CheckBase( fidx );
|
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||||
fidx += vec[i].base;
|
vec[i].fvalue /= sum;
|
||||||
findex.push_back( fidx ); fvalue.push_back( fv );
|
}
|
||||||
}
|
}
|
||||||
}else{
|
public:
|
||||||
utils::Assert( fscanf ( vec[i].fi, "%f", &fv ) == 1, "load feat" );
|
// basically we are loading all the data inside
|
||||||
fidx = vec[i].base;
|
inline void Load( void ){
|
||||||
findex.push_back( fidx ); fvalue.push_back( fv );
|
this->Clear();
|
||||||
}
|
float label, weight = 0.0f;
|
||||||
}
|
|
||||||
|
unsigned ngleft = 0, ngacc = 0;
|
||||||
|
if( fgroup != NULL ){
|
||||||
|
info.group_ptr.clear();
|
||||||
|
info.group_ptr.push_back(0);
|
||||||
}
|
}
|
||||||
inline void DoRescale( std::vector<float> &vec ){
|
|
||||||
double sum = 0.0;
|
while( fscanf( fp, "%f", &label ) == 1 ){
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
if( ngleft == 0 && fgroup != NULL ){
|
||||||
sum += vec[i] * vec[i];
|
utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1, "group" );
|
||||||
}
|
}
|
||||||
sum = sqrt( sum );
|
if( fweight != NULL ){
|
||||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
utils::Assert( fscanf( fweight, "%f", &weight ) == 1, "weight" );
|
||||||
vec[i] /= sum;
|
}
|
||||||
}
|
|
||||||
}
|
ngleft -= 1; ngacc += 1;
|
||||||
public:
|
|
||||||
// basically we are loading all the data inside
|
int pass = 1;
|
||||||
inline void Load( void ){
|
if( fwlist != NULL ){
|
||||||
this->data.Clear();
|
utils::Assert( fscanf( fwlist, "%u", &pass ) ==1, "pass" );
|
||||||
float label, weight = 0.0f;
|
}
|
||||||
|
if( pass == 0 ){
|
||||||
unsigned ngleft = 0, ngacc = 0;
|
vskip( fheader ); ngacc -= 1;
|
||||||
if( fgroup != NULL ){
|
}else{
|
||||||
info.group_ptr.clear();
|
const int nfeat = readnum( fheader );
|
||||||
info.group_ptr.push_back(0);
|
|
||||||
|
std::vector<SparseBatch::Entry> feats;
|
||||||
|
|
||||||
|
// pairs
|
||||||
|
this->Load( feats, fheader );
|
||||||
|
utils::Assert( feats.size() == (unsigned)nfeat, "nfeat" );
|
||||||
|
if( rescale != 0 ) this->DoRescale( feats );
|
||||||
|
// push back data :)
|
||||||
|
this->info.labels.push_back( label );
|
||||||
|
// push back weight if any
|
||||||
|
if( fweight != NULL ){
|
||||||
|
this->info.weights.push_back( weight );
|
||||||
}
|
}
|
||||||
|
this->AddRow( feats );
|
||||||
while( fscanf( fp, "%f", &label ) == 1 ){
|
}
|
||||||
if( ngleft == 0 && fgroup != NULL ){
|
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
||||||
utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1 );
|
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
||||||
}
|
utils::Assert( info.group_ptr.back() == info.num_row, "group size must match num rows" );
|
||||||
if( fweight != NULL ){
|
ngacc = 0;
|
||||||
utils::Assert( fscanf( fweight, "%f", &weight ) == 1 );
|
}
|
||||||
}
|
// linelimit
|
||||||
|
if( linelimit >= 0 ) {
|
||||||
ngleft -= 1; ngacc += 1;
|
if( -- linelimit <= 0 ) break;
|
||||||
|
}
|
||||||
int pass = 1;
|
|
||||||
if( fwlist != NULL ){
|
|
||||||
utils::Assert( fscanf( fwlist, "%u", &pass ) ==1 );
|
|
||||||
}
|
|
||||||
if( pass == 0 ){
|
|
||||||
vskip( fheader ); ngacc -= 1;
|
|
||||||
}else{
|
|
||||||
const int nfeat = readnum( fheader );
|
|
||||||
std::vector<unsigned> findex;
|
|
||||||
std::vector<float> fvalue;
|
|
||||||
// pairs
|
|
||||||
this->Load( findex, fvalue, fheader );
|
|
||||||
utils::Assert( findex.size() == (unsigned)nfeat );
|
|
||||||
if( rescale != 0 ) this->DoRescale( fvalue );
|
|
||||||
// push back data :)
|
|
||||||
this->info.labels.push_back( label );
|
|
||||||
// push back weight if any
|
|
||||||
if( fweight != NULL ){
|
|
||||||
this->info.weights.push_back( weight );
|
|
||||||
}
|
|
||||||
this->data.AddRow( findex, fvalue );
|
|
||||||
}
|
|
||||||
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
|
||||||
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
|
||||||
utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" );
|
|
||||||
ngacc = 0;
|
|
||||||
}
|
|
||||||
// linelimit
|
|
||||||
if( linelimit >= 0 ) {
|
|
||||||
if( -- linelimit <= 0 ) break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
|
||||||
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
|
||||||
utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" );
|
|
||||||
}
|
|
||||||
this->data.InitData();
|
|
||||||
}
|
}
|
||||||
|
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
||||||
|
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
||||||
|
utils::Assert( info.group_ptr.back() == info.num_row, "group size must match num rows" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const char *folder = "features";
|
const char *folder = "features";
|
||||||
|
|
||||||
int main( int argc, char *argv[] ){
|
int main( int argc, char *argv[] ){
|
||||||
if( argc < 3 ){
|
if( argc < 3 ){
|
||||||
printf("Usage:xgcombine_buffer <inname> <outname> [options] -f [features] -fd [densefeatures]\n"\
|
printf("Usage:xgcombine_buffer <inname> <outname> [options] -f [features] -fd [densefeatures]\n" \
|
||||||
"options: -rescale -linelimit -fgroup <groupfilename> -wlist <whitelistinstance>\n");
|
"options: -rescale -linelimit -fgroup <groupfilename> -wlist <whitelistinstance>\n");
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
DataLoader loader;
|
||||||
|
time_t start = time( NULL );
|
||||||
|
|
||||||
|
int mode = 0;
|
||||||
|
for( int i = 3; i < argc; i ++ ){
|
||||||
|
if( !strcmp( argv[i], "-f") ){
|
||||||
|
mode = 0; continue;
|
||||||
}
|
}
|
||||||
|
if( !strcmp( argv[i], "-fd") ){
|
||||||
DataLoader loader;
|
mode = 2; continue;
|
||||||
time_t start = time( NULL );
|
}
|
||||||
|
if( !strcmp( argv[i], "-rescale") ){
|
||||||
int mode = 0;
|
loader.rescale = 1; continue;
|
||||||
for( int i = 3; i < argc; i ++ ){
|
}
|
||||||
if( !strcmp( argv[i], "-f") ){
|
if( !strcmp( argv[i], "-wlist") ){
|
||||||
mode = 0; continue;
|
loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||||
}
|
}
|
||||||
if( !strcmp( argv[i], "-fd") ){
|
if( !strcmp( argv[i], "-fgroup") ){
|
||||||
mode = 2; continue;
|
loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||||
}
|
}
|
||||||
if( !strcmp( argv[i], "-rescale") ){
|
if( !strcmp( argv[i], "-fweight") ){
|
||||||
loader.rescale = 1; continue;
|
loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||||
}
|
}
|
||||||
if( !strcmp( argv[i], "-wlist") ){
|
if( !strcmp( argv[i], "-linelimit") ){
|
||||||
loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
loader.linelimit = atoi( argv[ ++i ] ); continue;
|
||||||
}
|
|
||||||
if( !strcmp( argv[i], "-fgroup") ){
|
|
||||||
loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
|
||||||
}
|
|
||||||
if( !strcmp( argv[i], "-fweight") ){
|
|
||||||
loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
|
||||||
}
|
|
||||||
if( !strcmp( argv[i], "-linelimit") ){
|
|
||||||
loader.linelimit = atoi( argv[ ++i ] ); continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
char name[ 256 ];
|
|
||||||
sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] );
|
|
||||||
Header h;
|
|
||||||
h.fi = utils::FopenCheck( name, "r" );
|
|
||||||
|
|
||||||
if( mode == 2 ){
|
|
||||||
h.is_dense = true; h.num_feat = 1;
|
|
||||||
loader.fheader.push_back( h );
|
|
||||||
}else{
|
|
||||||
utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" );
|
|
||||||
switch( mode ){
|
|
||||||
case 0: loader.fheader.push_back( h ); break;
|
|
||||||
default: ;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
loader.fp = utils::FopenCheck( argv[1], "r" );
|
|
||||||
|
|
||||||
printf("num_features=%d\n", norm( loader.fheader ) );
|
char name[ 256 ];
|
||||||
printf("start creating buffer...\n");
|
sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] );
|
||||||
loader.Load();
|
Header h;
|
||||||
loader.SaveBinary( argv[2] );
|
h.fi = utils::FopenCheck( name, "r" );
|
||||||
// close files
|
|
||||||
fclose( loader.fp );
|
if( mode == 2 ){
|
||||||
if( loader.fwlist != NULL ) fclose( loader.fwlist );
|
h.is_dense = true; h.num_feat = 1;
|
||||||
if( loader.fgroup != NULL ) fclose( loader.fgroup );
|
loader.fheader.push_back( h );
|
||||||
vclose( loader.fheader );
|
}else{
|
||||||
printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) );
|
utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" );
|
||||||
return 0;
|
switch( mode ){
|
||||||
|
case 0: loader.fheader.push_back( h ); break;
|
||||||
|
default: ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
loader.fp = utils::FopenCheck( argv[1], "r" );
|
||||||
|
|
||||||
|
printf("num_features=%d\n", norm( loader.fheader ) );
|
||||||
|
printf("start creating buffer...\n");
|
||||||
|
loader.Load();
|
||||||
|
loader.SaveBinary( argv[2] );
|
||||||
|
// close files
|
||||||
|
fclose( loader.fp );
|
||||||
|
if( loader.fwlist != NULL ) fclose( loader.fwlist );
|
||||||
|
if( loader.fgroup != NULL ) fclose( loader.fgroup );
|
||||||
|
vclose( loader.fheader );
|
||||||
|
printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) );
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user