diff --git a/Makefile b/Makefile index 8c9980ac1..83a927e8c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -export CC = clang -export CXX = clang++ -export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas +export CC = gcc +export CXX = g++ +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp # specify tensor path BIN = xgboost diff --git a/python/xgboost_wrapper.cpp b/python/xgboost_wrapper.cpp index 8b89d1d25..edda96c29 100644 --- a/python/xgboost_wrapper.cpp +++ b/python/xgboost_wrapper.cpp @@ -140,7 +140,7 @@ extern "C"{ for (size_t i = 0; i < len; ++i) { const int ridx = idxset[i]; SparseBatch::Inst inst = batch[ridx]; - utils::Check(ridx < batch.size, "slice index exceed number of rows"); + utils::Check(static_cast(ridx) < batch.size, "slice index exceed number of rows"); ret.row_data_.resize(ret.row_data_.size() + inst.length); memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, sizeof(SparseBatch::Entry) * inst.length); diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index 29e6d3393..163d0d283 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -171,7 +171,7 @@ class SoftmaxMultiClassObj : public IObjFunction { rec[k] = preds[j * nclass + k]; } Softmax(&rec); - unsigned label = static_cast(info.labels[j]); + int label = static_cast(info.labels[j]); utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class"); const float wt = info.GetWeight(j); for (int k = 0; k < nclass; ++k) { diff --git a/tools/Makefile b/tools/Makefile index fec764b5a..b35277aff 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -12,7 +12,6 @@ export LDFLAGS= -pthread -lm xgcombine_buffer : xgcombine_buffer.cpp - $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/tools/xgcombine_buffer.cpp b/tools/xgcombine_buffer.cpp index 16bc12894..84bc996a4 100644 --- a/tools/xgcombine_buffer.cpp +++ b/tools/xgcombine_buffer.cpp @@ -10,239 +10,238 @@ #include #include #include -#include "../regrank/xgboost_regrank_data.h" -#include "../utils/xgboost_utils.h" +#include "../src/io/simple_dmatrix-inl.hpp" +#include "../src/utils/utils.h" using namespace xgboost; -using namespace xgboost::booster; -using namespace xgboost::regrank; +using namespace xgboost::io; // header in dataset struct Header{ - FILE *fi; - int tmp_num; - int base; - int num_feat; - // whether it's dense format - bool is_dense; - bool warned; - - Header( void ){ this->warned = false; this->is_dense = false; } - - inline void CheckBase( unsigned findex ){ - if( findex >= (unsigned)num_feat && ! warned ) { - fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat ); - warned = true; - } - } + FILE *fi; + int tmp_num; + int base; + int num_feat; + // whether it's dense format + bool is_dense; + bool warned; + + Header( void ){ this->warned = false; this->is_dense = false; } + + inline void CheckBase( unsigned findex ){ + if( findex >= (unsigned)num_feat && ! warned ) { + fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat ); + warned = true; + } + } }; inline int norm( std::vector
&vec, int base = 0 ){ - int n = base; - for( size_t i = 0; i < vec.size(); i ++ ){ - if( vec[i].is_dense ) vec[i].num_feat = 1; - vec[i].base = n; n += vec[i].num_feat; - } - return n; + int n = base; + for( size_t i = 0; i < vec.size(); i ++ ){ + if( vec[i].is_dense ) vec[i].num_feat = 1; + vec[i].base = n; n += vec[i].num_feat; + } + return n; } inline void vclose( std::vector
&vec ){ - for( size_t i = 0; i < vec.size(); i ++ ){ - fclose( vec[i].fi ); - } + for( size_t i = 0; i < vec.size(); i ++ ){ + fclose( vec[i].fi ); + } } inline int readnum( std::vector
&vec ){ - int n = 0; - for( size_t i = 0; i < vec.size(); i ++ ){ - if( !vec[i].is_dense ){ - utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" ); - n += vec[i].tmp_num; - }else{ - n ++; - } + int n = 0; + for( size_t i = 0; i < vec.size(); i ++ ){ + if( !vec[i].is_dense ){ + utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" ); + n += vec[i].tmp_num; + }else{ + n ++; } - return n; + } + return n; } inline void vskip( std::vector
&vec ){ - for( size_t i = 0; i < vec.size(); i ++ ){ - if( !vec[i].is_dense ){ - utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0 ); - }else{ - utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0 ); - } + for( size_t i = 0; i < vec.size(); i ++ ){ + if( !vec[i].is_dense ){ + utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0, "sparse" ); + }else{ + utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0, "dense" ); } + } } -class DataLoader: public DMatrix{ -public: - // whether to do node and edge feature renormalization - int rescale; - int linelimit; -public: - FILE *fp, *fwlist, *fgroup, *fweight; - std::vector
fheader; - std::vector entry; - DataLoader( void ){ - rescale = 0; - linelimit = -1; - fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL; +class DataLoader: public DMatrixSimple { + public: + // whether to do node and edge feature renormalization + int rescale; + int linelimit; + public: + FILE *fp, *fwlist, *fgroup, *fweight; + std::vector
fheader; + DataLoader( void ){ + rescale = 0; + linelimit = -1; + fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL; + } + private: + inline void Load( std::vector &feats, std::vector
&vec ){ + SparseBatch::Entry e; + for( size_t i = 0; i < vec.size(); i ++ ){ + if( !vec[i].is_dense ) { + for( int j = 0; j < vec[i].tmp_num; j ++ ){ + utils::Assert( fscanf ( vec[i].fi, "%u:%f", &e.findex, &e.fvalue ) == 2, "Error when load feat" ); + vec[i].CheckBase( e.findex ); + e.findex += vec[i].base; + feats.push_back(e); + } + }else{ + utils::Assert( fscanf ( vec[i].fi, "%f", &e.fvalue ) == 1, "load feat" ); + e.findex = vec[i].base; + feats.push_back(e); + } } -private: - inline void Load( std::vector &findex, std::vector &fvalue, std::vector
&vec ){ - unsigned fidx; float fv; - for( size_t i = 0; i < vec.size(); i ++ ){ - if( !vec[i].is_dense ) { - for( int j = 0; j < vec[i].tmp_num; j ++ ){ - utils::Assert( fscanf ( vec[i].fi, "%u:%f", &fidx, &fv ) == 2, "Error when load feat" ); - vec[i].CheckBase( fidx ); - fidx += vec[i].base; - findex.push_back( fidx ); fvalue.push_back( fv ); - } - }else{ - utils::Assert( fscanf ( vec[i].fi, "%f", &fv ) == 1, "load feat" ); - fidx = vec[i].base; - findex.push_back( fidx ); fvalue.push_back( fv ); - } - } + } + inline void DoRescale( std::vector &vec ){ + double sum = 0.0; + for( size_t i = 0; i < vec.size(); i ++ ){ + sum += vec[i].fvalue * vec[i].fvalue; + } + sum = sqrt( sum ); + for( size_t i = 0; i < vec.size(); i ++ ){ + vec[i].fvalue /= sum; + } + } + public: + // basically we are loading all the data inside + inline void Load( void ){ + this->Clear(); + float label, weight = 0.0f; + + unsigned ngleft = 0, ngacc = 0; + if( fgroup != NULL ){ + info.group_ptr.clear(); + info.group_ptr.push_back(0); } - inline void DoRescale( std::vector &vec ){ - double sum = 0.0; - for( size_t i = 0; i < vec.size(); i ++ ){ - sum += vec[i] * vec[i]; - } - sum = sqrt( sum ); - for( size_t i = 0; i < vec.size(); i ++ ){ - vec[i] /= sum; - } - } -public: - // basically we are loading all the data inside - inline void Load( void ){ - this->data.Clear(); - float label, weight = 0.0f; - - unsigned ngleft = 0, ngacc = 0; - if( fgroup != NULL ){ - info.group_ptr.clear(); - info.group_ptr.push_back(0); + + while( fscanf( fp, "%f", &label ) == 1 ){ + if( ngleft == 0 && fgroup != NULL ){ + utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1, "group" ); + } + if( fweight != NULL ){ + utils::Assert( fscanf( fweight, "%f", &weight ) == 1, "weight" ); + } + + ngleft -= 1; ngacc += 1; + + int pass = 1; + if( fwlist != NULL ){ + utils::Assert( fscanf( fwlist, "%u", &pass ) ==1, "pass" ); + } + if( pass == 0 ){ + vskip( fheader ); ngacc -= 1; + }else{ + const int nfeat = readnum( fheader ); + + std::vector feats; + + // pairs + this->Load( feats, fheader ); + utils::Assert( feats.size() == (unsigned)nfeat, "nfeat" ); + if( rescale != 0 ) this->DoRescale( feats ); + // push back data :) + this->info.labels.push_back( label ); + // push back weight if any + if( fweight != NULL ){ + this->info.weights.push_back( weight ); } - - while( fscanf( fp, "%f", &label ) == 1 ){ - if( ngleft == 0 && fgroup != NULL ){ - utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1 ); - } - if( fweight != NULL ){ - utils::Assert( fscanf( fweight, "%f", &weight ) == 1 ); - } - - ngleft -= 1; ngacc += 1; - - int pass = 1; - if( fwlist != NULL ){ - utils::Assert( fscanf( fwlist, "%u", &pass ) ==1 ); - } - if( pass == 0 ){ - vskip( fheader ); ngacc -= 1; - }else{ - const int nfeat = readnum( fheader ); - std::vector findex; - std::vector fvalue; - // pairs - this->Load( findex, fvalue, fheader ); - utils::Assert( findex.size() == (unsigned)nfeat ); - if( rescale != 0 ) this->DoRescale( fvalue ); - // push back data :) - this->info.labels.push_back( label ); - // push back weight if any - if( fweight != NULL ){ - this->info.weights.push_back( weight ); - } - this->data.AddRow( findex, fvalue ); - } - if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){ - info.group_ptr.push_back( info.group_ptr.back() + ngacc ); - utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" ); - ngacc = 0; - } - // linelimit - if( linelimit >= 0 ) { - if( -- linelimit <= 0 ) break; - } - } - if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){ - info.group_ptr.push_back( info.group_ptr.back() + ngacc ); - utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" ); - } - this->data.InitData(); + this->AddRow( feats ); + } + if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){ + info.group_ptr.push_back( info.group_ptr.back() + ngacc ); + utils::Assert( info.group_ptr.back() == info.num_row, "group size must match num rows" ); + ngacc = 0; + } + // linelimit + if( linelimit >= 0 ) { + if( -- linelimit <= 0 ) break; + } } + if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){ + info.group_ptr.push_back( info.group_ptr.back() + ngacc ); + utils::Assert( info.group_ptr.back() == info.num_row, "group size must match num rows" ); + } + } + }; const char *folder = "features"; int main( int argc, char *argv[] ){ - if( argc < 3 ){ - printf("Usage:xgcombine_buffer [options] -f [features] -fd [densefeatures]\n"\ - "options: -rescale -linelimit -fgroup -wlist \n"); - return 0; + if( argc < 3 ){ + printf("Usage:xgcombine_buffer [options] -f [features] -fd [densefeatures]\n" \ + "options: -rescale -linelimit -fgroup -wlist \n"); + return 0; + } + + DataLoader loader; + time_t start = time( NULL ); + + int mode = 0; + for( int i = 3; i < argc; i ++ ){ + if( !strcmp( argv[i], "-f") ){ + mode = 0; continue; } - - DataLoader loader; - time_t start = time( NULL ); - - int mode = 0; - for( int i = 3; i < argc; i ++ ){ - if( !strcmp( argv[i], "-f") ){ - mode = 0; continue; - } - if( !strcmp( argv[i], "-fd") ){ - mode = 2; continue; - } - if( !strcmp( argv[i], "-rescale") ){ - loader.rescale = 1; continue; - } - if( !strcmp( argv[i], "-wlist") ){ - loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue; - } - if( !strcmp( argv[i], "-fgroup") ){ - loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue; - } - if( !strcmp( argv[i], "-fweight") ){ - loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue; - } - if( !strcmp( argv[i], "-linelimit") ){ - loader.linelimit = atoi( argv[ ++i ] ); continue; - } - - char name[ 256 ]; - sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] ); - Header h; - h.fi = utils::FopenCheck( name, "r" ); - - if( mode == 2 ){ - h.is_dense = true; h.num_feat = 1; - loader.fheader.push_back( h ); - }else{ - utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" ); - switch( mode ){ - case 0: loader.fheader.push_back( h ); break; - default: ; - } - } + if( !strcmp( argv[i], "-fd") ){ + mode = 2; continue; + } + if( !strcmp( argv[i], "-rescale") ){ + loader.rescale = 1; continue; + } + if( !strcmp( argv[i], "-wlist") ){ + loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue; + } + if( !strcmp( argv[i], "-fgroup") ){ + loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue; + } + if( !strcmp( argv[i], "-fweight") ){ + loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue; + } + if( !strcmp( argv[i], "-linelimit") ){ + loader.linelimit = atoi( argv[ ++i ] ); continue; } - loader.fp = utils::FopenCheck( argv[1], "r" ); - printf("num_features=%d\n", norm( loader.fheader ) ); - printf("start creating buffer...\n"); - loader.Load(); - loader.SaveBinary( argv[2] ); - // close files - fclose( loader.fp ); - if( loader.fwlist != NULL ) fclose( loader.fwlist ); - if( loader.fgroup != NULL ) fclose( loader.fgroup ); - vclose( loader.fheader ); - printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) ); - return 0; + char name[ 256 ]; + sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] ); + Header h; + h.fi = utils::FopenCheck( name, "r" ); + + if( mode == 2 ){ + h.is_dense = true; h.num_feat = 1; + loader.fheader.push_back( h ); + }else{ + utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" ); + switch( mode ){ + case 0: loader.fheader.push_back( h ); break; + default: ; + } + } + } + loader.fp = utils::FopenCheck( argv[1], "r" ); + + printf("num_features=%d\n", norm( loader.fheader ) ); + printf("start creating buffer...\n"); + loader.Load(); + loader.SaveBinary( argv[2] ); + // close files + fclose( loader.fp ); + if( loader.fwlist != NULL ) fclose( loader.fwlist ); + if( loader.fgroup != NULL ) fclose( loader.fgroup ); + vclose( loader.fheader ); + printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) ); + return 0; }