diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h index 344c13115..3653021c5 100644 --- a/regrank/xgboost_regrank_data.h +++ b/regrank/xgboost_regrank_data.h @@ -15,6 +15,8 @@ */ #include #include +#include +#include #include "../booster/xgboost_data.h" #include "../utils/xgboost_utils.h" #include "../utils/xgboost_stream.h" @@ -106,6 +108,13 @@ namespace xgboost{ data.LoadBinary(fs); info.labels.resize(data.NumRow()); utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary"); + {// load in group ptr + unsigned ngptr; + if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){ + info.group_ptr.resize( ngptr ); + utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file"); + } + } fs.Close(); // initialize column support as well data.InitData(); @@ -113,8 +122,10 @@ namespace xgboost{ if (!silent){ printf("%ux%u matrix with %lu entries is loaded from %s\n", (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + if( info.group_ptr.size() != 0 ){ + printf("data contains %u groups\n", (unsigned)info.group_ptr.size() ); + } } - this->TryLoadGroup(fname, silent); this->TryLoadWeight(fname, silent); return true; } @@ -129,11 +140,20 @@ namespace xgboost{ utils::FileStream fs(utils::FopenCheck(fname, "wb")); data.SaveBinary(fs); - fs.Write(&info.labels[0], sizeof(float)* data.NumRow()); + utils::Assert( info.labels.size() == data.NumRow(), "label size is not consistent with feature matrix size" ); + fs.Write(&info.labels[0], sizeof(float) * data.NumRow()); + {// write out group ptr + unsigned ngptr = static_cast( info.group_ptr.size() ); + fs.Write(&ngptr, sizeof(unsigned) ); + fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr); + } fs.Close(); if (!silent){ printf("%ux%u matrix with %lu entries is saved to %s\n", (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + if( info.group_ptr.size() != 0 ){ + printf("data contains %u groups\n", (unsigned)info.group_ptr.size() ); + } } } /*! diff --git a/tools/Makefile b/tools/Makefile new file mode 100644 index 000000000..fec764b5a --- /dev/null +++ b/tools/Makefile @@ -0,0 +1,26 @@ +export CC = gcc +export CXX = g++ +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp + +# specify tensor path +BIN = xgcombine_buffer +OBJ = +.PHONY: clean all + +all: $(BIN) $(OBJ) +export LDFLAGS= -pthread -lm + +xgcombine_buffer : xgcombine_buffer.cpp + + +$(BIN) : + $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) + +$(OBJ) : + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) + +install: + cp -f -r $(BIN) $(INSTALL_PATH) + +clean: + $(RM) $(OBJ) $(BIN) *~ diff --git a/tools/xgcombine_buffer.cpp b/tools/xgcombine_buffer.cpp new file mode 100644 index 000000000..878408894 --- /dev/null +++ b/tools/xgcombine_buffer.cpp @@ -0,0 +1,238 @@ +/*! + * a tool to combine different set of features into binary buffer + * not well organized code, but does it's job + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE + +#include +#include +#include +#include +#include "../regrank/xgboost_regrank_data.h" +#include "../utils/xgboost_utils.h" + +using namespace xgboost; +using namespace xgboost::booster; +using namespace xgboost::regrank; + +// header in dataset +struct Header{ + FILE *fi; + int tmp_num; + int base; + int num_feat; + // whether it's dense format + bool is_dense; + bool warned; + + Header( void ){ this->warned = false; this->is_dense = false; } + + inline void CheckBase( unsigned findex ){ + if( findex >= (unsigned)num_feat && ! warned ) { + fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat ); + warned = true; + } + } +}; + + +inline int norm( std::vector
&vec, int base = 0 ){ + int n = base; + for( size_t i = 0; i < vec.size(); i ++ ){ + if( vec[i].is_dense ) vec[i].num_feat = 1; + vec[i].base = n; n += vec[i].num_feat; + } + return n; +} + +inline void vclose( std::vector
&vec ){ + for( size_t i = 0; i < vec.size(); i ++ ){ + fclose( vec[i].fi ); + } +} + +inline int readnum( std::vector
&vec ){ + int n = 0; + for( size_t i = 0; i < vec.size(); i ++ ){ + if( !vec[i].is_dense ){ + utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" ); + n += vec[i].tmp_num; + }else{ + n ++; + } + } + return n; +} + +inline void vskip( std::vector
&vec ){ + for( size_t i = 0; i < vec.size(); i ++ ){ + if( !vec[i].is_dense ){ + utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0 ); + }else{ + utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0 ); + } + } +} + +class DataLoader: public DMatrix{ +public: + // whether to do node and edge feature renormalization + int rescale; + int linelimit; +public: + FILE *fp, *fwlist, *fgroup; + std::vector
fheader; + std::vector entry; + DataLoader( void ){ + rescale = 0; + linelimit = -1; + fp = NULL; fwlist = NULL; fgroup = NULL; + } +private: + inline void Load( std::vector &findex, std::vector &fvalue, std::vector
&vec ){ + unsigned fidx; float fv; + for( size_t i = 0; i < vec.size(); i ++ ){ + if( !vec[i].is_dense ) { + for( int j = 0; j < vec[i].tmp_num; j ++ ){ + utils::Assert( fscanf ( vec[i].fi, "%u:%f", &fidx, &fv ) == 2, "Error when load feat" ); + vec[i].CheckBase( fidx ); + fidx += vec[i].base; + findex.push_back( fidx ); fvalue.push_back( fv ); + } + }else{ + utils::Assert( fscanf ( vec[i].fi, "%f", &fv ) == 1, "load feat" ); + fidx = vec[i].base; + findex.push_back( fidx ); fvalue.push_back( fv ); + } + } + } + inline void DoRescale( std::vector &vec ){ + double sum = 0.0; + for( size_t i = 0; i < vec.size(); i ++ ){ + sum += vec[i] * vec[i]; + } + sum = sqrt( sum ); + for( size_t i = 0; i < vec.size(); i ++ ){ + vec[i] /= sum; + } + } +public: + // basically we are loading all the data inside + inline void Load( void ){ + this->data.Clear(); + float label; + + unsigned ngleft = 0, ngacc = 0; + if( fgroup != NULL ){ + info.group_ptr.clear(); + info.group_ptr.push_back(0); + } + + while( fscanf( fp, "%f", &label ) == 1 ){ + if( ngleft == 0 && fgroup != NULL ){ + utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1 ); + } + ngleft -= 1; ngacc += 1; + + if( fwlist != NULL ){ + int pass; + utils::Assert( fscanf( fwlist, "%u", &pass ) ==1 ); + if( pass == 0 ){ + vskip( fheader ); ngacc -= 1; + } + } + + const int nfeat = readnum( fheader ); + std::vector findex; + std::vector fvalue; + // pairs + this->Load( findex, fvalue, fheader ); + utils::Assert( findex.size() == (unsigned)nfeat ); + if( rescale != 0 ) this->DoRescale( fvalue ); + // push back data :) + this->info.labels.push_back( label ); + this->data.AddRow( findex, fvalue ); + + if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){ + info.group_ptr.push_back( info.group_ptr.back() + ngacc ); + utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" ); + ngacc = 0; + } + // linelimit + if( linelimit >= 0 ) { + if( -- linelimit <= 0 ) break; + } + } + if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){ + info.group_ptr.push_back( info.group_ptr.back() + ngacc ); + utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" ); + } + this->data.InitData(); + } +}; + +const char *folder = "features"; + +int main( int argc, char *argv[] ){ + if( argc < 3 ){ + printf("Usage:xgcombine_buffer [options] -f [features] -fd [densefeatures]\n"\ + "options: -rescale -linelimit -fgroup -wlist \n"); + return 0; + } + + DataLoader loader; + time_t start = time( NULL ); + + int mode = 0; + for( int i = 3; i < argc; i ++ ){ + if( !strcmp( argv[i], "-f") ){ + mode = 0; continue; + } + if( !strcmp( argv[i], "-fd") ){ + mode = 2; continue; + } + if( !strcmp( argv[i], "-rescale") ){ + loader.rescale = 1; continue; + } + if( !strcmp( argv[i], "-wlist") ){ + loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue; + } + if( !strcmp( argv[i], "-fgroup") ){ + loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue; + } + if( !strcmp( argv[i], "-linelimit") ){ + loader.linelimit = atoi( argv[ ++i ] ); continue; + } + + char name[ 256 ]; + sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] ); + Header h; + h.fi = utils::FopenCheck( name, "r" ); + + if( mode == 2 ){ + h.is_dense = true; h.num_feat = 1; + loader.fheader.push_back( h ); + }else{ + utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" ); + switch( mode ){ + case 0: loader.fheader.push_back( h ); break; + default: ; + } + } + } + loader.fp = utils::FopenCheck( argv[1], "r" ); + + printf("num_features=%d\n", norm( loader.fheader ) ); + printf("start creating buffer...\n"); + loader.Load(); + loader.SaveBinary( argv[2] ); + // close files + fclose( loader.fp ); + if( loader.fwlist != NULL ) fclose( loader.fwlist ); + if( loader.fgroup != NULL ) fclose( loader.fgroup ); + vclose( loader.fheader ); + printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) ); + return 0; +}