diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index f56a5bc8f..7f5b5f25f 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -78,6 +78,13 @@ namespace xgboost{ inline size_t NumRow( void ) const{ return row_ptr.size() - 1; } + /*! + * \brief get number of nonzero entries + * \return number of nonzero entries + */ + inline size_t NumEntry( void ) const{ + return findex.size(); + } /*! \brief clear the storage */ inline void Clear( void ){ row_ptr.resize( 0 ); @@ -164,6 +171,7 @@ namespace xgboost{ } } }; - }; + }; }; + #endif diff --git a/booster/xgboost_regression_data_reader.h b/booster/xgboost_regression_data_reader.h deleted file mode 100644 index 13c7b0d8d..000000000 --- a/booster/xgboost_regression_data_reader.h +++ /dev/null @@ -1,76 +0,0 @@ -#include"xgboost_data.h" -#include -#include - -using namespace xgboost::booster; -/*! - * \file xgboost_gbmbase.h - * \brief A reader to read the data for regression task from a specified file - * The data should contain each data instance in each line. - * The format of line data is as below: - * label nonzero feature dimension[ feature index:feature value]+ - * \author Kailong Chen: chenkl198812@gmail.com - */ - -class xgboost_regression_data_reader{ - -public: - xgboost_regression_data_reader(const char* file_path){ - Load(file_path); - } - - void Load(const char* file_path){ - data_matrix.Clear(); - FILE* file = fopen(file_path,"r"); - if(file == NULL){ - printf("The file is missing at %s",file_path); - return; - } - float label; - int nonzero_dimension,index,value,num_row = 0; - std::vector findex; - std::vector fvalue; - - while(fscanf(file,"%f %i",label,nonzero_dimension)){ - findex.clear(); - fvalue.clear(); - findex.resize(nonzero_dimension); - fvalue.resize(nonzero_dimension); - for(int i = 0; i < nonzero_dimension; i++){ - if(!fscanf(file," %i:%f",index,value)){ - printf("The feature dimension is not coincident \ - with the indicated one"); - return; - } - findex.push_back(index); - fvalue.push_back(value); - } - data_matrix.AddRow(findex, fvalue); - labels.push_back(label); - num_row++; - } - printf("%i rows of data is loaded from %s",num_row,file_path); - fclose(file); - } - - - float GetLabel(int index){ - return labels[index]; - } - - FMatrixS::Line GetLine(int index){ - return data_matrix[index]; - } - - int InsNum(){ - return labels.size(); - } - - FMatrixS::Image GetImage(){ - return FMatrixS::Image(data_matrix); - } - -private: - FMatrixS data_matrix; - std::vector labels; -}; \ No newline at end of file diff --git a/regression/xgboost_regdata.h b/regression/xgboost_regdata.h new file mode 100644 index 000000000..318e1e4bb --- /dev/null +++ b/regression/xgboost_regdata.h @@ -0,0 +1,132 @@ +#ifndef _XGBOOST_REGDATA_H_ +#define _XGBOOST_REGDATA_H_ + +/*! + * \file xgboost_regdata.h + * \brief input data structure for regression and binary classification task. + * Format: + * The data should contain each data instance in each line. + * The format of line data is as below: + * label [feature index:feature value]+ + * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com + */ +#include +#include +#include "../booster/xgboost_data.h" +#include "../utils/xgboost_utils.h" +#include "../utils/xgboost_stream.h" + +namespace xgboost{ + namespace regression{ + /*! \brief data matrix for regression content */ + struct DMatrix{ + public: + /*! \brief maximum feature dimension */ + unsigned num_feature; + /*! \brief feature data content */ + booster::FMatrixS data; + /*! \brief label of each instance */ + std::vector labels; + public: + /*! \brief default constructor */ + DMatrix( void ){} + /*! + * \brief load from text file + * \param fname name of text data + * \param silent whether print information or not + */ + inline void LoadText( const char* fname, bool silent = false ){ + data.Clear(); + FILE* file = utils::FopenCheck( fname, "r" ); + float label; + int nonzero_dimension; + std::vector findex; + std::vector fvalue; + + while( fscanf(file,"%f %d",&label,&nonzero_dimension) == 2 ){ + findex.clear(); fvalue.clear(); + for( int i = 0; i < nonzero_dimension; i++ ){ + unsigned index; float value; + utils::Assert( fscanf(file, "%d:%f", &index, &value ) == 2, + "The feature dimension is not coincident with the indicated one" ); + findex.push_back(index); fvalue.push_back(value); + } + data.AddRow( findex, fvalue ); + labels.push_back( label ); + } + this->UpdateInfo(); + if( !silent ){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + } + fclose(file); + } + /*! + * \brief load from binary file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ + inline bool LoadBinary( const char* fname, bool silent = false ){ + FILE *fp = fopen64( fname, "rb" ); + if( fp == NULL ) return false; + utils::FileStream fs( fp ); + data.LoadBinary( fs ); + labels.resize( data.NumRow() ); + utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" ); + fs.Close(); + this->UpdateInfo(); + if( !silent ){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + } + return true; + } + /*! + * \brief save to binary file + * \param fname name of binary data + * \param silent whether print information or not + */ + inline void SaveBinary( const char* fname, bool silent = false ){ + utils::FileStream fs( utils::FopenCheck( fname, "wb" ) ); + data.SaveBinary( fs ); + fs.Write( &labels[0], sizeof(float) * data.NumRow() ); + fs.Close(); + if( !silent ){ + printf("%ux%u matrix with %lu entries is saved to %s\n", + (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + } + } + /*! + * \brief cache load data given a file name, the function will first check if fname + '.xgbuffer' exists, + * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, + * and try to create a buffer file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ + inline void CacheLoad( const char *fname, bool silent = false ){ + char bname[ 1024 ]; + sprintf( bname, "%s.buffer", fname ); + if( !this->LoadBinary( bname, silent ) ){ + this->LoadText( fname, silent ); + this->SaveBinary( fname, silent ); + } + } + private: + /*! \brief update num_feature info */ + inline void UpdateInfo( void ){ + this->num_feature = 0; + for( size_t i = 0; i < data.NumRow(); i ++ ){ + booster::FMatrixS::Line sp = data[i]; + for( unsigned j = 0; j < sp.len; j ++ ){ + if( num_feature <= sp.findex[j] ){ + num_feature = sp.findex[j] + 1; + } + } + } + } + }; + }; +}; +#endif