add regression data
This commit is contained in:
parent
51a63d80d0
commit
cb0fa75252
@ -78,6 +78,13 @@ namespace xgboost{
|
|||||||
inline size_t NumRow( void ) const{
|
inline size_t NumRow( void ) const{
|
||||||
return row_ptr.size() - 1;
|
return row_ptr.size() - 1;
|
||||||
}
|
}
|
||||||
|
/*!
|
||||||
|
* \brief get number of nonzero entries
|
||||||
|
* \return number of nonzero entries
|
||||||
|
*/
|
||||||
|
inline size_t NumEntry( void ) const{
|
||||||
|
return findex.size();
|
||||||
|
}
|
||||||
/*! \brief clear the storage */
|
/*! \brief clear the storage */
|
||||||
inline void Clear( void ){
|
inline void Clear( void ){
|
||||||
row_ptr.resize( 0 );
|
row_ptr.resize( 0 );
|
||||||
@ -164,6 +171,7 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -1,76 +0,0 @@
|
|||||||
#include"xgboost_data.h"
|
|
||||||
#include<stdio.h>
|
|
||||||
#include<vector>
|
|
||||||
|
|
||||||
using namespace xgboost::booster;
|
|
||||||
/*!
|
|
||||||
* \file xgboost_gbmbase.h
|
|
||||||
* \brief A reader to read the data for regression task from a specified file
|
|
||||||
* The data should contain each data instance in each line.
|
|
||||||
* The format of line data is as below:
|
|
||||||
* label nonzero feature dimension[ feature index:feature value]+
|
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com
|
|
||||||
*/
|
|
||||||
|
|
||||||
class xgboost_regression_data_reader{
|
|
||||||
|
|
||||||
public:
|
|
||||||
xgboost_regression_data_reader(const char* file_path){
|
|
||||||
Load(file_path);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Load(const char* file_path){
|
|
||||||
data_matrix.Clear();
|
|
||||||
FILE* file = fopen(file_path,"r");
|
|
||||||
if(file == NULL){
|
|
||||||
printf("The file is missing at %s",file_path);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
float label;
|
|
||||||
int nonzero_dimension,index,value,num_row = 0;
|
|
||||||
std::vector<bst_uint> findex;
|
|
||||||
std::vector<bst_float> fvalue;
|
|
||||||
|
|
||||||
while(fscanf(file,"%f %i",label,nonzero_dimension)){
|
|
||||||
findex.clear();
|
|
||||||
fvalue.clear();
|
|
||||||
findex.resize(nonzero_dimension);
|
|
||||||
fvalue.resize(nonzero_dimension);
|
|
||||||
for(int i = 0; i < nonzero_dimension; i++){
|
|
||||||
if(!fscanf(file," %i:%f",index,value)){
|
|
||||||
printf("The feature dimension is not coincident \
|
|
||||||
with the indicated one");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
findex.push_back(index);
|
|
||||||
fvalue.push_back(value);
|
|
||||||
}
|
|
||||||
data_matrix.AddRow(findex, fvalue);
|
|
||||||
labels.push_back(label);
|
|
||||||
num_row++;
|
|
||||||
}
|
|
||||||
printf("%i rows of data is loaded from %s",num_row,file_path);
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
float GetLabel(int index){
|
|
||||||
return labels[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
FMatrixS::Line GetLine(int index){
|
|
||||||
return data_matrix[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
int InsNum(){
|
|
||||||
return labels.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
FMatrixS::Image GetImage(){
|
|
||||||
return FMatrixS::Image(data_matrix);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
FMatrixS data_matrix;
|
|
||||||
std::vector<float> labels;
|
|
||||||
};
|
|
||||||
132
regression/xgboost_regdata.h
Normal file
132
regression/xgboost_regdata.h
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
#ifndef _XGBOOST_REGDATA_H_
|
||||||
|
#define _XGBOOST_REGDATA_H_
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regdata.h
|
||||||
|
* \brief input data structure for regression and binary classification task.
|
||||||
|
* Format:
|
||||||
|
* The data should contain each data instance in each line.
|
||||||
|
* The format of line data is as below:
|
||||||
|
* label <nonzero feature dimension> [feature index:feature value]+
|
||||||
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
|
*/
|
||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
#include "../booster/xgboost_data.h"
|
||||||
|
#include "../utils/xgboost_utils.h"
|
||||||
|
#include "../utils/xgboost_stream.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regression{
|
||||||
|
/*! \brief data matrix for regression content */
|
||||||
|
struct DMatrix{
|
||||||
|
public:
|
||||||
|
/*! \brief maximum feature dimension */
|
||||||
|
unsigned num_feature;
|
||||||
|
/*! \brief feature data content */
|
||||||
|
booster::FMatrixS data;
|
||||||
|
/*! \brief label of each instance */
|
||||||
|
std::vector<float> labels;
|
||||||
|
public:
|
||||||
|
/*! \brief default constructor */
|
||||||
|
DMatrix( void ){}
|
||||||
|
/*!
|
||||||
|
* \brief load from text file
|
||||||
|
* \param fname name of text data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
*/
|
||||||
|
inline void LoadText( const char* fname, bool silent = false ){
|
||||||
|
data.Clear();
|
||||||
|
FILE* file = utils::FopenCheck( fname, "r" );
|
||||||
|
float label;
|
||||||
|
int nonzero_dimension;
|
||||||
|
std::vector<booster::bst_uint> findex;
|
||||||
|
std::vector<booster::bst_float> fvalue;
|
||||||
|
|
||||||
|
while( fscanf(file,"%f %d",&label,&nonzero_dimension) == 2 ){
|
||||||
|
findex.clear(); fvalue.clear();
|
||||||
|
for( int i = 0; i < nonzero_dimension; i++ ){
|
||||||
|
unsigned index; float value;
|
||||||
|
utils::Assert( fscanf(file, "%d:%f", &index, &value ) == 2,
|
||||||
|
"The feature dimension is not coincident with the indicated one" );
|
||||||
|
findex.push_back(index); fvalue.push_back(value);
|
||||||
|
}
|
||||||
|
data.AddRow( findex, fvalue );
|
||||||
|
labels.push_back( label );
|
||||||
|
}
|
||||||
|
this->UpdateInfo();
|
||||||
|
if( !silent ){
|
||||||
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
|
(unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname );
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load from binary file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
* \return whether loading is success
|
||||||
|
*/
|
||||||
|
inline bool LoadBinary( const char* fname, bool silent = false ){
|
||||||
|
FILE *fp = fopen64( fname, "rb" );
|
||||||
|
if( fp == NULL ) return false;
|
||||||
|
utils::FileStream fs( fp );
|
||||||
|
data.LoadBinary( fs );
|
||||||
|
labels.resize( data.NumRow() );
|
||||||
|
utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" );
|
||||||
|
fs.Close();
|
||||||
|
this->UpdateInfo();
|
||||||
|
if( !silent ){
|
||||||
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
|
(unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname );
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save to binary file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
*/
|
||||||
|
inline void SaveBinary( const char* fname, bool silent = false ){
|
||||||
|
utils::FileStream fs( utils::FopenCheck( fname, "wb" ) );
|
||||||
|
data.SaveBinary( fs );
|
||||||
|
fs.Write( &labels[0], sizeof(float) * data.NumRow() );
|
||||||
|
fs.Close();
|
||||||
|
if( !silent ){
|
||||||
|
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||||
|
(unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief cache load data given a file name, the function will first check if fname + '.xgbuffer' exists,
|
||||||
|
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||||
|
* and try to create a buffer file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
* \return whether loading is success
|
||||||
|
*/
|
||||||
|
inline void CacheLoad( const char *fname, bool silent = false ){
|
||||||
|
char bname[ 1024 ];
|
||||||
|
sprintf( bname, "%s.buffer", fname );
|
||||||
|
if( !this->LoadBinary( bname, silent ) ){
|
||||||
|
this->LoadText( fname, silent );
|
||||||
|
this->SaveBinary( fname, silent );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
/*! \brief update num_feature info */
|
||||||
|
inline void UpdateInfo( void ){
|
||||||
|
this->num_feature = 0;
|
||||||
|
for( size_t i = 0; i < data.NumRow(); i ++ ){
|
||||||
|
booster::FMatrixS::Line sp = data[i];
|
||||||
|
for( unsigned j = 0; j < sp.len; j ++ ){
|
||||||
|
if( num_feature <= sp.findex[j] ){
|
||||||
|
num_feature = sp.findex[j] + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
Loading…
x
Reference in New Issue
Block a user