xgboost/booster/xgboost_regression_data_reader.h
2014-02-10 23:40:38 +08:00

76 lines
1.7 KiB
C++

#include"xgboost_data.h"
#include<stdio.h>
#include<vector>
using namespace xgboost::booster;
/*!
* \file xgboost_gbmbase.h
* \brief A reader to read the data for regression task from a specified file
* The data should contain each data instance in each line.
* The format of line data is as below:
* label nonzero feature dimension[ feature index:feature value]+
* \author Kailong Chen: chenkl198812@gmail.com
*/
class xgboost_regression_data_reader{
public:
xgboost_regression_data_reader(const char* file_path){
Load(file_path);
}
void Load(const char* file_path){
data_matrix.Clear();
FILE* file = fopen(file_path,"r");
if(file == NULL){
printf("The file is missing at %s",file_path);
return;
}
float label;
int nonzero_dimension,index,value,num_row = 0;
std::vector<bst_uint> findex;
std::vector<bst_float> fvalue;
while(fscanf(file,"%f %i",label,nonzero_dimension)){
findex.clear();
fvalue.clear();
findex.resize(nonzero_dimension);
fvalue.resize(nonzero_dimension);
for(int i = 0; i < nonzero_dimension; i++){
if(!fscanf(file," %i:%f",index,value)){
printf("The feature dimension is not coincident \
with the indicated one");
return;
}
findex.push_back(index);
fvalue.push_back(value);
}
data_matrix.AddRow(findex, fvalue);
labels.push_back(label);
num_row++;
}
printf("%i rows of data is loaded from %s",num_row,file_path);
fclose(file);
}
float GetLabel(int index){
return labels[index];
}
FMatrixS::Line GetLine(int index){
return data_matrix[index];
}
int InsNum(){
return labels.size();
}
FMatrixS::Image GetImage(){
return FMatrixS::Image(data_matrix);
}
private:
FMatrixS data_matrix;
std::vector<float> labels;
};