reupdate data

This commit is contained in:
tqchen
2014-03-04 22:47:39 -08:00
parent 1479adba58
commit 0fdda29470
3 changed files with 239 additions and 137 deletions

View File

@@ -60,13 +60,14 @@ namespace xgboost{
}
char str_temp[25];
if( num_feature > base_model.param.num_feature ){
if( num_feature > mparam.num_feature ){
mparam.num_feature = num_feature;
sprintf( str_temp, "%d", num_feature );
base_model.SetParam( "bst:num_feature", str_temp );
base_gbm.SetParam( "bst:num_feature", str_temp );
}
sprintf( str_temp, "%u", buffer_size );
base_model.SetParam( "num_pbuffer", str_temp );
base_gbm.SetParam( "num_pbuffer", str_temp );
if( !silent ){
printf( "buffer_size=%u\n", buffer_size );
}
@@ -81,16 +82,16 @@ namespace xgboost{
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp( name, "silent") ) silent = atoi( val );
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
mparam.SetParam( name, val );
base_model.SetParam( name, val );
base_gbm.SetParam( name, val );
}
/*!
* \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation
*/
inline void InitTrainer( void ){
base_model.InitTrainer();
base_gbm.InitTrainer();
if( mparam.loss_type == kLogisticClassify ){
evaluator_.AddEval( "error" );
}else{
@@ -102,7 +103,7 @@ namespace xgboost{
* \brief initialize the current data storage for model, if the model is used first time, call this function
*/
inline void InitModel( void ){
base_model.InitModel();
base_gbm.InitModel();
mparam.AdjustBase();
}
/*!
@@ -110,7 +111,7 @@ namespace xgboost{
* \param fi input stream
*/
inline void LoadModel( utils::IStream &fi ){
base_model.LoadModel( fi );
base_gbm.LoadModel( fi );
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
}
/*!
@@ -120,7 +121,7 @@ namespace xgboost{
* \param with_stats whether print statistics as well
*/
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
base_model.DumpModel( fo, fmap, with_stats );
base_gbm.DumpModel( fo, fmap, with_stats );
}
/*!
* \brief Dump path of all trees
@@ -128,14 +129,14 @@ namespace xgboost{
* \param data input data
*/
inline void DumpPath( FILE *fo, const DMatrix &data ){
base_model.DumpPath( fo, data.data );
base_gbm.DumpPath( fo, data.data );
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel( utils::IStream &fo ) const{
base_model.SaveModel( fo );
base_gbm.SaveModel( fo );
fo.Write( &mparam, sizeof(ModelParam) );
}
/*!
@@ -146,7 +147,7 @@ namespace xgboost{
this->PredictBuffer( preds_, *train_, 0 );
this->GetGradient( preds_, train_->labels, grad_, hess_ );
std::vector<unsigned> root_index;
base_model.DoBoost( grad_, hess_, train_->data, root_index );
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
}
/*!
* \brief evaluate the model for specific iteration
@@ -165,7 +166,6 @@ namespace xgboost{
}
fprintf( fo,"\n" );
}
/*! \brief get prediction, without buffering */
inline void Predict( std::vector<float> &preds, const DMatrix &data ){
preds.resize( data.Size() );
@@ -174,7 +174,51 @@ namespace xgboost{
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_model.Predict( data.data, j, -1 ) );
( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) );
}
}
public:
/*!
* \brief update the model for one iteration
* \param iteration iteration number
*/
inline void UpdateInteract( void ){
this->InteractPredict( preds_, *train_, 0 );
int buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
std::vector<float> &preds = this->eval_preds_[ i ];
this->InteractPredict( preds, *evals_[i], buffer_offset );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
this->GetGradient( preds_, train_->labels, grad_, hess_ );
std::vector<unsigned> root_index;
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
this->InteractRePredict( *train_, 0 );
buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
this->InteractRePredict( *evals_[i], buffer_offset );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
}
private:
/*! \brief get the transformed predictions, given data */
inline void InteractPredict( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) );
}
}
/*! \brief repredict trial */
inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
base_gbm.InteractRePredict( data.data, j, buffer_offset + j );
}
}
private:
@@ -186,7 +230,7 @@ namespace xgboost{
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_model.Predict( data.data, j, buffer_offset + j ) );
( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) );
}
}
@@ -218,9 +262,16 @@ namespace xgboost{
float base_score;
/* \brief type of loss function */
int loss_type;
/* \brief number of features */
int num_feature;
/*! \brief reserved field */
int reserved[ 16 ];
/*! \brief constructor */
ModelParam( void ){
base_score = 0.5f;
loss_type = 0;
num_feature = 0;
memset( reserved, 0, sizeof( reserved ) );
}
/*!
* \brief set parameters from outside
@@ -230,6 +281,7 @@ namespace xgboost{
inline void SetParam( const char *name, const char *val ){
if( !strcmp("base_score", name ) ) base_score = (float)atof( val );
if( !strcmp("loss_type", name ) ) loss_type = atoi( val );
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
}
/*!
* \brief adjust base_score
@@ -330,7 +382,7 @@ namespace xgboost{
private:
int silent;
EvalSet evaluator_;
booster::GBMBaseModel base_model;
booster::GBMBase base_gbm;
ModelParam mparam;
const DMatrix *train_;
std::vector<DMatrix *> evals_;

View File

@@ -39,6 +39,10 @@ namespace xgboost{
this->TaskDump();
return 0;
}
if( task == "interactive" ){
this->TaskInteractive();
return 0;
}
if( task == "dumppath" ){
this->TaskDumpPath();
return 0;
@@ -60,6 +64,7 @@ namespace xgboost{
if( !strcmp("data", name ) ) train_path = val;
if( !strcmp("test:data", name ) ) test_path = val;
if( !strcmp("model_in", name ) ) model_in = val;
if( !strcmp("model_out", name ) ) model_out = val;
if( !strcmp("model_dir", name ) ) model_dir_path = val;
if( !strcmp("fmap", name ) ) name_fmap = val;
if( !strcmp("name_dump", name ) ) name_dump = val;
@@ -141,13 +146,30 @@ namespace xgboost{
}
// always save final round
if( save_period == 0 || num_round % save_period != 0 ){
this->SaveModel( num_round );
if( model_out == "NULL" ){
this->SaveModel( num_round );
}else{
this->SaveModel( model_out.c_str() );
}
}
if( !silent ){
printf("\nupdating end, %lu sec in all\n", elapsed );
}
}
inline void TaskInteractive( void ){
const time_t start = time( NULL );
unsigned long elapsed = 0;
learner.UpdateInteract();
utils::Assert( model_out != "NULL", "interactive mode must specify model_out" );
this->SaveModel( model_out.c_str() );
elapsed = (unsigned long)(time(NULL) - start);
if( !silent ){
printf("\ninteractive update, %lu sec in all\n", elapsed );
}
}
inline void TaskDump( void ){
FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" );
learner.DumpModel( fo, fmap, dump_model_stats != 0 );
@@ -158,13 +180,16 @@ namespace xgboost{
learner.DumpPath( fo, data );
fclose( fo );
}
inline void SaveModel( int i ) const{
char fname[256];
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
inline void SaveModel( const char *fname ) const{
utils::FileStream fo( utils::FopenCheck( fname, "wb" ) );
learner.SaveModel( fo );
fo.Close();
}
inline void SaveModel( int i ) const{
char fname[256];
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
this->SaveModel( fname );
}
inline void TaskPred( void ){
std::vector<float> preds;
if( !silent ) printf("start prediction...\n");
@@ -189,6 +214,8 @@ namespace xgboost{
std::string train_path, test_path;
/* \brief the path of test model file, or file to restart training */
std::string model_in;
/* \brief the path of final model file, to be saved */
std::string model_out;
/* \brief the path of directory containing the saved models */
std::string model_dir_path;
/* \brief task to perform */