fix sometimes python cachelist problem

This commit is contained in:
tqchen 2014-05-20 15:42:19 -07:00
parent ccde443590
commit 76c44072d1
4 changed files with 27 additions and 11 deletions

View File

@ -112,7 +112,7 @@ namespace xgboost{
private:
bool init_trainer, init_model;
public:
Booster(const std::vector<const regrank::DMatrix *> mats){
Booster(const std::vector<regrank::DMatrix *> mats){
silent = 1;
init_trainer = false;
init_model = false;
@ -223,7 +223,7 @@ extern "C"{
// xgboost implementation
void *XGBoosterCreate( void *dmats[], size_t len ){
std::vector<const xgboost::regrank::DMatrix*> mats;
std::vector<xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){
DMatrix *dtr = static_cast<DMatrix*>(dmats[i]);
dtr->CheckInit();

View File

@ -31,7 +31,7 @@ namespace xgboost{
* \brief a regression booter associated with training and evaluating data
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
RegRankBoostLearner(const std::vector<const DMatrix *>& mats){
RegRankBoostLearner(const std::vector<DMatrix *>& mats){
silent = 0;
obj_ = NULL;
name_obj_ = "reg:linear";
@ -45,7 +45,7 @@ namespace xgboost{
* data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<const DMatrix *>& mats){
inline void SetCacheData(const std::vector<DMatrix *>& mats){
// estimate feature bound
int num_feature = 0;
// assign buffer index
@ -58,7 +58,9 @@ namespace xgboost{
if( mats[i] == mats[j] ) dupilicate = true;
}
if( dupilicate ) continue;
cache_.push_back( CacheEntry( mats[i], buffer_size ) );
// set mats[i]'s cache learner pointer to this
mats[i]->cache_learner_ptr_ = this;
cache_.push_back( CacheEntry( mats[i], buffer_size, mats[i]->Size() ) );
buffer_size += static_cast<unsigned>(mats[i]->Size());
num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
}
@ -342,9 +344,10 @@ namespace xgboost{
private:
struct CacheEntry{
const DMatrix *mat_;
int buffer_offset_;
CacheEntry(const DMatrix *mat, int buffer_offset)
:mat_(mat), buffer_offset_(buffer_offset){}
int buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix *mat, int buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row){}
};
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
@ -352,7 +355,14 @@ namespace xgboost{
// find internal bufer offset for certain matrix, if not exist, return -1
inline int FindBufferOffset(const DMatrix &mat){
for(size_t i = 0; i < cache_.size(); ++i){
if( cache_[i].mat_ == &mat ) return cache_[i].buffer_offset_;
if( cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this ) {
if( cache_[i].num_row_ == mat.Size() ){
return cache_[i].buffer_offset_;
}else{
fprintf( stderr, "warning: number of rows in input matrix changed as remembered in cachelist, ignore cached results\n" );
fflush( stderr );
}
}
}
return -1;
}

View File

@ -52,9 +52,15 @@ namespace xgboost{
booster::FMatrixS data;
/*! \brief information fields */
Info info;
/*!
* \brief cache pointer to verify if the data structure is cached in some learner
* this is a bit ugly, we need to have double check verification, so if one side get deleted,
* and some strange re-allocation gets the same pointer we will still be fine
*/
void *cache_learner_ptr_;
public:
/*! \brief default constructor */
DMatrix(void){}
DMatrix(void):cache_learner_ptr_(NULL){}
/*! \brief get the number of instances */
inline size_t Size() const{
return data.NumRow();

View File

@ -126,7 +126,7 @@ namespace xgboost{
deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
devalall.push_back(deval.back());
}
std::vector<const DMatrix *> dcache(1, &data);
std::vector<DMatrix *> dcache(1, &data);
for( size_t i = 0; i < deval.size(); ++ i){
dcache.push_back( deval[i] );
}