[CORE] Refactor cache mechanism (#1540)

This commit is contained in:
Tianqi Chen
2016-09-02 20:39:07 -07:00
committed by GitHub
parent 6dabdd33e3
commit ecec5f7959
9 changed files with 320 additions and 421 deletions

View File

@@ -118,20 +118,8 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
*/
class LearnerImpl : public Learner {
public:
explicit LearnerImpl(const std::vector<DMatrix*>& cache_mats)
noexcept(false) {
// setup the cache setting in constructor.
CHECK_EQ(cache_.size(), 0);
size_t buffer_size = 0;
for (auto it = cache_mats.begin(); it != cache_mats.end(); ++it) {
// avoid duplication.
if (std::find(cache_mats.begin(), it, *it) != it) continue;
DMatrix* pmat = *it;
pmat->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(pmat, buffer_size, pmat->info().num_row));
buffer_size += pmat->info().num_row;
}
pred_buffer_size_ = buffer_size;
explicit LearnerImpl(const std::vector<std::shared_ptr<DMatrix> >& cache)
: cache_(cache) {
// boosted tree
name_obj_ = "reg:linear";
name_gbm_ = "gbtree";
@@ -257,7 +245,7 @@ class LearnerImpl : public Learner {
<< "BoostLearner: wrong model format";
// duplicated code with LazyInitModel
obj_.reset(ObjFunction::Create(name_obj_));
gbm_.reset(GradientBooster::Create(name_gbm_));
gbm_.reset(GradientBooster::Create(name_gbm_, cache_, mparam.base_score));
gbm_->Load(fi);
if (mparam.contain_extra_attrs != 0) {
std::vector<std::pair<std::string, std::string> > attr;
@@ -265,8 +253,6 @@ class LearnerImpl : public Learner {
attributes_ = std::map<std::string, std::string>(
attr.begin(), attr.end());
}
this->base_score_ = mparam.base_score;
gbm_->ResetPredBuffer(pred_buffer_size_);
cfg_["num_class"] = common::ToString(mparam.num_class);
cfg_["num_feature"] = common::ToString(mparam.num_feature);
obj_->Configure(cfg_.begin(), cfg_.end());
@@ -294,7 +280,7 @@ class LearnerImpl : public Learner {
this->LazyInitDMatrix(train);
this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train->info(), iter, &gpair_);
gbm_->DoBoost(train, this->FindBufferOffset(train), &gpair_);
gbm_->DoBoost(train, &gpair_, obj_.get());
}
void BoostOneIter(int iter,
@@ -304,7 +290,7 @@ class LearnerImpl : public Learner {
common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
gbm_->DoBoost(train, this->FindBufferOffset(train), in_gpair);
gbm_->DoBoost(train, in_gpair);
}
std::string EvalOneIter(int iter,
@@ -435,28 +421,24 @@ class LearnerImpl : public Learner {
// estimate feature bound
unsigned num_feature = 0;
for (size_t i = 0; i < cache_.size(); ++i) {
CHECK(cache_[i] != nullptr);
num_feature = std::max(num_feature,
static_cast<unsigned>(cache_[i].mat_->info().num_col));
static_cast<unsigned>(cache_[i]->info().num_col));
}
// run allreduce on num_feature to find the maximum value
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
if (num_feature > mparam.num_feature) {
mparam.num_feature = num_feature;
}
// setup
cfg_["num_feature"] = common::ToString(mparam.num_feature);
CHECK(obj_.get() == nullptr && gbm_.get() == nullptr);
obj_.reset(ObjFunction::Create(name_obj_));
gbm_.reset(GradientBooster::Create(name_gbm_));
gbm_->Configure(cfg_.begin(), cfg_.end());
obj_->Configure(cfg_.begin(), cfg_.end());
// reset the base score
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
this->base_score_ = mparam.base_score;
gbm_->ResetPredBuffer(pred_buffer_size_);
gbm_.reset(GradientBooster::Create(name_gbm_, cache_, mparam.base_score));
gbm_->Configure(cfg_.begin(), cfg_.end());
}
/*!
* \brief get un-transformed prediction
@@ -471,29 +453,9 @@ class LearnerImpl : public Learner {
CHECK(gbm_.get() != nullptr)
<< "Predict must happen after Load or InitModel";
gbm_->Predict(data,
this->FindBufferOffset(data),
out_preds,
ntree_limit);
// add base margin
std::vector<float>& preds = *out_preds;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
const std::vector<bst_float>& base_margin = data->info().base_margin;
if (base_margin.size() != 0) {
CHECK_EQ(preds.size(), base_margin.size())
<< "base_margin.size does not match with prediction size";
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] += base_margin[j];
}
} else {
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] += this->base_score_;
}
}
}
// cached size of predict buffer
size_t pred_buffer_size_;
// model parameter
LearnerModelParam mparam;
// training parameter
@@ -514,31 +476,11 @@ class LearnerImpl : public Learner {
private:
/*! \brief random number transformation seed. */
static const int kRandSeedMagic = 127;
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix* mat_;
size_t buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix* mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
};
// find internal buffer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix* mat) const {
for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == mat && mat->cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat->info().num_row) {
return static_cast<int64_t>(cache_[i].buffer_offset_);
}
}
}
return -1;
}
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
// internal cached dmatrix
std::vector<std::shared_ptr<DMatrix> > cache_;
};
Learner* Learner::Create(const std::vector<DMatrix*>& cache_data) {
Learner* Learner::Create(const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
return new LearnerImpl(cache_data);
}
} // namespace xgboost