xgboost/src/learner.cc

523 lines
18 KiB
C++

/*!
* Copyright 2014 by Contributors
* \file learner.cc
* \brief Implementation of learning algorithm.
* \author Tianqi Chen
*/
#include <xgboost/logging.h>
#include <xgboost/learner.h>
#include <dmlc/io.h>
#include <algorithm>
#include <vector>
#include <utility>
#include <string>
#include <sstream>
#include <limits>
#include <iomanip>
#include "./common/io.h"
#include "./common/common.h"
#include "./common/random.h"
namespace xgboost {
// implementation of base learner.
bool Learner::AllowLazyCheckPoint() const {
return gbm_->AllowLazyCheckPoint();
}
std::vector<std::string>
Learner::Dump2Text(const FeatureMap& fmap, int option) const {
return gbm_->Dump2Text(fmap, option);
}
/*! \brief training parameter for regression */
struct LearnerModelParam
: public dmlc::Parameter<LearnerModelParam> {
/* \brief global bias */
float base_score;
/* \brief number of features */
unsigned num_feature;
/* \brief number of classes, if it is multi-class classification */
int num_class;
/*! \brief Model contain additional properties */
int contain_extra_attrs;
/*! \brief reserved field */
int reserved[30];
/*! \brief constructor */
LearnerModelParam() {
std::memset(this, 0, sizeof(LearnerModelParam));
base_score = 0.5f;
}
// declare parameters
DMLC_DECLARE_PARAMETER(LearnerModelParam) {
DMLC_DECLARE_FIELD(base_score).set_default(0.5f)
.describe("Global bias of the model.");
DMLC_DECLARE_FIELD(num_feature).set_default(0)
.describe("Number of features in training data,"\
" this parameter will be automatically detected by learner.");
DMLC_DECLARE_FIELD(num_class).set_default(0).set_lower_bound(0)
.describe("Number of class option for multi-class classifier. "\
" By default equals 0 and corresponds to binary classifier.");
}
};
struct LearnerTrainParam
: public dmlc::Parameter<LearnerTrainParam> {
// stored random seed
int seed;
// whether seed the PRNG each iteration
bool seed_per_iteration;
// data split mode, can be row, col, or none.
int dsplit;
// tree construction method
int tree_method;
// internal test flag
std::string test_flag;
// maximum buffered row value
float prob_buffer_row;
// maximum row per batch.
size_t max_row_perbatch;
// declare parameters
DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
DMLC_DECLARE_FIELD(seed).set_default(0)
.describe("Random number seed during training.");
DMLC_DECLARE_FIELD(seed_per_iteration).set_default(false)
.describe("Seed PRNG determnisticly via iterator number, "\
"this option will be switched on automatically on distributed mode.");
DMLC_DECLARE_FIELD(dsplit).set_default(0)
.add_enum("auto", 0)
.add_enum("col", 1)
.add_enum("row", 2)
.describe("Data split mode for distributed trainig. ");
DMLC_DECLARE_FIELD(tree_method).set_default(0)
.add_enum("auto", 0)
.add_enum("approx", 1)
.add_enum("exact", 2)
.describe("Choice of tree construction method.");
DMLC_DECLARE_FIELD(test_flag).set_default("")
.describe("Internal test flag");
DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
.describe("Maximum buffered row portion");
DMLC_DECLARE_FIELD(max_row_perbatch).set_default(std::numeric_limits<size_t>::max())
.describe("maximum row per batch.");
}
};
DMLC_REGISTER_PARAMETER(LearnerModelParam);
DMLC_REGISTER_PARAMETER(LearnerTrainParam);
/*!
* \brief learner that performs gradient boosting for a specific objective function.
* It does training and prediction.
*/
class LearnerImpl : public Learner {
public:
explicit LearnerImpl(const std::vector<DMatrix*>& cache_mats)
noexcept(false) {
// setup the cache setting in constructor.
CHECK_EQ(cache_.size(), 0);
size_t buffer_size = 0;
for (auto it = cache_mats.begin(); it != cache_mats.end(); ++it) {
// avoid duplication.
if (std::find(cache_mats.begin(), it, *it) != it) continue;
DMatrix* pmat = *it;
pmat->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(pmat, buffer_size, pmat->info().num_row));
buffer_size += pmat->info().num_row;
}
pred_buffer_size_ = buffer_size;
// boosted tree
name_obj_ = "reg:linear";
name_gbm_ = "gbtree";
}
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
// add to configurations
tparam.InitAllowUnknown(args);
cfg_.clear();
for (const auto& kv : args) {
if (kv.first == "eval_metric") {
// check duplication
auto dup_check = [&kv](const std::unique_ptr<Metric>&m) {
return m->Name() != kv.second;
};
if (std::all_of(metrics_.begin(), metrics_.end(), dup_check)) {
metrics_.emplace_back(Metric::Create(kv.second));
}
} else {
cfg_[kv.first] = kv.second;
}
}
// add additional parameter
// These are cosntraints that need to be satisfied.
if (tparam.dsplit == 0 && rabit::IsDistributed()) {
tparam.dsplit = 2;
}
if (cfg_.count("num_class") != 0) {
cfg_["num_output_group"] = cfg_["num_class"];
if (atoi(cfg_["num_class"].c_str()) > 1 && cfg_.count("objective") == 0) {
cfg_["objective"] = "multi:softmax";
}
}
if (cfg_.count("max_delta_step") == 0 &&
cfg_.count("objective") != 0 &&
cfg_["objective"] == "count:poisson") {
cfg_["max_delta_step"] = "0.7";
}
if (cfg_.count("updater") == 0) {
if (tparam.dsplit == 1) {
cfg_["updater"] = "distcol";
} else if (tparam.dsplit == 2) {
cfg_["updater"] = "grow_histmaker,prune";
}
if (tparam.prob_buffer_row != 1.0f) {
cfg_["updater"] = "grow_histmaker,refresh,prune";
}
}
if (cfg_.count("objective") == 0) {
cfg_["objective"] = "reg:linear";
}
if (cfg_.count("booster") == 0) {
cfg_["booster"] = "gbtree";
}
if (!this->ModelInitialized()) {
mparam.InitAllowUnknown(args);
name_obj_ = cfg_["objective"];
name_gbm_ = cfg_["booster"];
// set seed only before the model is initialized
common::GlobalRandom().seed(tparam.seed);
}
// set number of features correctly.
cfg_["num_feature"] = common::ToString(mparam.num_feature);
cfg_["num_class"] = common::ToString(mparam.num_class);
if (gbm_.get() != nullptr) {
gbm_->Configure(cfg_.begin(), cfg_.end());
}
if (obj_.get() != nullptr) {
obj_->Configure(cfg_.begin(), cfg_.end());
}
}
void InitModel() override {
this->LazyInitModel();
}
void Load(dmlc::Stream* fi) override {
// TODO(tqchen) mark deprecation of old format.
common::PeekableInStream fp(fi);
// backward compatible header check.
std::string header;
header.resize(4);
if (fp.PeekRead(&header[0], 4) == 4) {
CHECK_NE(header, "bs64")
<< "Base64 format is no longer supported in brick.";
if (header == "binf") {
CHECK_EQ(fp.Read(&header[0], 4), 4);
}
}
// use the peekable reader.
fi = &fp;
// read parameter
CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam))
<< "BoostLearner: wrong model format";
{
// backward compatibility code for compatible with old model type
// for new model, Read(&name_obj_) is suffice
uint64_t len;
CHECK_EQ(fi->Read(&len, sizeof(len)), sizeof(len));
if (len >= std::numeric_limits<unsigned>::max()) {
int gap;
CHECK_EQ(fi->Read(&gap, sizeof(gap)), sizeof(gap))
<< "BoostLearner: wrong model format";
len = len >> static_cast<uint64_t>(32UL);
}
if (len != 0) {
name_obj_.resize(len);
CHECK_EQ(fi->Read(&name_obj_[0], len), len)
<<"BoostLearner: wrong model format";
}
}
CHECK(fi->Read(&name_gbm_))
<< "BoostLearner: wrong model format";
// duplicated code with LazyInitModel
obj_.reset(ObjFunction::Create(name_obj_));
gbm_.reset(GradientBooster::Create(name_gbm_));
gbm_->Load(fi);
if (mparam.contain_extra_attrs != 0) {
std::vector<std::pair<std::string, std::string> > attr;
fi->Read(&attr);
attributes_ = std::map<std::string, std::string>(
attr.begin(), attr.end());
}
if (metrics_.size() == 0) {
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
}
this->base_score_ = mparam.base_score;
gbm_->ResetPredBuffer(pred_buffer_size_);
cfg_["num_class"] = common::ToString(mparam.num_class);
cfg_["num_feature"] = common::ToString(mparam.num_feature);
obj_->Configure(cfg_.begin(), cfg_.end());
}
// rabit save model to rabit checkpoint
void Save(dmlc::Stream *fo) const override {
fo->Write(&mparam, sizeof(LearnerModelParam));
fo->Write(name_obj_);
fo->Write(name_gbm_);
gbm_->Save(fo);
if (mparam.contain_extra_attrs != 0) {
std::vector<std::pair<std::string, std::string> > attr(
attributes_.begin(), attributes_.end());
fo->Write(attr);
}
}
void UpdateOneIter(int iter, DMatrix* train) override {
CHECK(ModelInitialized())
<< "Always call InitModel or LoadModel before update";
if (tparam.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train->info(), iter, &gpair_);
gbm_->DoBoost(train, this->FindBufferOffset(train), &gpair_);
}
void BoostOneIter(int iter,
DMatrix* train,
std::vector<bst_gpair>* in_gpair) override {
if (tparam.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
gbm_->DoBoost(train, this->FindBufferOffset(train), in_gpair);
}
std::string EvalOneIter(int iter,
const std::vector<DMatrix*>& data_sets,
const std::vector<std::string>& data_names) override {
std::ostringstream os;
os << '[' << iter << ']'
<< std::setiosflags(std::ios::fixed);
for (size_t i = 0; i < data_sets.size(); ++i) {
this->PredictRaw(data_sets[i], &preds_);
obj_->EvalTransform(&preds_);
for (auto& ev : metrics_) {
os << '\t' << data_names[i] << '-' << ev->Name() << ':'
<< ev->Eval(preds_, data_sets[i]->info(), tparam.dsplit == 2);
}
}
return os.str();
}
void SetAttr(const std::string& key, const std::string& value) override {
attributes_[key] = value;
mparam.contain_extra_attrs = 1;
}
bool GetAttr(const std::string& key, std::string* out) const override {
auto it = attributes_.find(key);
if (it == attributes_.end()) return false;
*out = it->second;
return true;
}
std::pair<std::string, float> Evaluate(DMatrix* data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric();
std::unique_ptr<Metric> ev(Metric::Create(metric.c_str()));
this->PredictRaw(data, &preds_);
obj_->EvalTransform(&preds_);
return std::make_pair(metric, ev->Eval(preds_, data->info(), tparam.dsplit == 2));
}
void Predict(DMatrix* data,
bool output_margin,
std::vector<float> *out_preds,
unsigned ntree_limit,
bool pred_leaf) const override {
if (pred_leaf) {
gbm_->PredictLeaf(data, out_preds, ntree_limit);
} else {
this->PredictRaw(data, out_preds, ntree_limit);
if (!output_margin) {
obj_->PredTransform(out_preds);
}
}
}
protected:
// check if p_train is ready to used by training.
// if not, initialize the column access.
inline void LazyInitDMatrix(DMatrix *p_train) {
if (!p_train->HaveColAccess()) {
int ncol = static_cast<int>(p_train->info().num_col);
std::vector<bool> enabled(ncol, true);
// set max row per batch to limited value
// in distributed mode, use safe choice otherwise
size_t max_row_perbatch = tparam.max_row_perbatch;
const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);
if (tparam.tree_method == 0 &&
p_train->info().num_row >= (4UL << 20UL)) {
LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
<< " for faster speed."
<< " to use old behavior(exact greedy algorithm on single machine),"
<< " set tree_method to \'exact\'";
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
}
if (tparam.tree_method == 1) {
LOG(CONSOLE) << "Tree method is selected to be \'approx\'";
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
}
if (tparam.test_flag == "block" || tparam.dsplit == 2) {
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
}
// initialize column access
p_train->InitColAccess(enabled,
tparam.prob_buffer_row,
max_row_perbatch);
}
if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
if (tparam.tree_method == 2) {
LOG(CONSOLE) << "tree method is set to be 'exact',"
<< " but currently we are only able to proceed with approximate algorithm";
}
cfg_["updater"] = "grow_histmaker,prune";
if (gbm_.get() != nullptr) {
gbm_->Configure(cfg_.begin(), cfg_.end());
}
}
}
// return whether model is already initialized.
inline bool ModelInitialized() const {
return gbm_.get() != nullptr;
}
// lazily initialize the model if it haven't yet been initialized.
inline void LazyInitModel() {
if (this->ModelInitialized()) return;
// estimate feature bound
unsigned num_feature = 0;
for (size_t i = 0; i < cache_.size(); ++i) {
num_feature = std::max(num_feature,
static_cast<unsigned>(cache_[i].mat_->info().num_col));
}
// run allreduce on num_feature to find the maximum value
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
if (num_feature > mparam.num_feature) {
mparam.num_feature = num_feature;
}
// setup
cfg_["num_feature"] = common::ToString(mparam.num_feature);
CHECK(obj_.get() == nullptr && gbm_.get() == nullptr);
obj_.reset(ObjFunction::Create(name_obj_));
gbm_.reset(GradientBooster::Create(name_gbm_));
gbm_->Configure(cfg_.begin(), cfg_.end());
obj_->Configure(cfg_.begin(), cfg_.end());
// reset the base score
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
if (metrics_.size() == 0) {
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
}
this->base_score_ = mparam.base_score;
gbm_->ResetPredBuffer(pred_buffer_size_);
}
/*!
* \brief get un-transformed prediction
* \param data training data matrix
* \param out_preds output vector that stores the prediction
* \param ntree_limit limit number of trees used for boosted tree
* predictor, when it equals 0, this means we are using all the trees
*/
inline void PredictRaw(DMatrix* data,
std::vector<float>* out_preds,
unsigned ntree_limit = 0) const {
CHECK(gbm_.get() != nullptr)
<< "Predict must happen after Load or InitModel";
gbm_->Predict(data,
this->FindBufferOffset(data),
out_preds,
ntree_limit);
// add base margin
std::vector<float>& preds = *out_preds;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
const std::vector<bst_float>& base_margin = data->info().base_margin;
if (base_margin.size() != 0) {
CHECK_EQ(preds.size(), base_margin.size())
<< "base_margin.size does not match with prediction size";
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] += base_margin[j];
}
} else {
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] += this->base_score_;
}
}
}
// cached size of predict buffer
size_t pred_buffer_size_;
// model parameter
LearnerModelParam mparam;
// training parameter
LearnerTrainParam tparam;
// configurations
std::map<std::string, std::string> cfg_;
// attributes
std::map<std::string, std::string> attributes_;
// name of gbm
std::string name_gbm_;
// name of objective functon
std::string name_obj_;
// temporal storages for prediction
std::vector<float> preds_;
// gradient pairs
std::vector<bst_gpair> gpair_;
private:
/*! \brief random number transformation seed. */
static const int kRandSeedMagic = 127;
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix* mat_;
size_t buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix* mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
};
// find internal buffer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix* mat) const {
for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == mat && mat->cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat->info().num_row) {
return static_cast<int64_t>(cache_[i].buffer_offset_);
}
}
}
return -1;
}
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
};
Learner* Learner::Create(const std::vector<DMatrix*>& cache_data) {
return new LearnerImpl(cache_data);
}
} // namespace xgboost