From 82ceb4de0af8877403517f5044861cf60fbd6ce8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 3 Jan 2016 05:16:05 -0800 Subject: [PATCH] [LEARNER] Init learner interface --- Makefile | 2 +- include/xgboost/gbm.h | 4 +- include/xgboost/learner.h | 155 ++++++++++++++++++ src/gbm/gblinear.cc | 10 +- src/gbm/gbtree.cc | 17 +- .../learner/learner-inl.hpp => src/learner.cc | 10 ++ 6 files changed, 191 insertions(+), 7 deletions(-) create mode 100644 include/xgboost/learner.h rename old_src/learner/learner-inl.hpp => src/learner.cc (98%) diff --git a/Makefile b/Makefile index 5fdf20021..3c88041ce 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ lint: clean: $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ -clean: clean_all +clean_all: clean cd $(DMLC_CORE); make clean; cd - cd $(RABIT); make clean; cd - diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index de5b94be6..852906ef7 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -118,12 +118,12 @@ class GradientBooster { std::vector* out_preds, unsigned ntree_limit = 0) = 0; /*! - * \brief dump the model in text format + * \brief dump the model to text format * \param fmap feature map that may help give interpretations of feature * \param option extra option of the dump model * \return a vector of dump for boosters. */ - virtual std::vector Dump2Text(const FeatureMap& fmap, int option) = 0; + virtual std::vector Dump2Text(const FeatureMap& fmap, int option) const = 0; /*! * \breif create a gradient booster from given name * \param name name of gradient booster diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h new file mode 100644 index 000000000..67035514e --- /dev/null +++ b/include/xgboost/learner.h @@ -0,0 +1,155 @@ +/*! + * Copyright 2015 by Contributors + * \file learner.h + * \brief Learner interface that integrates objective, gbm and evaluation together. + * This is the user facing XGBoost training module. + * \author Tianqi Chen + */ +#ifndef XGBOOST_LEARNER_H_ +#define XGBOOST_LEARNER_H_ + +#include +#include +#include +#include +#include "./base.h" +#include "./gbm.h" +#include "./meric.h" +#include "./objective.h" + +namespace xgboost { +/*! + * \brief Learner class that do trainig and prediction. + * This is the user facing module of xgboost training. + * The Load/Save function corresponds to the model used in python/R. + * \code + * + * std::unique_ptr learner(new Learner::Create(cache_mats)); + * learner.Configure(configs); + * + * for (int iter = 0; iter < max_iter; ++i) { + * learner->UpdateOneIter(iter, train_mat); + * LOG(INFO) << learner->EvalOneIter(iter, data_sets, data_names); + * } + * + * \endcode + */ +class Learner : public rabit::Serializable { + public: + /*! + * \brief Set the configuration of gradient boosting. + * User must call configure once before InitModel and Training. + * + * \param cfg configurations on both training and model parameters. + */ + virtual void Configure(const std::vector >& cfg) = 0; + /*! + * \brief load model from stream + * \param fi input stream. + */ + virtual void Load(dmlc::Stream* fi) = 0; + /*! + * \brief save model to stream. + * \param fo output stream + */ + virtual void Save(dmlc::Stream* fo) const = 0; + /*! + * \brief update the model for one iteration + * With the specified objective function. + * \param iter current iteration number + * \param train reference to the data matrix. + */ + void UpdateOneIter(int iter, DMatrix* train); + /*! + * \brief Do customized gradient boosting with in_gpair. + * in_gair can be mutated after this call. + * \param iter current iteration number + * \param train reference to the data matrix. + * \param in_gpair The input gradient statistics. + */ + void BoostOneIter(int iter, + DMatrix* train, + std::vector* in_gpair); + /*! + * \brief evaluate the model for specific iteration using the configured metrics. + * \param iter iteration number + * \param data_sets datasets to be evaluated. + * \param data_names name of each dataset + * \return a string corresponding to the evaluation result + */ + std::string EvalOneIter(int iter, + const std::vector& data_sets, + const std::vector& data_names); + /*! + * \brief get prediction given the model. + * \param data input data + * \param output_margin whether to only predict margin value instead of transformed prediction + * \param out_preds output vector that stores the prediction + * \param ntree_limit limit number of trees used for boosted tree + * predictor, when it equals 0, this means we are using all the trees + * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor + */ + void Predict(DMatrix* data, + bool output_margin, + std::vector *out_preds, + unsigned ntree_limit = 0, + bool pred_leaf = false) const; + /*! + * \return whether the model allow lazy checkpoint in rabit. + */ + bool AllowLazyCheckPoint() const; + /*! + * \brief dump the model in text format + * \param fmap feature map that may help give interpretations of feature + * \param option extra option of the dump model + * \return a vector of dump for boosters. + */ + std::vector Dump2Text(const FeatureMap& fmap, int option) const; + /*! + * \brief online prediction function, predict score for one instance at a time + * NOTE: use the batch prediction interface if possible, batch prediction is usually + * more efficient than online prediction + * This function is NOT threadsafe, make sure you only call from one thread. + * + * \param inst the instance you want to predict + * \param output_margin whether to only predict margin value instead of transformed prediction + * \param out_preds output vector to hold the predictions + * \param ntree_limit limit the number of trees used in prediction + */ + inline void Predict(const SparseBatch::Inst &inst, + bool output_margin, + std::vector *out_preds, + unsigned ntree_limit = 0) const; + /*! + * \brief Create a new instance of learner. + * \param cache_data The matrix to cache the prediction. + * \return Created learner. + */ + static Learner* Create(const std::vector& cache_data); + + protected: + /*! \brief internal base score of the model */ + bst_float base_score_; + /*! \brief objective function */ + std::unique_ptr obj_; + /*! \brief The gradient boosted used by the model*/ + std::unique_ptr gbm_; + /*! \brief The evaluation metrics used to evaluate the model. */ + std::vector > metrics_; +}; + +// implementation of inline functions. +inline void Learner::Predict(const SparseBatch::Inst& inst, + bool output_margin, + std::vector* out_preds, + unsigned ntree_limit) const { + gbm_->Predict(inst, out_preds, ntree_limit); + if (out_preds->size() == 1) { + (*out_preds)[0] += base_score_; + } + if (!output_margin) { + obj_->PredTransform(out_preds); + } +} +} // namespace xgboost +#endif // XGBOOST_LEARNER_H_ diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 60c675764..d78756516 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -206,7 +206,7 @@ class GBLinear : public GradientBooster { LOG(FATAL) << "gblinear does not support predict leaf index"; } - std::vector Dump2Text(const FeatureMap& fmap, int option) override { + std::vector Dump2Text(const FeatureMap& fmap, int option) const override { std::stringstream fo(""); fo << "bias:\n"; for (int i = 0; i < model.param.num_output_group; ++i) { @@ -258,13 +258,19 @@ class GBLinear : public GradientBooster { fi->Read(&weight); } // model bias - inline float* bias(void) { + inline float* bias() { + return &weight[param.num_feature * param.num_output_group]; + } + inline const float* bias() const { return &weight[param.num_feature * param.num_output_group]; } // get i-th weight inline float* operator[](size_t i) { return &weight[i * param.num_output_group]; } + inline const float* operator[](size_t i) const { + return &weight[i * param.num_output_group]; + } }; // model field Model model; diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 9d0b22293..ef75e5de8 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -113,7 +113,11 @@ class GBTree : public GradientBooster { for (const auto& up : updaters) { up->Init(cfg); } + if (tparam.nthread != 0) { + omp_set_num_threads(tparam.nthread); + } } + void LoadModel(dmlc::Stream* fi) override { CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam)) << "GBTree: invalid model file"; @@ -130,6 +134,7 @@ class GBTree : public GradientBooster { } this->ResetPredBuffer(0); } + void SaveModel(dmlc::Stream* fo) const override { CHECK_EQ(mparam.num_trees, static_cast(trees.size())); // not save predict buffer. @@ -143,6 +148,7 @@ class GBTree : public GradientBooster { fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size()); } } + void InitModel() override { CHECK(mparam.num_trees == 0 && trees.size() == 0) << "Model has already been initialized."; @@ -151,6 +157,7 @@ class GBTree : public GradientBooster { pred_buffer.resize(mparam.PredBufferSize(), 0.0f); pred_counter.resize(mparam.PredBufferSize(), 0); } + void ResetPredBuffer(size_t num_pbuffer) override { mparam.num_pbuffer = static_cast(num_pbuffer); pred_buffer.clear(); @@ -158,10 +165,12 @@ class GBTree : public GradientBooster { pred_buffer.resize(mparam.PredBufferSize(), 0.0f); pred_counter.resize(mparam.PredBufferSize(), 0); } + bool AllowLazyCheckPoint() const override { return mparam.num_output_group == 1 || tparam.updater_seq.find("distcol") != std::string::npos; } + void DoBoost(DMatrix* p_fmat, int64_t buffer_offset, std::vector* in_gpair) override { @@ -191,6 +200,7 @@ class GBTree : public GradientBooster { this->CommitModel(std::move(new_trees[gid]), gid); } } + void Predict(DMatrix* p_fmat, int64_t buffer_offset, std::vector* out_preds, @@ -230,6 +240,7 @@ class GBTree : public GradientBooster { } } } + void Predict(const SparseBatch::Inst& inst, std::vector* out_preds, unsigned ntree_limit, @@ -246,9 +257,10 @@ class GBTree : public GradientBooster { ntree_limit); } } + void PredictLeaf(DMatrix* p_fmat, std::vector* out_preds, - unsigned ntree_limit) { + unsigned ntree_limit) override { int nthread; #pragma omp parallel { @@ -257,7 +269,8 @@ class GBTree : public GradientBooster { InitThreadTemp(nthread); this->PredPath(p_fmat, out_preds, ntree_limit); } - std::vector Dump2Text(const FeatureMap& fmap, int option) { + + std::vector Dump2Text(const FeatureMap& fmap, int option) const override { std::vector dump; for (size_t i = 0; i < trees.size(); i++) { dump.push_back(trees[i]->Dump2Text(fmap, option & 1)); diff --git a/old_src/learner/learner-inl.hpp b/src/learner.cc similarity index 98% rename from old_src/learner/learner-inl.hpp rename to src/learner.cc index 0e8480663..220d6d1dd 100644 --- a/old_src/learner/learner-inl.hpp +++ b/src/learner.cc @@ -21,6 +21,16 @@ namespace xgboost { /*! \brief namespace for learning algorithm */ namespace learner { + +inline bool Learner::AllowLazyCheckPoint() const { + return gbm_->AllowLazyCheckPoint(); +} + +inline std::vector +Learner::Dump2Text(const FeatureMap& fmap, int option) const { + return gbm_->Dump2Text(fmap, option); +} + /*! * \brief learner that performs gradient boosting for a specific objective function. * It does training and prediction.