xgboost/learner/learner-inl.hpp

#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
#define XGBOOST_LEARNER_LEARNER_INL_HPP_
/*!
 * \file learner-inl.hpp
 * \brief learning algorithm
 * \author Tianqi Chen
 */
#include <algorithm>
#include <vector>
#include <utility>
#include <string>
#include "./objective.h"
#include "./evaluation.h"
#include "../gbm/gbm.h"

namespace xgboost {
/*! \brief namespace for learning algorithm */
namespace learner {
/*!
 * \brief learner that takes do gradient boosting on specific objective functions
 *  and do training and prediction
 */
template<typename FMatrix>
class BoostLearner {
 public:
  BoostLearner(void) {
    obj_ = NULL;
    gbm_ = NULL;
    name_obj_ = "reg:linear";
    name_gbm_ = "gbtree";
  }
  ~BoostLearner(void) {
    if (obj_ != NULL) delete obj_;
    if (gbm_ != NULL) delete gbm_;
  }
  /*!
   * \brief add internal cache space for mat, this can speedup prediction for matrix,
   *        please cache prediction for training and eval data
   *    warning: if the model is loaded from file from some previous training history
   *             set cache data must be called with exactly SAME
   *             data matrices to continue training otherwise it will cause error
   * \param mats array of pointers to matrix whose prediction result need to be cached
   */
  inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
    // estimate feature bound
    unsigned num_feature = 0;
    // assign buffer index
    size_t buffer_size = 0;
    utils::Assert(cache_.size() == 0, "can only call cache data once");
    for (size_t i = 0; i < mats.size(); ++i) {
      bool dupilicate = false;
      for (size_t j = 0; j < i; ++j) {
        if (mats[i] == mats[j]) dupilicate = true;
      }
      if (dupilicate) continue;
      // set mats[i]'s cache learner pointer to this
      mats[i]->cache_learner_ptr_ = this;
      cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row));
      buffer_size += mats[i]->num_row;
      num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->num_col));
    }
    char str_temp[25];
    if (num_feature > mparam.num_feature) {
      snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
      this->SetParam("bst:num_feature", str_temp);
    }
    snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
    this->SetParam("num_pbuffer", str_temp);
    if (!silent) {
      printf("buffer_size=%ld\n", buffer_size);
    }
  }
  /*!
   * \brief set parameters from outside
   * \param name name of the parameter
   * \param val  value of the parameter
   */
  inline void SetParam(const char *name, const char *val) {
    if (!strcmp(name, "silent")) silent = atoi(val);
    if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
    if (gbm_ == NULL) {
      if (!strcmp(name, "objective")) name_obj_ = val;
      if (!strcmp(name, "booster")) name_gbm_ = val;
      mparam.SetParam(name, val);
    }
    cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
  }
  /*!
   * \brief initialize the model
   */
  inline void InitModel(void) {
    this->InitObjGBM();
    // adapt the base score
    mparam.base_score = obj_->ProbToMargin(mparam.base_score);
    gbm_->InitModel();
  }
  /*!
   * \brief load model from stream
   * \param fi input stream
   */
  inline void LoadModel(utils::IStream &fi) {
    utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                 "BoostLearner: wrong model format");
    utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
    utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
    // delete existing gbm if any
    if (obj_ != NULL) delete obj_;
    if (gbm_ != NULL) delete gbm_;
    this->InitObjGBM();
    gbm_->LoadModel(fi);
  }
  /*!
   * \brief load model from file
   * \param fname file name
   */
  inline void LoadModel(const char *fname) {
    utils::FileStream fi(utils::FopenCheck(fname, "rb"));
    this->LoadModel(fi);
    fi.Close();
  }
  inline void SaveModel(utils::IStream &fo) const {
    fo.Write(&mparam, sizeof(ModelParam));
    fo.Write(&name_obj_);
    fo.Write(&name_gbm_);
    gbm_->SaveModel(fo);
  }
  /*!
   * \brief save model into file
   * \param fname file name
   */
  inline void SaveModel(const char *fname) const {
    utils::FileStream fo(utils::FopenCheck(fname, "wb"));
    this->SaveModel(fo);
    fo.Close();
  }
  /*!
   * \brief update the model for one iteration
   * \param iter current iteration number
   * \param p_train pointer to the data matrix
   */
  inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
    this->PredictRaw(preds_, *p_train);
    obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
    gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
  }
  /*!
   * \brief evaluate the model for specific iteration
   * \param iter iteration number
   * \param evals datas i want to evaluate
   * \param evname name of each dataset
   * \return a string corresponding to the evaluation result
   */
  inline std::string EvalOneIter(int iter,
                                 const std::vector<const DMatrix<FMatrix>*> &evals,
                                 const std::vector<std::string> &evname) {
    std::string res;
    char tmp[256];
    snprintf(tmp, sizeof(tmp), "[%d]", iter);
    res = tmp;
    for (size_t i = 0; i < evals.size(); ++i) {
      this->PredictRaw(*evals[i], &preds_);
      obj_->EvalTransform(&preds_);
      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
    }
    return res;
  }
  /*!
   * \brief simple evaluation function, with a specified metric
   * \param data input data
   * \param metric name of metric
   * \return a pair of <evaluation name, result>
   */
  std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
    if (metric == "auto") metric = obj_->DefaultEvalMetric();
    IEvaluator *ev = CreateEvaluator(metric.c_str());
    this->PredictRaw(data, &preds_);
    obj_->EvalTransform(&preds_);
    float res = ev->Eval(preds_, data.info);
    delete ev;
    return std::make_pair(metric, res);
  }
  /*!
   * \brief get prediction
   * \param data input data
   * \param out_preds output vector that stores the prediction
   */
  inline void Predict(const DMatrix<FMatrix> &data,
                      std::vector<float> *out_preds) const {
    this->PredictRaw(data, out_preds);
    obj_->PredTransform(out_preds);
  }

 protected:
  /*!
   * \brief initialize the objective function and GBM,
   * if not yet done
   */
  inline void InitObjGBM(void) {
    if (obj_ != NULL) return;
    utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
    obj_ = CreateObjFunction(name_obj_.c_str());
    gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
    for (size_t i = 0; i < cfg_.size(); ++i) {
      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
    }
    evaluator_.AddEval(obj_->DefaultEvalMetric());
  }
  /*!
   * \brief get un-transformed prediction
   * \param data training data matrix
   * \param out_preds output vector that stores the prediction
   */
  inline void PredictRaw(const DMatrix<FMatrix> &data,
                         std::vector<float> *out_preds) {
    gbm_->Predict(data.fmat, this->FindBufferOffset(data),
                  data.info, out_preds);
  }

  /*! \brief training parameter for regression */
  struct ModelParam{
    /* \brief global bias */
    float base_score;
    /* \brief number of features  */
    unsigned num_feature;
    /* \brief number of class, if it is multi-class classification  */
    int num_class;
    /*! \brief reserved field */
    int reserved[32];
    /*! \brief constructor */
    ModelParam(void) {
      base_score = 0.5f;
      num_feature = 0;
      num_class = 0;
      memset(reserved, 0, sizeof(reserved));
    }
    /*!
     * \brief set parameters from outside
     * \param name name of the parameter
     * \param val value of the parameter
     */
    inline void SetParam(const char *name, const char *val) {
      if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
      if (!strcmp("num_class", name)) num_class = atoi(val);
      if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
    }
  };
  // data fields
  // silent during training
  int silent;
  // evaluation set
  EvalSet evaluator_;
  // model parameter
  ModelParam   mparam;
  // gbm model that back everything
  gbm::IGradBooster<FMatrix> *gbm_;
  // name of gbm model used for training
  std::string name_gbm_;
  // objective fnction
  IObjFunction *obj_;
  // name of objective function
  std::string name_obj_;
  // configurations
  std::vector< std::pair<std::string, std::string> > cfg_;
  // temporal storages for prediciton
  std::vector<float> preds_;
  // gradient pairs
  std::vector<bst_gpair> gpair_;

 private:
  // cache entry object that helps handle feature caching
  struct CacheEntry {
    const DMatrix<FMatrix> *mat_;
    size_t buffer_offset_;
    size_t num_row_;
    CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
        :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
  };
  // find internal bufer offset for certain matrix, if not exist, return -1
  inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
    for (size_t i = 0; i < cache_.size(); ++i) {
      if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
        if (cache_[i].num_row_ == mat.num_row) {
          return cache_[i].buffer_offset_;
        }
      }
    }
    return -1;
  }
  // data structure field
  /*! \brief the entries indicates that we have internal prediction cache */
  std::vector<CacheEntry> cache_;
};
}  // namespace learner
}  // namespace xgboost
#endif  // XGBOOST_LEARNER_LEARNER_INL_HPP_