[LEARNER] Init learner interface

2016-01-03 05:16:05 -08:00
parent 084f5f4715
commit 82ceb4de0a
6 changed files with 191 additions and 7 deletions
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -0,0 +1,155 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file learner.h
+ * \brief Learner interface that integrates objective, gbm and evaluation together.
+ *  This is the user facing XGBoost training module.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_LEARNER_H_
+#define XGBOOST_LEARNER_H_
+
+#include <rabit.h>
+#include <utility>
+#include <string>
+#include <vector>
+#include "./base.h"
+#include "./gbm.h"
+#include "./meric.h"
+#include "./objective.h"
+
+namespace xgboost {
+/*!
+ * \brief Learner class that do trainig and prediction.
+ *  This is the user facing module of xgboost training.
+ *  The Load/Save function corresponds to the model used in python/R.
+ *  \code
+ *
+ *  std::unique_ptr<Learner> learner(new Learner::Create(cache_mats));
+ *  learner.Configure(configs);
+ *
+ *  for (int iter = 0; iter < max_iter; ++i) {
+ *    learner->UpdateOneIter(iter, train_mat);
+ *    LOG(INFO) << learner->EvalOneIter(iter, data_sets, data_names);
+ *  }
+ *
+ *  \endcode
+ */
+class Learner : public rabit::Serializable {
+ public:
+  /*!
+   * \brief Set the configuration of gradient boosting.
+   *  User must call configure once before InitModel and Training.
+   *
+   * \param cfg configurations on both training and model parameters.
+   */
+  virtual void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) = 0;
+  /*!
+   * \brief load model from stream
+   * \param fi input stream.
+   */
+  virtual void Load(dmlc::Stream* fi) = 0;
+  /*!
+   * \brief save model to stream.
+   * \param fo output stream
+   */
+  virtual void Save(dmlc::Stream* fo) const = 0;
+  /*!
+   * \brief update the model for one iteration
+   *  With the specified objective function.
+   * \param iter current iteration number
+   * \param train reference to the data matrix.
+   */
+  void UpdateOneIter(int iter, DMatrix* train);
+  /*!
+   * \brief Do customized gradient boosting with in_gpair.
+   *  in_gair can be mutated after this call.
+   * \param iter current iteration number
+   * \param train reference to the data matrix.
+   * \param in_gpair The input gradient statistics.
+   */
+  void BoostOneIter(int iter,
+                    DMatrix* train,
+                    std::vector<bst_gpair>* in_gpair);
+  /*!
+   * \brief evaluate the model for specific iteration using the configured metrics.
+   * \param iter iteration number
+   * \param data_sets datasets to be evaluated.
+   * \param data_names name of each dataset
+   * \return a string corresponding to the evaluation result
+   */
+  std::string EvalOneIter(int iter,
+                          const std::vector<DMatrix*>& data_sets,
+                          const std::vector<std::string>& data_names);
+  /*!
+   * \brief get prediction given the model.
+   * \param data input data
+   * \param output_margin whether to only predict margin value instead of transformed prediction
+   * \param out_preds output vector that stores the prediction
+   * \param ntree_limit limit number of trees used for boosted tree
+   *   predictor, when it equals 0, this means we are using all the trees
+   * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
+   */
+  void Predict(DMatrix* data,
+               bool output_margin,
+               std::vector<float> *out_preds,
+               unsigned ntree_limit = 0,
+               bool pred_leaf = false) const;
+  /*!
+   * \return whether the model allow lazy checkpoint in rabit.
+   */
+  bool AllowLazyCheckPoint() const;
+  /*!
+   * \brief dump the model in text format
+   * \param fmap feature map that may help give interpretations of feature
+   * \param option extra option of the dump model
+   * \return a vector of dump for boosters.
+   */
+  std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const;
+  /*!
+   * \brief online prediction function, predict score for one instance at a time
+   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
+   *        more efficient than online prediction
+   *        This function is NOT threadsafe, make sure you only call from one thread.
+   *
+   * \param inst the instance you want to predict
+   * \param output_margin whether to only predict margin value instead of transformed prediction
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction
+   */
+  inline void Predict(const SparseBatch::Inst &inst,
+                      bool output_margin,
+                      std::vector<float> *out_preds,
+                      unsigned ntree_limit = 0) const;
+  /*!
+   * \brief Create a new instance of learner.
+   * \param cache_data The matrix to cache the prediction.
+   * \return Created learner.
+   */
+  static Learner* Create(const std::vector<DMatrix*>& cache_data);
+
+ protected:
+  /*! \brief internal base score of the model */
+  bst_float base_score_;
+  /*! \brief objective function */
+  std::unique_ptr<ObjFunction> obj_;
+  /*! \brief The gradient boosted used by the model*/
+  std::unique_ptr<GradientBooster> gbm_;
+  /*! \brief The evaluation metrics used to evaluate the model. */
+  std::vector<std::unique_ptr<Metric> > metrics_;
+};
+
+// implementation of inline functions.
+inline void Learner::Predict(const SparseBatch::Inst& inst,
+                             bool output_margin,
+                             std::vector<float>* out_preds,
+                             unsigned ntree_limit) const {
+  gbm_->Predict(inst, out_preds, ntree_limit);
+  if (out_preds->size() == 1) {
+    (*out_preds)[0] += base_score_;
+  }
+  if (!output_margin) {
+    obj_->PredTransform(out_preds);
+  }
+}
+}  // namespace xgboost
+#endif  // XGBOOST_LEARNER_H_