complete refactor data.h, now replies on iterator to access column

2014-08-27 17:00:21 -07:00
parent a59f8945dc
commit 605269133e
15 changed files with 216 additions and 492 deletions
--- a/src/learner/dmatrix.h
+++ b/src/learner/dmatrix.h
@@ -8,7 +8,7 @@
 */
 #include <vector>
 #include "../data.h"
-
+#include "../utils/io.h"
 namespace xgboost {
 namespace learner {
 /*!
@@ -142,7 +142,6 @@ struct MetaInfo {
 * \brief data object used for learning,
 * \tparam FMatrix type of feature data source
 */
-template<typename FMatrix>
 struct DMatrix {
  /*! 
   * \brief magic number associated with this object 
@@ -152,7 +151,7 @@ struct DMatrix {
  /*! \brief meta information about the dataset */
  MetaInfo info;
  /*! \brief feature matrix about data content */
-  FMatrix fmat;
+  IFMatrix *fmat;
  /*! 
   * \brief cache pointer to verify if the data structure is cached in some learner
   *  used to verify if DMatrix is cached
@@ -161,7 +160,9 @@ struct DMatrix {
  /*! \brief default constructor */
  explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
  // virtual destructor
-  virtual ~DMatrix(void){}
+  virtual ~DMatrix(void){
+    delete fmat;
+  }
 };

 }  // namespace learner
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -21,7 +21,6 @@ namespace learner {
 * \brief learner that takes do gradient boosting on specific objective functions
 *  and do training and prediction
 */
-template<typename FMatrix>
 class BoostLearner {
 public:
  BoostLearner(void) {
@@ -44,7 +43,7 @@ class BoostLearner {
   *             data matrices to continue training otherwise it will cause error
   * \param mats array of pointers to matrix whose prediction result need to be cached
   */          
-  inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
+  inline void SetCacheData(const std::vector<DMatrix*>& mats) {
    // estimate feature bound
    unsigned num_feature = 0;
    // assign buffer index
@@ -158,15 +157,15 @@ class BoostLearner {
   *  if not intialize it
   * \param p_train pointer to the matrix used by training
   */
-  inline void CheckInit(DMatrix<FMatrix> *p_train) {
-    p_train->fmat.InitColAccess(prob_buffer_row);
+  inline void CheckInit(DMatrix *p_train) {
+    p_train->fmat->InitColAccess(prob_buffer_row);
  }
  /*!
   * \brief update the model for one iteration
   * \param iter current iteration number
   * \param p_train pointer to the data matrix
   */
-  inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) {
+  inline void UpdateOneIter(int iter, const DMatrix &train) {
    this->PredictRaw(train, &preds_);
    obj_->GetGradient(preds_, train.info, iter, &gpair_);
    gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
@@ -179,7 +178,7 @@ class BoostLearner {
   * \return a string corresponding to the evaluation result
   */
  inline std::string EvalOneIter(int iter,
-                                 const std::vector<const DMatrix<FMatrix>*> &evals,
+                                 const std::vector<const DMatrix*> &evals,
                                 const std::vector<std::string> &evname) {
    std::string res;
    char tmp[256];
@@ -198,7 +197,7 @@ class BoostLearner {
   * \param metric name of metric
   * \return a pair of <evaluation name, result>
   */
-  std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
+  std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
    if (metric == "auto") metric = obj_->DefaultEvalMetric();
    IEvaluator *ev = CreateEvaluator(metric.c_str());
    this->PredictRaw(data, &preds_);
@@ -213,7 +212,7 @@ class BoostLearner {
   * \param output_margin whether to only predict margin value instead of transformed prediction
   * \param out_preds output vector that stores the prediction
   */
-  inline void Predict(const DMatrix<FMatrix> &data,
+  inline void Predict(const DMatrix &data,
                      bool output_margin,
                      std::vector<float> *out_preds) const {
    this->PredictRaw(data, out_preds);
@@ -235,7 +234,7 @@ class BoostLearner {
    if (obj_ != NULL) return;
    utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
    obj_ = CreateObjFunction(name_obj_.c_str());
-    gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
+    gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
    for (size_t i = 0; i < cfg_.size(); ++i) {
      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@@ -247,7 +246,7 @@ class BoostLearner {
   * \param data training data matrix
   * \param out_preds output vector that stores the prediction
   */
-  inline void PredictRaw(const DMatrix<FMatrix> &data,
+  inline void PredictRaw(const DMatrix &data,
                         std::vector<float> *out_preds) const {
    gbm_->Predict(data.fmat, this->FindBufferOffset(data),
                  data.info.info, out_preds);
@@ -307,7 +306,7 @@ class BoostLearner {
  // model parameter
  ModelParam   mparam;
  // gbm model that back everything
-  gbm::IGradBooster<FMatrix> *gbm_;
+  gbm::IGradBooster *gbm_;
  // name of gbm model used for training
  std::string name_gbm_;
  // objective fnction
@@ -324,14 +323,14 @@ class BoostLearner {
 private:
  // cache entry object that helps handle feature caching
  struct CacheEntry {
-    const DMatrix<FMatrix> *mat_;
+    const DMatrix *mat_;
    size_t buffer_offset_;
    size_t num_row_;
-    CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
+    CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
        :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
  };
  // find internal bufer offset for certain matrix, if not exist, return -1
-  inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
+  inline int64_t FindBufferOffset(const DMatrix &mat) const {
    for (size_t i = 0; i < cache_.size(); ++i) {
      if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
        if (cache_[i].num_row_ == mat.info.num_row()) {