complete refactor data.h, now replies on iterator to access column

2014-08-27 17:00:21 -07:00
parent a59f8945dc
commit 605269133e
15 changed files with 216 additions and 492 deletions
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -18,8 +18,7 @@ namespace gbm {
 * \brief gradient boosted linear model
 * \tparam FMatrix the data type updater taking
 */
-template<typename FMatrix>
-class GBLinear : public IGradBooster<FMatrix> {
+class GBLinear : public IGradBooster {
 public:
  virtual ~GBLinear(void) {
  }
@@ -41,13 +40,12 @@ class GBLinear : public IGradBooster<FMatrix> {
  virtual void InitModel(void) {
    model.InitModel();
  }
-  virtual void DoBoost(const FMatrix &fmat,
+  virtual void DoBoost(IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       std::vector<bst_gpair> *in_gpair) {
-    this->InitFeatIndex(fmat);
    std::vector<bst_gpair> &gpair = *in_gpair;
    const int ngroup = model.param.num_output_group;
-    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
    // for all the output group
    for (int gid = 0; gid < ngroup; ++gid) {
      double sum_grad = 0.0, sum_hess = 0.0;
@@ -72,42 +70,46 @@ class GBLinear : public IGradBooster<FMatrix> {
        }
      }
    }
-    // number of features
-    const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < nfeat; ++i) {
-      const bst_uint fid = feat_index[i];
-      for (int gid = 0; gid < ngroup; ++gid) {
-        double sum_grad = 0.0, sum_hess = 0.0;
-        for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
-          const float v = it.fvalue();
-          bst_gpair &p = gpair[it.rindex() * ngroup + gid];
-          if (p.hess < 0.0f) continue;
-          sum_grad += p.grad * v;
-          sum_hess += p.hess * v * v;
-        }
-        float &w = model[fid][gid];
-        bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
-        w += dw;
-        // update grad value
-        for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
-          bst_gpair &p = gpair[it.rindex() * ngroup + gid];
-          if (p.hess < 0.0f) continue;
-          p.grad += p.hess * it.fvalue() * dw;
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    while (iter->Next()) {
+      // number of features
+      const ColBatch &batch = iter->Value();
+      const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nfeat; ++i) {
+        const bst_uint fid = batch.col_index[i];
+        ColBatch::Inst col = batch[i];
+        for (int gid = 0; gid < ngroup; ++gid) {
+          double sum_grad = 0.0, sum_hess = 0.0;
+          for (bst_uint j = 0; j < col.length; ++j) {
+            const float v = col[j].fvalue;
+            bst_gpair &p = gpair[col[j].index * ngroup + gid];
+            if (p.hess < 0.0f) continue;
+            sum_grad += p.grad * v;
+            sum_hess += p.hess * v * v;
+          }
+          float &w = model[fid][gid];
+          bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
+          w += dw;
+          // update grad value
+          for (bst_uint j = 0; j < col.length; ++j) {
+            bst_gpair &p = gpair[col[j].index * ngroup + gid];
+            if (p.hess < 0.0f) continue;
+            p.grad += p.hess * col[j].fvalue * dw;
+          }
        }
      }
    }
  }

-  virtual void Predict(const FMatrix &fmat,
+  virtual void Predict(IFMatrix *p_fmat,
                       int64_t buffer_offset,
                       const BoosterInfo &info,
                       std::vector<float> *out_preds) {
    std::vector<float> &preds = *out_preds;
    preds.resize(0);
    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = fmat.RowIterator();
-    iter->BeforeFirst();
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
    const int ngroup = model.param.num_output_group;
    while (iter->Next()) {
      const RowBatch &batch = iter->Value();
@@ -134,18 +136,6 @@ class GBLinear : public IGradBooster<FMatrix> {
  }

 protected:
-  inline void InitFeatIndex(const FMatrix &fmat) {
-    if (feat_index.size() != 0) return;
-    // initialize feature index
-    unsigned ncol = static_cast<unsigned>(fmat.NumCol());
-    feat_index.reserve(ncol);
-    for (unsigned i = 0; i < ncol; ++i) {
-      if (fmat.GetColSize(i) != 0) {
-        feat_index.push_back(i);
-      }
-    }
-    random::Shuffle(feat_index);
-  }
  inline void Pred(const RowBatch::Inst &inst, float *preds) {
    for (int gid = 0; gid < model.param.num_output_group; ++gid) {
      float psum = model.bias()[gid];
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -7,6 +7,7 @@
 */
 #include <vector>
 #include "../data.h"
+#include "../utils/io.h"
 #include "../utils/fmap.h"

 namespace xgboost {
@@ -14,9 +15,7 @@ namespace xgboost {
 namespace gbm {
 /*! 
 * \brief interface of gradient boosting model
- * \tparam FMatrix the data type updater taking
 */
-template<typename FMatrix>
 class IGradBooster {
 public:
  /*!
@@ -41,17 +40,17 @@ class IGradBooster {
  virtual void InitModel(void) = 0;
  /*!
   * \brief peform update to the model(boosting)
-   * \param fmat feature matrix that provide access to features
+   * \param p_fmat feature matrix that provide access to features
   * \param info meta information about training
   * \param in_gpair address of the gradient pair statistics of the data
   * the booster may change content of gpair
   */
-  virtual void DoBoost(const FMatrix &fmat,
+  virtual void DoBoost(IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       std::vector<bst_gpair> *in_gpair) = 0;
  /*!
   * \brief generate predictions for given feature matrix
-   * \param fmat feature matrix
+   * \param p_fmat feature matrix
   * \param buffer_offset buffer index offset of these instances, if equals -1
   *        this means we do not have buffer index allocated to the gbm
   *  a buffer index is assigned to each instance that requires repeative prediction
@@ -59,7 +58,7 @@ class IGradBooster {
   * \param info extra side information that may be needed for prediction
   * \param out_preds output vector to hold the predictions
   */
-  virtual void Predict(const FMatrix &fmat,
+  virtual void Predict(IFMatrix *p_fmat,
                       int64_t buffer_offset,
                       const BoosterInfo &info,
                       std::vector<float> *out_preds) = 0;
@@ -73,21 +72,11 @@ class IGradBooster {
  // destrcutor
  virtual ~IGradBooster(void){}
 };
-}  // namespace gbm
-}  // namespace xgboost
-
-#include "gbtree-inl.hpp"
-#include "gblinear-inl.hpp"
-
-namespace xgboost {
-namespace gbm {
-template<typename FMatrix>
-inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
-  if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
-  if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
-  utils::Error("unknown booster type: %s", name);
-  return NULL;
-}
+/*!
+ * \breif create a gradient booster from given name
+ * \param name name of gradient booster
+ */
+IGradBooster* CreateGradBooster(const char *name);
 }  // namespace gbm
 }  // namespace xgboost
 #endif  // XGBOOST_GBM_GBM_H_
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -9,16 +9,15 @@
 #include <utility>
 #include <string>
 #include "./gbm.h"
+#include "../utils/omp.h"
 #include "../tree/updater.h"

 namespace xgboost {
 namespace gbm {
 /*!
 * \brief gradient boosted tree
- * \tparam FMatrix the data type updater taking
 */
-template<typename FMatrix>
-class GBTree : public IGradBooster<FMatrix> {
+class GBTree : public IGradBooster {
 public:
  virtual ~GBTree(void) {
    this->Clear();
@@ -82,12 +81,12 @@ class GBTree : public IGradBooster<FMatrix> {
    utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
    utils::Assert(trees.size() == 0, "GBTree: model already initialized");
  }
-  virtual void DoBoost(const FMatrix &fmat,
+  virtual void DoBoost(IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       std::vector<bst_gpair> *in_gpair) {
    const std::vector<bst_gpair> &gpair = *in_gpair;
    if (mparam.num_output_group == 1) {
-      this->BoostNewTrees(gpair, fmat, info, 0);
+      this->BoostNewTrees(gpair, p_fmat, info, 0);
    } else {
      const int ngroup = mparam.num_output_group;
      utils::Check(gpair.size() % ngroup == 0,
@@ -99,11 +98,11 @@ class GBTree : public IGradBooster<FMatrix> {
        for (bst_omp_uint i = 0; i < nsize; ++i) {
          tmp[i] = gpair[i * ngroup + gid];
        }
-        this->BoostNewTrees(tmp, fmat, info, gid);
+        this->BoostNewTrees(tmp, p_fmat, info, gid);
      }
    }
  }
-  virtual void Predict(const FMatrix &fmat,
+  virtual void Predict(IFMatrix *p_fmat,
                       int64_t buffer_offset,
                       const BoosterInfo &info,
                       std::vector<float> *out_preds) {
@@ -121,7 +120,7 @@ class GBTree : public IGradBooster<FMatrix> {
    const size_t stride = info.num_row * mparam.num_output_group;
    preds.resize(stride * (mparam.size_leaf_vector+1));
    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = fmat.RowIterator();
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
    iter->BeforeFirst();
    while (iter->Next()) {
      const RowBatch &batch = iter->Value();
@@ -172,7 +171,7 @@ class GBTree : public IGradBooster<FMatrix> {
    char *pstr;
    pstr = strtok(&tval[0], ",");
    while (pstr != NULL) {
-      updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
+      updaters.push_back(tree::CreateUpdater(pstr));
      for (size_t j = 0; j < cfg.size(); ++j) {
        // set parameters
        updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
@@ -183,7 +182,7 @@ class GBTree : public IGradBooster<FMatrix> {
  }
  // do group specific group
  inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
-                            const FMatrix &fmat,
+                            IFMatrix *p_fmat,
                            const BoosterInfo &info,
                            int bst_group) {
    this->InitUpdater();
@@ -198,7 +197,7 @@ class GBTree : public IGradBooster<FMatrix> {
    }
    // update the trees
    for (size_t i = 0; i < updaters.size(); ++i) {
-      updaters[i]->Update(gpair, fmat, info, new_trees);
+      updaters[i]->Update(gpair, p_fmat, info, new_trees);
    }
    // push back to model
    for (size_t i = 0; i < new_trees.size(); ++i) {
@@ -361,7 +360,7 @@ class GBTree : public IGradBooster<FMatrix> {
  // temporal storage for per thread
  std::vector<tree::RegTree::FVec> thread_temp;
  // the updaters that can be applied to each of tree
-  std::vector< tree::IUpdater<FMatrix>* > updaters;
+  std::vector<tree::IUpdater*> updaters;
 };

 }  // namespace gbm