Dmatrix refactor stage 1 (#3301)

* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
2018-06-07 10:25:58 +12:00
parent 286dccb8e8
commit a96039141a
47 changed files with 650 additions and 1036 deletions
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -65,10 +65,10 @@ inline std::pair<double, double> GetGradient(int group_idx, int num_group, int f
                                             const std::vector<GradientPair> &gpair,
                                             DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
-  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
+  auto iter = p_fmat->ColIterator();
  while (iter->Next()) {
-    const ColBatch &batch = iter->Value();
-    ColBatch::Inst col = batch[0];
+    auto batch = iter->Value();
+    auto col = batch[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.length);
    for (bst_omp_uint j = 0; j < ndata; ++j) {
      const bst_float v = col[j].fvalue;
@@ -96,10 +96,10 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
                                                     const std::vector<GradientPair> &gpair,
                                                     DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
-  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
+  auto iter = p_fmat->ColIterator();
  while (iter->Next()) {
-    const ColBatch &batch = iter->Value();
-    ColBatch::Inst col = batch[0];
+    auto batch = iter->Value();
+    auto col = batch[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.length);
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
    for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -154,10 +154,10 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
                                   float dw, std::vector<GradientPair> *in_gpair,
                                   DMatrix *p_fmat) {
  if (dw == 0.0f) return;
-  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
+  auto iter = p_fmat->ColIterator();
  while (iter->Next()) {
-    const ColBatch &batch = iter->Value();
-    ColBatch::Inst col = batch[0];
+    auto batch = iter->Value();
+    auto col = batch[fidx];
    // update grad value
    const auto num_row = static_cast<bst_omp_uint>(col.length);
 #pragma omp parallel for schedule(static)
@@ -325,12 +325,12 @@ class GreedyFeatureSelector : public FeatureSelector {
    const bst_omp_uint nfeat = model.param.num_feature;
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
+      auto batch = iter->Value();
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const ColBatch::Inst col = batch[i];
+        const auto col = batch[i];
        const bst_uint ndata = col.length;
        auto &sums = gpair_sums_[group_idx * nfeat + i];
        for (bst_uint j = 0u; j < ndata; ++j) {
@@ -392,13 +392,13 @@ class ThriftyFeatureSelector : public FeatureSelector {
    }
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
+      auto batch = iter->Value();
      // column-parallel is usually faster than row-parallel
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const ColBatch::Inst col = batch[i];
+        const auto col = batch[i];
        const bst_uint ndata = col.length;
        for (bst_uint gid = 0u; gid < ngroup; ++gid) {
          auto &sums = gpair_sums_[gid * nfeat + i];