Dmatrix refactor stage 1 (#3301)

* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
2018-06-07 10:25:58 +12:00
parent 286dccb8e8
commit a96039141a
47 changed files with 650 additions and 1036 deletions
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -65,10 +65,10 @@ inline std::pair<double, double> GetGradient(int group_idx, int num_group, int f
                                             const std::vector<GradientPair> &gpair,
                                             DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
-  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
+  auto iter = p_fmat->ColIterator();
  while (iter->Next()) {
-    const ColBatch &batch = iter->Value();
-    ColBatch::Inst col = batch[0];
+    auto batch = iter->Value();
+    auto col = batch[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.length);
    for (bst_omp_uint j = 0; j < ndata; ++j) {
      const bst_float v = col[j].fvalue;
@@ -96,10 +96,10 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
                                                     const std::vector<GradientPair> &gpair,
                                                     DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
-  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
+  auto iter = p_fmat->ColIterator();
  while (iter->Next()) {
-    const ColBatch &batch = iter->Value();
-    ColBatch::Inst col = batch[0];
+    auto batch = iter->Value();
+    auto col = batch[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.length);
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
    for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -154,10 +154,10 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
                                   float dw, std::vector<GradientPair> *in_gpair,
                                   DMatrix *p_fmat) {
  if (dw == 0.0f) return;
-  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
+  auto iter = p_fmat->ColIterator();
  while (iter->Next()) {
-    const ColBatch &batch = iter->Value();
-    ColBatch::Inst col = batch[0];
+    auto batch = iter->Value();
+    auto col = batch[fidx];
    // update grad value
    const auto num_row = static_cast<bst_omp_uint>(col.length);
 #pragma omp parallel for schedule(static)
@@ -325,12 +325,12 @@ class GreedyFeatureSelector : public FeatureSelector {
    const bst_omp_uint nfeat = model.param.num_feature;
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
+      auto batch = iter->Value();
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const ColBatch::Inst col = batch[i];
+        const auto col = batch[i];
        const bst_uint ndata = col.length;
        auto &sums = gpair_sums_[group_idx * nfeat + i];
        for (bst_uint j = 0u; j < ndata; ++j) {
@@ -392,13 +392,13 @@ class ThriftyFeatureSelector : public FeatureSelector {
    }
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
+      auto batch = iter->Value();
      // column-parallel is usually faster than row-parallel
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const ColBatch::Inst col = batch[i];
+        const auto col = batch[i];
        const bst_uint ndata = col.length;
        for (bst_uint gid = 0u; gid < ngroup; ++gid) {
          auto &sums = gpair_sums_[gid * nfeat + i];
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -81,7 +81,7 @@ struct GPUCoordinateTrainParam
  float reg_alpha_denorm;
 };

-void RescaleIndices(size_t ridx_begin, dh::DVec<SparseBatch::Entry> *data) {
+void RescaleIndices(size_t ridx_begin, dh::DVec<Entry> *data) {
  auto d_data = data->Data();
  dh::LaunchN(data->DeviceIdx(), data->Size(),
              [=] __device__(size_t idx) { d_data[idx].index -= ridx_begin; });
@@ -92,14 +92,14 @@ class DeviceShard {
  int normalised_device_idx_;  // Device index counting from param.gpu_id
  dh::BulkAllocator<dh::MemoryType::kDevice> ba_;
  std::vector<size_t> row_ptr_;
-  dh::DVec<SparseBatch::Entry> data_;
+  dh::DVec<Entry> data_;
  dh::DVec<GradientPair> gpair_;
  dh::CubMemory temp_;
  size_t ridx_begin_;
  size_t ridx_end_;

 public:
-  DeviceShard(int device_idx, int normalised_device_idx, const ColBatch &batch,
+  DeviceShard(int device_idx, int normalised_device_idx, const SparsePage &batch,
              bst_uint row_begin, bst_uint row_end,
              const GPUCoordinateTrainParam &param,
              const gbm::GBLinearModelParam &model_param)
@@ -112,17 +112,17 @@ class DeviceShard {
    // this shard
    std::vector<std::pair<bst_uint, bst_uint>> column_segments;
    row_ptr_ = {0};
-    for (auto fidx = 0; fidx < batch.size; fidx++) {
+    for (auto fidx = 0; fidx < batch.Size(); fidx++) {
      auto col = batch[fidx];
-      auto cmp = [](SparseBatch::Entry e1, SparseBatch::Entry e2) {
+      auto cmp = [](Entry e1, Entry e2) {
        return e1.index < e2.index;
      };
      auto column_begin =
          std::lower_bound(col.data, col.data + col.length,
-                           SparseBatch::Entry(row_begin, 0.0f), cmp);
+                           Entry(row_begin, 0.0f), cmp);
      auto column_end =
          std::upper_bound(col.data, col.data + col.length,
-                           SparseBatch::Entry(row_end, 0.0f), cmp);
+                           Entry(row_end, 0.0f), cmp);
      column_segments.push_back(
          std::make_pair(column_begin - col.data, column_end - col.data));
      row_ptr_.push_back(row_ptr_.back() + column_end - column_begin);
@@ -130,8 +130,8 @@ class DeviceShard {
    ba_.Allocate(device_idx, param.silent, &data_, row_ptr_.back(), &gpair_,
                (row_end - row_begin) * model_param.num_output_group);

-    for (int fidx = 0; fidx < batch.size; fidx++) {
-      ColBatch::Inst col = batch[fidx];
+    for (int fidx = 0; fidx < batch.Size(); fidx++) {
+      auto col = batch[fidx];
      thrust::copy(col.data + column_segments[fidx].first,
                   col.data + column_segments[fidx].second,
                   data_.tbegin() + row_ptr_[fidx]);
@@ -233,7 +233,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
      row_begin = row_end;
    }

-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    auto iter = p_fmat->ColIterator();
    CHECK(p_fmat->SingleColBlock());
    iter->Next();
    auto batch = iter->Value();
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -79,17 +79,17 @@ class ShotgunUpdater : public LinearUpdater {
    // lock-free parallel updates of weights
    selector_->Setup(*model, in_gpair->HostVector(), p_fmat,
                     param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+     auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      const auto nfeat = static_cast<bst_omp_uint>(batch.size);
+      auto batch = iter->Value();
+      const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
        int ii = selector_->NextFeature(i, *model, 0, in_gpair->HostVector(), p_fmat,
                                       param_.reg_alpha_denorm, param_.reg_lambda_denorm);
        if (ii < 0) continue;
-        const bst_uint fid = batch.col_index[ii];
-        ColBatch::Inst col = batch[ii];
+        const bst_uint fid = ii;
+        auto col = batch[ii];
        for (int gid = 0; gid < ngroup; ++gid) {
          double sum_grad = 0.0, sum_hess = 0.0;
          for (bst_uint j = 0; j < col.length; ++j) {