- training with external memory - part 2 of 2 (#4526)

* - training with external memory - part 2 of 2 - when external memory support is enabled, building of histogram indices are done incrementally for every sparse page - the entire set of input data is divided across multiple gpu's and the relative row positions within each device is tracked when building the compressed histogram buffer - this was tested using a mortgage dataset containing ~ 670m rows before 4xt4's could be saturated
2019-06-11 14:52:56 -07:00
parent 4591039eba
commit a2042b685a
4 changed files with 292 additions and 61 deletions
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -6,6 +6,7 @@
 #include <random>
 #include <cinttypes>
 #include <dmlc/filesystem.h>
+#include "../../src/data/simple_csr_source.h"

 bool FileExists(const std::string& filename) {
  struct stat st;
@@ -165,6 +166,71 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_s
  return dmat;
 }

+std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
+                                                       size_t page_size, bool deterministic) {
+  if (!n_rows || !n_cols) {
+    return nullptr;
+  }
+
+  // Create the svm file in a temp dir
+  dmlc::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.path + "/big.libsvm";
+
+  std::ofstream fo(tmp_file.c_str());
+  size_t cols_per_row = ((std::max(n_rows, n_cols) - 1) / std::min(n_rows, n_cols)) + 1;
+  int64_t rem_cols = n_cols;
+  size_t col_idx = 0;
+
+  // Random feature id generator
+  std::random_device rdev;
+  std::unique_ptr<std::mt19937> gen;
+  if (deterministic) {
+     // Seed it with a constant value for this configuration - without getting too fancy
+     // like ordered pairing functions and its likes to make it truely unique
+     gen.reset(new std::mt19937(n_rows * n_cols));
+  } else {
+     gen.reset(new std::mt19937(rdev()));
+  }
+  std::uniform_int_distribution<size_t> dis(1, n_cols);
+
+  for (size_t i = 0; i < n_rows; ++i) {
+    // Make sure that all cols are slotted in the first few rows; randomly distribute the
+    // rest
+    std::stringstream row_data;
+    fo << i;
+    size_t j = 0;
+    if (rem_cols > 0) {
+       for (; j < std::min(static_cast<size_t>(rem_cols), cols_per_row); ++j) {
+         row_data << " " << (col_idx+j) << ":" << (col_idx+j+1)*10;
+       }
+       rem_cols -= cols_per_row;
+    } else {
+       // Take some random number of colums in [1, n_cols] and slot them here
+       size_t ncols = dis(*gen);
+       for (; j < ncols; ++j) {
+         size_t fid = (col_idx+j) % n_cols;
+         row_data << " " << fid << ":" << (fid+1)*10;
+       }
+    }
+    col_idx += j;
+
+    fo << row_data.str() << "\n";
+  }
+  fo.close();
+
+  std::unique_ptr<DMatrix> dmat(DMatrix::Load(
+    tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
+  EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
+
+  if (!page_size) {
+    std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource);
+    source->CopyFrom(dmat.get());
+    return std::unique_ptr<DMatrix>(DMatrix::Create(std::move(source)));
+  } else {
+    return dmat;
+  }
+}
+
 gbm::GBTreeModel CreateTestModel() {
  std::vector<std::unique_ptr<RegTree>> trees;
  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -165,6 +165,27 @@ std::shared_ptr<xgboost::DMatrix> *CreateDMatrix(int rows, int columns,

 std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size);

+/**
+ * \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
+ *                                                            size_t page_size);
+ *
+ * \brief Creates dmatrix with some records, each record containing random number of
+ *        features in [1, n_cols]
+ *
+ * \param n_rows      Number of records to create.
+ * \param n_cols      Max number of features within that record.
+ * \param page_size   Sparse page size for the pages within the dmatrix. If page size is 0
+ *                    then the entire dmatrix is resident in memory; else, multiple sparse pages
+ *                    of page size are created and backed to disk, which would have to be
+ *                    streamed in at point of use.
+ * \param deterministic The content inside the dmatrix is constant for this configuration, if true;
+ *                      else, the content changes every time this method is invoked
+ *
+ * \return The new dmatrix.
+ */
+std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
+                                                       size_t page_size, bool deterministic);
+
 gbm::GBTreeModel CreateTestModel();

 inline LearnerTrainParam CreateEmptyGenericParam(int gpu_id, int n_gpus) {
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -77,7 +77,14 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,

  auto is_dense = (*dmat)->Info().num_nonzero_ ==
                  (*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
-  shard->InitCompressedData(cmat, batch, is_dense);
+  size_t row_stride = 0;
+  const auto &offset_vec = batch.offset.ConstHostVector();
+  for (size_t i = 1; i < offset_vec.size(); ++i) {
+    row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
+  }
+  shard->InitCompressedData(cmat, row_stride, is_dense);
+  shard->CreateHistIndices(
+    batch, cmat, RowStateOnDevice(batch.Size(), batch.Size()), -1);

  delete dmat;
 }
@@ -469,5 +476,46 @@ TEST(GpuHist, SortPosition) {
  TestSortPosition({2, 2, 2, 2}, 1, 2);
  TestSortPosition({1, 2, 1, 2, 3}, 1, 2);
 }
+
+TEST(GpuHist, TestHistogramIndex) {
+  // Test if the compressed histogram index matches when using a sparse
+  // dmatrix with and without using external memory
+
+  int constexpr kNRows = 1000, kNCols = 10;
+
+  // Build 2 matrices and build a histogram maker with that
+  tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker, hist_maker_ext;
+  std::unique_ptr<DMatrix> hist_maker_dmat(
+    CreateSparsePageDMatrixWithRC(kNRows, kNCols, 0, true));
+  std::unique_ptr<DMatrix> hist_maker_ext_dmat(
+    CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true));
+
+  std::vector<std::pair<std::string, std::string>> training_params = {
+    {"max_depth", "1"},
+    {"max_leaves", "0"}
+  };
+
+  LearnerTrainParam learner_param(CreateEmptyGenericParam(0, 1));
+  hist_maker.Init(training_params, &learner_param);
+  hist_maker.InitDataOnce(hist_maker_dmat.get());
+  hist_maker_ext.Init(training_params, &learner_param);
+  hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
+
+  // Extract the device shards from the histogram makers and from that its compressed
+  // histogram index
+  const auto &dev_shard = hist_maker.shards_[0];
+  std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
+  dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);
+
+  const auto &dev_shard_ext = hist_maker_ext.shards_[0];
+  std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
+  dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);
+
+  ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins);
+  ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size());
+
+  ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
+}
+
 }  // namespace tree
 }  // namespace xgboost