Optimizations for quantisation on device (#4572)

* - do not create device vectors for the entire sparse page while computing histograms... - while creating the compressed histogram indices, the row vector is created for the entire sparse page batch. this is needless as we only process chunks at a time based on a slice of the total gpu memory - this pr will allocate only as much as required to store the ppropriate row indices and the entries * - do not dereference row_ptrs once the device_vector has been created to elide host copies of those counts - instead, grab the entry counts directly from the sparsepage
2019-06-18 15:50:25 -07:00
parent ba1d848767
commit 6757654337
2 changed files with 46 additions and 37 deletions
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -477,7 +477,7 @@ TEST(GpuHist, SortPosition) {
  TestSortPosition({1, 2, 1, 2, 3}, 1, 2);
 }

-TEST(GpuHist, TestHistogramIndex) {
+void TestHistogramIndexImpl(int n_gpus) {
  // Test if the compressed histogram index matches when using a sparse
  // dmatrix with and without using external memory

@@ -491,31 +491,47 @@ TEST(GpuHist, TestHistogramIndex) {
    CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true));

  std::vector<std::pair<std::string, std::string>> training_params = {
-    {"max_depth", "1"},
+    {"max_depth", "10"},
    {"max_leaves", "0"}
  };

-  LearnerTrainParam learner_param(CreateEmptyGenericParam(0, 1));
+  LearnerTrainParam learner_param(CreateEmptyGenericParam(0, n_gpus));
  hist_maker.Init(training_params, &learner_param);
  hist_maker.InitDataOnce(hist_maker_dmat.get());
  hist_maker_ext.Init(training_params, &learner_param);
  hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());

+  ASSERT_EQ(hist_maker.shards_.size(), hist_maker_ext.shards_.size());
+
  // Extract the device shards from the histogram makers and from that its compressed
  // histogram index
-  const auto &dev_shard = hist_maker.shards_[0];
-  std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);
+  for (size_t i = 0; i < hist_maker.shards_.size(); ++i) {
+    const auto &dev_shard = hist_maker.shards_[i];
+    std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
+    dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);

-  const auto &dev_shard_ext = hist_maker_ext.shards_[0];
-  std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);
+    const auto &dev_shard_ext = hist_maker_ext.shards_[i];
+    std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
+    dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);

-  ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins);
-  ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size());
+    ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins);
+    ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size());

-  ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
+    ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
+  }
 }

+TEST(GpuHist, TestHistogramIndex) {
+  TestHistogramIndexImpl(1);
+}
+
+#if defined(XGBOOST_USE_NCCL)
+TEST(GpuHist, MGPU_TestHistogramIndex) {
+  auto devices = GPUSet::AllVisible();
+  CHECK_GT(devices.Size(), 1);
+  TestHistogramIndexImpl(-1);
+}
+#endif
+
 }  // namespace tree
 }  // namespace xgboost