GPU binning and compression. (#3319)

* GPU binning and compression. - binning and index compression are done inside the DeviceShard constructor - in case of a DMatrix with multiple row batches, it is first converted into a single row batch
2018-06-05 07:15:13 +02:00
parent 3f7696ff53
commit 286dccb8e8
10 changed files with 302 additions and 67 deletions
--- a/tests/cpp/common/test_gpu_compressed_iterator.cu
+++ b/tests/cpp/common/test_gpu_compressed_iterator.cu
@@ -0,0 +1,73 @@
+#include "../../../src/common/compressed_iterator.h"
+#include "../../../src/common/device_helpers.cuh"
+#include "gtest/gtest.h"
+#include <algorithm>
+#include <thrust/device_vector.h>
+
+namespace xgboost {
+namespace common {
+
+struct WriteSymbolFunction {
+  CompressedBufferWriter cbw;
+  unsigned char* buffer_data_d;
+  int* input_data_d;
+  WriteSymbolFunction(CompressedBufferWriter cbw, unsigned char* buffer_data_d,
+                      int* input_data_d)
+    : cbw(cbw), buffer_data_d(buffer_data_d), input_data_d(input_data_d) {}
+                                           
+  __device__ void operator()(size_t i) {
+    cbw.AtomicWriteSymbol(buffer_data_d, input_data_d[i], i);
+  }
+};
+
+struct ReadSymbolFunction {
+  CompressedIterator<int> ci;
+  int* output_data_d;
+  ReadSymbolFunction(CompressedIterator<int> ci, int* output_data_d)
+    : ci(ci), output_data_d(output_data_d) {}
+
+  __device__ void operator()(size_t i) {
+    output_data_d[i] = ci[i];
+  }                                           
+};
+
+TEST(CompressedIterator, TestGPU) {
+  std::vector<int> test_cases = {1, 3, 426, 21, 64, 256, 100000, INT32_MAX};
+  int num_elements = 1000;
+  int repetitions = 1000;
+  srand(9);
+
+  for (auto alphabet_size : test_cases) {
+    for (int i = 0; i < repetitions; i++) {
+      std::vector<int> input(num_elements);
+      std::generate(input.begin(), input.end(),
+        [=]() { return rand() % alphabet_size; });
+      CompressedBufferWriter cbw(alphabet_size);
+      thrust::device_vector<int> input_d(input);
+
+      thrust::device_vector<unsigned char> buffer_d(
+        CompressedBufferWriter::CalculateBufferSize(input.size(),
+          alphabet_size));
+
+      // write the data on device
+      auto input_data_d = input_d.data().get();
+      auto buffer_data_d = buffer_d.data().get();
+      dh::LaunchN(0, input_d.size(),
+                        WriteSymbolFunction(cbw, buffer_data_d, input_data_d));
+
+      // read the data on device
+      CompressedIterator<int> ci(buffer_d.data().get(), alphabet_size);
+      thrust::device_vector<int> output_d(input.size());
+      auto output_data_d = output_d.data().get();
+      dh::LaunchN(0, output_d.size(), ReadSymbolFunction(ci, output_data_d));
+
+      std::vector<int> output(output_d.size());
+      thrust::copy(output_d.begin(), output_d.end(), output.begin());
+
+      ASSERT_TRUE(input == output);
+    }
+  }
+}
+
+}  // namespace common
+}  // namespace xgboost
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -18,11 +18,19 @@ long GetFileSize(const std::string filename) {
 }

 std::string CreateSimpleTestData() {
+  return CreateBigTestData(6);
+}
+
+std::string CreateBigTestData(size_t n_entries) {
  std::string tmp_file = TempFileName();
  std::ofstream fo;
  fo.open(tmp_file);
-  fo << "0 0:0 1:10 2:20\n";
-  fo << "1 0:0 3:30 4:40\n";
+  const size_t entries_per_row = 3;
+  size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row;
+  for (size_t i = 0; i < n_rows; ++i) {
+    const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n";
+    fo << i << row;
+  }
  fo.close();
  return tmp_file;
 }
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -23,6 +23,8 @@ long GetFileSize(const std::string filename);

 std::string CreateSimpleTestData();

+std::string CreateBigTestData(size_t n_entries);
+
 void CheckObjFunction(xgboost::ObjFunction * obj,
                      std::vector<xgboost::bst_float> preds,
                      std::vector<xgboost::bst_float> labels,
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -7,6 +7,7 @@
 #include "../helpers.h"
 #include "gtest/gtest.h"

+#include "../../../src/data/sparse_page_source.h"
 #include "../../../src/gbm/gbtree_model.h"
 #include "../../../src/tree/updater_gpu_hist.cu"

@@ -24,8 +25,14 @@ TEST(gpu_hist_experimental, TestSparseShard) {
  gmat.Init(dmat.get());
  TrainParam p;
  p.max_depth = 6;
-  DeviceShard shard(0, 0, gmat, 0, rows, hmat.row_ptr.back(),
-                    p);
+
+  dmlc::DataIter<RowBatch>* iter = dmat->RowIterator();
+  iter->BeforeFirst();
+  CHECK(iter->Next());
+  const RowBatch& batch = iter->Value();
+  DeviceShard shard(0, 0, 0, rows, hmat.row_ptr.back(), p);
+  shard.Init(hmat, batch);
+  CHECK(!iter->Next());

  ASSERT_LT(shard.row_stride, columns);

@@ -59,8 +66,15 @@ TEST(gpu_hist_experimental, TestDenseShard) {
  gmat.Init(dmat.get());
  TrainParam p;
  p.max_depth = 6;
-  DeviceShard shard(0, 0, gmat, 0, rows, hmat.row_ptr.back(),
-                    p);
+
+  dmlc::DataIter<RowBatch>* iter = dmat->RowIterator();
+  iter->BeforeFirst();
+  CHECK(iter->Next());
+  const RowBatch& batch = iter->Value();
+
+  DeviceShard shard(0, 0, 0, rows, hmat.row_ptr.back(), p);
+  shard.Init(hmat, batch);
+  CHECK(!iter->Next());

  ASSERT_EQ(shard.row_stride, columns);

@@ -75,4 +89,4 @@ TEST(gpu_hist_experimental, TestDenseShard) {
 }

 }  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost