Support categorical data in GPU sketching. (#6137)

2020-09-21 13:53:06 +08:00
parent c932fb50a1
commit 210c131ce7
6 changed files with 196 additions and 62 deletions
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -108,7 +108,7 @@ TEST(HistUtil, DeviceSketchDeterminism) {
  }
 }

-TEST(HistUtil, DeviceSketchCategorical) {
+TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
  int categorical_sizes[] = {2, 6, 8, 12};
  int num_bins = 256;
  int sizes[] = {25, 100, 1000};
@@ -122,6 +122,33 @@ TEST(HistUtil, DeviceSketchCategorical) {
  }
 }

+void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins) {
+  auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
+  auto dmat = GetDMatrixFromData(x, n, 1);
+  dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
+  ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
+  auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+  std::sort(x.begin(), x.end());
+  auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
+  ASSERT_NE(n_uniques, x.size());
+  ASSERT_EQ(cuts.TotalBins(), n_uniques);
+  ASSERT_EQ(n_uniques, num_categories);
+
+  auto& values = cuts.cut_values_.HostVector();
+  ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
+  auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
+  ASSERT_TRUE(is_unique);
+
+  x.resize(n_uniques);
+  for (size_t i = 0; i < n_uniques; ++i) {
+    ASSERT_EQ(x[i], values[i]);
+  }
+}
+
+TEST(HistUtil, DeviceSketchCategoricalFeatures) {
+  TestCategoricalSketch(1000, 256, 32);
+}
+
 TEST(HistUtil, DeviceSketchMultipleColumns) {
  int bin_sizes[] = {2, 16, 256, 512};
  int sizes[] = {100, 1000, 1500};
@@ -237,7 +264,8 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
 template <typename Adapter>
 auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
  common::HistogramCuts batched_cuts;
-  SketchContainer sketch_container(num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
  MetaInfo info;
  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
@@ -305,7 +333,8 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
  dh::GlobalMemoryLogger().Clear();
  ConsoleLogger::Configure({{"verbosity", "3"}});
  common::HistogramCuts batched_cuts;
-  SketchContainer sketch_container(num_bins, num_columns, num_rows, 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
  HistogramCuts cuts;
@@ -332,10 +361,12 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
  dh::GlobalMemoryLogger().Clear();
  ConsoleLogger::Configure({{"verbosity", "3"}});
  common::HistogramCuts batched_cuts;
-  SketchContainer sketch_container(num_bins, num_columns, num_rows, 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
+
  HistogramCuts cuts;
  sketch_container.MakeCuts(&cuts);
  ConsoleLogger::Configure({{"verbosity", "0"}});
@@ -477,9 +508,11 @@ void TestAdapterSketchFromWeights(bool with_group) {

  data::CupyAdapter adapter(m);
  auto const& batch = adapter.Value();
-  SketchContainer sketch_container(kBins, kCols, kRows, 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
  AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
+
  common::HistogramCuts cuts;
  sketch_container.MakeCuts(&cuts);

--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -8,7 +8,8 @@ namespace xgboost {
 namespace common {
 TEST(GPUQuantile, Basic) {
  constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
-  SketchContainer sketch(kBins, kCols, kRows, 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch(ft, kBins, kCols, kRows, 0);
  dh::caching_device_vector<Entry> entries;
  dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
  thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -20,7 +21,8 @@ TEST(GPUQuantile, Basic) {
 void TestSketchUnique(float sparsity) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    SketchContainer sketch(n_bins, kCols, kRows, 0);
+    HostDeviceVector<FeatureType> ft;
+    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);

    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
@@ -93,8 +95,10 @@ void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,

 TEST(GPUQuantile, Prune) {
  constexpr size_t kRows = 1000, kCols = 100;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    SketchContainer sketch(n_bins, kCols, kRows, 0);
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
+                                 MetaInfo const &info) {
+    HostDeviceVector<FeatureType> ft;
+    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);

    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
@@ -111,8 +115,8 @@ TEST(GPUQuantile, Prune) {
    if (n_bins <= kRows) {
      ASSERT_EQ(sketch.Data().size(), n_bins * kCols);
    } else {
-      // LE because kRows * kCols is pushed into sketch, after removing duplicated entries
-      // we might not have that much inputs for prune.
+      // LE because kRows * kCols is pushed into sketch, after removing
+      // duplicated entries we might not have that much inputs for prune.
      ASSERT_LE(sketch.Data().size(), kRows * kCols);
    }
    // This is not necessarily true for all inputs without calling unique after
@@ -127,7 +131,8 @@ TEST(GPUQuantile, Prune) {
 TEST(GPUQuantile, MergeEmpty) {
  constexpr size_t kRows = 1000, kCols = 100;
  size_t n_bins = 10;
-  SketchContainer sketch_0(n_bins, kCols, kRows, 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
  HostDeviceVector<float> storage_0;
  std::string interface_str_0 =
      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
@@ -166,7 +171,8 @@ TEST(GPUQuantile, MergeEmpty) {
 TEST(GPUQuantile, MergeBasic) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    SketchContainer sketch_0(n_bins, kCols, kRows, 0);
+    HostDeviceVector<FeatureType> ft;
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
    HostDeviceVector<float> storage_0;
    std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
                                      .Device(0)
@@ -176,7 +182,7 @@ TEST(GPUQuantile, MergeBasic) {
    AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(), &sketch_0);

-    SketchContainer sketch_1(n_bins, kCols, kRows * kRows, 0);
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
    HostDeviceVector<float> storage_1;
    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
                                      .Device(0)
@@ -212,7 +218,8 @@ TEST(GPUQuantile, MergeBasic) {
 void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
  MetaInfo info;
  int32_t seed = 0;
-  SketchContainer sketch_0(n_bins, cols, rows, 0);
+  HostDeviceVector<FeatureType> ft;
+  SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
  HostDeviceVector<float> storage_0;
  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
                                    .Device(0)
@@ -224,7 +231,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
                      &sketch_0);

  size_t f_rows = rows * frac;
-  SketchContainer sketch_1(n_bins, cols, f_rows, 0);
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
  HostDeviceVector<float> storage_1;
  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
                                    .Device(0)
@@ -286,12 +293,14 @@ TEST(GPUQuantile, AllReduceBasic) {
  }

  constexpr size_t kRows = 1000, kCols = 100;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    // Set up single node version;
-    SketchContainer sketch_on_single_node(n_bins, kCols, kRows, 0);
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
+                                 MetaInfo const &info) {
+    // Set up single node version
+    HostDeviceVector<FeatureType> ft;
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);

-    size_t intermediate_num_cuts =
-        std::min(kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
+    size_t intermediate_num_cuts = std::min(
+        kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
    std::vector<SketchContainer> containers;
    for (auto rank = 0; rank < world; ++rank) {
      HostDeviceVector<float> storage;
@@ -300,12 +309,13 @@ TEST(GPUQuantile, AllReduceBasic) {
                                      .Seed(rank + seed)
                                      .GenerateArrayInterface(&storage);
      data::CupyAdapter adapter(interface_str);
-      containers.emplace_back(n_bins, kCols, kRows, 0);
+      HostDeviceVector<FeatureType> ft;
+      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
      AdapterDeviceSketch(adapter.Value(), n_bins, info,
                          std::numeric_limits<float>::quiet_NaN(),
                          &containers.back());
    }
-    for (auto& sketch : containers) {
+    for (auto &sketch : containers) {
      sketch.Prune(intermediate_num_cuts);
      sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
      sketch_on_single_node.FixError();
@@ -317,7 +327,7 @@ TEST(GPUQuantile, AllReduceBasic) {
    // Set up distributed version.  We rely on using rank as seed to generate
    // the exact same copy of data.
    auto rank = rabit::GetRank();
-    SketchContainer sketch_distributed(n_bins, kCols, kRows, 0);
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                    .Device(0)
@@ -376,7 +386,8 @@ TEST(GPUQuantile, SameOnAllWorkers) {
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
    auto rank = rabit::GetRank();
-    SketchContainer sketch_distributed(n_bins, kCols, kRows, 0);
+    HostDeviceVector<FeatureType> ft;
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                    .Device(0)