Categorical data support in CPU sketching. (#7221)

This commit is contained in:
Jiaming Yuan 2021-09-17 04:37:09 +08:00 committed by GitHub
parent 9f63d6fead
commit 31c1e13f90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 129 additions and 57 deletions

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2017-2020 by Contributors * Copyright 2017-2021 by Contributors
* \file hist_util.h * \file hist_util.h
* \brief Utility for fast histogram aggregation * \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen * \author Philip Cho, Tianqi Chen
@ -128,6 +128,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
} }
} }
HostSketchContainer container(reduced, max_bins, HostSketchContainer container(reduced, max_bins,
m->Info().feature_types.ConstHostSpan(),
HostSketchContainer::UseGroup(info), threads); HostSketchContainer::UseGroup(info), threads);
for (auto const &page : m->GetBatches<SparsePage>()) { for (auto const &page : m->GetBatches<SparsePage>()) {
container.PushRowPage(page, info, hessian); container.PushRowPage(page, info, hessian);

View File

@ -1,29 +1,35 @@
/*! /*!
* Copyright 2020 by XGBoost Contributors * Copyright 2020-2021 by XGBoost Contributors
*/ */
#include <limits> #include <limits>
#include <utility> #include <utility>
#include "quantile.h" #include "quantile.h"
#include "hist_util.h" #include "hist_util.h"
#include "categorical.h"
namespace xgboost { namespace xgboost {
namespace common { namespace common {
HostSketchContainer::HostSketchContainer(std::vector<bst_row_t> columns_size, HostSketchContainer::HostSketchContainer(
int32_t max_bins, bool use_group, std::vector<bst_row_t> columns_size, int32_t max_bins,
int32_t n_threads) common::Span<FeatureType const> feature_types, bool use_group,
: columns_size_{std::move(columns_size)}, max_bins_{max_bins}, int32_t n_threads)
: feature_types_(feature_types.cbegin(), feature_types.cend()),
columns_size_{std::move(columns_size)}, max_bins_{max_bins},
use_group_ind_{use_group}, n_threads_{n_threads} { use_group_ind_{use_group}, n_threads_{n_threads} {
monitor_.Init(__func__); monitor_.Init(__func__);
CHECK_NE(columns_size_.size(), 0); CHECK_NE(columns_size_.size(), 0);
sketches_.resize(columns_size_.size()); sketches_.resize(columns_size_.size());
CHECK_GE(n_threads_, 1); CHECK_GE(n_threads_, 1);
categories_.resize(columns_size_.size());
ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) { ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]); auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1)); n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1));
auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor); auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor);
sketches_[i].Init(columns_size_[i], eps); if (!IsCat(this->feature_types_, i)) {
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2); sketches_[i].Init(columns_size_[i], eps);
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
}
}); });
} }
@ -182,13 +188,21 @@ void HostSketchContainer::PushRowPage(
auto p_inst = inst.data(); auto p_inst = inst.data();
if (is_dense) { if (is_dense) {
for (size_t ii = begin; ii < end; ii++) { for (size_t ii = begin; ii < end; ii++) {
sketches_[ii].Push(p_inst[ii].fvalue, w); if (IsCat(feature_types_, ii)) {
categories_[ii].emplace(p_inst[ii].fvalue);
} else {
sketches_[ii].Push(p_inst[ii].fvalue, w);
}
} }
} else { } else {
for (size_t i = 0; i < inst.size(); ++i) { for (size_t i = 0; i < inst.size(); ++i) {
auto const& entry = p_inst[i]; auto const& entry = p_inst[i];
if (entry.index >= begin && entry.index < end) { if (entry.index >= begin && entry.index < end) {
sketches_[entry.index].Push(entry.fvalue, w); if (IsCat(feature_types_, entry.index)) {
categories_[entry.index].emplace(entry.fvalue);
} else {
sketches_[entry.index].Push(entry.fvalue, w);
}
} }
} }
} }
@ -338,6 +352,13 @@ void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
} }
} }
void AddCategories(std::set<bst_cat_t> const &categories, HistogramCuts *cuts) {
auto &cut_values = cuts->cut_values_.HostVector();
for (auto const &v : categories) {
cut_values.push_back(v);
}
}
void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
monitor_.Start(__func__); monitor_.Start(__func__);
std::vector<WQSketch::SummaryContainer> reduced; std::vector<WQSketch::SummaryContainer> reduced;
@ -348,6 +369,9 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
std::vector<WQSketch::SummaryContainer> final_summaries(reduced.size()); std::vector<WQSketch::SummaryContainer> final_summaries(reduced.size());
ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) { ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
if (IsCat(feature_types_, fidx)) {
return;
}
WQSketch::SummaryContainer &a = final_summaries[fidx]; WQSketch::SummaryContainer &a = final_summaries[fidx];
size_t max_num_bins = std::min(num_cuts[fidx], max_bins_); size_t max_num_bins = std::min(num_cuts[fidx], max_bins_);
a.Reserve(max_num_bins + 1); a.Reserve(max_num_bins + 1);
@ -367,13 +391,17 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
for (size_t fid = 0; fid < reduced.size(); ++fid) { for (size_t fid = 0; fid < reduced.size(); ++fid) {
size_t max_num_bins = std::min(num_cuts[fid], max_bins_); size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
WQSketch::SummaryContainer const& a = final_summaries[fid]; WQSketch::SummaryContainer const& a = final_summaries[fid];
AddCutPoint(a, max_num_bins, cuts); if (IsCat(feature_types_, fid)) {
// push a value that is greater than anything AddCategories(categories_.at(fid), cuts);
const bst_float cpt } else {
= (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid]; AddCutPoint(a, max_num_bins, cuts);
// this must be bigger than last value in a scale // push a value that is greater than anything
const bst_float last = cpt + (fabs(cpt) + 1e-5f); const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
cuts->cut_values_.HostVector().push_back(last); : cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);
}
// Ensure that every feature gets at least one quantile point // Ensure that every feature gets at least one quantile point
CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max()); CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2014 by Contributors * Copyright 2014-2021 by Contributors
* \file quantile.h * \file quantile.h
* \brief util to compute quantiles * \brief util to compute quantiles
* \author Tianqi Chen * \author Tianqi Chen
@ -15,6 +15,7 @@
#include <cstring> #include <cstring>
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <set>
#include "timer.h" #include "timer.h"
@ -707,6 +708,9 @@ class HostSketchContainer {
private: private:
std::vector<WQSketch> sketches_; std::vector<WQSketch> sketches_;
std::vector<std::set<bst_cat_t>> categories_;
std::vector<FeatureType> const feature_types_;
std::vector<bst_row_t> columns_size_; std::vector<bst_row_t> columns_size_;
int32_t max_bins_; int32_t max_bins_;
bool use_group_ind_{false}; bool use_group_ind_{false};
@ -721,7 +725,8 @@ class HostSketchContainer {
* \param use_group whether is assigned to group to data instance. * \param use_group whether is assigned to group to data instance.
*/ */
HostSketchContainer(std::vector<bst_row_t> columns_size, int32_t max_bins, HostSketchContainer(std::vector<bst_row_t> columns_size, int32_t max_bins,
bool use_group, int32_t n_threads); common::Span<FeatureType const> feature_types, bool use_group,
int32_t n_threads);
static bool UseGroup(MetaInfo const &info) { static bool UseGroup(MetaInfo const &info) {
size_t const num_groups = size_t const num_groups =

View File

@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
#include <string> #include <string>
@ -388,5 +391,16 @@ TEST(HistUtil, SketchFromWeights) {
TestSketchFromWeights(true); TestSketchFromWeights(true);
TestSketchFromWeights(false); TestSketchFromWeights(false);
} }
TEST(HistUtil, SketchCategoricalFeatures) {
TestCategoricalSketch(1000, 256, 32, false,
[](DMatrix *p_fmat, int32_t num_bins) {
return SketchOnDMatrix(p_fmat, num_bins);
});
TestCategoricalSketch(1000, 256, 32, true,
[](DMatrix *p_fmat, int32_t num_bins) {
return SketchOnDMatrix(p_fmat, num_bins);
});
}
} // namespace common } // namespace common
} // namespace xgboost } // namespace xgboost

View File

@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <dmlc/filesystem.h> #include <dmlc/filesystem.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
@ -126,43 +129,15 @@ TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
} }
} }
void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, bool weighted) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x, n, 1);
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
dmat->Info().weights_.HostVector() = weights;
}
ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
auto cuts = DeviceSketch(0, dmat.get(), num_bins);
std::sort(x.begin(), x.end());
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);
auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);
x.resize(n_uniques);
for (size_t i = 0; i < n_uniques; ++i) {
ASSERT_EQ(x[i], values[i]);
}
}
TEST(HistUtil, DeviceSketchCategoricalFeatures) { TEST(HistUtil, DeviceSketchCategoricalFeatures) {
TestCategoricalSketch(1000, 256, 32, false); TestCategoricalSketch(1000, 256, 32, false,
TestCategoricalSketch(1000, 256, 32, true); [](DMatrix *p_fmat, int32_t num_bins) {
return DeviceSketch(0, p_fmat, num_bins);
});
TestCategoricalSketch(1000, 256, 32, true,
[](DMatrix *p_fmat, int32_t num_bins) {
return DeviceSketch(0, p_fmat, num_bins);
});
} }
void TestMixedSketch() { void TestMixedSketch() {

View File

@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#pragma once #pragma once
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <dmlc/filesystem.h> #include <dmlc/filesystem.h>
@ -5,6 +8,8 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <fstream> #include <fstream>
#include "../helpers.h"
#include "../../../src/common/hist_util.h" #include "../../../src/common/hist_util.h"
#include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/simple_dmatrix.h"
#include "../../../src/data/adapter.h" #include "../../../src/data/adapter.h"
@ -206,5 +211,45 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
} }
} }
/**
* \brief Test for sketching on categorical data.
*
* \param sketch Sketch function, can be on device or on host.
*/
template <typename Fn>
void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins,
bool weighted, Fn sketch) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x, n, 1);
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
dmat->Info().weights_.HostVector() = weights;
}
ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
auto cuts = sketch(dmat.get(), num_bins);
std::sort(x.begin(), x.end());
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);
auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);
x.resize(n_uniques);
for (size_t i = 0; i < n_uniques; ++i) {
ASSERT_EQ(x[i], values[i]);
}
}
} // namespace common } // namespace common
} // namespace xgboost } // namespace xgboost

View File

@ -43,12 +43,14 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
// Generate cuts for distributed environment. // Generate cuts for distributed environment.
auto sparsity = 0.5f; auto sparsity = 0.5f;
auto rank = rabit::GetRank(); auto rank = rabit::GetRank();
HostSketchContainer sketch_distributed(column_size, n_bins, false, OmpGetNumThreads(0));
auto m = RandomDataGenerator{rows, cols, sparsity} auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank) .Seed(rank)
.Lower(.0f) .Lower(.0f)
.Upper(1.0f) .Upper(1.0f)
.GenerateDMatrix(); .GenerateDMatrix();
HostSketchContainer sketch_distributed(
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
OmpGetNumThreads(0));
for (auto const &page : m->GetBatches<SparsePage>()) { for (auto const &page : m->GetBatches<SparsePage>()) {
sketch_distributed.PushRowPage(page, m->Info()); sketch_distributed.PushRowPage(page, m->Info());
} }
@ -59,7 +61,9 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
rabit::Finalize(); rabit::Finalize();
CHECK_EQ(rabit::GetWorldSize(), 1); CHECK_EQ(rabit::GetWorldSize(), 1);
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; }); std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
HostSketchContainer sketch_on_single_node(column_size, n_bins, false, OmpGetNumThreads(0)); HostSketchContainer sketch_on_single_node(
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
OmpGetNumThreads(0));
for (auto rank = 0; rank < world; ++rank) { for (auto rank = 0; rank < world; ++rank) {
auto m = RandomDataGenerator{rows, cols, sparsity} auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank) .Seed(rank)