Categorical data support in CPU sketching. (#7221)
This commit is contained in:
parent
9f63d6fead
commit
31c1e13f90
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2017-2020 by Contributors
|
* Copyright 2017-2021 by Contributors
|
||||||
* \file hist_util.h
|
* \file hist_util.h
|
||||||
* \brief Utility for fast histogram aggregation
|
* \brief Utility for fast histogram aggregation
|
||||||
* \author Philip Cho, Tianqi Chen
|
* \author Philip Cho, Tianqi Chen
|
||||||
@ -128,6 +128,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
HostSketchContainer container(reduced, max_bins,
|
HostSketchContainer container(reduced, max_bins,
|
||||||
|
m->Info().feature_types.ConstHostSpan(),
|
||||||
HostSketchContainer::UseGroup(info), threads);
|
HostSketchContainer::UseGroup(info), threads);
|
||||||
for (auto const &page : m->GetBatches<SparsePage>()) {
|
for (auto const &page : m->GetBatches<SparsePage>()) {
|
||||||
container.PushRowPage(page, info, hessian);
|
container.PushRowPage(page, info, hessian);
|
||||||
|
|||||||
@ -1,29 +1,35 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2020 by XGBoost Contributors
|
* Copyright 2020-2021 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include "quantile.h"
|
#include "quantile.h"
|
||||||
#include "hist_util.h"
|
#include "hist_util.h"
|
||||||
|
#include "categorical.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
HostSketchContainer::HostSketchContainer(std::vector<bst_row_t> columns_size,
|
HostSketchContainer::HostSketchContainer(
|
||||||
int32_t max_bins, bool use_group,
|
std::vector<bst_row_t> columns_size, int32_t max_bins,
|
||||||
int32_t n_threads)
|
common::Span<FeatureType const> feature_types, bool use_group,
|
||||||
: columns_size_{std::move(columns_size)}, max_bins_{max_bins},
|
int32_t n_threads)
|
||||||
|
: feature_types_(feature_types.cbegin(), feature_types.cend()),
|
||||||
|
columns_size_{std::move(columns_size)}, max_bins_{max_bins},
|
||||||
use_group_ind_{use_group}, n_threads_{n_threads} {
|
use_group_ind_{use_group}, n_threads_{n_threads} {
|
||||||
monitor_.Init(__func__);
|
monitor_.Init(__func__);
|
||||||
CHECK_NE(columns_size_.size(), 0);
|
CHECK_NE(columns_size_.size(), 0);
|
||||||
sketches_.resize(columns_size_.size());
|
sketches_.resize(columns_size_.size());
|
||||||
CHECK_GE(n_threads_, 1);
|
CHECK_GE(n_threads_, 1);
|
||||||
|
categories_.resize(columns_size_.size());
|
||||||
ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
|
ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
|
||||||
auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
|
auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
|
||||||
n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1));
|
n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1));
|
||||||
auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor);
|
auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor);
|
||||||
sketches_[i].Init(columns_size_[i], eps);
|
if (!IsCat(this->feature_types_, i)) {
|
||||||
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
|
sketches_[i].Init(columns_size_[i], eps);
|
||||||
|
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,13 +188,21 @@ void HostSketchContainer::PushRowPage(
|
|||||||
auto p_inst = inst.data();
|
auto p_inst = inst.data();
|
||||||
if (is_dense) {
|
if (is_dense) {
|
||||||
for (size_t ii = begin; ii < end; ii++) {
|
for (size_t ii = begin; ii < end; ii++) {
|
||||||
sketches_[ii].Push(p_inst[ii].fvalue, w);
|
if (IsCat(feature_types_, ii)) {
|
||||||
|
categories_[ii].emplace(p_inst[ii].fvalue);
|
||||||
|
} else {
|
||||||
|
sketches_[ii].Push(p_inst[ii].fvalue, w);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < inst.size(); ++i) {
|
for (size_t i = 0; i < inst.size(); ++i) {
|
||||||
auto const& entry = p_inst[i];
|
auto const& entry = p_inst[i];
|
||||||
if (entry.index >= begin && entry.index < end) {
|
if (entry.index >= begin && entry.index < end) {
|
||||||
sketches_[entry.index].Push(entry.fvalue, w);
|
if (IsCat(feature_types_, entry.index)) {
|
||||||
|
categories_[entry.index].emplace(entry.fvalue);
|
||||||
|
} else {
|
||||||
|
sketches_[entry.index].Push(entry.fvalue, w);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -338,6 +352,13 @@ void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AddCategories(std::set<bst_cat_t> const &categories, HistogramCuts *cuts) {
|
||||||
|
auto &cut_values = cuts->cut_values_.HostVector();
|
||||||
|
for (auto const &v : categories) {
|
||||||
|
cut_values.push_back(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
|
void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
|
||||||
monitor_.Start(__func__);
|
monitor_.Start(__func__);
|
||||||
std::vector<WQSketch::SummaryContainer> reduced;
|
std::vector<WQSketch::SummaryContainer> reduced;
|
||||||
@ -348,6 +369,9 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
|
|||||||
std::vector<WQSketch::SummaryContainer> final_summaries(reduced.size());
|
std::vector<WQSketch::SummaryContainer> final_summaries(reduced.size());
|
||||||
|
|
||||||
ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
|
ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
|
||||||
|
if (IsCat(feature_types_, fidx)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
WQSketch::SummaryContainer &a = final_summaries[fidx];
|
WQSketch::SummaryContainer &a = final_summaries[fidx];
|
||||||
size_t max_num_bins = std::min(num_cuts[fidx], max_bins_);
|
size_t max_num_bins = std::min(num_cuts[fidx], max_bins_);
|
||||||
a.Reserve(max_num_bins + 1);
|
a.Reserve(max_num_bins + 1);
|
||||||
@ -367,13 +391,17 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
|
|||||||
for (size_t fid = 0; fid < reduced.size(); ++fid) {
|
for (size_t fid = 0; fid < reduced.size(); ++fid) {
|
||||||
size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
|
size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
|
||||||
WQSketch::SummaryContainer const& a = final_summaries[fid];
|
WQSketch::SummaryContainer const& a = final_summaries[fid];
|
||||||
AddCutPoint(a, max_num_bins, cuts);
|
if (IsCat(feature_types_, fid)) {
|
||||||
// push a value that is greater than anything
|
AddCategories(categories_.at(fid), cuts);
|
||||||
const bst_float cpt
|
} else {
|
||||||
= (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
|
AddCutPoint(a, max_num_bins, cuts);
|
||||||
// this must be bigger than last value in a scale
|
// push a value that is greater than anything
|
||||||
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
|
const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
|
||||||
cuts->cut_values_.HostVector().push_back(last);
|
: cuts->min_vals_.HostVector()[fid];
|
||||||
|
// this must be bigger than last value in a scale
|
||||||
|
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
|
||||||
|
cuts->cut_values_.HostVector().push_back(last);
|
||||||
|
}
|
||||||
|
|
||||||
// Ensure that every feature gets at least one quantile point
|
// Ensure that every feature gets at least one quantile point
|
||||||
CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
|
CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2014 by Contributors
|
* Copyright 2014-2021 by Contributors
|
||||||
* \file quantile.h
|
* \file quantile.h
|
||||||
* \brief util to compute quantiles
|
* \brief util to compute quantiles
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
@ -15,6 +15,7 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
#include "timer.h"
|
#include "timer.h"
|
||||||
|
|
||||||
@ -707,6 +708,9 @@ class HostSketchContainer {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<WQSketch> sketches_;
|
std::vector<WQSketch> sketches_;
|
||||||
|
std::vector<std::set<bst_cat_t>> categories_;
|
||||||
|
std::vector<FeatureType> const feature_types_;
|
||||||
|
|
||||||
std::vector<bst_row_t> columns_size_;
|
std::vector<bst_row_t> columns_size_;
|
||||||
int32_t max_bins_;
|
int32_t max_bins_;
|
||||||
bool use_group_ind_{false};
|
bool use_group_ind_{false};
|
||||||
@ -721,7 +725,8 @@ class HostSketchContainer {
|
|||||||
* \param use_group whether is assigned to group to data instance.
|
* \param use_group whether is assigned to group to data instance.
|
||||||
*/
|
*/
|
||||||
HostSketchContainer(std::vector<bst_row_t> columns_size, int32_t max_bins,
|
HostSketchContainer(std::vector<bst_row_t> columns_size, int32_t max_bins,
|
||||||
bool use_group, int32_t n_threads);
|
common::Span<FeatureType const> feature_types, bool use_group,
|
||||||
|
int32_t n_threads);
|
||||||
|
|
||||||
static bool UseGroup(MetaInfo const &info) {
|
static bool UseGroup(MetaInfo const &info) {
|
||||||
size_t const num_groups =
|
size_t const num_groups =
|
||||||
|
|||||||
@ -1,3 +1,6 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2019-2021 by XGBoost Contributors
|
||||||
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -388,5 +391,16 @@ TEST(HistUtil, SketchFromWeights) {
|
|||||||
TestSketchFromWeights(true);
|
TestSketchFromWeights(true);
|
||||||
TestSketchFromWeights(false);
|
TestSketchFromWeights(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(HistUtil, SketchCategoricalFeatures) {
|
||||||
|
TestCategoricalSketch(1000, 256, 32, false,
|
||||||
|
[](DMatrix *p_fmat, int32_t num_bins) {
|
||||||
|
return SketchOnDMatrix(p_fmat, num_bins);
|
||||||
|
});
|
||||||
|
TestCategoricalSketch(1000, 256, 32, true,
|
||||||
|
[](DMatrix *p_fmat, int32_t num_bins) {
|
||||||
|
return SketchOnDMatrix(p_fmat, num_bins);
|
||||||
|
});
|
||||||
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -1,3 +1,6 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2019-2021 by XGBoost Contributors
|
||||||
|
*/
|
||||||
#include <dmlc/filesystem.h>
|
#include <dmlc/filesystem.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
@ -126,43 +129,15 @@ TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, bool weighted) {
|
|
||||||
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
|
||||||
auto dmat = GetDMatrixFromData(x, n, 1);
|
|
||||||
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
|
|
||||||
|
|
||||||
if (weighted) {
|
|
||||||
std::vector<float> weights(n, 0);
|
|
||||||
SimpleLCG lcg;
|
|
||||||
SimpleRealUniformDistribution<float> dist(0, 1);
|
|
||||||
for (auto& v : weights) {
|
|
||||||
v = dist(&lcg);
|
|
||||||
}
|
|
||||||
dmat->Info().weights_.HostVector() = weights;
|
|
||||||
}
|
|
||||||
|
|
||||||
ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
|
|
||||||
auto cuts = DeviceSketch(0, dmat.get(), num_bins);
|
|
||||||
std::sort(x.begin(), x.end());
|
|
||||||
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
|
|
||||||
ASSERT_NE(n_uniques, x.size());
|
|
||||||
ASSERT_EQ(cuts.TotalBins(), n_uniques);
|
|
||||||
ASSERT_EQ(n_uniques, num_categories);
|
|
||||||
|
|
||||||
auto& values = cuts.cut_values_.HostVector();
|
|
||||||
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
|
|
||||||
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
|
|
||||||
ASSERT_TRUE(is_unique);
|
|
||||||
|
|
||||||
x.resize(n_uniques);
|
|
||||||
for (size_t i = 0; i < n_uniques; ++i) {
|
|
||||||
ASSERT_EQ(x[i], values[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(HistUtil, DeviceSketchCategoricalFeatures) {
|
TEST(HistUtil, DeviceSketchCategoricalFeatures) {
|
||||||
TestCategoricalSketch(1000, 256, 32, false);
|
TestCategoricalSketch(1000, 256, 32, false,
|
||||||
TestCategoricalSketch(1000, 256, 32, true);
|
[](DMatrix *p_fmat, int32_t num_bins) {
|
||||||
|
return DeviceSketch(0, p_fmat, num_bins);
|
||||||
|
});
|
||||||
|
TestCategoricalSketch(1000, 256, 32, true,
|
||||||
|
[](DMatrix *p_fmat, int32_t num_bins) {
|
||||||
|
return DeviceSketch(0, p_fmat, num_bins);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestMixedSketch() {
|
void TestMixedSketch() {
|
||||||
|
|||||||
@ -1,3 +1,6 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2019-2021 by XGBoost Contributors
|
||||||
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <dmlc/filesystem.h>
|
#include <dmlc/filesystem.h>
|
||||||
@ -5,6 +8,8 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
|
#include "../helpers.h"
|
||||||
#include "../../../src/common/hist_util.h"
|
#include "../../../src/common/hist_util.h"
|
||||||
#include "../../../src/data/simple_dmatrix.h"
|
#include "../../../src/data/simple_dmatrix.h"
|
||||||
#include "../../../src/data/adapter.h"
|
#include "../../../src/data/adapter.h"
|
||||||
@ -206,5 +211,45 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Test for sketching on categorical data.
|
||||||
|
*
|
||||||
|
* \param sketch Sketch function, can be on device or on host.
|
||||||
|
*/
|
||||||
|
template <typename Fn>
|
||||||
|
void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins,
|
||||||
|
bool weighted, Fn sketch) {
|
||||||
|
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
||||||
|
auto dmat = GetDMatrixFromData(x, n, 1);
|
||||||
|
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||||
|
|
||||||
|
if (weighted) {
|
||||||
|
std::vector<float> weights(n, 0);
|
||||||
|
SimpleLCG lcg;
|
||||||
|
SimpleRealUniformDistribution<float> dist(0, 1);
|
||||||
|
for (auto& v : weights) {
|
||||||
|
v = dist(&lcg);
|
||||||
|
}
|
||||||
|
dmat->Info().weights_.HostVector() = weights;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
|
||||||
|
auto cuts = sketch(dmat.get(), num_bins);
|
||||||
|
std::sort(x.begin(), x.end());
|
||||||
|
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
|
||||||
|
ASSERT_NE(n_uniques, x.size());
|
||||||
|
ASSERT_EQ(cuts.TotalBins(), n_uniques);
|
||||||
|
ASSERT_EQ(n_uniques, num_categories);
|
||||||
|
|
||||||
|
auto& values = cuts.cut_values_.HostVector();
|
||||||
|
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
|
||||||
|
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
|
||||||
|
ASSERT_TRUE(is_unique);
|
||||||
|
|
||||||
|
x.resize(n_uniques);
|
||||||
|
for (size_t i = 0; i < n_uniques; ++i) {
|
||||||
|
ASSERT_EQ(x[i], values[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -43,12 +43,14 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
|
|||||||
// Generate cuts for distributed environment.
|
// Generate cuts for distributed environment.
|
||||||
auto sparsity = 0.5f;
|
auto sparsity = 0.5f;
|
||||||
auto rank = rabit::GetRank();
|
auto rank = rabit::GetRank();
|
||||||
HostSketchContainer sketch_distributed(column_size, n_bins, false, OmpGetNumThreads(0));
|
|
||||||
auto m = RandomDataGenerator{rows, cols, sparsity}
|
auto m = RandomDataGenerator{rows, cols, sparsity}
|
||||||
.Seed(rank)
|
.Seed(rank)
|
||||||
.Lower(.0f)
|
.Lower(.0f)
|
||||||
.Upper(1.0f)
|
.Upper(1.0f)
|
||||||
.GenerateDMatrix();
|
.GenerateDMatrix();
|
||||||
|
HostSketchContainer sketch_distributed(
|
||||||
|
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
|
||||||
|
OmpGetNumThreads(0));
|
||||||
for (auto const &page : m->GetBatches<SparsePage>()) {
|
for (auto const &page : m->GetBatches<SparsePage>()) {
|
||||||
sketch_distributed.PushRowPage(page, m->Info());
|
sketch_distributed.PushRowPage(page, m->Info());
|
||||||
}
|
}
|
||||||
@ -59,7 +61,9 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
|
|||||||
rabit::Finalize();
|
rabit::Finalize();
|
||||||
CHECK_EQ(rabit::GetWorldSize(), 1);
|
CHECK_EQ(rabit::GetWorldSize(), 1);
|
||||||
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
|
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
|
||||||
HostSketchContainer sketch_on_single_node(column_size, n_bins, false, OmpGetNumThreads(0));
|
HostSketchContainer sketch_on_single_node(
|
||||||
|
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
|
||||||
|
OmpGetNumThreads(0));
|
||||||
for (auto rank = 0; rank < world; ++rank) {
|
for (auto rank = 0; rank < world; ++rank) {
|
||||||
auto m = RandomDataGenerator{rows, cols, sparsity}
|
auto m = RandomDataGenerator{rows, cols, sparsity}
|
||||||
.Seed(rank)
|
.Seed(rank)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user