Support building gradient index with cat data. (#7371)

This commit is contained in:
Jiaming Yuan 2021-11-03 22:37:37 +08:00 committed by GitHub
parent 57a4b4ff64
commit ccdabe4512
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 105 additions and 27 deletions

View File

@ -16,13 +16,12 @@
#include <utility>
#include <map>
#include "row_set.h"
#include "categorical.h"
#include "common.h"
#include "quantile.h"
#include "row_set.h"
#include "threading_utils.h"
#include "../tree/param.h"
#include "./quantile.h"
#include "./timer.h"
#include "../include/rabit/rabit.h"
#include "timer.h"
namespace xgboost {
class GHistIndexMatrix;
@ -105,9 +104,29 @@ class HistogramCuts {
return idx;
}
/**
* \brief Search the bin index for numerical feature.
*/
BinIdx SearchBin(Entry const& e) const {
return SearchBin(e.fvalue, e.index);
}
/**
* \brief Search the bin index for categorical feature.
*/
BinIdx SearchCatBin(Entry const &e) const {
auto const &ptrs = this->Ptrs();
auto const &vals = this->Values();
auto end = ptrs.at(e.index + 1) + vals.cbegin();
auto beg = ptrs[e.index] + vals.cbegin();
// Truncates the value in case it's not perfectly rounded.
auto v = static_cast<float>(common::AsCat(e.fvalue));
auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
if (bin_idx == ptrs.at(e.index + 1)) {
bin_idx -= 1;
}
return bin_idx;
}
};
inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,

View File

@ -3,6 +3,8 @@
*/
#include <limits>
#include <utility>
#include "rabit/rabit.h"
#include "quantile.h"
#include "hist_util.h"
#include "categorical.h"
@ -189,7 +191,7 @@ void HostSketchContainer::PushRowPage(
if (is_dense) {
for (size_t ii = begin; ii < end; ii++) {
if (IsCat(feature_types_, ii)) {
categories_[ii].emplace(p_inst[ii].fvalue);
categories_[ii].emplace(AsCat(p_inst[ii].fvalue));
} else {
sketches_[ii].Push(p_inst[ii].fvalue, w);
}
@ -199,7 +201,7 @@ void HostSketchContainer::PushRowPage(
auto const& entry = p_inst[i];
if (entry.index >= begin && entry.index < end) {
if (IsCat(feature_types_, entry.index)) {
categories_[entry.index].emplace(entry.fvalue);
categories_[entry.index].emplace(AsCat(entry.fvalue));
} else {
sketches_[entry.index].Push(entry.fvalue, w);
}

View File

@ -9,8 +9,9 @@
namespace xgboost {
void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
size_t prev_sum, uint32_t nbins,
void GHistIndexMatrix::PushBatch(SparsePage const &batch,
common::Span<FeatureType const> ft,
size_t rbegin, size_t prev_sum, uint32_t nbins,
int32_t n_threads) {
// The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size,
@ -86,7 +87,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
common::BinTypeSize curent_bin_size = index.GetBinTypeSize();
if (curent_bin_size == common::kUint8BinsTypeSize) {
common::Span<uint8_t> index_data_span = {index.data<uint8_t>(), n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[offsets](auto idx, auto j) {
return static_cast<uint8_t>(idx - offsets[j]);
});
@ -94,7 +95,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
} else if (curent_bin_size == common::kUint16BinsTypeSize) {
common::Span<uint16_t> index_data_span = {index.data<uint16_t>(),
n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[offsets](auto idx, auto j) {
return static_cast<uint16_t>(idx - offsets[j]);
});
@ -102,7 +103,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
CHECK_EQ(curent_bin_size, common::kUint32BinsTypeSize);
common::Span<uint32_t> index_data_span = {index.data<uint32_t>(),
n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[offsets](auto idx, auto j) {
return static_cast<uint32_t>(idx - offsets[j]);
});
@ -113,7 +114,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
not reduced */
} else {
common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[](auto idx, auto) { return idx; });
}
@ -147,15 +148,17 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins, common::Span<float> h
size_t prev_sum = 0;
const bool isDense = p_fmat->IsDense();
this->isDense_ = isDense;
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
this->PushBatch(batch, rbegin, prev_sum, nbins, nthread);
this->PushBatch(batch, ft, rbegin, prev_sum, nbins, nthread);
prev_sum = row_ptr[rbegin + batch.Size()];
rbegin += batch.Size();
}
}
void GHistIndexMatrix::Init(SparsePage const &batch,
common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts,
int32_t max_bins_per_feat, bool isDense,
int32_t n_threads) {
@ -176,7 +179,7 @@ void GHistIndexMatrix::Init(SparsePage const &batch,
size_t rbegin = 0;
size_t prev_sum = 0;
this->PushBatch(batch, rbegin, prev_sum, nbins, n_threads);
this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads);
}
void GHistIndexMatrix::ResizeIndex(const size_t n_index,

View File

@ -7,6 +7,7 @@
#include <vector>
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "../common/categorical.h"
#include "../common/hist_util.h"
#include "../common/threading_utils.h"
@ -18,8 +19,9 @@ namespace xgboost {
* index for CPU histogram. On GPU ellpack page is used.
*/
class GHistIndexMatrix {
void PushBatch(SparsePage const &batch, size_t rbegin, size_t prev_sum,
uint32_t nbins, int32_t n_threads);
void PushBatch(SparsePage const &batch, common::Span<FeatureType const> ft,
size_t rbegin, size_t prev_sum, uint32_t nbins,
int32_t n_threads);
public:
/*! \brief row pointer to rows by element position */
@ -40,12 +42,14 @@ class GHistIndexMatrix {
}
// Create a global histogram matrix, given cut
void Init(DMatrix* p_fmat, int max_num_bins, common::Span<float> hess);
void Init(SparsePage const &page, common::HistogramCuts const &cuts,
int32_t max_bins_per_feat, bool is_dense, int32_t n_threads);
void Init(SparsePage const &page, common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
bool is_dense, int32_t n_threads);
// specific method for sparse data as no possibility to reduce allocated memory
template <typename BinIdxType, typename GetOffset>
void SetIndexData(common::Span<BinIdxType> index_data_span,
common::Span<FeatureType const> ft,
size_t batch_threads, const SparsePage &batch,
size_t rbegin, size_t nbins, GetOffset get_offset) {
const xgboost::Entry *data_ptr = batch.data.HostVector().data();
@ -61,10 +65,17 @@ class GHistIndexMatrix {
SparsePage::Inst inst = {data_ptr + offset_vec[i], size};
CHECK_EQ(ibegin + inst.size(), iend);
for (bst_uint j = 0; j < inst.size(); ++j) {
auto e = inst[j];
if (common::IsCat(ft, e.index)) {
auto bin_idx = cut.SearchCatBin(e);
index_data[ibegin + j] = get_offset(bin_idx, j);
++hit_count_tloc_[tid * nbins + bin_idx];
} else {
uint32_t idx = cut.SearchBin(inst[j]);
index_data[ibegin + j] = get_offset(idx, j);
++hit_count_tloc_[tid * nbins + idx];
}
}
});
}

View File

@ -10,7 +10,8 @@ void GradientIndexPageSource::Fetch() {
auto const& csr = source_->Page();
this->page_.reset(new GHistIndexMatrix());
CHECK_NE(cuts_.Values().size(), 0);
this->page_->Init(*csr, cuts_, max_bin_per_feat_, is_dense_, nthreads_);
this->page_->Init(*csr, feature_types_, cuts_, max_bin_per_feat_, is_dense_,
nthreads_);
this->WriteCache();
}
}

View File

@ -16,16 +16,18 @@ class GradientIndexPageSource : public PageSourceIncMixIn<GHistIndexMatrix> {
common::HistogramCuts cuts_;
bool is_dense_;
int32_t max_bin_per_feat_;
common::Span<FeatureType const> feature_types_;
public:
GradientIndexPageSource(float missing, int nthreads, bst_feature_t n_features,
size_t n_batches, std::shared_ptr<Cache> cache,
BatchParam param, common::HistogramCuts cuts,
bool is_dense, int32_t max_bin_per_feat,
common::Span<FeatureType const> feature_types,
std::shared_ptr<SparsePageSource> source)
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache),
cuts_{std::move(cuts)}, is_dense_{is_dense}, max_bin_per_feat_{
max_bin_per_feat} {
cuts_{std::move(cuts)}, is_dense_{is_dense},
max_bin_per_feat_{max_bin_per_feat}, feature_types_{feature_types} {
this->source_ = source;
this->Fetch();
}

View File

@ -184,10 +184,11 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam&
batch_param_ = param;
ghist_index_source_.reset();
CHECK_NE(cuts.Values().size(), 0);
auto ft = this->info_.feature_types.ConstHostSpan();
ghist_index_source_.reset(new GradientIndexPageSource(
this->missing_, this->ctx_.Threads(), this->Info().num_col_,
this->n_batches_, cache_info_.at(id), param, std::move(cuts),
this->IsDense(), param.max_bin, sparse_page_source_));
this->IsDense(), param.max_bin, ft, sparse_page_source_));
} else {
CHECK(ghist_index_source_);
ghist_index_source_->Reset();

View File

@ -1,5 +1,5 @@
/*!
* Copyright 2019 by XGBoost Contributors
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <thrust/functional.h>
#include <thrust/random.h>
@ -13,6 +13,7 @@
#include "../../common/compressed_iterator.h"
#include "../../common/random.h"
#include "../param.h"
#include "gradient_based_sampler.cuh"
namespace xgboost {

View File

@ -23,5 +23,39 @@ TEST(GradientIndex, ExternalMemory) {
++i;
}
}
TEST(GradientIndex, FromCategoricalBasic) {
size_t constexpr kRows = 1000, kCats = 13, kCols = 1;
size_t max_bins = 8;
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
auto m = GetDMatrixFromData(x, kRows, 1);
auto &h_ft = m->Info().feature_types.HostVector();
h_ft.resize(kCols, FeatureType::kCategorical);
BatchParam p(0, max_bins);
GHistIndexMatrix gidx;
gidx.Init(m.get(), max_bins, {});
auto x_copy = x;
std::sort(x_copy.begin(), x_copy.end());
auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin();
ASSERT_EQ(n_uniques, kCats);
auto const &h_cut_ptr = gidx.cut.Ptrs();
auto const &h_cut_values = gidx.cut.Values();
ASSERT_EQ(h_cut_ptr.size(), 2);
ASSERT_EQ(h_cut_values.size(), kCats);
auto const &index = gidx.index;
for (size_t i = 0; i < x.size(); ++i) {
auto bin = index[i];
auto bin_value = h_cut_values.at(bin);
ASSERT_EQ(common::AsCat(x[i]), common::AsCat(bin_value));
}
}
} // namespace data
} // namespace xgboost

View File

@ -1,7 +1,11 @@
/*!
* Copyright 2020-2021 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include "../../../../src/data/ellpack_page.cuh"
#include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
#include "../../../../src/tree/param.h"
#include "../../helpers.h"
#include "dmlc/filesystem.h"