Support building gradient index with cat data. (#7371)
This commit is contained in:
parent
57a4b4ff64
commit
ccdabe4512
@ -16,13 +16,12 @@
|
||||
#include <utility>
|
||||
#include <map>
|
||||
|
||||
#include "row_set.h"
|
||||
#include "categorical.h"
|
||||
#include "common.h"
|
||||
#include "quantile.h"
|
||||
#include "row_set.h"
|
||||
#include "threading_utils.h"
|
||||
#include "../tree/param.h"
|
||||
#include "./quantile.h"
|
||||
#include "./timer.h"
|
||||
#include "../include/rabit/rabit.h"
|
||||
#include "timer.h"
|
||||
|
||||
namespace xgboost {
|
||||
class GHistIndexMatrix;
|
||||
@ -105,9 +104,29 @@ class HistogramCuts {
|
||||
return idx;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Search the bin index for numerical feature.
|
||||
*/
|
||||
BinIdx SearchBin(Entry const& e) const {
|
||||
return SearchBin(e.fvalue, e.index);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Search the bin index for categorical feature.
|
||||
*/
|
||||
BinIdx SearchCatBin(Entry const &e) const {
|
||||
auto const &ptrs = this->Ptrs();
|
||||
auto const &vals = this->Values();
|
||||
auto end = ptrs.at(e.index + 1) + vals.cbegin();
|
||||
auto beg = ptrs[e.index] + vals.cbegin();
|
||||
// Truncates the value in case it's not perfectly rounded.
|
||||
auto v = static_cast<float>(common::AsCat(e.fvalue));
|
||||
auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
|
||||
if (bin_idx == ptrs.at(e.index + 1)) {
|
||||
bin_idx -= 1;
|
||||
}
|
||||
return bin_idx;
|
||||
}
|
||||
};
|
||||
|
||||
inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
|
||||
|
||||
@ -3,6 +3,8 @@
|
||||
*/
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "quantile.h"
|
||||
#include "hist_util.h"
|
||||
#include "categorical.h"
|
||||
@ -189,7 +191,7 @@ void HostSketchContainer::PushRowPage(
|
||||
if (is_dense) {
|
||||
for (size_t ii = begin; ii < end; ii++) {
|
||||
if (IsCat(feature_types_, ii)) {
|
||||
categories_[ii].emplace(p_inst[ii].fvalue);
|
||||
categories_[ii].emplace(AsCat(p_inst[ii].fvalue));
|
||||
} else {
|
||||
sketches_[ii].Push(p_inst[ii].fvalue, w);
|
||||
}
|
||||
@ -199,7 +201,7 @@ void HostSketchContainer::PushRowPage(
|
||||
auto const& entry = p_inst[i];
|
||||
if (entry.index >= begin && entry.index < end) {
|
||||
if (IsCat(feature_types_, entry.index)) {
|
||||
categories_[entry.index].emplace(entry.fvalue);
|
||||
categories_[entry.index].emplace(AsCat(entry.fvalue));
|
||||
} else {
|
||||
sketches_[entry.index].Push(entry.fvalue, w);
|
||||
}
|
||||
|
||||
@ -9,8 +9,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
|
||||
size_t prev_sum, uint32_t nbins,
|
||||
void GHistIndexMatrix::PushBatch(SparsePage const &batch,
|
||||
common::Span<FeatureType const> ft,
|
||||
size_t rbegin, size_t prev_sum, uint32_t nbins,
|
||||
int32_t n_threads) {
|
||||
// The number of threads is pegged to the batch size. If the OMP
|
||||
// block is parallelized on anything other than the batch/block size,
|
||||
@ -86,7 +87,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
|
||||
common::BinTypeSize curent_bin_size = index.GetBinTypeSize();
|
||||
if (curent_bin_size == common::kUint8BinsTypeSize) {
|
||||
common::Span<uint8_t> index_data_span = {index.data<uint8_t>(), n_index};
|
||||
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
|
||||
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
|
||||
[offsets](auto idx, auto j) {
|
||||
return static_cast<uint8_t>(idx - offsets[j]);
|
||||
});
|
||||
@ -94,7 +95,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
|
||||
} else if (curent_bin_size == common::kUint16BinsTypeSize) {
|
||||
common::Span<uint16_t> index_data_span = {index.data<uint16_t>(),
|
||||
n_index};
|
||||
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
|
||||
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
|
||||
[offsets](auto idx, auto j) {
|
||||
return static_cast<uint16_t>(idx - offsets[j]);
|
||||
});
|
||||
@ -102,7 +103,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
|
||||
CHECK_EQ(curent_bin_size, common::kUint32BinsTypeSize);
|
||||
common::Span<uint32_t> index_data_span = {index.data<uint32_t>(),
|
||||
n_index};
|
||||
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
|
||||
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
|
||||
[offsets](auto idx, auto j) {
|
||||
return static_cast<uint32_t>(idx - offsets[j]);
|
||||
});
|
||||
@ -113,7 +114,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
|
||||
not reduced */
|
||||
} else {
|
||||
common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
|
||||
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
|
||||
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
|
||||
[](auto idx, auto) { return idx; });
|
||||
}
|
||||
|
||||
@ -147,15 +148,17 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins, common::Span<float> h
|
||||
size_t prev_sum = 0;
|
||||
const bool isDense = p_fmat->IsDense();
|
||||
this->isDense_ = isDense;
|
||||
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
|
||||
|
||||
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
this->PushBatch(batch, rbegin, prev_sum, nbins, nthread);
|
||||
this->PushBatch(batch, ft, rbegin, prev_sum, nbins, nthread);
|
||||
prev_sum = row_ptr[rbegin + batch.Size()];
|
||||
rbegin += batch.Size();
|
||||
}
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::Init(SparsePage const &batch,
|
||||
common::Span<FeatureType const> ft,
|
||||
common::HistogramCuts const &cuts,
|
||||
int32_t max_bins_per_feat, bool isDense,
|
||||
int32_t n_threads) {
|
||||
@ -176,7 +179,7 @@ void GHistIndexMatrix::Init(SparsePage const &batch,
|
||||
size_t rbegin = 0;
|
||||
size_t prev_sum = 0;
|
||||
|
||||
this->PushBatch(batch, rbegin, prev_sum, nbins, n_threads);
|
||||
this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads);
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::ResizeIndex(const size_t n_index,
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <vector>
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/threading_utils.h"
|
||||
|
||||
@ -18,8 +19,9 @@ namespace xgboost {
|
||||
* index for CPU histogram. On GPU ellpack page is used.
|
||||
*/
|
||||
class GHistIndexMatrix {
|
||||
void PushBatch(SparsePage const &batch, size_t rbegin, size_t prev_sum,
|
||||
uint32_t nbins, int32_t n_threads);
|
||||
void PushBatch(SparsePage const &batch, common::Span<FeatureType const> ft,
|
||||
size_t rbegin, size_t prev_sum, uint32_t nbins,
|
||||
int32_t n_threads);
|
||||
|
||||
public:
|
||||
/*! \brief row pointer to rows by element position */
|
||||
@ -40,12 +42,14 @@ class GHistIndexMatrix {
|
||||
}
|
||||
// Create a global histogram matrix, given cut
|
||||
void Init(DMatrix* p_fmat, int max_num_bins, common::Span<float> hess);
|
||||
void Init(SparsePage const &page, common::HistogramCuts const &cuts,
|
||||
int32_t max_bins_per_feat, bool is_dense, int32_t n_threads);
|
||||
void Init(SparsePage const &page, common::Span<FeatureType const> ft,
|
||||
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
|
||||
bool is_dense, int32_t n_threads);
|
||||
|
||||
// specific method for sparse data as no possibility to reduce allocated memory
|
||||
template <typename BinIdxType, typename GetOffset>
|
||||
void SetIndexData(common::Span<BinIdxType> index_data_span,
|
||||
common::Span<FeatureType const> ft,
|
||||
size_t batch_threads, const SparsePage &batch,
|
||||
size_t rbegin, size_t nbins, GetOffset get_offset) {
|
||||
const xgboost::Entry *data_ptr = batch.data.HostVector().data();
|
||||
@ -61,10 +65,17 @@ class GHistIndexMatrix {
|
||||
SparsePage::Inst inst = {data_ptr + offset_vec[i], size};
|
||||
CHECK_EQ(ibegin + inst.size(), iend);
|
||||
for (bst_uint j = 0; j < inst.size(); ++j) {
|
||||
auto e = inst[j];
|
||||
if (common::IsCat(ft, e.index)) {
|
||||
auto bin_idx = cut.SearchCatBin(e);
|
||||
index_data[ibegin + j] = get_offset(bin_idx, j);
|
||||
++hit_count_tloc_[tid * nbins + bin_idx];
|
||||
} else {
|
||||
uint32_t idx = cut.SearchBin(inst[j]);
|
||||
index_data[ibegin + j] = get_offset(idx, j);
|
||||
++hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ -10,7 +10,8 @@ void GradientIndexPageSource::Fetch() {
|
||||
auto const& csr = source_->Page();
|
||||
this->page_.reset(new GHistIndexMatrix());
|
||||
CHECK_NE(cuts_.Values().size(), 0);
|
||||
this->page_->Init(*csr, cuts_, max_bin_per_feat_, is_dense_, nthreads_);
|
||||
this->page_->Init(*csr, feature_types_, cuts_, max_bin_per_feat_, is_dense_,
|
||||
nthreads_);
|
||||
this->WriteCache();
|
||||
}
|
||||
}
|
||||
|
||||
@ -16,16 +16,18 @@ class GradientIndexPageSource : public PageSourceIncMixIn<GHistIndexMatrix> {
|
||||
common::HistogramCuts cuts_;
|
||||
bool is_dense_;
|
||||
int32_t max_bin_per_feat_;
|
||||
common::Span<FeatureType const> feature_types_;
|
||||
|
||||
public:
|
||||
GradientIndexPageSource(float missing, int nthreads, bst_feature_t n_features,
|
||||
size_t n_batches, std::shared_ptr<Cache> cache,
|
||||
BatchParam param, common::HistogramCuts cuts,
|
||||
bool is_dense, int32_t max_bin_per_feat,
|
||||
common::Span<FeatureType const> feature_types,
|
||||
std::shared_ptr<SparsePageSource> source)
|
||||
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache),
|
||||
cuts_{std::move(cuts)}, is_dense_{is_dense}, max_bin_per_feat_{
|
||||
max_bin_per_feat} {
|
||||
cuts_{std::move(cuts)}, is_dense_{is_dense},
|
||||
max_bin_per_feat_{max_bin_per_feat}, feature_types_{feature_types} {
|
||||
this->source_ = source;
|
||||
this->Fetch();
|
||||
}
|
||||
|
||||
@ -184,10 +184,11 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam&
|
||||
batch_param_ = param;
|
||||
ghist_index_source_.reset();
|
||||
CHECK_NE(cuts.Values().size(), 0);
|
||||
auto ft = this->info_.feature_types.ConstHostSpan();
|
||||
ghist_index_source_.reset(new GradientIndexPageSource(
|
||||
this->missing_, this->ctx_.Threads(), this->Info().num_col_,
|
||||
this->n_batches_, cache_info_.at(id), param, std::move(cuts),
|
||||
this->IsDense(), param.max_bin, sparse_page_source_));
|
||||
this->IsDense(), param.max_bin, ft, sparse_page_source_));
|
||||
} else {
|
||||
CHECK(ghist_index_source_);
|
||||
ghist_index_source_->Reset();
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2019 by XGBoost Contributors
|
||||
* Copyright 2019-2021 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/functional.h>
|
||||
#include <thrust/random.h>
|
||||
@ -13,6 +13,7 @@
|
||||
|
||||
#include "../../common/compressed_iterator.h"
|
||||
#include "../../common/random.h"
|
||||
#include "../param.h"
|
||||
#include "gradient_based_sampler.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@ -23,5 +23,39 @@ TEST(GradientIndex, ExternalMemory) {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GradientIndex, FromCategoricalBasic) {
|
||||
size_t constexpr kRows = 1000, kCats = 13, kCols = 1;
|
||||
size_t max_bins = 8;
|
||||
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
|
||||
auto m = GetDMatrixFromData(x, kRows, 1);
|
||||
|
||||
auto &h_ft = m->Info().feature_types.HostVector();
|
||||
h_ft.resize(kCols, FeatureType::kCategorical);
|
||||
|
||||
BatchParam p(0, max_bins);
|
||||
GHistIndexMatrix gidx;
|
||||
|
||||
gidx.Init(m.get(), max_bins, {});
|
||||
|
||||
auto x_copy = x;
|
||||
std::sort(x_copy.begin(), x_copy.end());
|
||||
auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin();
|
||||
ASSERT_EQ(n_uniques, kCats);
|
||||
|
||||
auto const &h_cut_ptr = gidx.cut.Ptrs();
|
||||
auto const &h_cut_values = gidx.cut.Values();
|
||||
|
||||
ASSERT_EQ(h_cut_ptr.size(), 2);
|
||||
ASSERT_EQ(h_cut_values.size(), kCats);
|
||||
|
||||
auto const &index = gidx.index;
|
||||
|
||||
for (size_t i = 0; i < x.size(); ++i) {
|
||||
auto bin = index[i];
|
||||
auto bin_value = h_cut_values.at(bin);
|
||||
ASSERT_EQ(common::AsCat(x[i]), common::AsCat(bin_value));
|
||||
}
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@ -1,7 +1,11 @@
|
||||
/*!
|
||||
* Copyright 2020-2021 by XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../../src/data/ellpack_page.cuh"
|
||||
#include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "../../helpers.h"
|
||||
#include "dmlc/filesystem.h"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user