* [backport] Fix CPU bin compression with categorical data. (#8809) * Fix CPU bin compression with categorical data. * The bug causes the maximum category to be lesser than 256 or the maximum number of bins when the input data is dense. * Avoid test symbol.
This commit is contained in:
parent
f15a6d2b19
commit
c22f6db4bf
@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
|
|||||||
feature_offsets_[fid] = accum_index;
|
feature_offsets_[fid] = accum_index;
|
||||||
}
|
}
|
||||||
|
|
||||||
SetTypeSize(gmat.max_num_bins);
|
SetTypeSize(gmat.MaxNumBinPerFeat());
|
||||||
auto storage_size =
|
auto storage_size =
|
||||||
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
||||||
index_.resize(storage_size, 0);
|
index_.resize(storage_size, 0);
|
||||||
|
|||||||
@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
|
|||||||
|
|
||||||
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
|
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
|
||||||
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
|
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
|
||||||
common::Span<float> hess) {
|
common::Span<float> hess)
|
||||||
|
: max_numeric_bins_per_feat{max_bins_per_feat} {
|
||||||
CHECK(p_fmat->SingleColBlock());
|
CHECK(p_fmat->SingleColBlock());
|
||||||
// We use sorted sketching for approx tree method since it's more efficient in
|
// We use sorted sketching for approx tree method since it's more efficient in
|
||||||
// computation time (but higher memory usage).
|
// computation time (but higher memory usage).
|
||||||
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
|
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
|
||||||
|
|
||||||
max_num_bins = max_bins_per_feat;
|
|
||||||
const uint32_t nbins = cut.Ptrs().back();
|
const uint32_t nbins = cut.Ptrs().back();
|
||||||
hit_count.resize(nbins, 0);
|
hit_count.resize(nbins, 0);
|
||||||
hit_count_tloc_.resize(n_threads * nbins, 0);
|
hit_count_tloc_.resize(n_threads * nbins, 0);
|
||||||
@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
|
|||||||
: row_ptr(info.num_row_ + 1, 0),
|
: row_ptr(info.num_row_ + 1, 0),
|
||||||
hit_count(cuts.TotalBins(), 0),
|
hit_count(cuts.TotalBins(), 0),
|
||||||
cut{std::forward<common::HistogramCuts>(cuts)},
|
cut{std::forward<common::HistogramCuts>(cuts)},
|
||||||
max_num_bins(max_bin_per_feat),
|
max_numeric_bins_per_feat(max_bin_per_feat),
|
||||||
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
||||||
|
|
||||||
#if !defined(XGBOOST_USE_CUDA)
|
#if !defined(XGBOOST_USE_CUDA)
|
||||||
@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
|
|||||||
}
|
}
|
||||||
|
|
||||||
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
|
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
|
||||||
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
|
common::HistogramCuts cuts, int32_t max_bins_per_feat,
|
||||||
bool isDense, double sparse_thresh, int32_t n_threads) {
|
bool isDense, double sparse_thresh, int32_t n_threads)
|
||||||
|
: cut{std::move(cuts)},
|
||||||
|
max_numeric_bins_per_feat{max_bins_per_feat},
|
||||||
|
base_rowid{batch.base_rowid},
|
||||||
|
isDense_{isDense} {
|
||||||
CHECK_GE(n_threads, 1);
|
CHECK_GE(n_threads, 1);
|
||||||
base_rowid = batch.base_rowid;
|
|
||||||
isDense_ = isDense;
|
|
||||||
cut = cuts;
|
|
||||||
max_num_bins = max_bins_per_feat;
|
|
||||||
CHECK_EQ(row_ptr.size(), 0);
|
CHECK_EQ(row_ptr.size(), 0);
|
||||||
// The number of threads is pegged to the batch size. If the OMP
|
// The number of threads is pegged to the batch size. If the OMP
|
||||||
// block is parallelized on anything other than the batch/block size,
|
// block is parallelized on anything other than the batch/block size,
|
||||||
@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
|
|||||||
#undef INSTANTIATION_PUSH
|
#undef INSTANTIATION_PUSH
|
||||||
|
|
||||||
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
||||||
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
|
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
|
||||||
|
isDense) {
|
||||||
// compress dense index to uint8
|
// compress dense index to uint8
|
||||||
index.SetBinTypeSize(common::kUint8BinsTypeSize);
|
index.SetBinTypeSize(common::kUint8BinsTypeSize);
|
||||||
index.Resize((sizeof(uint8_t)) * n_index);
|
index.Resize((sizeof(uint8_t)) * n_index);
|
||||||
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
||||||
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
||||||
isDense) {
|
isDense) {
|
||||||
// compress dense index to uint16
|
// compress dense index to uint16
|
||||||
index.SetBinTypeSize(common::kUint16BinsTypeSize);
|
index.SetBinTypeSize(common::kUint16BinsTypeSize);
|
||||||
|
|||||||
@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
|
|||||||
|
|
||||||
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
|
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
|
||||||
EllpackPage const& in_page, BatchParam const& p)
|
EllpackPage const& in_page, BatchParam const& p)
|
||||||
: max_num_bins{p.max_bin} {
|
: max_numeric_bins_per_feat{p.max_bin} {
|
||||||
auto page = in_page.Impl();
|
auto page = in_page.Impl();
|
||||||
isDense_ = page->is_dense;
|
isDense_ = page->is_dense;
|
||||||
|
|
||||||
|
|||||||
@ -133,11 +133,15 @@ class GHistIndexMatrix {
|
|||||||
std::vector<size_t> hit_count;
|
std::vector<size_t> hit_count;
|
||||||
/*! \brief The corresponding cuts */
|
/*! \brief The corresponding cuts */
|
||||||
common::HistogramCuts cut;
|
common::HistogramCuts cut;
|
||||||
/*! \brief max_bin for each feature. */
|
/** \brief max_bin for each feature. */
|
||||||
bst_bin_t max_num_bins;
|
bst_bin_t max_numeric_bins_per_feat;
|
||||||
/*! \brief base row index for current page (used by external memory) */
|
/*! \brief base row index for current page (used by external memory) */
|
||||||
size_t base_rowid{0};
|
size_t base_rowid{0};
|
||||||
|
|
||||||
|
bst_bin_t MaxNumBinPerFeat() const {
|
||||||
|
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
|
||||||
|
}
|
||||||
|
|
||||||
~GHistIndexMatrix();
|
~GHistIndexMatrix();
|
||||||
/**
|
/**
|
||||||
* \brief Constrcutor for SimpleDMatrix.
|
* \brief Constrcutor for SimpleDMatrix.
|
||||||
@ -160,7 +164,7 @@ class GHistIndexMatrix {
|
|||||||
* \brief Constructor for external memory.
|
* \brief Constructor for external memory.
|
||||||
*/
|
*/
|
||||||
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
|
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
|
||||||
common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
|
common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
|
||||||
double sparse_thresh, int32_t n_threads);
|
double sparse_thresh, int32_t n_threads);
|
||||||
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.
|
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.
|
||||||
|
|
||||||
|
|||||||
@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
|||||||
if (!fi->Read(&page->hit_count)) {
|
if (!fi->Read(&page->hit_count)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!fi->Read(&page->max_num_bins)) {
|
if (!fi->Read(&page->max_numeric_bins_per_feat)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!fi->Read(&page->base_rowid)) {
|
if (!fi->Read(&page->base_rowid)) {
|
||||||
@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
|||||||
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
|
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
|
||||||
sizeof(uint64_t);
|
sizeof(uint64_t);
|
||||||
// max_bins, base row, is_dense
|
// max_bins, base row, is_dense
|
||||||
fo->Write(page.max_num_bins);
|
fo->Write(page.max_numeric_bins_per_feat);
|
||||||
bytes += sizeof(page.max_num_bins);
|
bytes += sizeof(page.max_numeric_bins_per_feat);
|
||||||
fo->Write(page.base_rowid);
|
fo->Write(page.base_rowid);
|
||||||
bytes += sizeof(page.base_rowid);
|
bytes += sizeof(page.base_rowid);
|
||||||
fo->Write(page.IsDense());
|
fo->Write(page.IsDense());
|
||||||
|
|||||||
@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(GradientIndex, FromCategoricalLarge) {
|
||||||
|
size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
|
||||||
|
bst_bin_t max_bins = 8;
|
||||||
|
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
|
||||||
|
auto m = GetDMatrixFromData(x, kRows, 1);
|
||||||
|
Context ctx;
|
||||||
|
|
||||||
|
auto &h_ft = m->Info().feature_types.HostVector();
|
||||||
|
h_ft.resize(kCols, FeatureType::kCategorical);
|
||||||
|
|
||||||
|
BatchParam p{max_bins, 0.8};
|
||||||
|
{
|
||||||
|
GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, Context{}.Threads(), {});
|
||||||
|
ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
|
||||||
|
common::HistogramCuts cut = page.cut;
|
||||||
|
GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
|
||||||
|
ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(GradientIndex, PushBatch) {
|
TEST(GradientIndex, PushBatch) {
|
||||||
size_t constexpr kRows = 64, kCols = 4;
|
size_t constexpr kRows = 64, kCols = 4;
|
||||||
bst_bin_t max_bins = 64;
|
bst_bin_t max_bins = 64;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user