From c22f6db4bfae5a415213dca624b2f9a882b4e946 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 16 Feb 2023 06:39:25 +0800 Subject: [PATCH] [backport] Fix CPU bin compression with categorical data. (#8809) (#8810) * [backport] Fix CPU bin compression with categorical data. (#8809) * Fix CPU bin compression with categorical data. * The bug causes the maximum category to be lesser than 256 or the maximum number of bins when the input data is dense. * Avoid test symbol. --- src/common/column_matrix.cc | 2 +- src/data/gradient_index.cc | 25 +++++++++++++------------ src/data/gradient_index.cu | 2 +- src/data/gradient_index.h | 10 +++++++--- src/data/gradient_index_format.cc | 6 +++--- tests/cpp/data/test_gradient_index.cc | 24 ++++++++++++++++++++++++ 6 files changed, 49 insertions(+), 20 deletions(-) diff --git a/src/common/column_matrix.cc b/src/common/column_matrix.cc index 91977b96d..d8acfa7a5 100644 --- a/src/common/column_matrix.cc +++ b/src/common/column_matrix.cc @@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres feature_offsets_[fid] = accum_index; } - SetTypeSize(gmat.max_num_bins); + SetTypeSize(gmat.MaxNumBinPerFeat()); auto storage_size = feature_offsets_.back() * static_cast>(bins_type_size_); index_.resize(storage_size, 0); diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index 2e9d38a19..a2d140a03 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique hess) { + common::Span hess) + : max_numeric_bins_per_feat{max_bins_per_feat} { CHECK(p_fmat->SingleColBlock()); // We use sorted sketching for approx tree method since it's more efficient in // computation time (but higher memory usage). cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess); - max_num_bins = max_bins_per_feat; const uint32_t nbins = cut.Ptrs().back(); hit_count.resize(nbins, 0); hit_count_tloc_.resize(n_threads * nbins, 0); @@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts & : row_ptr(info.num_row_ + 1, 0), hit_count(cuts.TotalBins(), 0), cut{std::forward(cuts)}, - max_num_bins(max_bin_per_feat), + max_numeric_bins_per_feat(max_bin_per_feat), isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {} #if !defined(XGBOOST_USE_CUDA) @@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span ft, - common::HistogramCuts const &cuts, int32_t max_bins_per_feat, - bool isDense, double sparse_thresh, int32_t n_threads) { + common::HistogramCuts cuts, int32_t max_bins_per_feat, + bool isDense, double sparse_thresh, int32_t n_threads) + : cut{std::move(cuts)}, + max_numeric_bins_per_feat{max_bins_per_feat}, + base_rowid{batch.base_rowid}, + isDense_{isDense} { CHECK_GE(n_threads, 1); - base_rowid = batch.base_rowid; - isDense_ = isDense; - cut = cuts; - max_num_bins = max_bins_per_feat; CHECK_EQ(row_ptr.size(), 0); // The number of threads is pegged to the batch size. If the OMP // block is parallelized on anything other than the batch/block size, @@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch) #undef INSTANTIATION_PUSH void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) { - if ((max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense) { + if ((MaxNumBinPerFeat() - 1 <= static_cast(std::numeric_limits::max())) && + isDense) { // compress dense index to uint8 index.SetBinTypeSize(common::kUint8BinsTypeSize); index.Resize((sizeof(uint8_t)) * n_index); - } else if ((max_num_bins - 1 > static_cast(std::numeric_limits::max()) && - max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && + } else if ((MaxNumBinPerFeat() - 1 > static_cast(std::numeric_limits::max()) && + MaxNumBinPerFeat() - 1 <= static_cast(std::numeric_limits::max())) && isDense) { // compress dense index to uint16 index.SetBinTypeSize(common::kUint16BinsTypeSize); diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu index 42d935b3c..af5b0f67b 100644 --- a/src/data/gradient_index.cu +++ b/src/data/gradient_index.cu @@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info, EllpackPage const& in_page, BatchParam const& p) - : max_num_bins{p.max_bin} { + : max_numeric_bins_per_feat{p.max_bin} { auto page = in_page.Impl(); isDense_ = page->is_dense; diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index 10d9e770d..0fc0daf9d 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -133,11 +133,15 @@ class GHistIndexMatrix { std::vector hit_count; /*! \brief The corresponding cuts */ common::HistogramCuts cut; - /*! \brief max_bin for each feature. */ - bst_bin_t max_num_bins; + /** \brief max_bin for each feature. */ + bst_bin_t max_numeric_bins_per_feat; /*! \brief base row index for current page (used by external memory) */ size_t base_rowid{0}; + bst_bin_t MaxNumBinPerFeat() const { + return std::max(static_cast(cut.MaxCategory() + 1), max_numeric_bins_per_feat); + } + ~GHistIndexMatrix(); /** * \brief Constrcutor for SimpleDMatrix. @@ -160,7 +164,7 @@ class GHistIndexMatrix { * \brief Constructor for external memory. */ GHistIndexMatrix(SparsePage const& page, common::Span ft, - common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense, + common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense, double sparse_thresh, int32_t n_threads); GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back. diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc index 4b3fd0ea0..204157682 100644 --- a/src/data/gradient_index_format.cc +++ b/src/data/gradient_index_format.cc @@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat { if (!fi->Read(&page->hit_count)) { return false; } - if (!fi->Read(&page->max_num_bins)) { + if (!fi->Read(&page->max_numeric_bins_per_feat)) { return false; } if (!fi->Read(&page->base_rowid)) { @@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat { page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) + sizeof(uint64_t); // max_bins, base row, is_dense - fo->Write(page.max_num_bins); - bytes += sizeof(page.max_num_bins); + fo->Write(page.max_numeric_bins_per_feat); + bytes += sizeof(page.max_numeric_bins_per_feat); fo->Write(page.base_rowid); bytes += sizeof(page.base_rowid); fo->Write(page.IsDense()); diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc index 6233f1b25..66e9cfe0c 100644 --- a/tests/cpp/data/test_gradient_index.cc +++ b/tests/cpp/data/test_gradient_index.cc @@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) { } } +TEST(GradientIndex, FromCategoricalLarge) { + size_t constexpr kRows = 1000, kCats = 512, kCols = 1; + bst_bin_t max_bins = 8; + auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats); + auto m = GetDMatrixFromData(x, kRows, 1); + Context ctx; + + auto &h_ft = m->Info().feature_types.HostVector(); + h_ft.resize(kCols, FeatureType::kCategorical); + + BatchParam p{max_bins, 0.8}; + { + GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, Context{}.Threads(), {}); + ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize); + } + { + for (auto const &page : m->GetBatches(p)) { + common::HistogramCuts cut = page.cut; + GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins}; + ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats); + } + } +} + TEST(GradientIndex, PushBatch) { size_t constexpr kRows = 64, kCols = 4; bst_bin_t max_bins = 64;