Fix CPU bin compression with categorical data. (#8809)

* Fix CPU bin compression with categorical data.

* The bug causes the maximum category to be lesser than 256 or the maximum number of bins when
the input data is dense.
This commit is contained in:
Jiaming Yuan
2023-02-16 04:20:34 +08:00
committed by GitHub
parent cce4af4acf
commit c0afdb6786
6 changed files with 49 additions and 20 deletions

View File

@@ -134,11 +134,15 @@ class GHistIndexMatrix {
std::vector<size_t> hit_count;
/*! \brief The corresponding cuts */
common::HistogramCuts cut;
/*! \brief max_bin for each feature. */
bst_bin_t max_num_bins;
/** \brief max_bin for each feature. */
bst_bin_t max_numeric_bins_per_feat;
/*! \brief base row index for current page (used by external memory) */
size_t base_rowid{0};
bst_bin_t MaxNumBinPerFeat() const {
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
}
~GHistIndexMatrix();
/**
* \brief Constrcutor for SimpleDMatrix.
@@ -161,7 +165,7 @@ class GHistIndexMatrix {
* \brief Constructor for external memory.
*/
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
double sparse_thresh, int32_t n_threads);
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.