Fix CPU bin compression with categorical data. (#8809)
* Fix CPU bin compression with categorical data. * The bug causes the maximum category to be lesser than 256 or the maximum number of bins when the input data is dense.
This commit is contained in:
@@ -21,13 +21,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
|
||||
|
||||
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
|
||||
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
|
||||
common::Span<float> hess) {
|
||||
common::Span<float> hess)
|
||||
: max_numeric_bins_per_feat{max_bins_per_feat} {
|
||||
CHECK(p_fmat->SingleColBlock());
|
||||
// We use sorted sketching for approx tree method since it's more efficient in
|
||||
// computation time (but higher memory usage).
|
||||
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
|
||||
|
||||
max_num_bins = max_bins_per_feat;
|
||||
const uint32_t nbins = cut.Ptrs().back();
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_tloc_.resize(n_threads * nbins, 0);
|
||||
@@ -64,7 +64,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
|
||||
: row_ptr(info.num_row_ + 1, 0),
|
||||
hit_count(cuts.TotalBins(), 0),
|
||||
cut{std::forward<common::HistogramCuts>(cuts)},
|
||||
max_num_bins(max_bin_per_feat),
|
||||
max_numeric_bins_per_feat(max_bin_per_feat),
|
||||
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
@@ -87,13 +87,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
|
||||
}
|
||||
|
||||
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
|
||||
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
|
||||
bool isDense, double sparse_thresh, int32_t n_threads) {
|
||||
common::HistogramCuts cuts, int32_t max_bins_per_feat,
|
||||
bool isDense, double sparse_thresh, int32_t n_threads)
|
||||
: cut{std::move(cuts)},
|
||||
max_numeric_bins_per_feat{max_bins_per_feat},
|
||||
base_rowid{batch.base_rowid},
|
||||
isDense_{isDense} {
|
||||
CHECK_GE(n_threads, 1);
|
||||
base_rowid = batch.base_rowid;
|
||||
isDense_ = isDense;
|
||||
cut = cuts;
|
||||
max_num_bins = max_bins_per_feat;
|
||||
CHECK_EQ(row_ptr.size(), 0);
|
||||
// The number of threads is pegged to the batch size. If the OMP
|
||||
// block is parallelized on anything other than the batch/block size,
|
||||
@@ -128,12 +128,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
|
||||
#undef INSTANTIATION_PUSH
|
||||
|
||||
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
||||
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
|
||||
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
|
||||
isDense) {
|
||||
// compress dense index to uint8
|
||||
index.SetBinTypeSize(common::kUint8BinsTypeSize);
|
||||
index.Resize((sizeof(uint8_t)) * n_index);
|
||||
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
||||
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
||||
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
||||
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
||||
isDense) {
|
||||
// compress dense index to uint16
|
||||
index.SetBinTypeSize(common::kUint16BinsTypeSize);
|
||||
|
||||
Reference in New Issue
Block a user