diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 9dc0bd1c5..05d8c2eac 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -16,13 +16,12 @@ #include #include -#include "row_set.h" +#include "categorical.h" #include "common.h" +#include "quantile.h" +#include "row_set.h" #include "threading_utils.h" -#include "../tree/param.h" -#include "./quantile.h" -#include "./timer.h" -#include "../include/rabit/rabit.h" +#include "timer.h" namespace xgboost { class GHistIndexMatrix; @@ -105,9 +104,29 @@ class HistogramCuts { return idx; } + /** + * \brief Search the bin index for numerical feature. + */ BinIdx SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); } + + /** + * \brief Search the bin index for categorical feature. + */ + BinIdx SearchCatBin(Entry const &e) const { + auto const &ptrs = this->Ptrs(); + auto const &vals = this->Values(); + auto end = ptrs.at(e.index + 1) + vals.cbegin(); + auto beg = ptrs[e.index] + vals.cbegin(); + // Truncates the value in case it's not perfectly rounded. + auto v = static_cast(common::AsCat(e.fvalue)); + auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin(); + if (bin_idx == ptrs.at(e.index + 1)) { + bin_idx -= 1; + } + return bin_idx; + } }; inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, diff --git a/src/common/quantile.cc b/src/common/quantile.cc index 4e84719c0..4cb1d3a5c 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -3,6 +3,8 @@ */ #include #include + +#include "rabit/rabit.h" #include "quantile.h" #include "hist_util.h" #include "categorical.h" @@ -189,7 +191,7 @@ void HostSketchContainer::PushRowPage( if (is_dense) { for (size_t ii = begin; ii < end; ii++) { if (IsCat(feature_types_, ii)) { - categories_[ii].emplace(p_inst[ii].fvalue); + categories_[ii].emplace(AsCat(p_inst[ii].fvalue)); } else { sketches_[ii].Push(p_inst[ii].fvalue, w); } @@ -199,7 +201,7 @@ void HostSketchContainer::PushRowPage( auto const& entry = p_inst[i]; if (entry.index >= begin && entry.index < end) { if (IsCat(feature_types_, entry.index)) { - categories_[entry.index].emplace(entry.fvalue); + categories_[entry.index].emplace(AsCat(entry.fvalue)); } else { sketches_[entry.index].Push(entry.fvalue, w); } diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index f2e14882e..b0eea203e 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -9,8 +9,9 @@ namespace xgboost { -void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin, - size_t prev_sum, uint32_t nbins, +void GHistIndexMatrix::PushBatch(SparsePage const &batch, + common::Span ft, + size_t rbegin, size_t prev_sum, uint32_t nbins, int32_t n_threads) { // The number of threads is pegged to the batch size. If the OMP // block is parallelized on anything other than the batch/block size, @@ -86,7 +87,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin, common::BinTypeSize curent_bin_size = index.GetBinTypeSize(); if (curent_bin_size == common::kUint8BinsTypeSize) { common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, [offsets](auto idx, auto j) { return static_cast(idx - offsets[j]); }); @@ -94,7 +95,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin, } else if (curent_bin_size == common::kUint16BinsTypeSize) { common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, [offsets](auto idx, auto j) { return static_cast(idx - offsets[j]); }); @@ -102,7 +103,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin, CHECK_EQ(curent_bin_size, common::kUint32BinsTypeSize); common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, [offsets](auto idx, auto j) { return static_cast(idx - offsets[j]); }); @@ -113,7 +114,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin, not reduced */ } else { common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, [](auto idx, auto) { return idx; }); } @@ -147,15 +148,17 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins, common::Span h size_t prev_sum = 0; const bool isDense = p_fmat->IsDense(); this->isDense_ = isDense; + auto ft = p_fmat->Info().feature_types.ConstHostSpan(); for (const auto &batch : p_fmat->GetBatches()) { - this->PushBatch(batch, rbegin, prev_sum, nbins, nthread); + this->PushBatch(batch, ft, rbegin, prev_sum, nbins, nthread); prev_sum = row_ptr[rbegin + batch.Size()]; rbegin += batch.Size(); } } void GHistIndexMatrix::Init(SparsePage const &batch, + common::Span ft, common::HistogramCuts const &cuts, int32_t max_bins_per_feat, bool isDense, int32_t n_threads) { @@ -176,7 +179,7 @@ void GHistIndexMatrix::Init(SparsePage const &batch, size_t rbegin = 0; size_t prev_sum = 0; - this->PushBatch(batch, rbegin, prev_sum, nbins, n_threads); + this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads); } void GHistIndexMatrix::ResizeIndex(const size_t n_index, diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index 971e82d4f..a12ebfad6 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -7,6 +7,7 @@ #include #include "xgboost/base.h" #include "xgboost/data.h" +#include "../common/categorical.h" #include "../common/hist_util.h" #include "../common/threading_utils.h" @@ -18,8 +19,9 @@ namespace xgboost { * index for CPU histogram. On GPU ellpack page is used. */ class GHistIndexMatrix { - void PushBatch(SparsePage const &batch, size_t rbegin, size_t prev_sum, - uint32_t nbins, int32_t n_threads); + void PushBatch(SparsePage const &batch, common::Span ft, + size_t rbegin, size_t prev_sum, uint32_t nbins, + int32_t n_threads); public: /*! \brief row pointer to rows by element position */ @@ -40,12 +42,14 @@ class GHistIndexMatrix { } // Create a global histogram matrix, given cut void Init(DMatrix* p_fmat, int max_num_bins, common::Span hess); - void Init(SparsePage const &page, common::HistogramCuts const &cuts, - int32_t max_bins_per_feat, bool is_dense, int32_t n_threads); + void Init(SparsePage const &page, common::Span ft, + common::HistogramCuts const &cuts, int32_t max_bins_per_feat, + bool is_dense, int32_t n_threads); // specific method for sparse data as no possibility to reduce allocated memory template void SetIndexData(common::Span index_data_span, + common::Span ft, size_t batch_threads, const SparsePage &batch, size_t rbegin, size_t nbins, GetOffset get_offset) { const xgboost::Entry *data_ptr = batch.data.HostVector().data(); @@ -61,9 +65,16 @@ class GHistIndexMatrix { SparsePage::Inst inst = {data_ptr + offset_vec[i], size}; CHECK_EQ(ibegin + inst.size(), iend); for (bst_uint j = 0; j < inst.size(); ++j) { - uint32_t idx = cut.SearchBin(inst[j]); - index_data[ibegin + j] = get_offset(idx, j); - ++hit_count_tloc_[tid * nbins + idx]; + auto e = inst[j]; + if (common::IsCat(ft, e.index)) { + auto bin_idx = cut.SearchCatBin(e); + index_data[ibegin + j] = get_offset(bin_idx, j); + ++hit_count_tloc_[tid * nbins + bin_idx]; + } else { + uint32_t idx = cut.SearchBin(inst[j]); + index_data[ibegin + j] = get_offset(idx, j); + ++hit_count_tloc_[tid * nbins + idx]; + } } }); } diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc index e35970bf3..8f592213f 100644 --- a/src/data/gradient_index_page_source.cc +++ b/src/data/gradient_index_page_source.cc @@ -10,7 +10,8 @@ void GradientIndexPageSource::Fetch() { auto const& csr = source_->Page(); this->page_.reset(new GHistIndexMatrix()); CHECK_NE(cuts_.Values().size(), 0); - this->page_->Init(*csr, cuts_, max_bin_per_feat_, is_dense_, nthreads_); + this->page_->Init(*csr, feature_types_, cuts_, max_bin_per_feat_, is_dense_, + nthreads_); this->WriteCache(); } } diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h index db66a1cda..a11057d54 100644 --- a/src/data/gradient_index_page_source.h +++ b/src/data/gradient_index_page_source.h @@ -16,16 +16,18 @@ class GradientIndexPageSource : public PageSourceIncMixIn { common::HistogramCuts cuts_; bool is_dense_; int32_t max_bin_per_feat_; + common::Span feature_types_; public: GradientIndexPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches, std::shared_ptr cache, BatchParam param, common::HistogramCuts cuts, bool is_dense, int32_t max_bin_per_feat, + common::Span feature_types, std::shared_ptr source) : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache), - cuts_{std::move(cuts)}, is_dense_{is_dense}, max_bin_per_feat_{ - max_bin_per_feat} { + cuts_{std::move(cuts)}, is_dense_{is_dense}, + max_bin_per_feat_{max_bin_per_feat}, feature_types_{feature_types} { this->source_ = source; this->Fetch(); } diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc index 18c81a654..db2e298df 100644 --- a/src/data/sparse_page_dmatrix.cc +++ b/src/data/sparse_page_dmatrix.cc @@ -184,10 +184,11 @@ BatchSet SparsePageDMatrix::GetGradientIndex(const BatchParam& batch_param_ = param; ghist_index_source_.reset(); CHECK_NE(cuts.Values().size(), 0); + auto ft = this->info_.feature_types.ConstHostSpan(); ghist_index_source_.reset(new GradientIndexPageSource( this->missing_, this->ctx_.Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id), param, std::move(cuts), - this->IsDense(), param.max_bin, sparse_page_source_)); + this->IsDense(), param.max_bin, ft, sparse_page_source_)); } else { CHECK(ghist_index_source_); ghist_index_source_->Reset(); diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 3b20e54a7..42329b492 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -1,5 +1,5 @@ /*! - * Copyright 2019 by XGBoost Contributors + * Copyright 2019-2021 by XGBoost Contributors */ #include #include @@ -13,6 +13,7 @@ #include "../../common/compressed_iterator.h" #include "../../common/random.h" +#include "../param.h" #include "gradient_based_sampler.cuh" namespace xgboost { diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc index 2c19b9e58..2dcb5ed1a 100644 --- a/tests/cpp/data/test_gradient_index.cc +++ b/tests/cpp/data/test_gradient_index.cc @@ -23,5 +23,39 @@ TEST(GradientIndex, ExternalMemory) { ++i; } } + +TEST(GradientIndex, FromCategoricalBasic) { + size_t constexpr kRows = 1000, kCats = 13, kCols = 1; + size_t max_bins = 8; + auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats); + auto m = GetDMatrixFromData(x, kRows, 1); + + auto &h_ft = m->Info().feature_types.HostVector(); + h_ft.resize(kCols, FeatureType::kCategorical); + + BatchParam p(0, max_bins); + GHistIndexMatrix gidx; + + gidx.Init(m.get(), max_bins, {}); + + auto x_copy = x; + std::sort(x_copy.begin(), x_copy.end()); + auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin(); + ASSERT_EQ(n_uniques, kCats); + + auto const &h_cut_ptr = gidx.cut.Ptrs(); + auto const &h_cut_values = gidx.cut.Values(); + + ASSERT_EQ(h_cut_ptr.size(), 2); + ASSERT_EQ(h_cut_values.size(), kCats); + + auto const &index = gidx.index; + + for (size_t i = 0; i < x.size(); ++i) { + auto bin = index[i]; + auto bin_value = h_cut_values.at(bin); + ASSERT_EQ(common::AsCat(x[i]), common::AsCat(bin_value)); + } +} } // namespace data } // namespace xgboost diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index 5dc2c4982..2ff5f8204 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -1,7 +1,11 @@ +/*! + * Copyright 2020-2021 by XGBoost Contributors + */ #include #include "../../../../src/data/ellpack_page.cuh" #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh" +#include "../../../../src/tree/param.h" #include "../../helpers.h" #include "dmlc/filesystem.h"