/** * Copyright 2017-2023 by XGBoost Contributors * \file hist_util.cc */ #include "hist_util.h" #include #include #include "../data/adapter.h" // for SparsePageAdapterBatch #include "../data/gradient_index.h" // for GHistIndexMatrix #include "quantile.h" #include "xgboost/base.h" #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for SparsePage, SortedCSCPage #if defined(XGBOOST_MM_PREFETCH_PRESENT) #include #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) #elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) #else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op #define PREFETCH_READ_T0(addr) do {} while (0) #endif // defined(XGBOOST_MM_PREFETCH_PRESENT) namespace xgboost::common { HistogramCuts::HistogramCuts() { cut_ptrs_.HostVector().emplace_back(0); } HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted, Span hessian) { HistogramCuts out; auto const &info = m->Info(); auto n_threads = ctx->Threads(); std::vector reduced(info.num_col_, 0); for (auto const &page : m->GetBatches()) { auto const &entries_per_column = CalcColumnSize(data::SparsePageAdapterBatch{page.GetView()}, info.num_col_, n_threads, [](auto) { return true; }); CHECK_EQ(entries_per_column.size(), info.num_col_); for (size_t i = 0; i < entries_per_column.size(); ++i) { reduced[i] += entries_per_column[i]; } } if (!use_sorted) { HostSketchContainer container(ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced, HostSketchContainer::UseGroup(info)); for (auto const &page : m->GetBatches()) { container.PushRowPage(page, info, hessian); } container.MakeCuts(ctx, m->Info(), &out); } else { SortedSketchContainer container{ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced, HostSketchContainer::UseGroup(info)}; for (auto const &page : m->GetBatches(ctx)) { container.PushColPage(page, info, hessian); } container.MakeCuts(ctx, m->Info(), &out); } return out; } /*! * \brief Increment hist as dst += add in range [begin, end) */ void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end) { double *pdst = reinterpret_cast(dst.data()); const double *padd = reinterpret_cast(add.data()); for (std::size_t i = 2 * begin; i < 2 * end; ++i) { pdst[i] += padd[i]; } } /*! * \brief Copy hist from src to dst in range [begin, end) */ void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end) { double *pdst = reinterpret_cast(dst.data()); const double *psrc = reinterpret_cast(src.data()); for (size_t i = 2 * begin; i < 2 * end; ++i) { pdst[i] = psrc[i]; } } /*! * \brief Compute Subtraction: dst = src1 - src2 in range [begin, end) */ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2, size_t begin, size_t end) { double* pdst = reinterpret_cast(dst.data()); const double* psrc1 = reinterpret_cast(src1.data()); const double* psrc2 = reinterpret_cast(src2.data()); for (size_t i = 2 * begin; i < 2 * end; ++i) { pdst[i] = psrc1[i] - psrc2[i]; } } struct Prefetch { public: static constexpr size_t kCacheLineSize = 64; static constexpr size_t kPrefetchOffset = 10; private: static constexpr size_t kNoPrefetchSize = kPrefetchOffset + kCacheLineSize / sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type); public: static size_t NoPrefetchSize(size_t rows) { return std::min(rows, kNoPrefetchSize); } template static constexpr size_t GetPrefetchStep() { return Prefetch::kCacheLineSize / sizeof(T); } }; constexpr size_t Prefetch::kNoPrefetchSize; struct RuntimeFlags { const bool first_page; const bool read_by_column; const BinTypeSize bin_type_size; }; template class GHistBuildingManager { public: constexpr static bool kAnyMissing = _any_missing; constexpr static bool kFirstPage = _first_page; constexpr static bool kReadByColumn = _read_by_column; using BinIdxType = BinIdxTypeName; private: template struct SetFirstPage { using Type = GHistBuildingManager; }; template struct SetReadByColumn { using Type = GHistBuildingManager; }; template struct SetBinIdxType { using Type = GHistBuildingManager; }; using Type = GHistBuildingManager; public: /* Entry point to dispatcher * This function check matching run time flags to compile time flags. * In case of difference, it creates a Manager with different template parameters * and forward the call there. */ template static void DispatchAndExecute(const RuntimeFlags& flags, Fn&& fn) { if (flags.first_page != kFirstPage) { SetFirstPage::Type::DispatchAndExecute(flags, std::forward(fn)); } else if (flags.read_by_column != kReadByColumn) { SetReadByColumn::Type::DispatchAndExecute(flags, std::forward(fn)); } else if (flags.bin_type_size != sizeof(BinIdxType)) { DispatchBinType(flags.bin_type_size, [&](auto t) { using NewBinIdxType = decltype(t); SetBinIdxType::Type::DispatchAndExecute(flags, std::forward(fn)); }); } else { fn(Type()); } } }; template void RowsWiseBuildHistKernel(Span gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { constexpr bool kAnyMissing = BuildingManager::kAnyMissing; constexpr bool kFirstPage = BuildingManager::kFirstPage; using BinIdxType = typename BuildingManager::BinIdxType; const size_t size = row_indices.Size(); const size_t *rid = row_indices.begin; auto const *p_gpair = reinterpret_cast(gpair.data()); const BinIdxType *gradient_index = gmat.index.data(); auto const &row_ptr = gmat.row_ptr.data(); auto base_rowid = gmat.base_rowid; uint32_t const *offsets = gmat.index.Offset(); // There's no feature-based compression if missing value is present. if (kAnyMissing) { CHECK(!offsets); } else { CHECK(offsets); } auto get_row_ptr = [&](bst_idx_t ridx) { return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid]; }; auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); }; const size_t n_features = get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]); auto hist_data = reinterpret_cast(hist.data()); const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains // 2 FP values: gradient and hessian. // So we need to multiply each row-index/bin-index by 2 // to work with gradient pairs as a singe row FP array for (std::size_t i = 0; i < size; ++i) { const size_t icol_start = kAnyMissing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features; const size_t icol_end = kAnyMissing ? get_row_ptr(rid[i] + 1) : icol_start + n_features; const size_t row_size = icol_end - icol_start; const size_t idx_gh = two * rid[i]; if (do_prefetch) { const size_t icol_start_prefetch = kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset]) : get_rid(rid[i + Prefetch::kPrefetchOffset]) * n_features; const size_t icol_end_prefetch = kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1) : icol_start_prefetch + n_features; PREFETCH_READ_T0(p_gpair + two * rid[i + Prefetch::kPrefetchOffset]); for (size_t j = icol_start_prefetch; j < icol_end_prefetch; j += Prefetch::GetPrefetchStep()) { PREFETCH_READ_T0(gradient_index + j); } } const BinIdxType *gr_index_local = gradient_index + icol_start; // The trick with pgh_t buffer helps the compiler to generate faster binary. const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]}; for (size_t j = 0; j < row_size; ++j) { const uint32_t idx_bin = two * (static_cast(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); auto hist_local = hist_data + idx_bin; *(hist_local) += pgh_t[0]; *(hist_local + 1) += pgh_t[1]; } } } template void ColsWiseBuildHistKernel(Span gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { constexpr bool kAnyMissing = BuildingManager::kAnyMissing; constexpr bool kFirstPage = BuildingManager::kFirstPage; using BinIdxType = typename BuildingManager::BinIdxType; const size_t size = row_indices.Size(); const size_t *rid = row_indices.begin; auto const *pgh = reinterpret_cast(gpair.data()); const BinIdxType *gradient_index = gmat.index.data(); auto const &row_ptr = gmat.row_ptr.data(); auto base_rowid = gmat.base_rowid; const uint32_t *offsets = gmat.index.Offset(); auto get_row_ptr = [&](bst_idx_t ridx) { return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid]; }; auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); }; const size_t n_features = gmat.cut.Ptrs().size() - 1; const size_t n_columns = n_features; auto hist_data = reinterpret_cast(hist.data()); const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains // 2 FP values: gradient and hessian. // So we need to multiply each row-index/bin-index by 2 // to work with gradient pairs as a singe row FP array for (size_t cid = 0; cid < n_columns; ++cid) { const uint32_t offset = kAnyMissing ? 0 : offsets[cid]; for (size_t i = 0; i < size; ++i) { const size_t row_id = rid[i]; const size_t icol_start = kAnyMissing ? get_row_ptr(row_id) : get_rid(row_id) * n_features; const size_t icol_end = kAnyMissing ? get_row_ptr(rid[i] + 1) : icol_start + n_features; if (cid < icol_end - icol_start) { const BinIdxType *gr_index_local = gradient_index + icol_start; const uint32_t idx_bin = two * (static_cast(gr_index_local[cid]) + offset); auto hist_local = hist_data + idx_bin; const size_t idx_gh = two * row_id; // The trick with pgh_t buffer helps the compiler to generate faster binary. const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]}; *(hist_local) += pgh_t[0]; *(hist_local + 1) += pgh_t[1]; } } } } template void BuildHistDispatch(Span gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { if (BuildingManager::kReadByColumn) { ColsWiseBuildHistKernel(gpair, row_indices, gmat, hist); } else { const size_t nrows = row_indices.Size(); const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows); // if need to work with all rows from bin-matrix (e.g. root node) const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1); if (contiguousBlock) { // contiguous memory access, built-in HW prefetching is enough RowsWiseBuildHistKernel(gpair, row_indices, gmat, hist); } else { const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size); const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end); RowsWiseBuildHistKernel(gpair, span1, gmat, hist); // no prefetching to avoid loading extra memory RowsWiseBuildHistKernel(gpair, span2, gmat, hist); } } } template void BuildHist(Span gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column) { /* force_read_by_column is used for testing the columnwise building of histograms. * default force_read_by_column = false */ constexpr double kAdhocL2Size = 1024 * 1024 * 0.8; const bool hist_fit_to_l2 = kAdhocL2Size > 2 * sizeof(float) * gmat.cut.Ptrs().back(); bool first_page = gmat.base_rowid == 0; bool read_by_column = !hist_fit_to_l2 && !any_missing; auto bin_type_size = gmat.index.GetBinTypeSize(); GHistBuildingManager::DispatchAndExecute( {first_page, read_by_column || force_read_by_column, bin_type_size}, [&](auto t) { using BuildingManager = decltype(t); BuildHistDispatch(gpair, row_indices, gmat, hist); }); } template void BuildHist(Span gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column); template void BuildHist(Span gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column); } // namespace xgboost::common