[EM] Make page concatenation optional. (#10826)

This PR introduces a new parameter `extmem_concat_pages` to make the page concatenation optional for GPU hist. In addition, the document is updated for the new GPU-based external memory.
This commit is contained in:
Jiaming Yuan
2024-09-24 06:19:28 +08:00
committed by GitHub
parent 215da76263
commit e228c1a121
31 changed files with 690 additions and 388 deletions

View File

@@ -106,9 +106,10 @@ inline auto NoCategorical(std::string name) {
return name + " doesn't support categorical features.";
}
inline void NoOnHost(bool on_host) {
if (on_host) {
LOG(FATAL) << "Caching on host memory is only available for GPU.";
inline void NoPageConcat(bool concat_pages) {
if (concat_pages) {
LOG(FATAL) << "`extmem_concat_pages` must be false when there's no sampling or when it's "
"running on the CPU.";
}
}
} // namespace xgboost::error

24
src/data/batch_utils.cuh Normal file
View File

@@ -0,0 +1,24 @@
/**
* Copyright 2024, XGBoost Contributors
*/
#pragma once
#include "xgboost/data.h" // for BatchParam
namespace xgboost::data::cuda_impl {
// Use two batch for prefecting. There's always one batch being worked on, while the other
// batch being transferred.
constexpr auto DftPrefetchBatches() { return 2; }
// Empty parameter to prevent regen, only used to control external memory prefetching.
//
// Both the approx and hist initializes the DMatrix before creating the actual
// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
// parameter to avoid any regen.
inline BatchParam StaticBatch(bool prefetch_copy) {
BatchParam p;
p.prefetch_copy = prefetch_copy;
p.n_prefetch_batches = DftPrefetchBatches();
return p;
}
} // namespace xgboost::data::cuda_impl

View File

@@ -920,7 +920,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(),
1,
cache_file};
cache_file,
false};
}
return dmat;

View File

@@ -10,7 +10,7 @@
#include <utility> // for move
#include <vector> // for vector
#include "../common/cuda_rt_utils.h" // for SupportsPageableMem
#include "../common/cuda_rt_utils.h" // for SupportsPageableMem, SupportsAts
#include "../common/hist_util.h" // for HistogramCuts
#include "ellpack_page.h" // for EllpackPage
#include "ellpack_page_raw_format.h" // for EllpackPageRawFormat
@@ -67,7 +67,20 @@ class EllpackFormatPolicy {
using FormatT = EllpackPageRawFormat;
public:
EllpackFormatPolicy() = default;
EllpackFormatPolicy() {
StringView msg{" The overhead of iterating through external memory might be significant."};
if (!has_hmm_) {
LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
} else if (!common::SupportsAts()) {
LOG(WARNING) << "CUDA address translation service is not available." << msg;
}
#if !defined(XGBOOST_USE_RMM)
LOG(WARNING) << "XGBoost is not built with RMM support." << msg;
#endif
if (!GlobalConfigThreadLocalStore::Get()->use_rmm) {
LOG(WARNING) << "`use_rmm` is set to false." << msg;
}
}
// For testing with the HMM flag.
explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
@@ -135,6 +148,9 @@ class EllpackMmapStreamPolicy : public F<S> {
bst_idx_t length) const;
};
/**
* @brief Ellpack source with sparse pages as the underlying source.
*/
template <typename F>
class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
using Super = PageSourceIncMixIn<EllpackPage, F>;
@@ -171,6 +187,9 @@ using EllpackPageHostSource =
using EllpackPageSource =
EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
/**
* @brief Ellpack source directly interfaces with user-defined iterators.
*/
template <typename FormatCreatePolicy>
class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy> {
using Super = ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy>;
@@ -201,6 +220,7 @@ class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, Forma
info_{info},
ext_info_{std::move(ext_info)},
base_rows_{std::move(base_rows)} {
cuts->SetDevice(ctx->Device());
this->SetCuts(std::move(cuts), ctx->Device());
this->Fetch();
}

View File

@@ -13,6 +13,7 @@
#include "proxy_dmatrix.h" // for DataIterProxy, HostAdapterDispatch
#include "quantile_dmatrix.h" // for GetDataShape, MakeSketches
#include "simple_batch_iterator.h" // for SimpleBatchIteratorImpl
#include "sparse_page_source.h" // for MakeCachePrefix
#if !defined(XGBOOST_USE_CUDA)
#include "../common/common.h" // for AssertGPUSupport
@@ -26,6 +27,7 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
std::int32_t n_threads, std::string cache,
bst_bin_t max_bin, bool on_host)
: cache_prefix_{std::move(cache)}, on_host_{on_host} {
cache_prefix_ = MakeCachePrefix(cache_prefix_);
auto iter = std::make_shared<DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>>(
iter_handle, reset, next);
iter->Reset();

View File

@@ -13,9 +13,9 @@
#include <utility> // for move
#include <variant> // for visit
#include "../collective/communicator-inl.h"
#include "batch_utils.h" // for RegenGHist
#include "gradient_index.h"
#include "batch_utils.h" // for RegenGHist
#include "gradient_index.h" // for GHistIndexMatrix
#include "sparse_page_source.h" // for MakeCachePrefix
namespace xgboost::data {
MetaInfo &SparsePageDMatrix::Info() { return info_; }
@@ -34,12 +34,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
cache_prefix_{std::move(cache_prefix)},
on_host_{on_host} {
Context ctx;
ctx.nthread = nthreads;
ctx.Init(Args{{"nthread", std::to_string(nthreads)}});
cache_prefix_ = MakeCachePrefix(cache_prefix_);
cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
if (collective::IsDistributed()) {
cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
}
DMatrixProxy *proxy = MakeProxy(proxy_);
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
iter_, reset_, next_};
@@ -107,7 +104,6 @@ BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
auto id = MakeCache(this, ".col.page", on_host_, cache_prefix_, &cache_info_);
CHECK_NE(this->Info().num_col_, 0);
error::NoOnHost(on_host_);
this->InitializeSparsePage(ctx);
if (!column_source_) {
column_source_ =
@@ -122,7 +118,6 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
auto id = MakeCache(this, ".sorted.col.page", on_host_, cache_prefix_, &cache_info_);
CHECK_NE(this->Info().num_col_, 0);
error::NoOnHost(on_host_);
this->InitializeSparsePage(ctx);
if (!sorted_column_source_) {
sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
@@ -140,7 +135,6 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
error::NoOnHost(on_host_);
auto id = MakeCache(this, ".gradient_index.page", on_host_, cache_prefix_, &cache_info_);
if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
this->InitializeSparsePage(ctx);

View File

@@ -70,10 +70,10 @@ class SparsePageDMatrix : public DMatrix {
DataIterResetCallback *reset_;
XGDMatrixCallbackNext *next_;
float missing_;
float const missing_;
Context fmat_ctx_;
std::string cache_prefix_;
bool on_host_{false};
bool const on_host_;
std::uint32_t n_batches_{0};
// sparse page is the source to other page types, we make a special member function.
void InitializeSparsePage(Context const *ctx);
@@ -83,7 +83,7 @@ class SparsePageDMatrix : public DMatrix {
public:
explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
std::string cache_prefix, bool on_host = false);
std::string cache_prefix, bool on_host);
~SparsePageDMatrix() override;

View File

@@ -54,22 +54,18 @@ class SparsePageRawFormat : public SparsePageFormat<T> {
private:
};
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
.describe("Raw binary data format.")
.set_body([]() {
return new SparsePageRawFormat<SparsePage>();
});
#define SparsePageFmt SparsePageFormat<SparsePage>
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, raw)
.describe("Raw binary data format.")
.set_body([]() { return new SparsePageRawFormat<SparsePage>(); });
XGBOOST_REGISTER_CSC_PAGE_FORMAT(raw)
.describe("Raw binary data format.")
.set_body([]() {
return new SparsePageRawFormat<CSCPage>();
});
XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
.describe("Raw binary data format.")
.set_body([]() {
return new SparsePageRawFormat<SortedCSCPage>();
});
#define CSCPageFmt SparsePageFormat<CSCPage>
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, raw)
.describe("Raw binary data format.")
.set_body([]() { return new SparsePageRawFormat<CSCPage>(); });
#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, raw)
.describe("Raw binary data format.")
.set_body([]() { return new SparsePageRawFormat<SortedCSCPage>(); });
} // namespace xgboost::data

View File

@@ -8,6 +8,8 @@
#include <numeric> // for partial_sum
#include <string> // for string
#include "../collective/communicator-inl.h" // for IsDistributed, GetRank
namespace xgboost::data {
void Cache::Commit() {
if (!this->written) {
@@ -28,6 +30,14 @@ void TryDeleteCacheFile(const std::string& file) {
}
}
std::string MakeCachePrefix(std::string cache_prefix) {
cache_prefix = cache_prefix.empty() ? "DMatrix" : cache_prefix;
if (collective::IsDistributed()) {
cache_prefix += ("-r" + std::to_string(collective::GetRank()));
}
return cache_prefix;
}
#if !defined(XGBOOST_USE_CUDA)
void InitNewThread::operator()() const { *GlobalConfigThreadLocalStore::Get() = config; }
#endif

View File

@@ -33,6 +33,8 @@
namespace xgboost::data {
void TryDeleteCacheFile(const std::string& file);
std::string MakeCachePrefix(std::string cache_prefix);
/**
* @brief Information about the cache including path and page offsets.
*/

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2014-2023, XGBoost Contributors
* Copyright 2014-2024, XGBoost Contributors
* \file sparse_page_writer.h
* \author Tianqi Chen
*/
@@ -11,7 +11,6 @@
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "dmlc/registry.h" // for Registry, FunctionRegEntryBase
#include "xgboost/data.h" // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
namespace xgboost::data {
template<typename T>
@@ -54,47 +53,13 @@ inline SparsePageFormat<T>* CreatePageFormat(const std::string& name) {
return (e->body)();
}
/*!
* \brief Registry entry for sparse page format.
/**
* @brief Registry entry for sparse page format.
*/
template<typename T>
struct SparsePageFormatReg
: public dmlc::FunctionRegEntryBase<SparsePageFormatReg<T>,
std::function<SparsePageFormat<T>* ()>> {
};
/*!
* \brief Macro to register sparse page format.
*
* \code
* // example of registering a objective
* XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
* .describe("Raw binary data format.")
* .set_body([]() {
* return new RawFormat();
* });
* \endcode
*/
#define SparsePageFmt SparsePageFormat<SparsePage>
#define XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(Name) \
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, Name)
#define CSCPageFmt SparsePageFormat<CSCPage>
#define XGBOOST_REGISTER_CSC_PAGE_FORMAT(Name) \
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, Name)
#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
#define XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(Name) \
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, Name)
#define EllpackPageFmt SparsePageFormat<EllpackPage>
#define XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(Name) \
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<EllpackPage>, EllpackPageFmt, Name)
#define GHistIndexPageFmt SparsePageFormat<GHistIndexMatrix>
#define XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(Name) \
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>, \
GHistIndexPageFmt, Name)
} // namespace xgboost::data
#endif // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_

View File

@@ -14,9 +14,10 @@
#include "../common/categorical.h"
#include "../common/common.h"
#include "../common/cuda_context.cuh" // for CUDAContext
#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs
#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs, SetDevice
#include "../common/device_helpers.cuh"
#include "../common/error_msg.h" // for InplacePredictProxy
#include "../common/error_msg.h" // for InplacePredictProxy
#include "../data/batch_utils.cuh" // for StaticBatch
#include "../data/device_adapter.cuh"
#include "../data/ellpack_page.cuh"
#include "../data/proxy_dmatrix.h"
@@ -31,6 +32,8 @@
namespace xgboost::predictor {
DMLC_REGISTRY_FILE_TAG(gpu_predictor);
using data::cuda_impl::StaticBatch;
struct TreeView {
RegTree::CategoricalSplitMatrix cats;
common::Span<RegTree::Node const> d_tree;
@@ -475,15 +478,14 @@ struct PathInfo {
};
// Transform model into path element form for GPUTreeShap
void ExtractPaths(
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
void ExtractPaths(Context const* ctx,
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
DeviceOrd device) {
common::SetDevice(device.ordinal);
auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
dh::XGBCachingDeviceAllocator<PathInfo> alloc;
auto d_nodes = device_model.nodes.ConstDeviceSpan();
auto d_tree_segments = device_model.tree_segments.ConstDeviceSpan();
auto nodes_transform = dh::MakeTransformIterator<PathInfo>(
@@ -502,17 +504,15 @@ void ExtractPaths(
}
return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
});
auto end = thrust::copy_if(
thrust::cuda::par(alloc), nodes_transform,
nodes_transform + d_nodes.size(), info.begin(),
[=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
auto end = thrust::copy_if(ctx->CUDACtx()->CTP(), nodes_transform,
nodes_transform + d_nodes.size(), info.begin(),
[=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
info.resize(end - info.begin());
auto length_iterator = dh::MakeTransformIterator<size_t>(
info.begin(),
[=] __device__(const PathInfo& info) { return info.length; });
dh::caching_device_vector<size_t> path_segments(info.size() + 1);
thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
length_iterator + info.size() + 1,
thrust::exclusive_scan(ctx->CUDACtx()->CTP(), length_iterator, length_iterator + info.size() + 1,
path_segments.begin());
paths->resize(path_segments.back());
@@ -528,19 +528,17 @@ void ExtractPaths(
auto d_cat_node_segments = device_model.categories_node_segments.ConstDeviceSpan();
size_t max_cat = 0;
if (thrust::any_of(dh::tbegin(d_split_types), dh::tend(d_split_types),
if (thrust::any_of(ctx->CUDACtx()->CTP(), dh::tbegin(d_split_types), dh::tend(d_split_types),
common::IsCatOp{})) {
dh::PinnedMemory pinned;
auto h_max_cat = pinned.GetSpan<RegTree::CategoricalSplitMatrix::Segment>(1);
auto max_elem_it = dh::MakeTransformIterator<size_t>(
dh::tbegin(d_cat_node_segments),
[] __device__(RegTree::CategoricalSplitMatrix::Segment seg) { return seg.size; });
size_t max_cat_it =
thrust::max_element(thrust::device, max_elem_it,
max_elem_it + d_cat_node_segments.size()) -
max_elem_it;
dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
d_cat_node_segments.data() + max_cat_it,
size_t max_cat_it = thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it,
max_elem_it + d_cat_node_segments.size()) -
max_elem_it;
dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it,
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
max_cat = h_max_cat[0].size;
CHECK_GE(max_cat, 1);
@@ -550,7 +548,7 @@ void ExtractPaths(
auto d_model_categories = device_model.categories.DeviceSpan();
common::Span<uint32_t> d_path_categories = dh::ToSpan(*path_categories);
dh::LaunchN(info.size(), [=] __device__(size_t idx) {
dh::LaunchN(info.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
auto path_info = d_info[idx];
size_t tree_offset = d_tree_segments[path_info.tree_idx];
TreeView tree{0, path_info.tree_idx, d_nodes,
@@ -864,7 +862,7 @@ class GPUPredictor : public xgboost::Predictor {
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
num_features);
auto const kernel = [&](auto predict_fn) {
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
predict_fn, data, model.nodes.ConstDeviceSpan(),
predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
@@ -888,7 +886,7 @@ class GPUPredictor : public xgboost::Predictor {
DeviceModel d_model;
bool use_shared = false;
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS}(
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, 0, ctx_->CUDACtx()->Stream()}(
PredictKernel<EllpackLoader, EllpackDeviceAccessor>, batch, model.nodes.ConstDeviceSpan(),
out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
@@ -924,7 +922,7 @@ class GPUPredictor : public xgboost::Predictor {
}
} else {
bst_idx_t batch_offset = 0;
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
dmat->Info().feature_types.SetDevice(ctx_->Device());
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
@@ -989,7 +987,7 @@ class GPUPredictor : public xgboost::Predictor {
bool use_shared = shared_memory_bytes != 0;
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
PredictKernel<Loader, typename Loader::BatchT>, m->Value(), d_model.nodes.ConstDeviceSpan(),
out_preds->predictions.DeviceSpan(), d_model.tree_segments.ConstDeviceSpan(),
d_model.tree_group.ConstDeviceSpan(), d_model.split_types.ConstDeviceSpan(),
@@ -1055,7 +1053,7 @@ class GPUPredictor : public xgboost::Predictor {
DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
if (p_fmat->PageExists<SparsePage>()) {
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
@@ -1067,7 +1065,7 @@ class GPUPredictor : public xgboost::Predictor {
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
}
} else {
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
@@ -1083,7 +1081,7 @@ class GPUPredictor : public xgboost::Predictor {
auto base_score = model.learner_model_param->BaseScore(ctx_);
dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
[=] __device__(size_t idx) {
ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
phis[(idx + 1) * contributions_columns - 1] +=
margin.empty() ? base_score(0) : margin[idx];
});
@@ -1125,7 +1123,7 @@ class GPUPredictor : public xgboost::Predictor {
DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
if (p_fmat->PageExists<SparsePage>()) {
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
@@ -1137,7 +1135,7 @@ class GPUPredictor : public xgboost::Predictor {
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
}
} else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
auto impl = batch.Impl();
auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
@@ -1155,7 +1153,7 @@ class GPUPredictor : public xgboost::Predictor {
auto base_score = model.learner_model_param->BaseScore(ctx_);
size_t n_features = model.learner_model_param->num_feature;
dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
[=] __device__(size_t idx) {
ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
size_t group = idx % ngroup;
size_t row_idx = idx / ngroup;
phis[gpu_treeshap::IndexPhiInteractions(row_idx, ngroup, group, n_features,
@@ -1199,7 +1197,7 @@ class GPUPredictor : public xgboost::Predictor {
bst_feature_t num_features = info.num_col_;
auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
fn, data, d_model.nodes.ConstDeviceSpan(),
predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
@@ -1223,7 +1221,7 @@ class GPUPredictor : public xgboost::Predictor {
}
} else {
bst_idx_t batch_offset = 0;
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);

View File

@@ -148,19 +148,8 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
CombineGradientPair combine_;
};
NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
DMatrix* dmat) {
return {dmat, gpair};
}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
: batch_param_{std::move(batch_param)} {}
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const*,
common::Span<GradientPair> gpair,
DMatrix* p_fmat) {
DMatrix* p_fmat) {
return {p_fmat, gpair};
}
@@ -246,9 +235,10 @@ GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batc
grad_sum_(n_rows, 0.0f) {}
GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
common::Span<GradientPair> gpair, DMatrix* dmat) {
common::Span<GradientPair> gpair,
DMatrix* p_fmat) {
auto cuctx = ctx->CUDACtx();
size_t n_rows = dmat->Info().num_row_;
size_t n_rows = p_fmat->Info().num_row_;
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
ctx, gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
@@ -257,7 +247,7 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
RandomWeight(common::GlobalRandom()())));
return {dmat, gpair};
return {p_fmat, gpair};
}
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
@@ -323,46 +313,46 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
const BatchParam& batch_param, float subsample,
int sampling_method, bool is_external_memory) {
int sampling_method, bool concat_pages) {
// The ctx is kept here for future development of stream-based operations.
monitor_.Init("gradient_based_sampler");
monitor_.Init(__func__);
bool is_sampling = subsample < 1.0;
if (is_sampling) {
switch (sampling_method) {
case TrainParam::kUniform:
if (is_external_memory) {
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new UniformSampling(batch_param, subsample));
}
break;
case TrainParam::kGradientBased:
if (is_external_memory) {
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
}
break;
default:
LOG(FATAL) << "unknown sampling method";
if (!is_sampling) {
strategy_.reset(new NoSampling{});
error::NoPageConcat(concat_pages);
return;
}
switch (sampling_method) {
case TrainParam::kUniform: {
if (concat_pages) {
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new UniformSampling(batch_param, subsample));
}
break;
}
} else {
if (is_external_memory) {
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
} else {
strategy_.reset(new NoSampling(batch_param));
case TrainParam::kGradientBased: {
if (concat_pages) {
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
}
break;
}
default:
LOG(FATAL) << "unknown sampling method";
}
}
// Sample a DMatrix based on the given gradient pairs.
GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
common::Span<GradientPair> gpair, DMatrix* dmat) {
monitor_.Start("Sample");
monitor_.Start(__func__);
GradientBasedSample sample = strategy_->Sample(ctx, gpair, dmat);
monitor_.Stop("Sample");
monitor_.Stop(__func__);
return sample;
}

View File

@@ -24,31 +24,29 @@ class SamplingStrategy {
virtual GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) = 0;
virtual ~SamplingStrategy() = default;
/**
* @brief Whether pages are concatenated after sampling.
*/
[[nodiscard]] virtual bool ConcatPages() const { return false; }
};
/*! \brief No sampling in in-memory mode. */
class ExtMemSamplingStrategy : public SamplingStrategy {
public:
[[nodiscard]] bool ConcatPages() const final { return true; }
};
/**
* @brief No-op.
*/
class NoSampling : public SamplingStrategy {
public:
explicit NoSampling(BatchParam batch_param);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
BatchParam batch_param_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
explicit ExternalMemoryNoSampling(BatchParam batch_param);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
BatchParam batch_param_;
};
/*! \brief Uniform sampling in in-memory mode. */
/**
* @brief Uniform sampling in in-memory mode.
*/
class UniformSampling : public SamplingStrategy {
public:
UniformSampling(BatchParam batch_param, float subsample);
@@ -61,7 +59,7 @@ class UniformSampling : public SamplingStrategy {
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryUniformSampling : public SamplingStrategy {
class ExternalMemoryUniformSampling : public ExtMemSamplingStrategy {
public:
ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
@@ -91,7 +89,7 @@ class GradientBasedSampling : public SamplingStrategy {
};
/*! \brief Gradient-based sampling in external memory mode.. */
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
class ExternalMemoryGradientBasedSampling : public ExtMemSamplingStrategy {
public:
ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
@@ -120,7 +118,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
class GradientBasedSampler {
public:
GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
float subsample, int sampling_method, bool is_external_memory);
float subsample, int sampling_method, bool concat_pages);
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
@@ -130,6 +128,8 @@ class GradientBasedSampler {
common::Span<float> threshold, common::Span<float> grad_sum,
size_t sample_rows);
[[nodiscard]] bool ConcatPages() const { return this->strategy_->ConcatPages(); }
private:
common::Monitor monitor_;
std::unique_ptr<SamplingStrategy> strategy_;

View File

@@ -23,6 +23,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
constexpr static std::size_t CudaDefaultNodes() { return static_cast<std::size_t>(1) << 12; }
bool debug_synchronize{false};
bool extmem_concat_pages{false};
void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
@@ -42,6 +43,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
.set_default(NotSet())
.set_lower_bound(1)
.describe("Maximum number of nodes in histogram cache.");
DMLC_DECLARE_FIELD(extmem_concat_pages).set_default(false);
}
};
} // namespace xgboost::tree

View File

@@ -278,7 +278,7 @@ class GlobalApproxUpdater : public TreeUpdater {
*sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
auto in = gpair->HostView().Values();
std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
error::NoPageConcat(this->hist_param_.extmem_concat_pages);
SampleGradient(ctx_, param, sampled->HostView());
}

View File

@@ -5,10 +5,11 @@
#include <limits> // for numeric_limits
#include <ostream> // for ostream
#include "gpu_hist/quantiser.cuh" // for GradientQuantiser
#include "param.h" // for TrainParam
#include "xgboost/base.h" // for bst_bin_t
#include "xgboost/task.h" // for ObjInfo
#include "../data/batch_utils.cuh" // for DftPrefetchBatches, StaticBatch
#include "gpu_hist/quantiser.cuh" // for GradientQuantiser
#include "param.h" // for TrainParam
#include "xgboost/base.h" // for bst_bin_t
#include "xgboost/task.h" // for ObjInfo
namespace xgboost::tree {
struct GPUTrainingParam {
@@ -119,26 +120,19 @@ struct DeviceSplitCandidate {
};
namespace cuda_impl {
constexpr auto DftPrefetchBatches() { return 2; }
inline BatchParam HistBatch(TrainParam const& param) {
auto p = BatchParam{param.max_bin, TrainParam::DftSparseThreshold()};
p.prefetch_copy = true;
p.n_prefetch_batches = DftPrefetchBatches();
p.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
return p;
}
inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hess,
ObjInfo const& task) {
return BatchParam{p.max_bin, hess, !task.const_hess};
}
// Empty parameter to prevent regen, only used to control external memory prefetching.
inline BatchParam StaticBatch(bool prefetch_copy) {
BatchParam p;
p.prefetch_copy = prefetch_copy;
p.n_prefetch_batches = DftPrefetchBatches();
return p;
auto batch = BatchParam{p.max_bin, hess, !task.const_hess};
batch.prefetch_copy = true;
batch.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
return batch;
}
} // namespace cuda_impl

View File

@@ -21,6 +21,7 @@
#include "../common/hist_util.h" // for HistogramCuts
#include "../common/random.h" // for ColumnSampler, GlobalRandom
#include "../common/timer.h"
#include "../data/batch_utils.cuh" // for StaticBatch
#include "../data/ellpack_page.cuh"
#include "../data/ellpack_page.h"
#include "constraints.cuh"
@@ -50,11 +51,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
using cuda_impl::ApproxBatch;
using cuda_impl::HistBatch;
// Both the approx and hist initializes the DMatrix before creating the actual
// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
// parameter to avoid any regen.
using cuda_impl::StaticBatch;
using data::cuda_impl::StaticBatch;
// Extra data for each node that is passed to the update position function
struct NodeSplitData {
@@ -102,11 +99,11 @@ struct GPUHistMakerDevice {
std::vector<std::unique_ptr<RowPartitioner>> partitioners_;
DeviceHistogramBuilder histogram_;
std::vector<bst_idx_t> batch_ptr_;
std::vector<bst_idx_t> const batch_ptr_;
// node idx for each sample
dh::device_vector<bst_node_t> positions_;
HistMakerTrainParam const* hist_param_;
std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
std::shared_ptr<common::HistogramCuts const> const cuts_;
auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
std::vector<bst_node_t> nidx(candidates.size());
@@ -135,35 +132,35 @@ struct GPUHistMakerDevice {
dh::device_vector<int> monotone_constraints;
TrainParam param;
TrainParam const param;
std::unique_ptr<GradientQuantiser> quantiser;
dh::PinnedMemory pinned;
dh::PinnedMemory pinned2;
common::Monitor monitor;
FeatureInteractionConstraintDevice interaction_constraints;
std::unique_ptr<GradientBasedSampler> sampler;
std::unique_ptr<FeatureGroups> feature_groups;
common::Monitor monitor;
GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param,
std::shared_ptr<common::ColumnSampler> column_sampler, BatchParam batch_param,
MetaInfo const& info, std::vector<bst_idx_t> batch_ptr,
std::shared_ptr<common::HistogramCuts const> cuts)
: evaluator_{_param, static_cast<bst_feature_t>(info.num_col_), ctx->Device()},
ctx_(ctx),
param(std::move(_param)),
column_sampler_(std::move(column_sampler)),
interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
ctx_{ctx},
column_sampler_{std::move(column_sampler)},
batch_ptr_{std::move(batch_ptr)},
hist_param_{hist_param},
cuts_{std::move(cuts)} {
this->sampler =
std::make_unique<GradientBasedSampler>(ctx, info.num_row_, batch_param, param.subsample,
param.sampling_method, batch_ptr_.size() > 2);
cuts_{std::move(cuts)},
param{std::move(_param)},
interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
sampler{std::make_unique<GradientBasedSampler>(
ctx, info.num_row_, batch_param, param.subsample, param.sampling_method,
batch_ptr_.size() > 2 && this->hist_param_->extmem_concat_pages)} {
if (!param.monotone_constraints.empty()) {
// Copy assigning an empty vector causes an exception in MSVC debug builds
monotone_constraints = param.monotone_constraints;
@@ -185,33 +182,31 @@ struct GPUHistMakerDevice {
}
// Reset values for each update iteration
[[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
[[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
this->monitor.Start(__func__);
common::SetDevice(ctx_->Ordinal());
auto const& info = p_fmat->Info();
// backup the gradient
dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
this->interaction_constraints.Reset(ctx_);
this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit());
// Sampling
/**
* Sampling
*/
dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat);
this->gpair = sample.gpair;
p_fmat = sample.p_fmat; // Update p_fmat before allocating partitioners
p_fmat = sample.p_fmat;
p_fmat->Info().feature_types.SetDevice(ctx_->Device());
std::size_t n_batches = p_fmat->NumBatches();
bool is_concat = (n_batches + 1) != this->batch_ptr_.size();
std::vector<bst_idx_t> batch_ptr{batch_ptr_};
/**
* Initialize the partitioners
*/
bool is_concat = sampler->ConcatPages();
std::size_t n_batches = is_concat ? 1 : p_fmat->NumBatches();
std::vector<bst_idx_t> batch_ptr{this->batch_ptr_};
if (is_concat) {
// Concatenate the batch ptrs as well.
batch_ptr = {static_cast<bst_idx_t>(0), p_fmat->Info().num_row_};
}
// Initialize partitions
if (!partitioners_.empty()) {
CHECK_EQ(partitioners_.size(), n_batches);
}
@@ -230,8 +225,20 @@ struct GPUHistMakerDevice {
CHECK_EQ(partitioners_.front()->Size(), p_fmat->Info().num_row_);
}
// Other initializations
quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
/**
* Initialize the evaluator
*/
this->column_sampler_->Init(ctx_, info.num_col_, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
this->interaction_constraints.Reset(ctx_);
this->evaluator_.Reset(this->ctx_, *cuts_, info.feature_types.ConstDeviceSpan(), info.num_col_,
this->param, info.IsColumnSplit());
/**
* Other initializations
*/
this->quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
this->InitFeatureGroupsOnce(info);
@@ -327,8 +334,8 @@ struct GPUHistMakerDevice {
auto d_ridx = partitioners_.at(k)->GetRows(nidx);
this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
d_node_hist, *quantiser);
feature_groups->DeviceAccessor(ctx_->Device()), this->gpair,
d_ridx, d_node_hist, *quantiser);
monitor.Stop(__func__);
}
@@ -678,11 +685,11 @@ struct GPUHistMakerDevice {
constexpr bst_node_t kRootNIdx = RegTree::kRoot;
auto quantiser = *this->quantiser;
auto gpair_it = dh::MakeTransformIterator<GradientPairInt64>(
dh::tbegin(gpair),
dh::tbegin(this->gpair),
[=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
GradientPairInt64 root_sum_quantised =
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(), GradientPairInt64{},
thrust::plus<GradientPairInt64>{});
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
auto rc = collective::GlobalSum(
ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));

View File

@@ -539,6 +539,7 @@ class QuantileHistMaker : public TreeUpdater {
// Copy gradient into buffer for sampling. This converts C-order to F-order.
std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
}
error::NoPageConcat(this->hist_param_.extmem_concat_pages);
SampleGradient(ctx_, *param, h_sample_out);
auto *h_out_position = &out_position[tree_it - trees.begin()];
if ((*tree_it)->IsMultiTarget()) {