[EM] Make page concatenation optional. (#10826)
This PR introduces a new parameter `extmem_concat_pages` to make the page concatenation optional for GPU hist. In addition, the document is updated for the new GPU-based external memory.
This commit is contained in:
@@ -106,9 +106,10 @@ inline auto NoCategorical(std::string name) {
|
||||
return name + " doesn't support categorical features.";
|
||||
}
|
||||
|
||||
inline void NoOnHost(bool on_host) {
|
||||
if (on_host) {
|
||||
LOG(FATAL) << "Caching on host memory is only available for GPU.";
|
||||
inline void NoPageConcat(bool concat_pages) {
|
||||
if (concat_pages) {
|
||||
LOG(FATAL) << "`extmem_concat_pages` must be false when there's no sampling or when it's "
|
||||
"running on the CPU.";
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::error
|
||||
|
||||
24
src/data/batch_utils.cuh
Normal file
24
src/data/batch_utils.cuh
Normal file
@@ -0,0 +1,24 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "xgboost/data.h" // for BatchParam
|
||||
|
||||
namespace xgboost::data::cuda_impl {
|
||||
// Use two batch for prefecting. There's always one batch being worked on, while the other
|
||||
// batch being transferred.
|
||||
constexpr auto DftPrefetchBatches() { return 2; }
|
||||
|
||||
// Empty parameter to prevent regen, only used to control external memory prefetching.
|
||||
//
|
||||
// Both the approx and hist initializes the DMatrix before creating the actual
|
||||
// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
|
||||
// parameter to avoid any regen.
|
||||
inline BatchParam StaticBatch(bool prefetch_copy) {
|
||||
BatchParam p;
|
||||
p.prefetch_copy = prefetch_copy;
|
||||
p.n_prefetch_batches = DftPrefetchBatches();
|
||||
return p;
|
||||
}
|
||||
} // namespace xgboost::data::cuda_impl
|
||||
@@ -920,7 +920,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
|
||||
data::fileiter::Next,
|
||||
std::numeric_limits<float>::quiet_NaN(),
|
||||
1,
|
||||
cache_file};
|
||||
cache_file,
|
||||
false};
|
||||
}
|
||||
|
||||
return dmat;
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../common/cuda_rt_utils.h" // for SupportsPageableMem
|
||||
#include "../common/cuda_rt_utils.h" // for SupportsPageableMem, SupportsAts
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "ellpack_page.h" // for EllpackPage
|
||||
#include "ellpack_page_raw_format.h" // for EllpackPageRawFormat
|
||||
@@ -67,7 +67,20 @@ class EllpackFormatPolicy {
|
||||
using FormatT = EllpackPageRawFormat;
|
||||
|
||||
public:
|
||||
EllpackFormatPolicy() = default;
|
||||
EllpackFormatPolicy() {
|
||||
StringView msg{" The overhead of iterating through external memory might be significant."};
|
||||
if (!has_hmm_) {
|
||||
LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
|
||||
} else if (!common::SupportsAts()) {
|
||||
LOG(WARNING) << "CUDA address translation service is not available." << msg;
|
||||
}
|
||||
#if !defined(XGBOOST_USE_RMM)
|
||||
LOG(WARNING) << "XGBoost is not built with RMM support." << msg;
|
||||
#endif
|
||||
if (!GlobalConfigThreadLocalStore::Get()->use_rmm) {
|
||||
LOG(WARNING) << "`use_rmm` is set to false." << msg;
|
||||
}
|
||||
}
|
||||
// For testing with the HMM flag.
|
||||
explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
|
||||
|
||||
@@ -135,6 +148,9 @@ class EllpackMmapStreamPolicy : public F<S> {
|
||||
bst_idx_t length) const;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Ellpack source with sparse pages as the underlying source.
|
||||
*/
|
||||
template <typename F>
|
||||
class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
|
||||
using Super = PageSourceIncMixIn<EllpackPage, F>;
|
||||
@@ -171,6 +187,9 @@ using EllpackPageHostSource =
|
||||
using EllpackPageSource =
|
||||
EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
|
||||
|
||||
/**
|
||||
* @brief Ellpack source directly interfaces with user-defined iterators.
|
||||
*/
|
||||
template <typename FormatCreatePolicy>
|
||||
class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy> {
|
||||
using Super = ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy>;
|
||||
@@ -201,6 +220,7 @@ class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, Forma
|
||||
info_{info},
|
||||
ext_info_{std::move(ext_info)},
|
||||
base_rows_{std::move(base_rows)} {
|
||||
cuts->SetDevice(ctx->Device());
|
||||
this->SetCuts(std::move(cuts), ctx->Device());
|
||||
this->Fetch();
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "proxy_dmatrix.h" // for DataIterProxy, HostAdapterDispatch
|
||||
#include "quantile_dmatrix.h" // for GetDataShape, MakeSketches
|
||||
#include "simple_batch_iterator.h" // for SimpleBatchIteratorImpl
|
||||
#include "sparse_page_source.h" // for MakeCachePrefix
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
#include "../common/common.h" // for AssertGPUSupport
|
||||
@@ -26,6 +27,7 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
|
||||
std::int32_t n_threads, std::string cache,
|
||||
bst_bin_t max_bin, bool on_host)
|
||||
: cache_prefix_{std::move(cache)}, on_host_{on_host} {
|
||||
cache_prefix_ = MakeCachePrefix(cache_prefix_);
|
||||
auto iter = std::make_shared<DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>>(
|
||||
iter_handle, reset, next);
|
||||
iter->Reset();
|
||||
|
||||
@@ -13,9 +13,9 @@
|
||||
#include <utility> // for move
|
||||
#include <variant> // for visit
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "batch_utils.h" // for RegenGHist
|
||||
#include "gradient_index.h"
|
||||
#include "batch_utils.h" // for RegenGHist
|
||||
#include "gradient_index.h" // for GHistIndexMatrix
|
||||
#include "sparse_page_source.h" // for MakeCachePrefix
|
||||
|
||||
namespace xgboost::data {
|
||||
MetaInfo &SparsePageDMatrix::Info() { return info_; }
|
||||
@@ -34,12 +34,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
|
||||
cache_prefix_{std::move(cache_prefix)},
|
||||
on_host_{on_host} {
|
||||
Context ctx;
|
||||
ctx.nthread = nthreads;
|
||||
ctx.Init(Args{{"nthread", std::to_string(nthreads)}});
|
||||
cache_prefix_ = MakeCachePrefix(cache_prefix_);
|
||||
|
||||
cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
|
||||
if (collective::IsDistributed()) {
|
||||
cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
|
||||
}
|
||||
DMatrixProxy *proxy = MakeProxy(proxy_);
|
||||
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
|
||||
iter_, reset_, next_};
|
||||
@@ -107,7 +104,6 @@ BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
|
||||
BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
|
||||
auto id = MakeCache(this, ".col.page", on_host_, cache_prefix_, &cache_info_);
|
||||
CHECK_NE(this->Info().num_col_, 0);
|
||||
error::NoOnHost(on_host_);
|
||||
this->InitializeSparsePage(ctx);
|
||||
if (!column_source_) {
|
||||
column_source_ =
|
||||
@@ -122,7 +118,6 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
|
||||
BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
|
||||
auto id = MakeCache(this, ".sorted.col.page", on_host_, cache_prefix_, &cache_info_);
|
||||
CHECK_NE(this->Info().num_col_, 0);
|
||||
error::NoOnHost(on_host_);
|
||||
this->InitializeSparsePage(ctx);
|
||||
if (!sorted_column_source_) {
|
||||
sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
|
||||
@@ -140,7 +135,6 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
|
||||
CHECK_GE(param.max_bin, 2);
|
||||
}
|
||||
detail::CheckEmpty(batch_param_, param);
|
||||
error::NoOnHost(on_host_);
|
||||
auto id = MakeCache(this, ".gradient_index.page", on_host_, cache_prefix_, &cache_info_);
|
||||
if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
|
||||
this->InitializeSparsePage(ctx);
|
||||
|
||||
@@ -70,10 +70,10 @@ class SparsePageDMatrix : public DMatrix {
|
||||
DataIterResetCallback *reset_;
|
||||
XGDMatrixCallbackNext *next_;
|
||||
|
||||
float missing_;
|
||||
float const missing_;
|
||||
Context fmat_ctx_;
|
||||
std::string cache_prefix_;
|
||||
bool on_host_{false};
|
||||
bool const on_host_;
|
||||
std::uint32_t n_batches_{0};
|
||||
// sparse page is the source to other page types, we make a special member function.
|
||||
void InitializeSparsePage(Context const *ctx);
|
||||
@@ -83,7 +83,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
public:
|
||||
explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
|
||||
XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
|
||||
std::string cache_prefix, bool on_host = false);
|
||||
std::string cache_prefix, bool on_host);
|
||||
|
||||
~SparsePageDMatrix() override;
|
||||
|
||||
|
||||
@@ -54,22 +54,18 @@ class SparsePageRawFormat : public SparsePageFormat<T> {
|
||||
private:
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
|
||||
.describe("Raw binary data format.")
|
||||
.set_body([]() {
|
||||
return new SparsePageRawFormat<SparsePage>();
|
||||
});
|
||||
#define SparsePageFmt SparsePageFormat<SparsePage>
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, raw)
|
||||
.describe("Raw binary data format.")
|
||||
.set_body([]() { return new SparsePageRawFormat<SparsePage>(); });
|
||||
|
||||
XGBOOST_REGISTER_CSC_PAGE_FORMAT(raw)
|
||||
.describe("Raw binary data format.")
|
||||
.set_body([]() {
|
||||
return new SparsePageRawFormat<CSCPage>();
|
||||
});
|
||||
|
||||
XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
|
||||
.describe("Raw binary data format.")
|
||||
.set_body([]() {
|
||||
return new SparsePageRawFormat<SortedCSCPage>();
|
||||
});
|
||||
#define CSCPageFmt SparsePageFormat<CSCPage>
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, raw)
|
||||
.describe("Raw binary data format.")
|
||||
.set_body([]() { return new SparsePageRawFormat<CSCPage>(); });
|
||||
|
||||
#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, raw)
|
||||
.describe("Raw binary data format.")
|
||||
.set_body([]() { return new SparsePageRawFormat<SortedCSCPage>(); });
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
#include <numeric> // for partial_sum
|
||||
#include <string> // for string
|
||||
|
||||
#include "../collective/communicator-inl.h" // for IsDistributed, GetRank
|
||||
|
||||
namespace xgboost::data {
|
||||
void Cache::Commit() {
|
||||
if (!this->written) {
|
||||
@@ -28,6 +30,14 @@ void TryDeleteCacheFile(const std::string& file) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string MakeCachePrefix(std::string cache_prefix) {
|
||||
cache_prefix = cache_prefix.empty() ? "DMatrix" : cache_prefix;
|
||||
if (collective::IsDistributed()) {
|
||||
cache_prefix += ("-r" + std::to_string(collective::GetRank()));
|
||||
}
|
||||
return cache_prefix;
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
void InitNewThread::operator()() const { *GlobalConfigThreadLocalStore::Get() = config; }
|
||||
#endif
|
||||
|
||||
@@ -33,6 +33,8 @@
|
||||
namespace xgboost::data {
|
||||
void TryDeleteCacheFile(const std::string& file);
|
||||
|
||||
std::string MakeCachePrefix(std::string cache_prefix);
|
||||
|
||||
/**
|
||||
* @brief Information about the cache including path and page offsets.
|
||||
*/
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
* Copyright 2014-2024, XGBoost Contributors
|
||||
* \file sparse_page_writer.h
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
@@ -11,7 +11,6 @@
|
||||
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "dmlc/registry.h" // for Registry, FunctionRegEntryBase
|
||||
#include "xgboost/data.h" // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
|
||||
|
||||
namespace xgboost::data {
|
||||
template<typename T>
|
||||
@@ -54,47 +53,13 @@ inline SparsePageFormat<T>* CreatePageFormat(const std::string& name) {
|
||||
return (e->body)();
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Registry entry for sparse page format.
|
||||
/**
|
||||
* @brief Registry entry for sparse page format.
|
||||
*/
|
||||
template<typename T>
|
||||
struct SparsePageFormatReg
|
||||
: public dmlc::FunctionRegEntryBase<SparsePageFormatReg<T>,
|
||||
std::function<SparsePageFormat<T>* ()>> {
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief Macro to register sparse page format.
|
||||
*
|
||||
* \code
|
||||
* // example of registering a objective
|
||||
* XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
|
||||
* .describe("Raw binary data format.")
|
||||
* .set_body([]() {
|
||||
* return new RawFormat();
|
||||
* });
|
||||
* \endcode
|
||||
*/
|
||||
#define SparsePageFmt SparsePageFormat<SparsePage>
|
||||
#define XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(Name) \
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, Name)
|
||||
|
||||
#define CSCPageFmt SparsePageFormat<CSCPage>
|
||||
#define XGBOOST_REGISTER_CSC_PAGE_FORMAT(Name) \
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, Name)
|
||||
|
||||
#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
|
||||
#define XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(Name) \
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, Name)
|
||||
|
||||
#define EllpackPageFmt SparsePageFormat<EllpackPage>
|
||||
#define XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(Name) \
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<EllpackPage>, EllpackPageFmt, Name)
|
||||
|
||||
#define GHistIndexPageFmt SparsePageFormat<GHistIndexMatrix>
|
||||
#define XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(Name) \
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>, \
|
||||
GHistIndexPageFmt, Name)
|
||||
|
||||
} // namespace xgboost::data
|
||||
#endif // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
|
||||
|
||||
@@ -14,9 +14,10 @@
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/common.h"
|
||||
#include "../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs
|
||||
#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs, SetDevice
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/error_msg.h" // for InplacePredictProxy
|
||||
#include "../common/error_msg.h" // for InplacePredictProxy
|
||||
#include "../data/batch_utils.cuh" // for StaticBatch
|
||||
#include "../data/device_adapter.cuh"
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "../data/proxy_dmatrix.h"
|
||||
@@ -31,6 +32,8 @@
|
||||
namespace xgboost::predictor {
|
||||
DMLC_REGISTRY_FILE_TAG(gpu_predictor);
|
||||
|
||||
using data::cuda_impl::StaticBatch;
|
||||
|
||||
struct TreeView {
|
||||
RegTree::CategoricalSplitMatrix cats;
|
||||
common::Span<RegTree::Node const> d_tree;
|
||||
@@ -475,15 +478,14 @@ struct PathInfo {
|
||||
};
|
||||
|
||||
// Transform model into path element form for GPUTreeShap
|
||||
void ExtractPaths(
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
||||
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
||||
DeviceOrd device) {
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
void ExtractPaths(Context const* ctx,
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
|
||||
DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
|
||||
DeviceOrd device) {
|
||||
common::SetDevice(device.ordinal);
|
||||
auto& device_model = *model;
|
||||
|
||||
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
||||
dh::XGBCachingDeviceAllocator<PathInfo> alloc;
|
||||
auto d_nodes = device_model.nodes.ConstDeviceSpan();
|
||||
auto d_tree_segments = device_model.tree_segments.ConstDeviceSpan();
|
||||
auto nodes_transform = dh::MakeTransformIterator<PathInfo>(
|
||||
@@ -502,17 +504,15 @@ void ExtractPaths(
|
||||
}
|
||||
return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
|
||||
});
|
||||
auto end = thrust::copy_if(
|
||||
thrust::cuda::par(alloc), nodes_transform,
|
||||
nodes_transform + d_nodes.size(), info.begin(),
|
||||
[=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
|
||||
auto end = thrust::copy_if(ctx->CUDACtx()->CTP(), nodes_transform,
|
||||
nodes_transform + d_nodes.size(), info.begin(),
|
||||
[=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
|
||||
info.resize(end - info.begin());
|
||||
auto length_iterator = dh::MakeTransformIterator<size_t>(
|
||||
info.begin(),
|
||||
[=] __device__(const PathInfo& info) { return info.length; });
|
||||
dh::caching_device_vector<size_t> path_segments(info.size() + 1);
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
|
||||
length_iterator + info.size() + 1,
|
||||
thrust::exclusive_scan(ctx->CUDACtx()->CTP(), length_iterator, length_iterator + info.size() + 1,
|
||||
path_segments.begin());
|
||||
|
||||
paths->resize(path_segments.back());
|
||||
@@ -528,19 +528,17 @@ void ExtractPaths(
|
||||
auto d_cat_node_segments = device_model.categories_node_segments.ConstDeviceSpan();
|
||||
|
||||
size_t max_cat = 0;
|
||||
if (thrust::any_of(dh::tbegin(d_split_types), dh::tend(d_split_types),
|
||||
if (thrust::any_of(ctx->CUDACtx()->CTP(), dh::tbegin(d_split_types), dh::tend(d_split_types),
|
||||
common::IsCatOp{})) {
|
||||
dh::PinnedMemory pinned;
|
||||
auto h_max_cat = pinned.GetSpan<RegTree::CategoricalSplitMatrix::Segment>(1);
|
||||
auto max_elem_it = dh::MakeTransformIterator<size_t>(
|
||||
dh::tbegin(d_cat_node_segments),
|
||||
[] __device__(RegTree::CategoricalSplitMatrix::Segment seg) { return seg.size; });
|
||||
size_t max_cat_it =
|
||||
thrust::max_element(thrust::device, max_elem_it,
|
||||
max_elem_it + d_cat_node_segments.size()) -
|
||||
max_elem_it;
|
||||
dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
|
||||
d_cat_node_segments.data() + max_cat_it,
|
||||
size_t max_cat_it = thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it,
|
||||
max_elem_it + d_cat_node_segments.size()) -
|
||||
max_elem_it;
|
||||
dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it,
|
||||
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
|
||||
max_cat = h_max_cat[0].size;
|
||||
CHECK_GE(max_cat, 1);
|
||||
@@ -550,7 +548,7 @@ void ExtractPaths(
|
||||
auto d_model_categories = device_model.categories.DeviceSpan();
|
||||
common::Span<uint32_t> d_path_categories = dh::ToSpan(*path_categories);
|
||||
|
||||
dh::LaunchN(info.size(), [=] __device__(size_t idx) {
|
||||
dh::LaunchN(info.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
|
||||
auto path_info = d_info[idx];
|
||||
size_t tree_offset = d_tree_segments[path_info.tree_idx];
|
||||
TreeView tree{0, path_info.tree_idx, d_nodes,
|
||||
@@ -864,7 +862,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||
num_features);
|
||||
auto const kernel = [&](auto predict_fn) {
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
|
||||
predict_fn, data, model.nodes.ConstDeviceSpan(),
|
||||
predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
|
||||
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
|
||||
@@ -888,7 +886,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
DeviceModel d_model;
|
||||
|
||||
bool use_shared = false;
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS}(
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, 0, ctx_->CUDACtx()->Stream()}(
|
||||
PredictKernel<EllpackLoader, EllpackDeviceAccessor>, batch, model.nodes.ConstDeviceSpan(),
|
||||
out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
|
||||
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
|
||||
@@ -924,7 +922,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
} else {
|
||||
bst_idx_t batch_offset = 0;
|
||||
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
||||
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
|
||||
dmat->Info().feature_types.SetDevice(ctx_->Device());
|
||||
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
|
||||
this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
|
||||
@@ -989,7 +987,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
|
||||
bool use_shared = shared_memory_bytes != 0;
|
||||
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
|
||||
PredictKernel<Loader, typename Loader::BatchT>, m->Value(), d_model.nodes.ConstDeviceSpan(),
|
||||
out_preds->predictions.DeviceSpan(), d_model.tree_segments.ConstDeviceSpan(),
|
||||
d_model.tree_group.ConstDeviceSpan(), d_model.split_types.ConstDeviceSpan(),
|
||||
@@ -1055,7 +1053,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, 0, tree_end, ctx_->Device());
|
||||
dh::device_vector<uint32_t> categories;
|
||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
|
||||
ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
|
||||
if (p_fmat->PageExists<SparsePage>()) {
|
||||
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
@@ -1067,7 +1065,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
|
||||
}
|
||||
} else {
|
||||
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
|
||||
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
|
||||
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
|
||||
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
|
||||
std::numeric_limits<float>::quiet_NaN()};
|
||||
@@ -1083,7 +1081,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
|
||||
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
||||
dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
|
||||
[=] __device__(size_t idx) {
|
||||
ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
|
||||
phis[(idx + 1) * contributions_columns - 1] +=
|
||||
margin.empty() ? base_score(0) : margin[idx];
|
||||
});
|
||||
@@ -1125,7 +1123,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, 0, tree_end, ctx_->Device());
|
||||
dh::device_vector<uint32_t> categories;
|
||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
|
||||
ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
|
||||
if (p_fmat->PageExists<SparsePage>()) {
|
||||
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
@@ -1137,7 +1135,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
|
||||
}
|
||||
} else {
|
||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
|
||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
|
||||
auto impl = batch.Impl();
|
||||
auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
|
||||
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
|
||||
@@ -1155,7 +1153,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
||||
size_t n_features = model.learner_model_param->num_feature;
|
||||
dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
|
||||
[=] __device__(size_t idx) {
|
||||
ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
|
||||
size_t group = idx % ngroup;
|
||||
size_t row_idx = idx / ngroup;
|
||||
phis[gpu_treeshap::IndexPhiInteractions(row_idx, ngroup, group, n_features,
|
||||
@@ -1199,7 +1197,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
bst_feature_t num_features = info.num_col_;
|
||||
|
||||
auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
|
||||
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
|
||||
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
|
||||
fn, data, d_model.nodes.ConstDeviceSpan(),
|
||||
predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
|
||||
|
||||
@@ -1223,7 +1221,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
} else {
|
||||
bst_idx_t batch_offset = 0;
|
||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
|
||||
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
|
||||
auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
|
||||
launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
|
||||
|
||||
@@ -148,19 +148,8 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
||||
CombineGradientPair combine_;
|
||||
};
|
||||
|
||||
NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
|
||||
|
||||
GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
return {dmat, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
|
||||
: batch_param_{std::move(batch_param)} {}
|
||||
|
||||
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const*,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* p_fmat) {
|
||||
DMatrix* p_fmat) {
|
||||
return {p_fmat, gpair};
|
||||
}
|
||||
|
||||
@@ -246,9 +235,10 @@ GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batc
|
||||
grad_sum_(n_rows, 0.0f) {}
|
||||
|
||||
GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair, DMatrix* dmat) {
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* p_fmat) {
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
size_t n_rows = dmat->Info().num_row_;
|
||||
size_t n_rows = p_fmat->Info().num_row_;
|
||||
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
|
||||
ctx, gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
|
||||
|
||||
@@ -257,7 +247,7 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
|
||||
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
|
||||
RandomWeight(common::GlobalRandom()())));
|
||||
return {dmat, gpair};
|
||||
return {p_fmat, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
|
||||
@@ -323,46 +313,46 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
||||
|
||||
GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
|
||||
const BatchParam& batch_param, float subsample,
|
||||
int sampling_method, bool is_external_memory) {
|
||||
int sampling_method, bool concat_pages) {
|
||||
// The ctx is kept here for future development of stream-based operations.
|
||||
monitor_.Init("gradient_based_sampler");
|
||||
monitor_.Init(__func__);
|
||||
|
||||
bool is_sampling = subsample < 1.0;
|
||||
|
||||
if (is_sampling) {
|
||||
switch (sampling_method) {
|
||||
case TrainParam::kUniform:
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new UniformSampling(batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
case TrainParam::kGradientBased:
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "unknown sampling method";
|
||||
if (!is_sampling) {
|
||||
strategy_.reset(new NoSampling{});
|
||||
error::NoPageConcat(concat_pages);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (sampling_method) {
|
||||
case TrainParam::kUniform: {
|
||||
if (concat_pages) {
|
||||
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new UniformSampling(batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
|
||||
} else {
|
||||
strategy_.reset(new NoSampling(batch_param));
|
||||
case TrainParam::kGradientBased: {
|
||||
if (concat_pages) {
|
||||
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LOG(FATAL) << "unknown sampling method";
|
||||
}
|
||||
}
|
||||
|
||||
// Sample a DMatrix based on the given gradient pairs.
|
||||
GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair, DMatrix* dmat) {
|
||||
monitor_.Start("Sample");
|
||||
monitor_.Start(__func__);
|
||||
GradientBasedSample sample = strategy_->Sample(ctx, gpair, dmat);
|
||||
monitor_.Stop("Sample");
|
||||
monitor_.Stop(__func__);
|
||||
return sample;
|
||||
}
|
||||
|
||||
|
||||
@@ -24,31 +24,29 @@ class SamplingStrategy {
|
||||
virtual GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) = 0;
|
||||
virtual ~SamplingStrategy() = default;
|
||||
/**
|
||||
* @brief Whether pages are concatenated after sampling.
|
||||
*/
|
||||
[[nodiscard]] virtual bool ConcatPages() const { return false; }
|
||||
};
|
||||
|
||||
/*! \brief No sampling in in-memory mode. */
|
||||
class ExtMemSamplingStrategy : public SamplingStrategy {
|
||||
public:
|
||||
[[nodiscard]] bool ConcatPages() const final { return true; }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief No-op.
|
||||
*/
|
||||
class NoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit NoSampling(BatchParam batch_param);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit ExternalMemoryNoSampling(BatchParam batch_param);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
};
|
||||
|
||||
/*! \brief Uniform sampling in in-memory mode. */
|
||||
/**
|
||||
* @brief Uniform sampling in in-memory mode.
|
||||
*/
|
||||
class UniformSampling : public SamplingStrategy {
|
||||
public:
|
||||
UniformSampling(BatchParam batch_param, float subsample);
|
||||
@@ -61,7 +59,7 @@ class UniformSampling : public SamplingStrategy {
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryUniformSampling : public SamplingStrategy {
|
||||
class ExternalMemoryUniformSampling : public ExtMemSamplingStrategy {
|
||||
public:
|
||||
ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
@@ -91,7 +89,7 @@ class GradientBasedSampling : public SamplingStrategy {
|
||||
};
|
||||
|
||||
/*! \brief Gradient-based sampling in external memory mode.. */
|
||||
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
class ExternalMemoryGradientBasedSampling : public ExtMemSamplingStrategy {
|
||||
public:
|
||||
ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
@@ -120,7 +118,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
class GradientBasedSampler {
|
||||
public:
|
||||
GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample, int sampling_method, bool is_external_memory);
|
||||
float subsample, int sampling_method, bool concat_pages);
|
||||
|
||||
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
|
||||
@@ -130,6 +128,8 @@ class GradientBasedSampler {
|
||||
common::Span<float> threshold, common::Span<float> grad_sum,
|
||||
size_t sample_rows);
|
||||
|
||||
[[nodiscard]] bool ConcatPages() const { return this->strategy_->ConcatPages(); }
|
||||
|
||||
private:
|
||||
common::Monitor monitor_;
|
||||
std::unique_ptr<SamplingStrategy> strategy_;
|
||||
|
||||
@@ -23,6 +23,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
|
||||
constexpr static std::size_t CudaDefaultNodes() { return static_cast<std::size_t>(1) << 12; }
|
||||
|
||||
bool debug_synchronize{false};
|
||||
bool extmem_concat_pages{false};
|
||||
|
||||
void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
|
||||
|
||||
@@ -42,6 +43,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
|
||||
.set_default(NotSet())
|
||||
.set_lower_bound(1)
|
||||
.describe("Maximum number of nodes in histogram cache.");
|
||||
DMLC_DECLARE_FIELD(extmem_concat_pages).set_default(false);
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -278,7 +278,7 @@ class GlobalApproxUpdater : public TreeUpdater {
|
||||
*sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
|
||||
auto in = gpair->HostView().Values();
|
||||
std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
|
||||
|
||||
error::NoPageConcat(this->hist_param_.extmem_concat_pages);
|
||||
SampleGradient(ctx_, param, sampled->HostView());
|
||||
}
|
||||
|
||||
|
||||
@@ -5,10 +5,11 @@
|
||||
#include <limits> // for numeric_limits
|
||||
#include <ostream> // for ostream
|
||||
|
||||
#include "gpu_hist/quantiser.cuh" // for GradientQuantiser
|
||||
#include "param.h" // for TrainParam
|
||||
#include "xgboost/base.h" // for bst_bin_t
|
||||
#include "xgboost/task.h" // for ObjInfo
|
||||
#include "../data/batch_utils.cuh" // for DftPrefetchBatches, StaticBatch
|
||||
#include "gpu_hist/quantiser.cuh" // for GradientQuantiser
|
||||
#include "param.h" // for TrainParam
|
||||
#include "xgboost/base.h" // for bst_bin_t
|
||||
#include "xgboost/task.h" // for ObjInfo
|
||||
|
||||
namespace xgboost::tree {
|
||||
struct GPUTrainingParam {
|
||||
@@ -119,26 +120,19 @@ struct DeviceSplitCandidate {
|
||||
};
|
||||
|
||||
namespace cuda_impl {
|
||||
constexpr auto DftPrefetchBatches() { return 2; }
|
||||
|
||||
inline BatchParam HistBatch(TrainParam const& param) {
|
||||
auto p = BatchParam{param.max_bin, TrainParam::DftSparseThreshold()};
|
||||
p.prefetch_copy = true;
|
||||
p.n_prefetch_batches = DftPrefetchBatches();
|
||||
p.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
|
||||
return p;
|
||||
}
|
||||
|
||||
inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hess,
|
||||
ObjInfo const& task) {
|
||||
return BatchParam{p.max_bin, hess, !task.const_hess};
|
||||
}
|
||||
|
||||
// Empty parameter to prevent regen, only used to control external memory prefetching.
|
||||
inline BatchParam StaticBatch(bool prefetch_copy) {
|
||||
BatchParam p;
|
||||
p.prefetch_copy = prefetch_copy;
|
||||
p.n_prefetch_batches = DftPrefetchBatches();
|
||||
return p;
|
||||
auto batch = BatchParam{p.max_bin, hess, !task.const_hess};
|
||||
batch.prefetch_copy = true;
|
||||
batch.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
|
||||
return batch;
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "../common/random.h" // for ColumnSampler, GlobalRandom
|
||||
#include "../common/timer.h"
|
||||
#include "../data/batch_utils.cuh" // for StaticBatch
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "../data/ellpack_page.h"
|
||||
#include "constraints.cuh"
|
||||
@@ -50,11 +51,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
|
||||
using cuda_impl::ApproxBatch;
|
||||
using cuda_impl::HistBatch;
|
||||
|
||||
// Both the approx and hist initializes the DMatrix before creating the actual
|
||||
// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
|
||||
// parameter to avoid any regen.
|
||||
using cuda_impl::StaticBatch;
|
||||
using data::cuda_impl::StaticBatch;
|
||||
|
||||
// Extra data for each node that is passed to the update position function
|
||||
struct NodeSplitData {
|
||||
@@ -102,11 +99,11 @@ struct GPUHistMakerDevice {
|
||||
std::vector<std::unique_ptr<RowPartitioner>> partitioners_;
|
||||
|
||||
DeviceHistogramBuilder histogram_;
|
||||
std::vector<bst_idx_t> batch_ptr_;
|
||||
std::vector<bst_idx_t> const batch_ptr_;
|
||||
// node idx for each sample
|
||||
dh::device_vector<bst_node_t> positions_;
|
||||
HistMakerTrainParam const* hist_param_;
|
||||
std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
|
||||
std::shared_ptr<common::HistogramCuts const> const cuts_;
|
||||
|
||||
auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
|
||||
std::vector<bst_node_t> nidx(candidates.size());
|
||||
@@ -135,35 +132,35 @@ struct GPUHistMakerDevice {
|
||||
|
||||
dh::device_vector<int> monotone_constraints;
|
||||
|
||||
TrainParam param;
|
||||
TrainParam const param;
|
||||
|
||||
std::unique_ptr<GradientQuantiser> quantiser;
|
||||
|
||||
dh::PinnedMemory pinned;
|
||||
dh::PinnedMemory pinned2;
|
||||
|
||||
common::Monitor monitor;
|
||||
FeatureInteractionConstraintDevice interaction_constraints;
|
||||
|
||||
std::unique_ptr<GradientBasedSampler> sampler;
|
||||
|
||||
std::unique_ptr<FeatureGroups> feature_groups;
|
||||
common::Monitor monitor;
|
||||
|
||||
GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param,
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler, BatchParam batch_param,
|
||||
MetaInfo const& info, std::vector<bst_idx_t> batch_ptr,
|
||||
std::shared_ptr<common::HistogramCuts const> cuts)
|
||||
: evaluator_{_param, static_cast<bst_feature_t>(info.num_col_), ctx->Device()},
|
||||
ctx_(ctx),
|
||||
param(std::move(_param)),
|
||||
column_sampler_(std::move(column_sampler)),
|
||||
interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
|
||||
ctx_{ctx},
|
||||
column_sampler_{std::move(column_sampler)},
|
||||
batch_ptr_{std::move(batch_ptr)},
|
||||
hist_param_{hist_param},
|
||||
cuts_{std::move(cuts)} {
|
||||
this->sampler =
|
||||
std::make_unique<GradientBasedSampler>(ctx, info.num_row_, batch_param, param.subsample,
|
||||
param.sampling_method, batch_ptr_.size() > 2);
|
||||
cuts_{std::move(cuts)},
|
||||
param{std::move(_param)},
|
||||
interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
|
||||
sampler{std::make_unique<GradientBasedSampler>(
|
||||
ctx, info.num_row_, batch_param, param.subsample, param.sampling_method,
|
||||
batch_ptr_.size() > 2 && this->hist_param_->extmem_concat_pages)} {
|
||||
if (!param.monotone_constraints.empty()) {
|
||||
// Copy assigning an empty vector causes an exception in MSVC debug builds
|
||||
monotone_constraints = param.monotone_constraints;
|
||||
@@ -185,33 +182,31 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
// Reset values for each update iteration
|
||||
[[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
|
||||
[[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
|
||||
this->monitor.Start(__func__);
|
||||
common::SetDevice(ctx_->Ordinal());
|
||||
|
||||
auto const& info = p_fmat->Info();
|
||||
// backup the gradient
|
||||
dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
|
||||
this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
this->interaction_constraints.Reset(ctx_);
|
||||
this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
|
||||
p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit());
|
||||
|
||||
// Sampling
|
||||
/**
|
||||
* Sampling
|
||||
*/
|
||||
dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
|
||||
auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat);
|
||||
this->gpair = sample.gpair;
|
||||
p_fmat = sample.p_fmat; // Update p_fmat before allocating partitioners
|
||||
p_fmat = sample.p_fmat;
|
||||
p_fmat->Info().feature_types.SetDevice(ctx_->Device());
|
||||
std::size_t n_batches = p_fmat->NumBatches();
|
||||
bool is_concat = (n_batches + 1) != this->batch_ptr_.size();
|
||||
std::vector<bst_idx_t> batch_ptr{batch_ptr_};
|
||||
|
||||
/**
|
||||
* Initialize the partitioners
|
||||
*/
|
||||
bool is_concat = sampler->ConcatPages();
|
||||
std::size_t n_batches = is_concat ? 1 : p_fmat->NumBatches();
|
||||
std::vector<bst_idx_t> batch_ptr{this->batch_ptr_};
|
||||
if (is_concat) {
|
||||
// Concatenate the batch ptrs as well.
|
||||
batch_ptr = {static_cast<bst_idx_t>(0), p_fmat->Info().num_row_};
|
||||
}
|
||||
// Initialize partitions
|
||||
if (!partitioners_.empty()) {
|
||||
CHECK_EQ(partitioners_.size(), n_batches);
|
||||
}
|
||||
@@ -230,8 +225,20 @@ struct GPUHistMakerDevice {
|
||||
CHECK_EQ(partitioners_.front()->Size(), p_fmat->Info().num_row_);
|
||||
}
|
||||
|
||||
// Other initializations
|
||||
quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
|
||||
/**
|
||||
* Initialize the evaluator
|
||||
*/
|
||||
this->column_sampler_->Init(ctx_, info.num_col_, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
this->interaction_constraints.Reset(ctx_);
|
||||
this->evaluator_.Reset(this->ctx_, *cuts_, info.feature_types.ConstDeviceSpan(), info.num_col_,
|
||||
this->param, info.IsColumnSplit());
|
||||
|
||||
/**
|
||||
* Other initializations
|
||||
*/
|
||||
this->quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
|
||||
|
||||
this->InitFeatureGroupsOnce(info);
|
||||
|
||||
@@ -327,8 +334,8 @@ struct GPUHistMakerDevice {
|
||||
|
||||
auto d_ridx = partitioners_.at(k)->GetRows(nidx);
|
||||
this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
|
||||
feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
|
||||
d_node_hist, *quantiser);
|
||||
feature_groups->DeviceAccessor(ctx_->Device()), this->gpair,
|
||||
d_ridx, d_node_hist, *quantiser);
|
||||
monitor.Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -678,11 +685,11 @@ struct GPUHistMakerDevice {
|
||||
constexpr bst_node_t kRootNIdx = RegTree::kRoot;
|
||||
auto quantiser = *this->quantiser;
|
||||
auto gpair_it = dh::MakeTransformIterator<GradientPairInt64>(
|
||||
dh::tbegin(gpair),
|
||||
dh::tbegin(this->gpair),
|
||||
[=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
|
||||
GradientPairInt64 root_sum_quantised =
|
||||
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(), GradientPairInt64{},
|
||||
thrust::plus<GradientPairInt64>{});
|
||||
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
|
||||
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
|
||||
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
|
||||
auto rc = collective::GlobalSum(
|
||||
ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
|
||||
|
||||
@@ -539,6 +539,7 @@ class QuantileHistMaker : public TreeUpdater {
|
||||
// Copy gradient into buffer for sampling. This converts C-order to F-order.
|
||||
std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
|
||||
}
|
||||
error::NoPageConcat(this->hist_param_.extmem_concat_pages);
|
||||
SampleGradient(ctx_, *param, h_sample_out);
|
||||
auto *h_out_position = &out_position[tree_it - trees.begin()];
|
||||
if ((*tree_it)->IsMultiTarget()) {
|
||||
|
||||
Reference in New Issue
Block a user