Initial GPU support for the approx tree method. (#9414)
This commit is contained in:
@@ -89,5 +89,10 @@ void WarnDeprecatedGPUId();
|
||||
void WarnEmptyDataset();
|
||||
|
||||
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
|
||||
|
||||
constexpr StringView InvalidCUDAOrdinal() {
|
||||
return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
|
||||
"available for using GPU.";
|
||||
}
|
||||
} // namespace xgboost::error
|
||||
#endif // XGBOOST_COMMON_ERROR_MSG_H_
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD
|
||||
#include "error_msg.h" // for GroupWeight, GroupSize
|
||||
#include "error_msg.h" // for GroupWeight, GroupSize, InvalidCUDAOrdinal
|
||||
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for MetaInfo
|
||||
@@ -240,7 +240,7 @@ class RankingCache {
|
||||
// The function simply returns a uninitialized buffer as this is only used by the
|
||||
// objective for creating pairs.
|
||||
common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
|
||||
CHECK(ctx->IsCUDA());
|
||||
CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
|
||||
if (y_sorted_idx_cache_.Empty()) {
|
||||
y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
|
||||
y_sorted_idx_cache_.Resize(n_samples);
|
||||
@@ -248,7 +248,7 @@ class RankingCache {
|
||||
return y_sorted_idx_cache_.DeviceSpan();
|
||||
}
|
||||
common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
|
||||
CHECK(ctx->IsCUDA());
|
||||
CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
|
||||
if (y_ranked_by_model_.Empty()) {
|
||||
y_ranked_by_model_.SetDevice(ctx->gpu_id);
|
||||
y_ranked_by_model_.Resize(n_samples);
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/cuda_context.cuh"
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "../common/random.h"
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
#include "./ellpack_page.cuh"
|
||||
#include "device_adapter.cuh" // for HasInfInData
|
||||
@@ -131,7 +130,11 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
|
||||
monitor_.Start("Quantiles");
|
||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
|
||||
row_stride = GetRowStride(dmat);
|
||||
cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
|
||||
if (!param.hess.empty()) {
|
||||
cuts_ = common::DeviceSketchWithHessian(ctx, dmat, param.max_bin, param.hess);
|
||||
} else {
|
||||
cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
|
||||
}
|
||||
monitor_.Stop("Quantiles");
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
|
||||
@@ -7,13 +7,12 @@
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility> // std::forward
|
||||
#include <utility> // for forward
|
||||
|
||||
#include "../common/column_matrix.h"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/numeric.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
#include "../common/transform_iterator.h" // for MakeIndexTransformIter
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
|
||||
@@ -8,12 +8,12 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <numeric> // for accumulate
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/error_msg.h" // for InconsistentMaxBin
|
||||
#include "../common/random.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "../collective/communicator-inl.h" // for GetWorldSize, GetRank, Allgather
|
||||
#include "../common/error_msg.h" // for InconsistentMaxBin
|
||||
#include "./simple_batch_iterator.h"
|
||||
#include "adapter.h"
|
||||
#include "batch_utils.h" // for CheckEmpty, RegenGHist
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "./sparse_page_dmatrix.h"
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "./simple_batch_iterator.h"
|
||||
#include "batch_utils.h" // for RegenGHist
|
||||
#include "gradient_index.h"
|
||||
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
/**
|
||||
* Copyright 2021-2023 by XGBoost contributors
|
||||
*/
|
||||
#include <memory>
|
||||
#include <memory> // for unique_ptr
|
||||
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "batch_utils.h" // for CheckEmpty, RegenGHist
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "batch_utils.h" // for CheckEmpty, RegenGHist
|
||||
#include "ellpack_page.cuh"
|
||||
#include "sparse_page_dmatrix.h"
|
||||
#include "sparse_page_source.h"
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for BatchParam
|
||||
|
||||
namespace xgboost::data {
|
||||
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
@@ -25,8 +27,13 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
cache_info_.erase(id);
|
||||
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
|
||||
std::unique_ptr<common::HistogramCuts> cuts;
|
||||
cuts =
|
||||
std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin, 0));
|
||||
if (!param.hess.empty()) {
|
||||
cuts = std::make_unique<common::HistogramCuts>(
|
||||
common::DeviceSketchWithHessian(ctx, this, param.max_bin, param.hess));
|
||||
} else {
|
||||
cuts =
|
||||
std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin));
|
||||
}
|
||||
this->InitializeSparsePage(ctx); // reset after use.
|
||||
|
||||
row_stride = GetRowStride(this);
|
||||
@@ -35,10 +42,10 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
batch_param_ = param;
|
||||
|
||||
auto ft = this->info_.feature_types.ConstDeviceSpan();
|
||||
ellpack_page_source_.reset(); // release resources.
|
||||
ellpack_page_source_.reset(new EllpackPageSource(
|
||||
ellpack_page_source_.reset(); // make sure resource is released before making new ones.
|
||||
ellpack_page_source_ = std::make_shared<EllpackPageSource>(
|
||||
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
|
||||
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id));
|
||||
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
|
||||
} else {
|
||||
CHECK(sparse_page_source_);
|
||||
ellpack_page_source_->Reset();
|
||||
|
||||
@@ -47,15 +47,16 @@ std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method)
|
||||
if (ctx->IsCUDA()) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
|
||||
switch (tree_method) {
|
||||
case TreeMethod::kAuto: // Use hist as default in 2.0
|
||||
case TreeMethod::kHist: {
|
||||
return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
|
||||
[] { return "grow_gpu_hist"; });
|
||||
}
|
||||
case TreeMethod::kApprox:
|
||||
CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
|
||||
return "grow_histmaker";
|
||||
case TreeMethod::kApprox: {
|
||||
return ctx->DispatchDevice([] { return "grow_histmaker"; }, [] { return "grow_gpu_approx"; });
|
||||
}
|
||||
case TreeMethod::kExact:
|
||||
CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
|
||||
return "grow_colmaker,prune";
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2018-2019 by Contributors
|
||||
/**
|
||||
* Copyright 2018-2023 by Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_TREE_CONSTRAINTS_H_
|
||||
#define XGBOOST_TREE_CONSTRAINTS_H_
|
||||
@@ -8,10 +8,8 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/base.h"
|
||||
|
||||
#include "param.h"
|
||||
#include "xgboost/base.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*!
|
||||
|
||||
@@ -8,10 +8,10 @@
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "../../common/compressed_iterator.h"
|
||||
#include "../../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../../common/random.h"
|
||||
#include "../param.h"
|
||||
@@ -202,27 +202,27 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
|
||||
GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
// Set gradient pair to 0 with p = 1 - subsample
|
||||
thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0),
|
||||
BernoulliTrial(common::GlobalRandom()(), subsample_),
|
||||
GradientPair());
|
||||
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<std::size_t>(0),
|
||||
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair{});
|
||||
|
||||
// Count the sampled rows.
|
||||
size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
|
||||
size_t sample_rows =
|
||||
thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero{});
|
||||
|
||||
// Compact gradient pairs.
|
||||
gpair_.resize(sample_rows);
|
||||
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
|
||||
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero{});
|
||||
|
||||
// Index the sample rows.
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
|
||||
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
IsNonZero());
|
||||
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
|
||||
sample_row_index_.begin());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
|
||||
sample_row_index_.begin(),
|
||||
sample_row_index_.begin(),
|
||||
ClearEmptyRows());
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
sample_row_index_.begin(), ClearEmptyRows());
|
||||
|
||||
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
|
||||
auto first_page = (*batch_iterator.begin()).Impl();
|
||||
@@ -232,7 +232,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
first_page->row_stride, sample_rows));
|
||||
|
||||
// Compact the ELLPACK pages into the single sample page.
|
||||
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||
for (auto& batch : batch_iterator) {
|
||||
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include "../common/random.h"
|
||||
#include "../data/gradient_index.h"
|
||||
#include "common_row_partitioner.h"
|
||||
#include "constraints.h"
|
||||
#include "driver.h"
|
||||
#include "hist/evaluate_splits.h"
|
||||
#include "hist/histogram.h"
|
||||
|
||||
@@ -31,7 +31,6 @@
|
||||
#include "gpu_hist/histogram.cuh"
|
||||
#include "gpu_hist/row_partitioner.cuh"
|
||||
#include "param.h"
|
||||
#include "split_evaluator.h"
|
||||
#include "updater_gpu_common.cuh"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/context.h"
|
||||
@@ -49,13 +48,30 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
#endif // !defined(GTEST_TEST)
|
||||
|
||||
// training parameters specific to this algorithm
|
||||
struct GPUHistMakerTrainParam
|
||||
: public XGBoostParameter<GPUHistMakerTrainParam> {
|
||||
struct GPUHistMakerTrainParam : public XGBoostParameter<GPUHistMakerTrainParam> {
|
||||
bool debug_synchronize;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
|
||||
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
|
||||
"Check if all distributed tree are identical after tree construction.");
|
||||
DMLC_DECLARE_FIELD(debug_synchronize)
|
||||
.set_default(false)
|
||||
.describe("Check if all distributed tree are identical after tree construction.");
|
||||
}
|
||||
|
||||
// Only call this method for testing
|
||||
void CheckTreesSynchronized(RegTree const* local_tree) const {
|
||||
if (this->debug_synchronize) {
|
||||
std::string s_model;
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = collective::GetRank();
|
||||
if (rank == 0) {
|
||||
local_tree->Save(&fs);
|
||||
}
|
||||
fs.Seek(0);
|
||||
collective::Broadcast(&s_model, 0);
|
||||
RegTree reference_tree{}; // rank 0 tree
|
||||
reference_tree.Load(&fs);
|
||||
CHECK(*local_tree == reference_tree);
|
||||
}
|
||||
}
|
||||
};
|
||||
#if !defined(GTEST_TEST)
|
||||
@@ -170,16 +186,15 @@ class DeviceHistogramStorage {
|
||||
};
|
||||
|
||||
// Manage memory for a single GPU
|
||||
template <typename GradientSumT>
|
||||
struct GPUHistMakerDevice {
|
||||
private:
|
||||
GPUHistEvaluator evaluator_;
|
||||
Context const* ctx_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
|
||||
public:
|
||||
EllpackPageImpl const* page{nullptr};
|
||||
common::Span<FeatureType const> feature_types;
|
||||
BatchParam batch_param;
|
||||
|
||||
std::unique_ptr<RowPartitioner> row_partitioner;
|
||||
DeviceHistogramStorage<> hist{};
|
||||
@@ -199,7 +214,6 @@ struct GPUHistMakerDevice {
|
||||
dh::PinnedMemory pinned2;
|
||||
|
||||
common::Monitor monitor;
|
||||
common::ColumnSampler column_sampler;
|
||||
FeatureInteractionConstraintDevice interaction_constraints;
|
||||
|
||||
std::unique_ptr<GradientBasedSampler> sampler;
|
||||
@@ -208,22 +222,22 @@ struct GPUHistMakerDevice {
|
||||
|
||||
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
|
||||
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
|
||||
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
|
||||
BatchParam _batch_param)
|
||||
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
uint32_t n_features, BatchParam batch_param)
|
||||
: evaluator_{_param, n_features, ctx->gpu_id},
|
||||
ctx_(ctx),
|
||||
feature_types{_feature_types},
|
||||
param(std::move(_param)),
|
||||
column_sampler(column_sampler_seed),
|
||||
interaction_constraints(param, n_features),
|
||||
batch_param(std::move(_batch_param)) {
|
||||
sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method, is_external_memory));
|
||||
column_sampler_(std::move(column_sampler)),
|
||||
interaction_constraints(param, n_features) {
|
||||
sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method, is_external_memory);
|
||||
if (!param.monotone_constraints.empty()) {
|
||||
// Copy assigning an empty vector causes an exception in MSVC debug builds
|
||||
monotone_constraints = param.monotone_constraints;
|
||||
}
|
||||
|
||||
CHECK(column_sampler_);
|
||||
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
|
||||
}
|
||||
|
||||
@@ -234,16 +248,16 @@ struct GPUHistMakerDevice {
|
||||
CHECK(page);
|
||||
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
||||
sizeof(GradientSumT)));
|
||||
sizeof(GradientPairPrecise)));
|
||||
}
|
||||
}
|
||||
|
||||
// Reset values for each update iteration
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
|
||||
auto const& info = dmat->Info();
|
||||
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
@@ -275,8 +289,8 @@ struct GPUHistMakerDevice {
|
||||
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
|
||||
int nidx = RegTree::kRoot;
|
||||
GPUTrainingParam gpu_param(param);
|
||||
auto sampled_features = column_sampler.GetFeatureSet(0);
|
||||
sampled_features->SetDevice(ctx_->gpu_id);
|
||||
auto sampled_features = column_sampler_->GetFeatureSet(0);
|
||||
sampled_features->SetDevice(ctx_->Device());
|
||||
common::Span<bst_feature_t> feature_set =
|
||||
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
@@ -316,13 +330,13 @@ struct GPUHistMakerDevice {
|
||||
int right_nidx = tree[candidate.nid].RightChild();
|
||||
nidx[i * 2] = left_nidx;
|
||||
nidx[i * 2 + 1] = right_nidx;
|
||||
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
|
||||
left_sampled_features->SetDevice(ctx_->gpu_id);
|
||||
auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
|
||||
left_sampled_features->SetDevice(ctx_->Device());
|
||||
feature_sets.emplace_back(left_sampled_features);
|
||||
common::Span<bst_feature_t> left_feature_set =
|
||||
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
|
||||
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||
right_sampled_features->SetDevice(ctx_->gpu_id);
|
||||
auto right_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(right_nidx));
|
||||
right_sampled_features->SetDevice(ctx_->Device());
|
||||
feature_sets.emplace_back(right_sampled_features);
|
||||
common::Span<bst_feature_t> right_feature_set =
|
||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
|
||||
@@ -657,7 +671,6 @@ struct GPUHistMakerDevice {
|
||||
evaluator_.ApplyTreeSplit(candidate, p_tree);
|
||||
|
||||
const auto& parent = tree[candidate.nid];
|
||||
std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
|
||||
interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
|
||||
parent.RightChild());
|
||||
}
|
||||
@@ -693,9 +706,8 @@ struct GPUHistMakerDevice {
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
|
||||
ObjInfo const* task, RegTree* p_tree,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
|
||||
RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
auto& tree = *p_tree;
|
||||
// Process maximum 32 nodes at a time
|
||||
Driver<GPUExpandEntry> driver(param, 32);
|
||||
@@ -720,7 +732,6 @@ struct GPUHistMakerDevice {
|
||||
std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
|
||||
[&](const auto& e) { return driver.IsChildValid(e); });
|
||||
|
||||
|
||||
auto new_candidates =
|
||||
pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
|
||||
|
||||
@@ -753,8 +764,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
using GradientSumT = GradientPairPrecise;
|
||||
|
||||
public:
|
||||
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task)
|
||||
: TreeUpdater(ctx), task_{task} {};
|
||||
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
|
||||
void Configure(const Args& args) override {
|
||||
// Used in test to count how many configurations are performed
|
||||
LOG(DEBUG) << "[GPU Hist]: Configure";
|
||||
@@ -786,13 +796,10 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
// build tree
|
||||
try {
|
||||
size_t t_idx{0};
|
||||
std::size_t t_idx{0};
|
||||
for (xgboost::RegTree* tree : trees) {
|
||||
this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
|
||||
|
||||
if (hist_maker_param_.debug_synchronize) {
|
||||
this->CheckTreesSynchronized(tree);
|
||||
}
|
||||
this->hist_maker_param_.CheckTreesSynchronized(tree);
|
||||
++t_idx;
|
||||
}
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
@@ -809,13 +816,14 @@ class GPUHistMaker : public TreeUpdater {
|
||||
// Synchronise the column sampling seed
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
||||
maker.reset(new GPUHistMakerDevice<GradientSumT>(
|
||||
maker = std::make_unique<GPUHistMakerDevice>(
|
||||
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
|
||||
*param, column_sampling_seed, info_->num_col_, batch_param));
|
||||
*param, column_sampler_, info_->num_col_, batch_param);
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised_ = true;
|
||||
@@ -830,21 +838,6 @@ class GPUHistMaker : public TreeUpdater {
|
||||
p_last_tree_ = p_tree;
|
||||
}
|
||||
|
||||
// Only call this method for testing
|
||||
void CheckTreesSynchronized(RegTree* local_tree) const {
|
||||
std::string s_model;
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = collective::GetRank();
|
||||
if (rank == 0) {
|
||||
local_tree->Save(&fs);
|
||||
}
|
||||
fs.Seek(0);
|
||||
collective::Broadcast(&s_model, 0);
|
||||
RegTree reference_tree{}; // rank 0 tree
|
||||
reference_tree.Load(&fs);
|
||||
CHECK(*local_tree == reference_tree);
|
||||
}
|
||||
|
||||
void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
monitor_.Start("InitData");
|
||||
@@ -868,7 +861,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
MetaInfo* info_{}; // NOLINT
|
||||
|
||||
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
|
||||
std::unique_ptr<GPUHistMakerDevice> maker; // NOLINT
|
||||
|
||||
[[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
|
||||
[[nodiscard]] bool HasNodePosition() const override { return true; }
|
||||
@@ -883,6 +876,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
ObjInfo const* task_{nullptr};
|
||||
|
||||
common::Monitor monitor_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
};
|
||||
|
||||
#if !defined(GTEST_TEST)
|
||||
@@ -892,4 +886,131 @@ XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||
return new GPUHistMaker(ctx, task);
|
||||
});
|
||||
#endif // !defined(GTEST_TEST)
|
||||
|
||||
class GPUGlobalApproxMaker : public TreeUpdater {
|
||||
public:
|
||||
explicit GPUGlobalApproxMaker(Context const* ctx, ObjInfo const* task)
|
||||
: TreeUpdater(ctx), task_{task} {};
|
||||
void Configure(Args const& args) override {
|
||||
// Used in test to count how many configurations are performed
|
||||
LOG(DEBUG) << "[GPU Approx]: Configure";
|
||||
hist_maker_param_.UpdateAllowUnknown(args);
|
||||
dh::CheckComputeCapability();
|
||||
initialised_ = false;
|
||||
|
||||
monitor_.Init(this->Name());
|
||||
}
|
||||
|
||||
void LoadConfig(Json const& in) override {
|
||||
auto const& config = get<Object const>(in);
|
||||
FromJson(config.at("approx_train_param"), &this->hist_maker_param_);
|
||||
initialised_ = false;
|
||||
}
|
||||
void SaveConfig(Json* p_out) const override {
|
||||
auto& out = *p_out;
|
||||
out["approx_train_param"] = ToJson(hist_maker_param_);
|
||||
}
|
||||
~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
|
||||
|
||||
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
monitor_.Start("Update");
|
||||
|
||||
this->InitDataOnce(p_fmat);
|
||||
// build tree
|
||||
hess_.resize(gpair->Size());
|
||||
auto hess = dh::ToSpan(hess_);
|
||||
|
||||
gpair->SetDevice(ctx_->Device());
|
||||
auto d_gpair = gpair->ConstDeviceSpan();
|
||||
auto cuctx = ctx_->CUDACtx();
|
||||
thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess),
|
||||
[=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); });
|
||||
|
||||
auto const& info = p_fmat->Info();
|
||||
info.feature_types.SetDevice(ctx_->Device());
|
||||
auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
|
||||
maker_ = std::make_unique<GPUHistMakerDevice>(
|
||||
ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
|
||||
*param, column_sampler_, info.num_col_, batch);
|
||||
|
||||
std::size_t t_idx{0};
|
||||
for (xgboost::RegTree* tree : trees) {
|
||||
this->UpdateTree(gpair, p_fmat, tree, &out_position[t_idx]);
|
||||
this->hist_maker_param_.CheckTreesSynchronized(tree);
|
||||
++t_idx;
|
||||
}
|
||||
|
||||
monitor_.Stop("Update");
|
||||
}
|
||||
|
||||
void InitDataOnce(DMatrix* p_fmat) {
|
||||
if (this->initialised_) {
|
||||
return;
|
||||
}
|
||||
|
||||
monitor_.Start(__func__);
|
||||
CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal();
|
||||
// Synchronise the column sampling seed
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
p_last_fmat_ = p_fmat;
|
||||
initialised_ = true;
|
||||
monitor_.Stop(__func__);
|
||||
}
|
||||
|
||||
void InitData(DMatrix* p_fmat, RegTree const* p_tree) {
|
||||
this->InitDataOnce(p_fmat);
|
||||
p_last_tree_ = p_tree;
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
monitor_.Start("InitData");
|
||||
this->InitData(p_fmat, p_tree);
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(ctx_->gpu_id);
|
||||
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache(const DMatrix* data,
|
||||
linalg::MatrixView<bst_float> p_out_preds) override {
|
||||
if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
|
||||
return false;
|
||||
}
|
||||
monitor_.Start("UpdatePredictionCache");
|
||||
bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_);
|
||||
monitor_.Stop("UpdatePredictionCache");
|
||||
return result;
|
||||
}
|
||||
|
||||
[[nodiscard]] char const* Name() const override { return "grow_gpu_approx"; }
|
||||
[[nodiscard]] bool HasNodePosition() const override { return true; }
|
||||
|
||||
private:
|
||||
bool initialised_{false};
|
||||
|
||||
GPUHistMakerTrainParam hist_maker_param_;
|
||||
dh::device_vector<float> hess_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
std::unique_ptr<GPUHistMakerDevice> maker_;
|
||||
|
||||
DMatrix* p_last_fmat_{nullptr};
|
||||
RegTree const* p_last_tree_{nullptr};
|
||||
ObjInfo const* task_{nullptr};
|
||||
|
||||
common::Monitor monitor_;
|
||||
};
|
||||
|
||||
#if !defined(GTEST_TEST)
|
||||
XGBOOST_REGISTER_TREE_UPDATER(GPUApproxMaker, "grow_gpu_approx")
|
||||
.describe("Grow tree with GPU.")
|
||||
.set_body([](Context const* ctx, ObjInfo const* task) {
|
||||
return new GPUGlobalApproxMaker(ctx, task);
|
||||
});
|
||||
#endif // !defined(GTEST_TEST)
|
||||
} // namespace xgboost::tree
|
||||
|
||||
Reference in New Issue
Block a user