Remove internal use of gpu_id. (#9568)
This commit is contained in:
@@ -19,8 +19,7 @@
|
||||
#include "xgboost/linalg.h" // TensorView, Tensor, Constant
|
||||
#include "xgboost/logging.h" // CHECK_EQ
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace xgboost::tree {
|
||||
namespace cpu_impl {
|
||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
linalg::TensorView<GradientPair const, 2> gpair,
|
||||
@@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
|
||||
|
||||
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
|
||||
bst_target_t n_targets, linalg::Vector<float>* out) {
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
out->SetDevice(ctx->Device());
|
||||
out->Reshape(n_targets);
|
||||
|
||||
gpair.SetDevice(ctx->Device());
|
||||
@@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
|
||||
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
|
||||
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -21,9 +21,7 @@
|
||||
#include "xgboost/logging.h" // CHECK_EQ
|
||||
#include "xgboost/span.h" // span
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace cuda_impl {
|
||||
namespace xgboost::tree::cuda_impl {
|
||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
||||
auto n_targets = out.Size();
|
||||
@@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
||||
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
||||
|
||||
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
|
||||
collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
|
||||
d_sum.Size() * 2);
|
||||
|
||||
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
|
||||
@@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
|
||||
});
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree::cuda_impl
|
||||
|
||||
@@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
|
||||
auto const world_size = collective::GetWorldSize();
|
||||
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
|
||||
auto all_candidates = dh::ToSpan(all_candidate_storage);
|
||||
collective::AllGather(device_, out_splits.data(), all_candidates.data(),
|
||||
collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
|
||||
out_splits.size() * sizeof(DeviceSplitCandidate));
|
||||
|
||||
// Reduce to get the best candidate from all workers.
|
||||
|
||||
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
|
||||
std::size_t node_categorical_storage_size_ = 0;
|
||||
// Is the data split column-wise?
|
||||
bool is_column_split_ = false;
|
||||
int32_t device_;
|
||||
DeviceOrd device_;
|
||||
|
||||
// Copy the categories from device to host asynchronously.
|
||||
void CopyToHost( const std::vector<bst_node_t>& nidx);
|
||||
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
|
||||
}
|
||||
|
||||
public:
|
||||
GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, int32_t device)
|
||||
GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, DeviceOrd device)
|
||||
: tree_evaluator_{param, n_features, device}, param_{param} {}
|
||||
/**
|
||||
* \brief Reset the evaluator, should be called before any use.
|
||||
*/
|
||||
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m, bool is_column_split,
|
||||
int32_t device);
|
||||
DeviceOrd device);
|
||||
|
||||
/**
|
||||
* \brief Get host category storage for nidx. Different from the internal version, this
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2022 by XGBoost Contributors
|
||||
* Copyright 2022-2023 by XGBoost Contributors
|
||||
*
|
||||
* \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
|
||||
* compilation time.
|
||||
@@ -12,11 +12,10 @@
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace xgboost::tree {
|
||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m,
|
||||
bool is_column_split, int32_t device) {
|
||||
bool is_column_split, DeviceOrd device) {
|
||||
param_ = param;
|
||||
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
||||
has_categoricals_ = cuts.HasCategorical();
|
||||
@@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
||||
});
|
||||
return dh::ToSpan(cat_sorted_idx_);
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2020-2023 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef FEATURE_GROUPS_CUH_
|
||||
#define FEATURE_GROUPS_CUH_
|
||||
@@ -102,11 +102,10 @@ struct FeatureGroups {
|
||||
InitSingle(cuts);
|
||||
}
|
||||
|
||||
FeatureGroupsAccessor DeviceAccessor(int device) const {
|
||||
[[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
|
||||
feature_segments.SetDevice(device);
|
||||
bin_segments.SetDevice(device);
|
||||
return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
|
||||
max_group_bins};
|
||||
return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
|
||||
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
|
||||
auto page = batch.Impl();
|
||||
if (!page_) {
|
||||
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
|
||||
page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
|
||||
page->row_stride, dmat->Info().num_row_);
|
||||
}
|
||||
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
|
||||
size_t num_elements = page_->Copy(ctx->Device(), page, offset);
|
||||
offset += num_elements;
|
||||
}
|
||||
page_concatenated_ = true;
|
||||
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
auto first_page = (*batch_iterator.begin()).Impl();
|
||||
// Create a new ELLPACK page with empty rows.
|
||||
page_.reset(); // Release the device memory first before reallocating
|
||||
page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
|
||||
page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
|
||||
first_page->row_stride, sample_rows));
|
||||
|
||||
// Compact the ELLPACK pages into the single sample page.
|
||||
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||
for (auto& batch : batch_iterator) {
|
||||
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||
page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||
}
|
||||
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
||||
auto first_page = (*batch_iterator.begin()).Impl();
|
||||
// Create a new ELLPACK page with empty rows.
|
||||
page_.reset(); // Release the device memory first before reallocating
|
||||
page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
|
||||
page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
|
||||
first_page->row_stride, sample_rows));
|
||||
|
||||
// Compact the ELLPACK pages into the single sample page.
|
||||
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||
for (auto& batch : batch_iterator) {
|
||||
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||
page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||
}
|
||||
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
|
||||
@@ -13,15 +13,15 @@
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
|
||||
RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
|
||||
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
|
||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
||||
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
||||
}
|
||||
|
||||
RowPartitioner::~RowPartitioner() {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
|
||||
}
|
||||
|
||||
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
|
||||
|
||||
@@ -199,7 +199,7 @@ class RowPartitioner {
|
||||
static constexpr bst_node_t kIgnoredTreePosition = -1;
|
||||
|
||||
private:
|
||||
int device_idx_;
|
||||
DeviceOrd device_idx_;
|
||||
/*! \brief In here if you want to find the rows belong to a node nid, first you need to
|
||||
* get the indices segment from ridx_segments[nid], then get the row index that
|
||||
* represents position of row in input data X. `RowPartitioner::GetRows` would be a
|
||||
@@ -223,7 +223,7 @@ class RowPartitioner {
|
||||
dh::PinnedMemory pinned2_;
|
||||
|
||||
public:
|
||||
RowPartitioner(int device_idx, size_t num_rows);
|
||||
RowPartitioner(DeviceOrd device_idx, size_t num_rows);
|
||||
~RowPartitioner();
|
||||
RowPartitioner(const RowPartitioner&) = delete;
|
||||
RowPartitioner& operator=(const RowPartitioner&) = delete;
|
||||
|
||||
@@ -477,7 +477,7 @@ class HistEvaluator {
|
||||
: ctx_{ctx},
|
||||
param_{param},
|
||||
column_sampler_{std::move(sampler)},
|
||||
tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
|
||||
tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
|
||||
is_col_split_{info.IsColumnSplit()} {
|
||||
interaction_constraints_.Configure(*param, info.num_col_);
|
||||
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
|
||||
@@ -696,7 +696,7 @@ class HistMultiEvaluator {
|
||||
stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
|
||||
gain_.resize(1);
|
||||
|
||||
linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
|
||||
linalg::Vector<float> weight({n_targets}, ctx_->Device());
|
||||
CalcWeight(*param_, root_sum, weight.HostView());
|
||||
auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
|
||||
gain_.front() = root_gain;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2018-2020 by Contributors
|
||||
/**
|
||||
* Copyright 2018-2023 by Contributors
|
||||
* \file split_evaluator.h
|
||||
* \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
|
||||
* \author Henry Gouk
|
||||
@@ -23,8 +23,7 @@
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace xgboost::tree {
|
||||
class TreeEvaluator {
|
||||
// hist and exact use parent id to calculate constraints.
|
||||
static constexpr bst_node_t kRootParentId =
|
||||
@@ -33,13 +32,13 @@ class TreeEvaluator {
|
||||
HostDeviceVector<float> lower_bounds_;
|
||||
HostDeviceVector<float> upper_bounds_;
|
||||
HostDeviceVector<int32_t> monotone_;
|
||||
int32_t device_;
|
||||
DeviceOrd device_;
|
||||
bool has_constraint_;
|
||||
|
||||
public:
|
||||
TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
|
||||
TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
|
||||
device_ = device;
|
||||
if (device != Context::kCpuId) {
|
||||
if (device.IsCUDA()) {
|
||||
lower_bounds_.SetDevice(device);
|
||||
upper_bounds_.SetDevice(device);
|
||||
monotone_.SetDevice(device);
|
||||
@@ -59,7 +58,7 @@ class TreeEvaluator {
|
||||
has_constraint_ = true;
|
||||
}
|
||||
|
||||
if (device_ != Context::kCpuId) {
|
||||
if (device_.IsCUDA()) {
|
||||
// Pull to device early.
|
||||
lower_bounds_.ConstDeviceSpan();
|
||||
upper_bounds_.ConstDeviceSpan();
|
||||
@@ -122,7 +121,7 @@ class TreeEvaluator {
|
||||
}
|
||||
|
||||
// Fast floating point division instruction on device
|
||||
XGBOOST_DEVICE float Divide(float a, float b) const {
|
||||
[[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return __fdividef(a, b);
|
||||
#else
|
||||
@@ -154,7 +153,7 @@ class TreeEvaluator {
|
||||
public:
|
||||
/* Get a view to the evaluator that can be passed down to device. */
|
||||
template <typename ParamT = TrainParam> auto GetEvaluator() const {
|
||||
if (device_ != Context::kCpuId) {
|
||||
if (device_.IsCUDA()) {
|
||||
auto constraints = monotone_.ConstDevicePointer();
|
||||
return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
|
||||
upper_bounds_.ConstDevicePointer(), has_constraint_};
|
||||
@@ -215,7 +214,6 @@ enum SplitType {
|
||||
// partition-based categorical split
|
||||
kPart = 2
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
#endif // XGBOOST_TREE_SPLIT_EVALUATOR_H_
|
||||
|
||||
@@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
|
||||
: param_(param),
|
||||
colmaker_train_param_{colmaker_train_param},
|
||||
ctx_{ctx},
|
||||
tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
|
||||
tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
|
||||
interaction_constraints_{std::move(_interaction_constraints)},
|
||||
column_densities_(column_densities) {}
|
||||
// update one tree, growing
|
||||
|
||||
@@ -74,7 +74,7 @@ class DeviceHistogramStorage {
|
||||
dh::device_vector<typename GradientSumT::ValueT> overflow_;
|
||||
std::map<int, size_t> overflow_nidx_map_;
|
||||
int n_bins_;
|
||||
int device_id_;
|
||||
DeviceOrd device_id_;
|
||||
static constexpr size_t kNumItemsInGradientSum =
|
||||
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
|
||||
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
|
||||
@@ -82,7 +82,7 @@ class DeviceHistogramStorage {
|
||||
public:
|
||||
// Start with about 16mb
|
||||
DeviceHistogramStorage() { data_.reserve(1 << 22); }
|
||||
void Init(int device_id, int n_bins) {
|
||||
void Init(DeviceOrd device_id, int n_bins) {
|
||||
this->n_bins_ = n_bins;
|
||||
this->device_id_ = device_id;
|
||||
}
|
||||
@@ -196,7 +196,7 @@ struct GPUHistMakerDevice {
|
||||
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
|
||||
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
|
||||
: evaluator_{_param, n_features, ctx->gpu_id},
|
||||
: evaluator_{_param, n_features, ctx->Device()},
|
||||
ctx_(ctx),
|
||||
feature_types{_feature_types},
|
||||
param(std::move(_param)),
|
||||
@@ -211,7 +211,7 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
CHECK(column_sampler_);
|
||||
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
|
||||
monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
|
||||
}
|
||||
|
||||
~GPUHistMakerDevice() = default;
|
||||
@@ -220,7 +220,7 @@ struct GPUHistMakerDevice {
|
||||
if (!feature_groups) {
|
||||
CHECK(page);
|
||||
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
||||
dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
|
||||
sizeof(GradientPairPrecise));
|
||||
}
|
||||
}
|
||||
@@ -231,7 +231,7 @@ struct GPUHistMakerDevice {
|
||||
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
|
||||
@@ -246,15 +246,15 @@ struct GPUHistMakerDevice {
|
||||
gpair = sample.gpair;
|
||||
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
||||
dmat->Info().IsColumnSplit(), ctx_->gpu_id);
|
||||
dmat->Info().IsColumnSplit(), ctx_->Device());
|
||||
|
||||
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
|
||||
|
||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
|
||||
row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
|
||||
hist.Init(ctx_->Device(), page->Cuts().TotalBins());
|
||||
hist.Reset();
|
||||
|
||||
this->InitFeatureGroupsOnce();
|
||||
@@ -267,7 +267,7 @@ struct GPUHistMakerDevice {
|
||||
sampled_features->SetDevice(ctx_->Device());
|
||||
common::Span<bst_feature_t> feature_set =
|
||||
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
auto matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
|
||||
EvaluateSplitSharedInputs shared_inputs{
|
||||
gpu_param,
|
||||
@@ -289,7 +289,7 @@ struct GPUHistMakerDevice {
|
||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
|
||||
std::vector<bst_node_t> nidx(2 * candidates.size());
|
||||
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
|
||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
auto matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||
EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
|
||||
matrix.feature_segments, matrix.gidx_fvalue_map,
|
||||
matrix.min_fvalue,
|
||||
@@ -342,9 +342,9 @@ struct GPUHistMakerDevice {
|
||||
void BuildHist(int nidx) {
|
||||
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
||||
auto d_ridx = row_partitioner->GetRows(nidx);
|
||||
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
|
||||
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
|
||||
*quantiser);
|
||||
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
|
||||
feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
|
||||
d_node_hist, *quantiser);
|
||||
}
|
||||
|
||||
// Attempt to do subtraction trick
|
||||
@@ -413,10 +413,10 @@ struct GPUHistMakerDevice {
|
||||
});
|
||||
|
||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||
ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
|
||||
ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
|
||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||
ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
|
||||
collective::Synchronize(ctx_->gpu_id);
|
||||
ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
|
||||
collective::Synchronize(ctx_->Ordinal());
|
||||
|
||||
row_partitioner->UpdatePositionBatch(
|
||||
nidx, left_nidx, right_nidx, split_data,
|
||||
@@ -454,7 +454,7 @@ struct GPUHistMakerDevice {
|
||||
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
|
||||
}
|
||||
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||
|
||||
if (info_.IsColumnSplit()) {
|
||||
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
|
||||
@@ -524,9 +524,9 @@ struct GPUHistMakerDevice {
|
||||
common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
|
||||
common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||
auto d_gpair = this->gpair;
|
||||
p_out_position->SetDevice(ctx_->gpu_id);
|
||||
p_out_position->SetDevice(ctx_->Device());
|
||||
p_out_position->Resize(row_partitioner->GetRows().size());
|
||||
|
||||
auto new_position_op = [=] __device__(size_t row_id, int position) {
|
||||
@@ -613,7 +613,7 @@ struct GPUHistMakerDevice {
|
||||
monitor.Start("AllReduce");
|
||||
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
||||
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
|
||||
collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
page->Cuts().TotalBins() * 2 * num_histograms);
|
||||
|
||||
monitor.Stop("AllReduce");
|
||||
@@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
|
||||
CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
|
||||
CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
|
||||
info_ = &dmat->Info();
|
||||
|
||||
// Synchronise the column sampling seed
|
||||
@@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater {
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
info_->feature_types.SetDevice(ctx_->Device());
|
||||
maker = std::make_unique<GPUHistMakerDevice>(
|
||||
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
|
||||
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
|
||||
@@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
this->InitData(param, p_fmat, p_tree);
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(ctx_->gpu_id);
|
||||
gpair->SetDevice(ctx_->Device());
|
||||
maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
||||
}
|
||||
|
||||
@@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
|
||||
this->InitData(p_fmat, p_tree);
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(ctx_->gpu_id);
|
||||
gpair->SetDevice(ctx_->Device());
|
||||
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
||||
}
|
||||
|
||||
|
||||
@@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
|
||||
auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
|
||||
if (need_copy()) {
|
||||
// allocate buffer
|
||||
sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
|
||||
sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
|
||||
h_sample_out = sample_out.HostView();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user