Remove internal use of gpu_id. (#9568)

This commit is contained in:
Jiaming Yuan
2023-09-20 23:29:51 +08:00
committed by GitHub
parent 38ac52dd87
commit 8c676c889d
121 changed files with 1012 additions and 1044 deletions

View File

@@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
}
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
&column_sizes_scan);
}
@@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
HistogramCuts cuts;
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
ctx->Ordinal());
ctx->Device());
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
std::size_t page_nnz = page.data.Size();

View File

@@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
}
template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0;
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
@@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
* \param out_column_size Output buffer for the size of each column.
*/
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
std::size_t max_shared_memory = dh::MaxSharedMemory(device);
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
// Not strictly correct as we should use number of samples to determine the type of
// counter. However, the sample size is not known due to sliding window on number of
// elements.
@@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
}
template <typename BatchIt>
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
// Count the valid entries in each column and copy them out.
template <typename AdapterBatch, typename BatchIter>
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
float missing, size_t columns, size_t cuts_per_feature, int device,
float missing, size_t columns, size_t cuts_per_feature,
DeviceOrd device,
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
dh::caching_device_vector<size_t>* column_sizes_scan,
dh::device_vector<Entry>* sorted_entries) {
@@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
int device, size_t columns, size_t begin, size_t end,
DeviceOrd device, size_t columns, size_t begin, size_t end,
float missing, SketchContainer *sketch_container,
int num_cuts) {
// Copy current subset of valid elements into temporary storage and sort
@@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
template <typename Batch>
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
int num_cuts_per_feature,
bool is_ranking, float missing, int device,
bool is_ranking, float missing, DeviceOrd device,
size_t columns, size_t begin, size_t end,
SketchContainer *sketch_container) {
dh::XGBCachingDeviceAllocator<char> alloc;
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
info.weights_.SetDevice(device);
auto weights = info.weights_.ConstDeviceSpan();
@@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
size_t num_rows = batch.NumRows();
size_t num_cols = batch.NumCols();
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
int32_t device = sketch_container->DeviceIdx();
auto device = sketch_container->DeviceIdx();
bool weighted = !info.weights_.Empty();
if (weighted) {
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, true);
device.ordinal, num_cuts_per_feature, true);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, false);
device.ordinal, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));

View File

@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
@@ -81,7 +81,7 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
template <typename T>
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
template <typename T>
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
return false;
}
template <typename T>
void HostDeviceVector<T>::SetDevice(int) const {}
template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}

View File

@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
template <typename T>
class HostDeviceVectorImpl {
public:
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
if (device >= 0) {
HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
// Initializer can be std::vector<T> or std::initializer_list<T>
template <class Initializer>
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
if (device >= 0) {
HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite;
LazyResizeDevice(init.size());
Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
gpu_access_{that.gpu_access_} {}
~HostDeviceVectorImpl() {
if (device_ >= 0) {
if (device_.IsCUDA()) {
SetDevice();
}
}
size_t Size() const {
[[nodiscard]] size_t Size() const {
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
}
int DeviceIdx() const { return device_; }
[[nodiscard]] DeviceOrd Device() const { return device_; }
T* DevicePointer() {
LazySyncDevice(GPUAccess::kWrite);
@@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
} else {
auto ptr = other->ConstDevicePointer();
SetDevice();
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
CHECK_EQ(this->Device(), other->Device());
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr,
other->Size() * sizeof(T),
@@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
return data_h_;
}
void SetDevice(int device) {
void SetDevice(DeviceOrd device) {
if (device_ == device) { return; }
if (device_ >= 0) {
if (device_.IsCUDA()) {
LazySyncHost(GPUAccess::kNone);
}
if (device_ >= 0 && device >= 0) {
CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
if (device_.IsCUDA() && device.IsCUDA()) {
CHECK_EQ(device_.ordinal, device.ordinal)
<< "New device ordinal is different from previous one.";
}
device_ = device;
if (device_ >= 0) {
if (device_.IsCUDA()) {
LazyResizeDevice(data_h_.size());
}
}
void Resize(size_t new_size, T v) {
if (new_size == Size()) { return; }
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
// fast on-device resize
gpu_access_ = GPUAccess::kWrite;
SetDevice();
@@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
gpu_access_ = access;
}
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
GPUAccess Access() const { return gpu_access_; }
[[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
[[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
[[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
[[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
[[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
[[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
[[nodiscard]] GPUAccess Access() const { return gpu_access_; }
private:
int device_{-1};
DeviceOrd device_{DeviceOrd::CPU()};
std::vector<T> data_h_{};
std::unique_ptr<dh::device_vector<T>> data_d_{};
GPUAccess gpu_access_{GPUAccess::kNone};
@@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
}
void SetDevice() {
CHECK_GE(device_, 0);
CHECK_GE(device_.ordinal, 0);
if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
} else {
(*cudaSetDeviceHandler)(device_);
(*cudaSetDeviceHandler)(device_.ordinal);
}
if (!data_d_) {
@@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
};
template<typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T>
@@ -309,7 +310,9 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
template <typename T>
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
DeviceOrd HostDeviceVector<T>::Device() const {
return impl_->Device();
}
template <typename T>
T* HostDeviceVector<T>::DevicePointer() {
@@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
return impl_->Access();
}
template <typename T>
void HostDeviceVector<T>::SetDevice(int device) const {
impl_->SetDevice(device);
}
template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
impl_->SetDevice(device.ordinal);
impl_->SetDevice(device);
}
template <typename T>

View File

@@ -8,16 +8,12 @@
#include "xgboost/context.h" // Context
#include "xgboost/host_device_vector.h" // HostDeviceVector
namespace xgboost {
namespace common {
namespace cuda_impl {
namespace xgboost::common::cuda_impl {
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
values.SetDevice(ctx->gpu_id);
values.SetDevice(ctx->Device());
auto const d_values = values.ConstDeviceSpan();
dh::XGBCachingDeviceAllocator<char> alloc;
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
thrust::plus<float>{});
}
} // namespace cuda_impl
} // namespace common
} // namespace xgboost
} // namespace xgboost::common::cuda_impl

View File

@@ -24,7 +24,7 @@ struct OptionalWeights {
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
HostDeviceVector<float> const& weights) {
if (ctx->IsCUDA()) {
weights.SetDevice(ctx->gpu_id);
weights.SetDevice(ctx->Device());
}
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
}

View File

@@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// summary does the output element come from) result by definition of merged rank. So we
// run it in 2 passes to obtain the merge path and then customize the standard merge
// algorithm.
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size());
CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts;
bool first_window = this->Current().empty();
@@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
* pruning or merging. We preserve the first type and remove the second type.
*/
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
void SketchContainer::Prune(size_t to) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
OffsetT to_total = 0;
auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
timer_.Start(__func__);
if (this->Current().size() == 0) {
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
}
void SketchContainer::FixError() {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
auto in = dh::ToSpan(this->Current());
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -502,7 +502,7 @@ void SketchContainer::FixError() {
}
void SketchContainer::AllReduce(bool is_column_split) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
auto world = collective::GetWorldSize();
if (world == 1 || is_column_split) {
return;
@@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
auto offset = rank * d_columns_ptr.size();
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
gathered_ptrs.begin() + offset);
collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
gathered_ptrs.size());
// Get the data from all workers.
std::vector<size_t> recv_lengths;
dh::caching_device_vector<char> recvbuf;
collective::AllGatherV(device_, this->Current().data().get(),
collective::AllGatherV(device_.ordinal, this->Current().data().get(),
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
collective::Synchronize(device_);
collective::Synchronize(device_.ordinal);
// Segment the received data.
auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -584,7 +584,7 @@ struct InvalidCatOp {
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers.

View File

@@ -41,7 +41,7 @@ class SketchContainer {
bst_row_t num_rows_;
bst_feature_t num_columns_;
int32_t num_bins_;
int32_t device_;
DeviceOrd device_;
// Double buffer as neither prune nor merge can be performed inplace.
dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
* \param device GPU ID.
*/
SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
int32_t max_bin, bst_feature_t num_columns,
bst_row_t num_rows, int32_t device)
: num_rows_{num_rows},
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
CHECK_GE(device, 0);
// Initialize Sketches for this dmatrix
this->columns_ptr_.SetDevice(device_);
this->columns_ptr_.Resize(num_columns + 1);
this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_b_.Resize(num_columns + 1);
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
CHECK(device.IsCUDA());
// Initialize Sketches for this dmatrix
this->columns_ptr_.SetDevice(device_);
this->columns_ptr_.Resize(num_columns + 1);
this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_b_.Resize(num_columns + 1);
this->feature_types_.Resize(feature_types.Size());
this->feature_types_.Copy(feature_types);
// Pull to device.
this->feature_types_.SetDevice(device);
this->feature_types_.ConstDeviceSpan();
this->feature_types_.ConstHostSpan();
this->feature_types_.Resize(feature_types.Size());
this->feature_types_.Copy(feature_types);
// Pull to device.
this->feature_types_.SetDevice(device);
this->feature_types_.ConstDeviceSpan();
this->feature_types_.ConstHostSpan();
auto d_feature_types = feature_types_.ConstDeviceSpan();
has_categorical_ =
!d_feature_types.empty() &&
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
common::IsCatOp{});
auto d_feature_types = feature_types_.ConstDeviceSpan();
has_categorical_ =
!d_feature_types.empty() &&
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
timer_.Init(__func__);
}
timer_.Init(__func__);
}
/* \brief Return GPU ID for this container. */
int32_t DeviceIdx() const { return device_; }
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
/* \brief Whether the predictor matrix contains categorical features. */
bool HasCategorical() const { return has_categorical_; }
/* \brief Accumulate weights of duplicated entries in input. */
@@ -175,7 +172,7 @@ class SketchContainer {
template <typename KeyComp = thrust::equal_to<size_t>>
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
this->columns_ptr_.SetDevice(device_);
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);

View File

@@ -15,8 +15,7 @@
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
#include "xgboost/logging.h" // CHECK_EQ
namespace xgboost {
namespace common {
namespace xgboost::common {
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
if (!ctx->IsCPU()) {
@@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
}
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
v.SetDevice(ctx->gpu_id);
out->SetDevice(ctx->gpu_id);
v.SetDevice(ctx->Device());
out->SetDevice(ctx->Device());
out->Reshape(1);
if (ctx->IsCPU()) {
@@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -15,14 +15,12 @@
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
namespace xgboost {
namespace common {
namespace cuda_impl {
namespace xgboost::common::cuda_impl {
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
CHECK_GE(t.Shape(1), 1);
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
segments.SetDevice(ctx->gpu_id);
segments.SetDevice(ctx->Device());
auto d_segments = segments.DeviceSpan();
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
});
out->SetDevice(ctx->gpu_id);
out->SetDevice(ctx->Device());
out->Reshape(t.Shape(1));
if (weights.Empty()) {
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
dh::TemporaryArray<char> temp{bytes};
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
}
} // namespace cuda_impl
} // namespace common
} // namespace xgboost
} // namespace xgboost::common::cuda_impl

View File

@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
auto d_sorted_idx = dh::ToSpan(sorted_idx);
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
quantiles->SetDevice(ctx->gpu_id);
quantiles->SetDevice(ctx->Device());
quantiles->Resize(n_segments);
auto d_results = quantiles->DeviceSpan();
@@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
scan_val, weights_cdf.begin());
auto n_segments = std::distance(seg_beg, seg_end) - 1;
quantiles->SetDevice(ctx->gpu_id);
quantiles->SetDevice(ctx->Device());
quantiles->Resize(n_segments);
auto d_results = quantiles->DeviceSpan();
auto d_weight_cdf = dh::ToSpan(weights_cdf);

View File

@@ -60,8 +60,8 @@ class Transform {
template <typename Functor>
struct Evaluator {
public:
Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
/*!
* \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -71,7 +71,7 @@ class Transform {
*/
template <typename... HDV>
void Eval(HDV... vectors) const {
bool on_device = device_ >= 0;
bool on_device = device_.IsCUDA();
if (on_device) {
LaunchCUDA(func_, vectors...);
@@ -116,11 +116,11 @@ class Transform {
}
// Recursive unpack for Shard.
template <typename T>
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
vector->SetDevice(device);
}
template <typename Head, typename... Rest>
void UnpackShard(int device,
void UnpackShard(DeviceOrd device,
const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->SetDevice(device);
@@ -140,7 +140,7 @@ class Transform {
// granularity is used in data vector.
size_t shard_size = range_size;
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
const int kGrids =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
if (kGrids == 0) {
@@ -174,7 +174,7 @@ class Transform {
/*! \brief Range object specifying parallel threads index range. */
Range range_;
int32_t n_threads_;
int32_t device_;
DeviceOrd device_;
};
public:
@@ -192,8 +192,8 @@ class Transform {
*/
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
int32_t device_idx) {
return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
DeviceOrd device) {
return Evaluator<Functor>{func, std::move(range), n_threads, device};
}
};