add cuda to hip wrapper

This commit is contained in:
Your Name 2023-10-17 12:42:37 -07:00
parent ea19555474
commit ffbbc9c968
35 changed files with 60 additions and 509 deletions

View File

@ -59,21 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
void XGBoostAPIGuard::SetGPUAttribute() {
// Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
// If errors, do nothing, assuming running on CPU only machine.
#if defined(XGBOOST_USE_CUDA)
cudaGetDevice(&device_id_);
#elif defined(XGBOOST_USE_HIP)
hipGetDevice(&device_id_);
#endif
}
void XGBoostAPIGuard::RestoreGPUAttribute() {
// Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
// If errors, do nothing, assuming running on CPU only machine.
#if defined(XGBOOST_USE_CUDA)
cudaSetDevice(device_id_);
#elif defined(XGBOOST_USE_HIP)
hipSetDevice(device_id_);
#endif
}
void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,

View File

@ -26,22 +26,12 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
return;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
#endif
auto size = count * GetTypeSize(data_type);
host_buffer_.resize(size);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
Allreduce(host_buffer_.data(), count, data_type, op);
dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
AllReduce(host_buffer_.data(), count, data_type, op);
dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
#endif
}
void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
@ -49,7 +39,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
return;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
@ -57,15 +46,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda(
cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
hipMemcpyDefault));
Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda(
hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
#endif
}
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
@ -74,11 +54,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
return;
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_));
#endif
segments->clear();
segments->resize(world_size_, 0);
@ -92,25 +68,15 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
for (int32_t i = 0; i < world_size_; ++i) {
size_t as_bytes = segments->at(i);
if (i == rank_) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
hipMemcpyDefault));
#endif
}
Broadcast(host_buffer_.data() + offset, as_bytes, i);
offset += as_bytes;
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
hipMemcpyDefault));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
cudaMemcpyDefault));
#endif
}
void Synchronize() override {

View File

@ -185,13 +185,8 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
group_ptr.data() + 1, ctx->CUDACtx()->Stream());
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
#endif
}
/**

View File

@ -2,17 +2,14 @@
* Copyright 2018-2022 XGBoost contributors
*/
#include "common.h"
#include "cuda_to_hip.h"
namespace xgboost {
namespace common {
void SetDevice(std::int32_t device) {
if (device >= 0) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
}
}
@ -21,17 +18,9 @@ int AllVisibleGPUs() {
try {
// When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail.
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDeviceCount(&n_visgpus));
#endif
} catch (const dmlc::Error &) {
#if defined(XGBOOST_USE_CUDA)
cudaGetLastError(); // reset error.
#elif defined(XGBOOST_USE_HIP)
hipGetLastError(); // reset error.
#endif
return 0;
}
return n_visgpus;

57
src/common/cuda_to_hip.h Normal file
View File

@ -0,0 +1,57 @@
/**
* Copyright 2017-2023 XGBoost contributors
*/
#pragma once
#if defined(XGBOOST_USE_HIP)
#define cudaSuccess hipSuccess
#define cudaGetLastError hipGetLastError
#define cudaStream_t hipStream_t
#define cudaStreamCreate hipStreamCreate
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamLegacy hipStreamLegacy
#define cudaEvent_t hipEvent_t
#define cudaEventCreate hipEventCreate
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDestroy hipEventDestroy
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#define cudaMallocHost hipMallocHost
#define cudaFreeHost hipFreeHost
#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDefault hipMemcpyDefault
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyHostToHost hipMemcpyHostToHost
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaMemGetInfo hipMemGetInfo
#define cudaFuncSetAttribute hipFuncSetAttribute
#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount
#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
#endif

View File

@ -31,6 +31,8 @@
#include <tuple>
#include <vector>
#include "cuda_to_hip.h"
#include "../collective/communicator-inl.h"
#include "common.h"
#include "xgboost/global_config.h"

View File

@ -330,13 +330,8 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
} else {
// copy hessian as weight
CHECK_EQ(d_weight_out.size(), hessian.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
hipMemcpyDefault));
#endif
}
return d_weight_out;
}

View File

@ -88,19 +88,10 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
#endif
int n_blocks_per_mp = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
#endif
std::uint32_t grid_size = n_blocks_per_mp * n_mps;
return grid_size;
}
@ -348,13 +339,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
size_t columns, size_t begin, size_t end,
SketchContainer *sketch_container) {
dh::XGBCachingDeviceAllocator<char> alloc;
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#endif
info.weights_.SetDevice(device);
auto weights = info.weights_.ConstDeviceSpan();

View File

@ -140,17 +140,10 @@ class HostDeviceVectorImpl {
SetDevice();
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr,
other->Size() * sizeof(T),
cudaMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(this->DevicePointer() + ori_size,
ptr,
other->Size() * sizeof(T),
hipMemcpyDeviceToDevice));
#endif
}
}
@ -204,17 +197,10 @@ class HostDeviceVectorImpl {
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(data_h_.data(),
data_d_->data().get(),
data_d_->size() * sizeof(T),
cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(data_h_.data(),
data_d_->data().get(),
data_d_->size() * sizeof(T),
hipMemcpyDeviceToHost));
#endif
}
void LazySyncDevice(GPUAccess access) {
@ -228,17 +214,10 @@ class HostDeviceVectorImpl {
LazyResizeDevice(data_h_.size());
SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
data_h_.data(),
data_d_->size() * sizeof(T),
cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(),
data_h_.data(),
data_d_->size() * sizeof(T),
hipMemcpyHostToDevice));
#endif
gpu_access_ = access;
}
@ -264,13 +243,8 @@ class HostDeviceVectorImpl {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
data_d_->size() * sizeof(T), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
data_d_->size() * sizeof(T), hipMemcpyDefault));
#endif
}
}
@ -279,13 +253,8 @@ class HostDeviceVectorImpl {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
data_d_->size() * sizeof(T), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), begin,
data_d_->size() * sizeof(T), hipMemcpyDefault));
#endif
}
void LazyResizeDevice(size_t new_size) {
@ -297,11 +266,7 @@ class HostDeviceVectorImpl {
void SetDevice() {
CHECK_GE(device_, 0);
if (cudaSetDeviceHandler == nullptr) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
} else {
(*cudaSetDeviceHandler)(device_);
}

View File

@ -12,17 +12,9 @@
namespace xgboost {
namespace linalg {
template <typename T, int32_t D, typename Fn>
#if defined(XGBOOST_USE_CUDA)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
#elif defined(XGBOOST_USE_HIP)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
#endif
{
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(t.Device().ordinal));
#endif
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
"For function with return, use transform instead.");
if (t.Contiguous()) {
@ -37,11 +29,7 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s
}
template <typename T, int32_t D, typename Fn>
#if defined(XGBOOST_USE_HIP)
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
#elif defined(XGBOOST_USE_CUDA)
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
#endif
{
if (t.Contiguous()) {
auto ptr = t.Values().data();

View File

@ -110,15 +110,9 @@ void CopyTo(Span<T> out, Span<U> src) {
CHECK_EQ(out.size(), src.size());
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
out.size_bytes(),
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(out.data(), src.data(),
out.size_bytes(),
hipMemcpyDefault));
#endif
}
// Compute the merge path.
@ -251,11 +245,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size());
@ -354,11 +344,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts;
@ -418,11 +404,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
* pruning or merging. We preserve the first type and remove the second type.
*/
timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
dh::XGBCachingDeviceAllocator<char> alloc;
@ -479,11 +461,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
void SketchContainer::Prune(size_t to) {
timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
OffsetT to_total = 0;
auto& h_columns_ptr = columns_ptr_b_.HostVector();
@ -518,11 +496,7 @@ void SketchContainer::Prune(size_t to) {
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
timer_.Start(__func__);
if (this->Current().size() == 0) {
@ -558,11 +532,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
}
void SketchContainer::FixError() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
auto in = dh::ToSpan(this->Current());
@ -588,11 +558,7 @@ void SketchContainer::FixError() {
}
void SketchContainer::AllReduce(bool is_column_split) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
auto world = collective::GetWorldSize();
if (world == 1 || is_column_split) {
return;
@ -674,11 +640,7 @@ struct InvalidCatOp {
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers.

View File

@ -176,11 +176,7 @@ class SketchContainer {
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#else
dh::safe_cuda(cudaSetDevice(device_));
#endif
this->columns_ptr_.SetDevice(device_);
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();

View File

@ -147,13 +147,8 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
auto const& h_group_ptr = info.group_ptr_;
group_ptr_.Resize(h_group_ptr.size());
auto d_group_ptr = group_ptr_.DeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
cudaMemcpyHostToDevice, cuctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
hipMemcpyHostToDevice, cuctx->Stream()));
#endif
}
auto d_group_ptr = DataGroupPtr(ctx);

View File

@ -61,13 +61,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
out_group_threads_ptr.size());
size_t total = 0;
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
sizeof(total), hipMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
sizeof(total), cudaMemcpyDeviceToHost));
#endif
return total;
}

View File

@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
// default per-thread stream
default: {
dh::CUDAEvent e;
#if defined(XGBOOST_USE_CUDA)
e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
#elif defined(XGBOOST_USE_HIP)
e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
#endif
dh::DefaultStream().Wait(e);
}
}

View File

@ -22,19 +22,11 @@ namespace cub = hipcub;
namespace xgboost {
namespace {
auto SetDeviceToPtr(void const* ptr) {
#if defined(XGBOOST_USE_CUDA)
cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(cudaSetDevice(ptr_device));
return ptr_device;
#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
hipPointerAttribute_t attr;
dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(hipSetDevice(ptr_device));
return ptr_device;
#endif
}
template <typename T, int32_t D>
@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
// set data
data->Resize(array.n);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
cudaMemcpyDefault, ctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
hipMemcpyDefault, ctx->Stream()));
#endif
});
return;
}
@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
});
bool non_dec = true;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
hipMemcpyDeviceToHost));
#endif
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
size_t bytes = 0;

View File

@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, Context::kCpuId);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_));
#endif
for (auto& json_col : json_columns) {
auto column = ArrayInterface<1>(get<Object const>(json_col));
@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
template <typename AdapterBatchT>
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
float missing) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#endif
auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();

View File

@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
n_rows(n_rows) {
monitor_.Init("ellpack_page");
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
monitor_.Start("InitCompressedData");
InitCompressedData(device);
@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
: is_dense(dmat->IsDense()) {
monitor_.Init("ellpack_page");
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
#endif
n_rows = dmat->Info().num_row_;
@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
CopyDataToEllpack(batch, feature_types, this, device, missing);
@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
auto d_row_ptr = dh::ToSpan(row_ptr);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#endif
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
auto null = accessor.NullValue();
@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
if (row_batch.data.DeviceCanRead()) {
auto const& d_data = row_batch.data.ConstDeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), hipMemcpyDefault));
#endif
} else {
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), hipMemcpyDefault));
#endif
}
const dim3 block3(32, 8, 1); // 256 threads

View File

@ -10,11 +10,7 @@
namespace xgboost::data {
void EllpackPageSource::Fetch() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
if (!this->ReadCache()) {
if (count_ != 0 && !sync_) {
// source is initialized to be the 0th page during construction, so when count_ is 0

View File

@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
int32_t current_device;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&current_device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&current_device));
#endif
auto get_device = [&]() -> int32_t {
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
// ctx_.gpu_id = proxy->DeviceIdx();
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
if (cols == 0) {
cols = num_cols();
@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto n_features = cols;
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
if (!ref) {
HostDeviceVector<FeatureType> ft;
@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
while (iter.Next()) {
init_page();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0);

View File

@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
: adapter->DeviceIdx();
CHECK_GE(device, 0);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});

View File

@ -57,11 +57,7 @@ template <typename AdapterBatchT>
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
int device_idx, float missing) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing);
// Count elements per row

View File

@ -60,11 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
return;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
// The begin and end indices for the section of each column associated with
// this device
@ -92,17 +88,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
auto col = page[fidx];
auto seg = column_segments[fidx];
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(
data_.data().get() + row_ptr_[fidx],
col.data() + seg.first,
sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(
data_.data().get() + row_ptr_[fidx],
col.data() + seg.first,
sizeof(Entry) * (seg.second - seg.first), hipMemcpyHostToDevice));
#endif
}
}
@ -182,11 +171,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda.
GradientPair GetBiasGradient(int group_idx, int num_group) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
auto counting = thrust::make_counting_iterator(0ull);
auto f = [=] __device__(size_t idx) {
@ -211,11 +196,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda.
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
@ -249,17 +230,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
}
void UpdateGpair(const std::vector<GradientPair> &host_gpair) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
gpair_.data().get(),
host_gpair.data(),
gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
gpair_.data().get(),
host_gpair.data(),
gpair_.size() * sizeof(GradientPair), hipMemcpyHostToDevice));
#endif
}
// training parameter

View File

@ -95,11 +95,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
auto labels = info.labels.View(device);
auto weights = info.weights_.ConstDeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
CHECK_NE(labels.Size(), 0);
CHECK_EQ(labels.Size(), predts.size());
@ -352,11 +348,7 @@ template <bool scale, typename Fn>
double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
common::Span<uint32_t> d_class_ptr, size_t n_classes,
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
/**
* Sorted idx
*/
@ -934,11 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
common::Span<float const> predts,
MetaInfo const &info,
std::shared_ptr<DeviceAUCCache> *p_cache) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
#endif
if (predts.empty()) {
return std::make_pair(0.0, static_cast<uint32_t>(0));

View File

@ -166,12 +166,7 @@ class MultiClassMetricsReduction {
labels.SetDevice(device_);
weights.SetDevice(device_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
result = DeviceReduceMetrics(weights, labels, preds, n_class);
}
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

View File

@ -159,11 +159,7 @@ class ElementWiseSurvivalMetricsReduction {
labels_upper_bound.SetDevice(ctx.gpu_id);
weights.SetDevice(ctx.gpu_id);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx.gpu_id));
#endif
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
}

View File

@ -30,22 +30,13 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
// copy position to buffer
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
#endif
auto cuctx = ctx->CUDACtx();
size_t n_samples = position.size();
dh::device_vector<bst_node_t> sorted_position(position.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(sorted_position.data().get(), position.data(),
position.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream()));
#endif
p_ridx->resize(position.size());
dh::Iota(dh::ToSpan(*p_ridx));
@ -98,17 +89,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
bst_node_t* h_first_unique =
reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
cudaMemcpyDeviceToHost, copy_stream.View()));
dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
cudaMemcpyDeviceToHost, copy_stream.View()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
hipMemcpyDeviceToHost, copy_stream.View()));
dh::safe_cuda(hipMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
hipMemcpyDeviceToHost, copy_stream.View()));
#endif
/**
* copy node index (leaf index)
@ -171,11 +155,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
#endif
dh::device_vector<size_t> ridx;
HostDeviceVector<size_t> nptr;
HostDeviceVector<bst_node_t> nidx;

View File

@ -297,11 +297,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
linalg::Matrix<GradientPair>* out_gpair) {
// boilerplate
std::int32_t device_id = ctx->gpu_id;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_id));
#endif
auto n_groups = p_cache->Groups();
info.labels.SetDevice(device_id);
@ -385,11 +381,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
linalg::Matrix<GradientPair>* out_gpair) {
// boilerplate
auto device = ctx->Device();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
auto const discount = p_cache->Discount(ctx);
@ -457,11 +449,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
linalg::VectorView<double> li, linalg::VectorView<double> lj,
linalg::Matrix<GradientPair>* out_gpair) {
auto device = ctx->Device();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
info.labels.SetDevice(device);
predt.SetDevice(device);
@ -500,11 +488,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
linalg::VectorView<double> li, linalg::VectorView<double> lj,
linalg::Matrix<GradientPair>* out_gpair) {
auto device = ctx->Device();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
info.labels.SetDevice(device);
predt.SetDevice(device);

View File

@ -341,11 +341,7 @@ class DeviceModel {
int num_group;
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(gpu_id));
#endif
// Copy decision trees to device
tree_segments = HostDeviceVector<size_t>({}, gpu_id);
@ -366,21 +362,12 @@ class DeviceModel {
auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
auto& src_stats = model.trees.at(tree_idx)->GetStats();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
dh::safe_cuda(cudaMemcpyAsync(
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault));
dh::safe_cuda(hipMemcpyAsync(
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault));
#endif
}
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
@ -504,11 +491,7 @@ void ExtractPaths(
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
int gpu_id) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(gpu_id));
#endif
auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@ -584,15 +567,9 @@ void ExtractPaths(
thrust::max_element(thrust::device, max_elem_it,
max_elem_it + d_cat_node_segments.size()) -
max_elem_it;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
d_cat_node_segments.data() + max_cat_it,
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(h_max_cat.data(),
d_cat_node_segments.data() + max_cat_it,
h_max_cat.size_bytes(), hipMemcpyDeviceToHost));
#endif
max_cat = h_max_cat[0].size;
CHECK_GE(max_cat, 1);
path_categories->resize(max_cat * paths->size());
@ -786,11 +763,7 @@ class ColumnSplitHelper {
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
bst_feature_t num_features, std::uint32_t num_group) const {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
dh::caching_device_vector<BitType> decision_storage{};
dh::caching_device_vector<BitType> missing_storage{};
@ -970,11 +943,7 @@ class GPUPredictor : public xgboost::Predictor {
~GPUPredictor() override {
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
}
}
@ -1071,11 +1040,7 @@ class GPUPredictor : public xgboost::Predictor {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
out_contribs->SetDevice(ctx_->gpu_id);
if (tree_end == 0 || tree_end > model.trees.size()) {
@ -1135,11 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
out_contribs->SetDevice(ctx_->gpu_id);
if (tree_end == 0 || tree_end > model.trees.size()) {
@ -1199,11 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
const gbm::GBTreeModel &model,
unsigned tree_end) const override {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
const MetaInfo& info = p_fmat->Info();

View File

@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
for (auto idx : nidx) {
copy_stream_.View().Wait(event);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
#endif
}
}
@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
dh::ToSpan(out_entries));
GPUExpandEntry root_entry;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
hipMemcpyDeviceToHost));
#endif
return root_entry;
}
} // namespace xgboost::tree

View File

@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
split_cats_.resize(node_categorical_storage_size_);
h_split_cats_.resize(node_categorical_storage_size_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(
hipMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
#endif
cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
sort_input_.resize(cat_sorted_idx_.size());

View File

@ -266,11 +266,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
// decide whether to use shared memory
int device = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&device));
#endif
// opt into maximum shared memory for the kernel if necessary
#if defined(XGBOOST_USE_CUDA)
@ -303,17 +299,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
int num_groups = feature_groups.NumGroups();
int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, smem_size));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
int n_blocks_per_mp = 0;
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, smem_size));
#endif
// This gives the number of blocks to keep the device occupied
// Use this as the maximum number of blocks
@ -347,11 +336,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetLastError());
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetLastError());
#endif
}
} // namespace tree

View File

@ -16,22 +16,14 @@ namespace tree {
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#endif
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
}
RowPartitioner::~RowPartitioner() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#endif
}
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {

View File

@ -287,15 +287,9 @@ class RowPartitioner {
total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
cudaMemcpyDefault));
#endif
// Temporary arrays
auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@ -305,13 +299,8 @@ class RowPartitioner {
SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
total_rows, op, &tmp_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
hipMemcpyDefault));
#endif
// TODO(Rory): this synchronisation hurts performance a lot
// Future optimisation should find a way to skip this
dh::DefaultStream().Sync();
@ -348,15 +337,9 @@ class RowPartitioner {
void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
sizeof(NodePositionInfo) * ridx_segments_.size(),
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
sizeof(NodePositionInfo) * ridx_segments_.size(),
cudaMemcpyDefault));
#endif
constexpr int kBlockSize = 512;
const int kItemsThread = 8;

View File

@ -232,26 +232,16 @@ struct GPUHistMakerDevice {
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
this->interaction_constraints.Reset();
if (d_gpair.size() != dh_gpair->Size()) {
d_gpair.resize(dh_gpair->Size());
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
cudaMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
hipMemcpyDeviceToDevice));
#endif
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
page = sample.page;
gpair = sample.gpair;
@ -338,28 +328,15 @@ struct GPUHistMakerDevice {
max_active_features =
std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
#endif
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
hipMemcpyDeviceToHost));
#endif
dh::DefaultStream().Sync();
}
@ -412,13 +389,8 @@ struct GPUHistMakerDevice {
BitVector missing_bits{dh::ToSpan(missing_storage)};
dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
#endif
auto d_split_data = dh::ToSpan(split_data_storage);
dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
@ -527,15 +499,9 @@ struct GPUHistMakerDevice {
dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
d_nodes.size() * sizeof(RegTree::Node),
cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
d_nodes.size() * sizeof(RegTree::Node),
hipMemcpyHostToDevice));
#endif
auto const& h_split_types = p_tree->GetSplitTypes();
auto const& categories = p_tree->GetSplitCategories();
@ -606,15 +572,9 @@ struct GPUHistMakerDevice {
auto s_position = p_out_position->ConstDeviceSpan();
positions.resize(s_position.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
s_position.size_bytes(), cudaMemcpyDeviceToDevice,
ctx_->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(),
s_position.size_bytes(), hipMemcpyDeviceToDevice,
ctx_->CUDACtx()->Stream()));
#endif
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
bst_node_t position = d_out_position[idx];
@ -632,26 +592,16 @@ struct GPUHistMakerDevice {
CHECK(out_preds_d.Device().IsCUDA());
CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
#endif
auto d_position = dh::ToSpan(positions);
CHECK_EQ(out_preds_d.Size(), d_position.size());
auto const& h_nodes = p_tree->GetNodes();
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
ctx_->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(),
h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice,
ctx_->CUDACtx()->Stream()));
#endif
auto d_nodes = dh::ToSpan(nodes);
CHECK_EQ(out_preds_d.Shape(1), 1);
@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater {
++t_idx;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetLastError());
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetLastError());
#endif
} catch (const std::exception& e) {
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
}
@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater {
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
info_->feature_types.SetDevice(ctx_->gpu_id);
maker = std::make_unique<GPUHistMakerDevice>(