add cuda to hip wrapper

This commit is contained in:
Your Name 2023-10-17 12:42:37 -07:00
parent ea19555474
commit ffbbc9c968
35 changed files with 60 additions and 509 deletions

View File

@ -59,21 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
void XGBoostAPIGuard::SetGPUAttribute() { void XGBoostAPIGuard::SetGPUAttribute() {
// Not calling `safe_cuda` to avoid unnecessary exception handling overhead. // Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
// If errors, do nothing, assuming running on CPU only machine. // If errors, do nothing, assuming running on CPU only machine.
#if defined(XGBOOST_USE_CUDA)
cudaGetDevice(&device_id_); cudaGetDevice(&device_id_);
#elif defined(XGBOOST_USE_HIP)
hipGetDevice(&device_id_);
#endif
} }
void XGBoostAPIGuard::RestoreGPUAttribute() { void XGBoostAPIGuard::RestoreGPUAttribute() {
// Not calling `safe_cuda` to avoid unnecessary exception handling overhead. // Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
// If errors, do nothing, assuming running on CPU only machine. // If errors, do nothing, assuming running on CPU only machine.
#if defined(XGBOOST_USE_CUDA)
cudaSetDevice(device_id_); cudaSetDevice(device_id_);
#elif defined(XGBOOST_USE_HIP)
hipSetDevice(device_id_);
#endif
} }
void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad, void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,

View File

@ -26,22 +26,12 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
return; return;
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
#endif
auto size = count * GetTypeSize(data_type); auto size = count * GetTypeSize(data_type);
host_buffer_.resize(size); host_buffer_.resize(size);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault)); dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
Allreduce(host_buffer_.data(), count, data_type, op); Allreduce(host_buffer_.data(), count, data_type, op);
dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault)); dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
AllReduce(host_buffer_.data(), count, data_type, op);
dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
#endif
} }
void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override { void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
@ -49,7 +39,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
return; return;
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_); host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size, dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
@ -57,15 +46,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
Allgather(host_buffer_.data(), host_buffer_.size()); Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda( dh::safe_cuda(
cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault)); cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
hipMemcpyDefault));
Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda(
hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
#endif
} }
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments, void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
@ -74,11 +54,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
return; return;
} }
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
#endif
segments->clear(); segments->clear();
segments->resize(world_size_, 0); segments->resize(world_size_, 0);
@ -92,25 +68,15 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
for (int32_t i = 0; i < world_size_; ++i) { for (int32_t i = 0; i < world_size_; ++i) {
size_t as_bytes = segments->at(i); size_t as_bytes = segments->at(i);
if (i == rank_) { if (i == rank_) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_), dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
cudaMemcpyDefault)); cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
hipMemcpyDefault));
#endif
} }
Broadcast(host_buffer_.data() + offset, as_bytes, i); Broadcast(host_buffer_.data() + offset, as_bytes, i);
offset += as_bytes; offset += as_bytes;
} }
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
hipMemcpyDefault));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes, dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
cudaMemcpyDefault)); cudaMemcpyDefault));
#endif
} }
void Synchronize() override { void Synchronize() override {

View File

@ -185,13 +185,8 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(), sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
group_ptr.data() + 1, ctx->CUDACtx()->Stream()); group_ptr.data() + 1, ctx->CUDACtx()->Stream());
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(), dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice)); sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
#endif
} }
/** /**

View File

@ -2,17 +2,14 @@
* Copyright 2018-2022 XGBoost contributors * Copyright 2018-2022 XGBoost contributors
*/ */
#include "common.h" #include "common.h"
#include "cuda_to_hip.h"
namespace xgboost { namespace xgboost {
namespace common { namespace common {
void SetDevice(std::int32_t device) { void SetDevice(std::int32_t device) {
if (device >= 0) { if (device >= 0) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
} }
} }
@ -21,17 +18,9 @@ int AllVisibleGPUs() {
try { try {
// When compiled with CUDA but running on CPU only device, // When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail. // cudaGetDeviceCount will fail.
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDeviceCount(&n_visgpus));
#endif
} catch (const dmlc::Error &) { } catch (const dmlc::Error &) {
#if defined(XGBOOST_USE_CUDA)
cudaGetLastError(); // reset error. cudaGetLastError(); // reset error.
#elif defined(XGBOOST_USE_HIP)
hipGetLastError(); // reset error.
#endif
return 0; return 0;
} }
return n_visgpus; return n_visgpus;

57
src/common/cuda_to_hip.h Normal file
View File

@ -0,0 +1,57 @@
/**
* Copyright 2017-2023 XGBoost contributors
*/
#pragma once
#if defined(XGBOOST_USE_HIP)
#define cudaSuccess hipSuccess
#define cudaGetLastError hipGetLastError
#define cudaStream_t hipStream_t
#define cudaStreamCreate hipStreamCreate
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamLegacy hipStreamLegacy
#define cudaEvent_t hipEvent_t
#define cudaEventCreate hipEventCreate
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDestroy hipEventDestroy
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#define cudaMallocHost hipMallocHost
#define cudaFreeHost hipFreeHost
#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDefault hipMemcpyDefault
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyHostToHost hipMemcpyHostToHost
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaMemGetInfo hipMemGetInfo
#define cudaFuncSetAttribute hipFuncSetAttribute
#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount
#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
#endif

View File

@ -31,6 +31,8 @@
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "cuda_to_hip.h"
#include "../collective/communicator-inl.h" #include "../collective/communicator-inl.h"
#include "common.h" #include "common.h"
#include "xgboost/global_config.h" #include "xgboost/global_config.h"

View File

@ -330,13 +330,8 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
} else { } else {
// copy hessian as weight // copy hessian as weight
CHECK_EQ(d_weight_out.size(), hessian.size()); CHECK_EQ(d_weight_out.size(), hessian.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(), dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
cudaMemcpyDefault)); cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
hipMemcpyDefault));
#endif
} }
return d_weight_out; return d_weight_out;
} }

View File

@ -88,19 +88,10 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
template <std::uint32_t kBlockThreads, typename Kernel> template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) { std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0; int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
#endif
int n_blocks_per_mp = 0; int n_blocks_per_mp = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem)); kBlockThreads, shared_mem));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
#endif
std::uint32_t grid_size = n_blocks_per_mp * n_mps; std::uint32_t grid_size = n_blocks_per_mp * n_mps;
return grid_size; return grid_size;
} }
@ -348,13 +339,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
size_t columns, size_t begin, size_t end, size_t columns, size_t begin, size_t end,
SketchContainer *sketch_container) { SketchContainer *sketch_container) {
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
#endif
info.weights_.SetDevice(device); info.weights_.SetDevice(device);
auto weights = info.weights_.ConstDeviceSpan(); auto weights = info.weights_.ConstDeviceSpan();

View File

@ -140,17 +140,10 @@ class HostDeviceVectorImpl {
SetDevice(); SetDevice();
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx()); CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr, ptr,
other->Size() * sizeof(T), other->Size() * sizeof(T),
cudaMemcpyDeviceToDevice)); cudaMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(this->DevicePointer() + ori_size,
ptr,
other->Size() * sizeof(T),
hipMemcpyDeviceToDevice));
#endif
} }
} }
@ -204,17 +197,10 @@ class HostDeviceVectorImpl {
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
SetDevice(); SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(data_h_.data(), dh::safe_cuda(cudaMemcpy(data_h_.data(),
data_d_->data().get(), data_d_->data().get(),
data_d_->size() * sizeof(T), data_d_->size() * sizeof(T),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(data_h_.data(),
data_d_->data().get(),
data_d_->size() * sizeof(T),
hipMemcpyDeviceToHost));
#endif
} }
void LazySyncDevice(GPUAccess access) { void LazySyncDevice(GPUAccess access) {
@ -228,17 +214,10 @@ class HostDeviceVectorImpl {
LazyResizeDevice(data_h_.size()); LazyResizeDevice(data_h_.size());
SetDevice(); SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
data_h_.data(), data_h_.data(),
data_d_->size() * sizeof(T), data_d_->size() * sizeof(T),
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(),
data_h_.data(),
data_d_->size() * sizeof(T),
hipMemcpyHostToDevice));
#endif
gpu_access_ = access; gpu_access_ = access;
} }
@ -264,13 +243,8 @@ class HostDeviceVectorImpl {
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
SetDevice(); SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
data_d_->size() * sizeof(T), cudaMemcpyDefault)); data_d_->size() * sizeof(T), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
data_d_->size() * sizeof(T), hipMemcpyDefault));
#endif
} }
} }
@ -279,13 +253,8 @@ class HostDeviceVectorImpl {
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
SetDevice(); SetDevice();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin, dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
data_d_->size() * sizeof(T), cudaMemcpyDefault)); data_d_->size() * sizeof(T), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), begin,
data_d_->size() * sizeof(T), hipMemcpyDefault));
#endif
} }
void LazyResizeDevice(size_t new_size) { void LazyResizeDevice(size_t new_size) {
@ -297,11 +266,7 @@ class HostDeviceVectorImpl {
void SetDevice() { void SetDevice() {
CHECK_GE(device_, 0); CHECK_GE(device_, 0);
if (cudaSetDeviceHandler == nullptr) { if (cudaSetDeviceHandler == nullptr) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
} else { } else {
(*cudaSetDeviceHandler)(device_); (*cudaSetDeviceHandler)(device_);
} }

View File

@ -12,17 +12,9 @@
namespace xgboost { namespace xgboost {
namespace linalg { namespace linalg {
template <typename T, int32_t D, typename Fn> template <typename T, int32_t D, typename Fn>
#if defined(XGBOOST_USE_CUDA)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
#elif defined(XGBOOST_USE_HIP)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
#endif
{ {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(t.Device().ordinal)); dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(t.Device().ordinal));
#endif
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value, static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
"For function with return, use transform instead."); "For function with return, use transform instead.");
if (t.Contiguous()) { if (t.Contiguous()) {
@ -37,11 +29,7 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s
} }
template <typename T, int32_t D, typename Fn> template <typename T, int32_t D, typename Fn>
#if defined(XGBOOST_USE_HIP)
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
#elif defined(XGBOOST_USE_CUDA)
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
#endif
{ {
if (t.Contiguous()) { if (t.Contiguous()) {
auto ptr = t.Values().data(); auto ptr = t.Values().data();

View File

@ -110,15 +110,9 @@ void CopyTo(Span<T> out, Span<U> src) {
CHECK_EQ(out.size(), src.size()); CHECK_EQ(out.size(), src.size());
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value); static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(), dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
out.size_bytes(), out.size_bytes(),
cudaMemcpyDefault)); cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(out.data(), src.data(),
out.size_bytes(),
hipMemcpyDefault));
#endif
} }
// Compute the merge path. // Compute the merge path.
@ -251,11 +245,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x, void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y, Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) { Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
CHECK_EQ(d_x.size() + d_y.size(), out.size()); CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size()); CHECK_EQ(x_ptr.size(), out_ptr.size());
@ -354,11 +344,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr, void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) { size_t total_cuts, Span<float> weights) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
Span<SketchEntry> out; Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts; dh::device_vector<SketchEntry> cuts;
@ -418,11 +404,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
* pruning or merging. We preserve the first type and remove the second type. * pruning or merging. We preserve the first type and remove the second type.
*/ */
timer_.Start(__func__); timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1); CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
@ -479,11 +461,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
void SketchContainer::Prune(size_t to) { void SketchContainer::Prune(size_t to) {
timer_.Start(__func__); timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
OffsetT to_total = 0; OffsetT to_total = 0;
auto& h_columns_ptr = columns_ptr_b_.HostVector(); auto& h_columns_ptr = columns_ptr_b_.HostVector();
@ -518,11 +496,7 @@ void SketchContainer::Prune(size_t to) {
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr, void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) { Span<SketchEntry const> that) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
timer_.Start(__func__); timer_.Start(__func__);
if (this->Current().size() == 0) { if (this->Current().size() == 0) {
@ -558,11 +532,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
} }
void SketchContainer::FixError() { void SketchContainer::FixError() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan(); auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
auto in = dh::ToSpan(this->Current()); auto in = dh::ToSpan(this->Current());
@ -588,11 +558,7 @@ void SketchContainer::FixError() {
} }
void SketchContainer::AllReduce(bool is_column_split) { void SketchContainer::AllReduce(bool is_column_split) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
auto world = collective::GetWorldSize(); auto world = collective::GetWorldSize();
if (world == 1 || is_column_split) { if (world == 1 || is_column_split) {
return; return;
@ -674,11 +640,7 @@ struct InvalidCatOp {
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__); timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
p_cuts->min_vals_.Resize(num_columns_); p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers. // Sync between workers.

View File

@ -176,11 +176,7 @@ class SketchContainer {
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) { size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__); timer_.Start(__func__);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#else
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#endif
this->columns_ptr_.SetDevice(device_); this->columns_ptr_.SetDevice(device_);
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan(); Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();

View File

@ -147,13 +147,8 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
auto const& h_group_ptr = info.group_ptr_; auto const& h_group_ptr = info.group_ptr_;
group_ptr_.Resize(h_group_ptr.size()); group_ptr_.Resize(h_group_ptr.size());
auto d_group_ptr = group_ptr_.DeviceSpan(); auto d_group_ptr = group_ptr_.DeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(), dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
cudaMemcpyHostToDevice, cuctx->Stream())); cudaMemcpyHostToDevice, cuctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
hipMemcpyHostToDevice, cuctx->Stream()));
#endif
} }
auto d_group_ptr = DataGroupPtr(ctx); auto d_group_ptr = DataGroupPtr(ctx);

View File

@ -61,13 +61,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
out_group_threads_ptr.size()); out_group_threads_ptr.size());
size_t total = 0; size_t total = 0;
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
sizeof(total), hipMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
sizeof(total), cudaMemcpyDeviceToHost)); sizeof(total), cudaMemcpyDeviceToHost));
#endif
return total; return total;
} }

View File

@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
// default per-thread stream // default per-thread stream
default: { default: {
dh::CUDAEvent e; dh::CUDAEvent e;
#if defined(XGBOOST_USE_CUDA)
e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)}); e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
#elif defined(XGBOOST_USE_HIP)
e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
#endif
dh::DefaultStream().Wait(e); dh::DefaultStream().Wait(e);
} }
} }

View File

@ -22,19 +22,11 @@ namespace cub = hipcub;
namespace xgboost { namespace xgboost {
namespace { namespace {
auto SetDeviceToPtr(void const* ptr) { auto SetDeviceToPtr(void const* ptr) {
#if defined(XGBOOST_USE_CUDA)
cudaPointerAttributes attr; cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr)); dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device; int32_t ptr_device = attr.device;
dh::safe_cuda(cudaSetDevice(ptr_device)); dh::safe_cuda(cudaSetDevice(ptr_device));
return ptr_device; return ptr_device;
#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
hipPointerAttribute_t attr;
dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(hipSetDevice(ptr_device));
return ptr_device;
#endif
} }
template <typename T, int32_t D> template <typename T, int32_t D>
@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
// set data // set data
data->Resize(array.n); data->Resize(array.n);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T), dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
cudaMemcpyDefault, ctx->Stream())); cudaMemcpyDefault, ctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
hipMemcpyDefault, ctx->Stream()));
#endif
}); });
return; return;
} }
@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
}); });
bool non_dec = true; bool non_dec = true;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool), dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
hipMemcpyDeviceToHost));
#endif
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data."; CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
size_t bytes = 0; size_t bytes = 0;

View File

@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
device_idx_ = dh::CudaGetPointerDevice(first_column.data); device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, Context::kCpuId); CHECK_NE(device_idx_, Context::kCpuId);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_idx_));
#endif
for (auto& json_col : json_columns) { for (auto& json_col : json_columns) {
auto column = ArrayInterface<1>(get<Object const>(json_col)); auto column = ArrayInterface<1>(get<Object const>(json_col));
@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
template <typename AdapterBatchT> template <typename AdapterBatchT>
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx, std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
float missing) { float missing) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaSetDevice(device_idx));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing); IsValidFunctor is_valid(missing);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes())); dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#endif
auto n_samples = batch.NumRows(); auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols(); bst_feature_t n_features = batch.NumCols();

View File

@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
n_rows(n_rows) { n_rows(n_rows) {
monitor_.Init("ellpack_page"); monitor_.Init("ellpack_page");
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
monitor_.Start("InitCompressedData"); monitor_.Start("InitCompressedData");
InitCompressedData(device); InitCompressedData(device);
@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param) EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
: is_dense(dmat->IsDense()) { : is_dense(dmat->IsDense()) {
monitor_.Init("ellpack_page"); monitor_.Init("ellpack_page");
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
#endif
n_rows = dmat->Info().num_row_; n_rows = dmat->Info().num_row_;
@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
common::Span<size_t> row_counts_span, common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride, common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts) { size_t n_rows, common::HistogramCuts const& cuts) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows); *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
CopyDataToEllpack(batch, feature_types, this, device, missing); CopyDataToEllpack(batch, feature_types, this, device, missing);
@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer(); common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
dh::device_vector<size_t> row_ptr(page.row_ptr.size()); dh::device_vector<size_t> row_ptr(page.row_ptr.size());
auto d_row_ptr = dh::ToSpan(row_ptr); auto d_row_ptr = dh::ToSpan(row_ptr);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream())); cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#endif
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft); auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
auto null = accessor.NullValue(); auto null = accessor.NullValue();
@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
if (row_batch.data.DeviceCanRead()) { if (row_batch.data.DeviceCanRead()) {
auto const& d_data = row_batch.data.ConstDeviceSpan(); auto const& d_data = row_batch.data.ConstDeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin, entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault)); n_entries * sizeof(Entry), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), hipMemcpyDefault));
#endif
} else { } else {
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector(); const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin, entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault)); n_entries * sizeof(Entry), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), hipMemcpyDefault));
#endif
} }
const dim3 block3(32, 8, 1); // 256 threads const dim3 block3(32, 8, 1); // 256 threads

View File

@ -10,11 +10,7 @@
namespace xgboost::data { namespace xgboost::data {
void EllpackPageSource::Fetch() { void EllpackPageSource::Fetch() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
if (!this->ReadCache()) { if (!this->ReadCache()) {
if (count_ != 0 && !sync_) { if (count_ != 0 && !sync_) {
// source is initialized to be the 0th page during construction, so when count_ is 0 // source is initialized to be the 0th page during construction, so when count_ is 0

View File

@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
int32_t current_device; int32_t current_device;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&current_device)); dh::safe_cuda(cudaGetDevice(&current_device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&current_device));
#endif
auto get_device = [&]() -> int32_t { auto get_device = [&]() -> int32_t {
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id; std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
// ctx_.gpu_id = proxy->DeviceIdx(); // ctx_.gpu_id = proxy->DeviceIdx();
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs()); CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device())); dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
if (cols == 0) { if (cols == 0) {
cols = num_cols(); cols = num_cols();
@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto n_features = cols; auto n_features = cols;
CHECK_GE(n_features, 1) << "Data must has at least 1 column."; CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device())); dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
if (!ref) { if (!ref) {
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
while (iter.Next()) { while (iter.Next()) {
init_page(); init_page();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device())); dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
auto rows = num_rows(); auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0); dh::device_vector<size_t> row_counts(rows + 1, 0);

View File

@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
: adapter->DeviceIdx(); : adapter->DeviceIdx();
CHECK_GE(device, 0); CHECK_GE(device, 0);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
Context ctx; Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}}); ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});

View File

@ -57,11 +57,7 @@ template <typename AdapterBatchT>
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
int device_idx, float missing) { int device_idx, float missing) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing); IsValidFunctor is_valid(missing);
// Count elements per row // Count elements per row

View File

@ -60,11 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
return; return;
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
// The begin and end indices for the section of each column associated with // The begin and end indices for the section of each column associated with
// this device // this device
@ -92,17 +88,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
auto col = page[fidx]; auto col = page[fidx];
auto seg = column_segments[fidx]; auto seg = column_segments[fidx];
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy( dh::safe_cuda(cudaMemcpy(
data_.data().get() + row_ptr_[fidx], data_.data().get() + row_ptr_[fidx],
col.data() + seg.first, col.data() + seg.first,
sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice)); sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(
data_.data().get() + row_ptr_[fidx],
col.data() + seg.first,
sizeof(Entry) * (seg.second - seg.first), hipMemcpyHostToDevice));
#endif
} }
} }
@ -182,11 +171,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda. // This needs to be public because of the __device__ lambda.
GradientPair GetBiasGradient(int group_idx, int num_group) { GradientPair GetBiasGradient(int group_idx, int num_group) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
auto counting = thrust::make_counting_iterator(0ull); auto counting = thrust::make_counting_iterator(0ull);
auto f = [=] __device__(size_t idx) { auto f = [=] __device__(size_t idx) {
@ -211,11 +196,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda. // This needs to be public because of the __device__ lambda.
GradientPair GetGradient(int group_idx, int num_group, int fidx) { GradientPair GetGradient(int group_idx, int num_group, int fidx) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]); common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx]; size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
@ -249,17 +230,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
} }
void UpdateGpair(const std::vector<GradientPair> &host_gpair) { void UpdateGpair(const std::vector<GradientPair> &host_gpair) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
gpair_.data().get(), gpair_.data().get(),
host_gpair.data(), host_gpair.data(),
gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice)); gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
gpair_.data().get(),
host_gpair.data(),
gpair_.size() * sizeof(GradientPair), hipMemcpyHostToDevice));
#endif
} }
// training parameter // training parameter

View File

@ -95,11 +95,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) { Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
auto labels = info.labels.View(device); auto labels = info.labels.View(device);
auto weights = info.weights_.ConstDeviceSpan(); auto weights = info.weights_.ConstDeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal)); dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
CHECK_NE(labels.Size(), 0); CHECK_NE(labels.Size(), 0);
CHECK_EQ(labels.Size(), predts.size()); CHECK_EQ(labels.Size(), predts.size());
@ -352,11 +348,7 @@ template <bool scale, typename Fn>
double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
common::Span<uint32_t> d_class_ptr, size_t n_classes, common::Span<uint32_t> d_class_ptr, size_t n_classes,
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) { std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal)); dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
/** /**
* Sorted idx * Sorted idx
*/ */
@ -934,11 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
common::Span<float const> predts, common::Span<float const> predts,
MetaInfo const &info, MetaInfo const &info,
std::shared_ptr<DeviceAUCCache> *p_cache) { std::shared_ptr<DeviceAUCCache> *p_cache) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
#endif
if (predts.empty()) { if (predts.empty()) {
return std::make_pair(0.0, static_cast<uint32_t>(0)); return std::make_pair(0.0, static_cast<uint32_t>(0));

View File

@ -166,12 +166,7 @@ class MultiClassMetricsReduction {
labels.SetDevice(device_); labels.SetDevice(device_);
weights.SetDevice(device_); weights.SetDevice(device_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
result = DeviceReduceMetrics(weights, labels, preds, n_class); result = DeviceReduceMetrics(weights, labels, preds, n_class);
} }
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

View File

@ -159,11 +159,7 @@ class ElementWiseSurvivalMetricsReduction {
labels_upper_bound.SetDevice(ctx.gpu_id); labels_upper_bound.SetDevice(ctx.gpu_id);
weights.SetDevice(ctx.gpu_id); weights.SetDevice(ctx.gpu_id);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx.gpu_id)); dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx.gpu_id));
#endif
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds); result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
} }

View File

@ -30,22 +30,13 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr, dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) { HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
// copy position to buffer // copy position to buffer
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
#endif
auto cuctx = ctx->CUDACtx(); auto cuctx = ctx->CUDACtx();
size_t n_samples = position.size(); size_t n_samples = position.size();
dh::device_vector<bst_node_t> sorted_position(position.size()); dh::device_vector<bst_node_t> sorted_position(position.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(), dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream())); position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(sorted_position.data().get(), position.data(),
position.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream()));
#endif
p_ridx->resize(position.size()); p_ridx->resize(position.size());
dh::Iota(dh::ToSpan(*p_ridx)); dh::Iota(dh::ToSpan(*p_ridx));
@ -98,17 +89,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
bst_node_t* h_first_unique = bst_node_t* h_first_unique =
reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data()); reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t), dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
cudaMemcpyDeviceToHost, copy_stream.View())); cudaMemcpyDeviceToHost, copy_stream.View()));
dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t), dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
cudaMemcpyDeviceToHost, copy_stream.View())); cudaMemcpyDeviceToHost, copy_stream.View()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
hipMemcpyDeviceToHost, copy_stream.View()));
dh::safe_cuda(hipMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
hipMemcpyDeviceToHost, copy_stream.View()));
#endif
/** /**
* copy node index (leaf index) * copy node index (leaf index)
@ -171,11 +155,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position, void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
std::int32_t group_idx, MetaInfo const& info, float learning_rate, std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) { HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
#endif
dh::device_vector<size_t> ridx; dh::device_vector<size_t> ridx;
HostDeviceVector<size_t> nptr; HostDeviceVector<size_t> nptr;
HostDeviceVector<bst_node_t> nidx; HostDeviceVector<bst_node_t> nidx;

View File

@ -297,11 +297,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
linalg::Matrix<GradientPair>* out_gpair) { linalg::Matrix<GradientPair>* out_gpair) {
// boilerplate // boilerplate
std::int32_t device_id = ctx->gpu_id; std::int32_t device_id = ctx->gpu_id;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_id)); dh::safe_cuda(cudaSetDevice(device_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_id));
#endif
auto n_groups = p_cache->Groups(); auto n_groups = p_cache->Groups();
info.labels.SetDevice(device_id); info.labels.SetDevice(device_id);
@ -385,11 +381,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
linalg::Matrix<GradientPair>* out_gpair) { linalg::Matrix<GradientPair>* out_gpair) {
// boilerplate // boilerplate
auto device = ctx->Device(); auto device = ctx->Device();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal)); dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
auto const d_inv_IDCG = p_cache->InvIDCG(ctx); auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
auto const discount = p_cache->Discount(ctx); auto const discount = p_cache->Discount(ctx);
@ -457,11 +449,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
linalg::VectorView<double> li, linalg::VectorView<double> lj, linalg::VectorView<double> li, linalg::VectorView<double> lj,
linalg::Matrix<GradientPair>* out_gpair) { linalg::Matrix<GradientPair>* out_gpair) {
auto device = ctx->Device(); auto device = ctx->Device();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal)); dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
info.labels.SetDevice(device); info.labels.SetDevice(device);
predt.SetDevice(device); predt.SetDevice(device);
@ -500,11 +488,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
linalg::VectorView<double> li, linalg::VectorView<double> lj, linalg::VectorView<double> li, linalg::VectorView<double> lj,
linalg::Matrix<GradientPair>* out_gpair) { linalg::Matrix<GradientPair>* out_gpair) {
auto device = ctx->Device(); auto device = ctx->Device();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal)); dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
info.labels.SetDevice(device); info.labels.SetDevice(device);
predt.SetDevice(device); predt.SetDevice(device);

View File

@ -341,11 +341,7 @@ class DeviceModel {
int num_group; int num_group;
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(gpu_id)); dh::safe_cuda(cudaSetDevice(gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(gpu_id));
#endif
// Copy decision trees to device // Copy decision trees to device
tree_segments = HostDeviceVector<size_t>({}, gpu_id); tree_segments = HostDeviceVector<size_t>({}, gpu_id);
@ -366,21 +362,12 @@ class DeviceModel {
auto& src_nodes = model.trees.at(tree_idx)->GetNodes(); auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
auto& src_stats = model.trees.at(tree_idx)->GetStats(); auto& src_stats = model.trees.at(tree_idx)->GetStats();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(), d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault)); sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(), d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault)); sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault));
dh::safe_cuda(hipMemcpyAsync(
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault));
#endif
} }
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id); tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
@ -504,11 +491,7 @@ void ExtractPaths(
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths, dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
DeviceModel *model, dh::device_vector<uint32_t> *path_categories, DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
int gpu_id) { int gpu_id) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(gpu_id)); dh::safe_cuda(cudaSetDevice(gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(gpu_id));
#endif
auto& device_model = *model; auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size()); dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@ -584,15 +567,9 @@ void ExtractPaths(
thrust::max_element(thrust::device, max_elem_it, thrust::max_element(thrust::device, max_elem_it,
max_elem_it + d_cat_node_segments.size()) - max_elem_it + d_cat_node_segments.size()) -
max_elem_it; max_elem_it;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(h_max_cat.data(), dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
d_cat_node_segments.data() + max_cat_it, d_cat_node_segments.data() + max_cat_it,
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost)); h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(h_max_cat.data(),
d_cat_node_segments.data() + max_cat_it,
h_max_cat.size_bytes(), hipMemcpyDeviceToHost));
#endif
max_cat = h_max_cat[0].size; max_cat = h_max_cat[0].size;
CHECK_GE(max_cat, 1); CHECK_GE(max_cat, 1);
path_categories->resize(max_cat * paths->size()); path_categories->resize(max_cat * paths->size());
@ -786,11 +763,7 @@ class ColumnSplitHelper {
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model, void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
bst_feature_t num_features, std::uint32_t num_group) const { bst_feature_t num_features, std::uint32_t num_group) const {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
dh::caching_device_vector<BitType> decision_storage{}; dh::caching_device_vector<BitType> decision_storage{};
dh::caching_device_vector<BitType> missing_storage{}; dh::caching_device_vector<BitType> missing_storage{};
@ -970,11 +943,7 @@ class GPUPredictor : public xgboost::Predictor {
~GPUPredictor() override { ~GPUPredictor() override {
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) { if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
} }
} }
@ -1071,11 +1040,7 @@ class GPUPredictor : public xgboost::Predictor {
LOG(FATAL) << "Dart booster feature " << not_implemented; LOG(FATAL) << "Dart booster feature " << not_implemented;
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
out_contribs->SetDevice(ctx_->gpu_id); out_contribs->SetDevice(ctx_->gpu_id);
if (tree_end == 0 || tree_end > model.trees.size()) { if (tree_end == 0 || tree_end > model.trees.size()) {
@ -1135,11 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
LOG(FATAL) << "Dart booster feature " << not_implemented; LOG(FATAL) << "Dart booster feature " << not_implemented;
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
out_contribs->SetDevice(ctx_->gpu_id); out_contribs->SetDevice(ctx_->gpu_id);
if (tree_end == 0 || tree_end > model.trees.size()) { if (tree_end == 0 || tree_end > model.trees.size()) {
@ -1199,11 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions, void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
const gbm::GBTreeModel &model, const gbm::GBTreeModel &model,
unsigned tree_end) const override { unsigned tree_end) const override {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id); auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
const MetaInfo& info = p_fmat->Info(); const MetaInfo& info = p_fmat->Info();

View File

@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
for (auto idx : nidx) { for (auto idx : nidx) {
copy_stream_.View().Wait(event); copy_stream_.View().Wait(event);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(), h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View())); d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
#endif
} }
} }
@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
dh::ToSpan(out_entries)); dh::ToSpan(out_entries));
GPUExpandEntry root_entry; GPUExpandEntry root_entry;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
hipMemcpyDeviceToHost));
#endif
return root_entry; return root_entry;
} }
} // namespace xgboost::tree } // namespace xgboost::tree

View File

@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
split_cats_.resize(node_categorical_storage_size_); split_cats_.resize(node_categorical_storage_size_);
h_split_cats_.resize(node_categorical_storage_size_); h_split_cats_.resize(node_categorical_storage_size_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda( dh::safe_cuda(
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(
hipMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
#endif
cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
sort_input_.resize(cat_sorted_idx_.size()); sort_input_.resize(cat_sorted_idx_.size());

View File

@ -266,11 +266,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
// decide whether to use shared memory // decide whether to use shared memory
int device = 0; int device = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&device)); dh::safe_cuda(cudaGetDevice(&device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&device));
#endif
// opt into maximum shared memory for the kernel if necessary // opt into maximum shared memory for the kernel if necessary
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
@ -303,17 +299,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
int num_groups = feature_groups.NumGroups(); int num_groups = feature_groups.NumGroups();
int n_mps = 0; int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
int n_blocks_per_mp = 0; int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, smem_size)); kBlockThreads, smem_size));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
int n_blocks_per_mp = 0;
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, smem_size));
#endif
// This gives the number of blocks to keep the device occupied // This gives the number of blocks to keep the device occupied
// Use this as the maximum number of blocks // Use this as the maximum number of blocks
@ -347,11 +336,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>); runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetLastError()); dh::safe_cuda(cudaGetLastError());
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetLastError());
#endif
} }
} // namespace tree } // namespace tree

View File

@ -16,22 +16,14 @@ namespace tree {
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_idx_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#endif
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
} }
RowPartitioner::~RowPartitioner() { RowPartitioner::~RowPartitioner() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_idx_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#endif
} }
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) { common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {

View File

@ -287,15 +287,9 @@ class RowPartitioner {
total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
} }
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<OpDataT>), h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
cudaMemcpyDefault)); cudaMemcpyDefault));
#endif
// Temporary arrays // Temporary arrays
auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0); auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@ -305,13 +299,8 @@ class RowPartitioner {
SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>( SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
total_rows, op, &tmp_); total_rows, op, &tmp_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(), dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
cudaMemcpyDefault)); cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
hipMemcpyDefault));
#endif
// TODO(Rory): this synchronisation hurts performance a lot // TODO(Rory): this synchronisation hurts performance a lot
// Future optimisation should find a way to skip this // Future optimisation should find a way to skip this
dh::DefaultStream().Sync(); dh::DefaultStream().Sync();
@ -348,15 +337,9 @@ class RowPartitioner {
void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) { void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size()); dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
sizeof(NodePositionInfo) * ridx_segments_.size(),
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
sizeof(NodePositionInfo) * ridx_segments_.size(), sizeof(NodePositionInfo) * ridx_segments_.size(),
cudaMemcpyDefault)); cudaMemcpyDefault));
#endif
constexpr int kBlockSize = 512; constexpr int kBlockSize = 512;
const int kItemsThread = 8; const int kItemsThread = 8;

View File

@ -232,26 +232,16 @@ struct GPUHistMakerDevice {
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(), this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel, param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree); param.colsample_bytree);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
this->interaction_constraints.Reset(); this->interaction_constraints.Reset();
if (d_gpair.size() != dh_gpair->Size()) { if (d_gpair.size() != dh_gpair->Size()) {
d_gpair.resize(dh_gpair->Size()); d_gpair.resize(dh_gpair->Size());
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(), dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair), dh_gpair->Size() * sizeof(GradientPair),
cudaMemcpyDeviceToDevice)); cudaMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
hipMemcpyDeviceToDevice));
#endif
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat); auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
page = sample.page; page = sample.page;
gpair = sample.gpair; gpair = sample.gpair;
@ -338,28 +328,15 @@ struct GPUHistMakerDevice {
max_active_features = max_active_features =
std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size())); std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync( dh::safe_cuda(cudaMemcpyAsync(
d_node_inputs.data().get(), h_node_inputs.data(), d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
#endif
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs), this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries)); shared_inputs, dh::ToSpan(entries));
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
hipMemcpyDeviceToHost));
#endif
dh::DefaultStream().Sync(); dh::DefaultStream().Sync();
} }
@ -412,13 +389,8 @@ struct GPUHistMakerDevice {
BitVector missing_bits{dh::ToSpan(missing_storage)}; BitVector missing_bits{dh::ToSpan(missing_storage)};
dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates); dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(), dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault)); num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
#endif
auto d_split_data = dh::ToSpan(split_data_storage); auto d_split_data = dh::ToSpan(split_data_storage);
dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable { dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
@ -527,15 +499,9 @@ struct GPUHistMakerDevice {
dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size()); dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(), dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
d_nodes.size() * sizeof(RegTree::Node), d_nodes.size() * sizeof(RegTree::Node),
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
d_nodes.size() * sizeof(RegTree::Node),
hipMemcpyHostToDevice));
#endif
auto const& h_split_types = p_tree->GetSplitTypes(); auto const& h_split_types = p_tree->GetSplitTypes();
auto const& categories = p_tree->GetSplitCategories(); auto const& categories = p_tree->GetSplitCategories();
@ -606,15 +572,9 @@ struct GPUHistMakerDevice {
auto s_position = p_out_position->ConstDeviceSpan(); auto s_position = p_out_position->ConstDeviceSpan();
positions.resize(s_position.size()); positions.resize(s_position.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(), dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
s_position.size_bytes(), cudaMemcpyDeviceToDevice, s_position.size_bytes(), cudaMemcpyDeviceToDevice,
ctx_->CUDACtx()->Stream())); ctx_->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(),
s_position.size_bytes(), hipMemcpyDeviceToDevice,
ctx_->CUDACtx()->Stream()));
#endif
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) { dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
bst_node_t position = d_out_position[idx]; bst_node_t position = d_out_position[idx];
@ -632,26 +592,16 @@ struct GPUHistMakerDevice {
CHECK(out_preds_d.Device().IsCUDA()); CHECK(out_preds_d.Device().IsCUDA());
CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal()); CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
#endif
auto d_position = dh::ToSpan(positions); auto d_position = dh::ToSpan(positions);
CHECK_EQ(out_preds_d.Size(), d_position.size()); CHECK_EQ(out_preds_d.Size(), d_position.size());
auto const& h_nodes = p_tree->GetNodes(); auto const& h_nodes = p_tree->GetNodes();
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size()); dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(), dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice, h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
ctx_->CUDACtx()->Stream())); ctx_->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(),
h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice,
ctx_->CUDACtx()->Stream()));
#endif
auto d_nodes = dh::ToSpan(nodes); auto d_nodes = dh::ToSpan(nodes);
CHECK_EQ(out_preds_d.Shape(1), 1); CHECK_EQ(out_preds_d.Shape(1), 1);
@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater {
++t_idx; ++t_idx;
} }
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetLastError()); dh::safe_cuda(cudaGetLastError());
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetLastError());
#endif
} catch (const std::exception& e) { } catch (const std::exception& e) {
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl; LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
} }
@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater {
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed); this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()}; auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
info_->feature_types.SetDevice(ctx_->gpu_id); info_->feature_types.SetDevice(ctx_->gpu_id);
maker = std::make_unique<GPUHistMakerDevice>( maker = std::make_unique<GPUHistMakerDevice>(