From ffbbc9c9689343719e62bfb1c197f521d50bcb9e Mon Sep 17 00:00:00 2001 From: Your Name <96135754+amdsc21@users.noreply.github.com> Date: Tue, 17 Oct 2023 12:42:37 -0700 Subject: [PATCH] add cuda to hip wrapper --- src/c_api/c_api.cu | 8 --- .../device_communicator_adapter.cuh | 34 ----------- src/common/algorithm.cuh | 5 -- src/common/common.cu | 13 +---- src/common/cuda_to_hip.h | 57 ++++++++++++++++++ src/common/device_helpers.hip.h | 2 + src/common/hist_util.cu | 5 -- src/common/hist_util.cuh | 15 ----- src/common/host_device_vector.cu | 35 ----------- src/common/linalg_op.cuh | 12 ---- src/common/quantile.cu | 38 ------------ src/common/quantile.cuh | 4 -- src/common/ranking_utils.cu | 5 -- src/common/threading_utils.cuh | 5 -- src/data/array_interface.cu | 4 -- src/data/data.cu | 18 ------ src/data/device_adapter.cuh | 12 ---- src/data/ellpack_page.cu | 29 ---------- src/data/ellpack_page_source.cu | 4 -- src/data/iterative_dmatrix.cu | 16 ----- src/data/simple_dmatrix.cu | 4 -- src/data/simple_dmatrix.cuh | 4 -- src/linear/updater_gpu_coordinate.cu | 26 --------- src/metric/auc.cu | 12 ---- src/metric/multiclass_metric.cu | 5 -- src/metric/survival_metric.cu | 4 -- src/objective/adaptive.cu | 20 ------- src/objective/lambdarank_obj.cu | 16 ----- src/predictor/gpu_predictor.cu | 43 -------------- src/tree/gpu_hist/evaluate_splits.cu | 11 ---- src/tree/gpu_hist/evaluator.cu | 5 -- src/tree/gpu_hist/histogram.cu | 15 ----- src/tree/gpu_hist/row_partitioner.cu | 8 --- src/tree/gpu_hist/row_partitioner.cuh | 17 ------ src/tree/updater_gpu_hist.cu | 58 ------------------- 35 files changed, 60 insertions(+), 509 deletions(-) create mode 100644 src/common/cuda_to_hip.h diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu index 506be723b..de21e9749 100644 --- a/src/c_api/c_api.cu +++ b/src/c_api/c_api.cu @@ -59,21 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) { void XGBoostAPIGuard::SetGPUAttribute() { // Not calling `safe_cuda` to avoid unnecessary exception handling overhead. // If errors, do nothing, assuming running on CPU only machine. -#if defined(XGBOOST_USE_CUDA) cudaGetDevice(&device_id_); -#elif defined(XGBOOST_USE_HIP) - hipGetDevice(&device_id_); -#endif } void XGBoostAPIGuard::RestoreGPUAttribute() { // Not calling `safe_cuda` to avoid unnecessary exception handling overhead. // If errors, do nothing, assuming running on CPU only machine. -#if defined(XGBOOST_USE_CUDA) cudaSetDevice(device_id_); -#elif defined(XGBOOST_USE_HIP) - hipSetDevice(device_id_); -#endif } void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad, diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh index 49c0405cb..0ffa28770 100644 --- a/src/collective/device_communicator_adapter.cuh +++ b/src/collective/device_communicator_adapter.cuh @@ -26,22 +26,12 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator { return; } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_ordinal_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_ordinal_)); -#endif auto size = count * GetTypeSize(data_type); host_buffer_.resize(size); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault)); Allreduce(host_buffer_.data(), count, data_type, op); dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault)); - AllReduce(host_buffer_.data(), count, data_type, op); - dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault)); -#endif } void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override { @@ -49,7 +39,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator { return; } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_ordinal_)); host_buffer_.resize(send_size * world_size_); dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size, @@ -57,15 +46,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator { Allgather(host_buffer_.data(), host_buffer_.size()); dh::safe_cuda( cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_ordinal_)); - host_buffer_.resize(send_size * world_size_); - dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size, - hipMemcpyDefault)); - Allgather(host_buffer_.data(), host_buffer_.size()); - dh::safe_cuda( - hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault)); -#endif } void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector *segments, @@ -74,11 +54,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator { return; } -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_ordinal_)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_ordinal_)); -#endif segments->clear(); segments->resize(world_size_, 0); @@ -92,25 +68,15 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator { for (int32_t i = 0; i < world_size_; ++i) { size_t as_bytes = segments->at(i); if (i == rank_) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_), - hipMemcpyDefault)); -#endif } Broadcast(host_buffer_.data() + offset, as_bytes, i); offset += as_bytes; } -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes, - hipMemcpyDefault)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes, cudaMemcpyDefault)); -#endif } void Synchronize() override { diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index 20192a7f2..8bf6bb808 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -185,13 +185,8 @@ void SegmentedArgSort(Context const *ctx, Span values, Span group_ptr, sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(), group_ptr.data() + 1, ctx->CUDACtx()->Stream()); -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(), - sorted_idx.size_bytes(), hipMemcpyDeviceToDevice)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice)); -#endif } /** diff --git a/src/common/common.cu b/src/common/common.cu index 0997b7c83..b57890906 100644 --- a/src/common/common.cu +++ b/src/common/common.cu @@ -2,17 +2,14 @@ * Copyright 2018-2022 XGBoost contributors */ #include "common.h" +#include "cuda_to_hip.h" namespace xgboost { namespace common { void SetDevice(std::int32_t device) { if (device >= 0) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device)); -#endif } } @@ -21,17 +18,9 @@ int AllVisibleGPUs() { try { // When compiled with CUDA but running on CPU only device, // cudaGetDeviceCount will fail. -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipGetDeviceCount(&n_visgpus)); -#endif } catch (const dmlc::Error &) { -#if defined(XGBOOST_USE_CUDA) cudaGetLastError(); // reset error. -#elif defined(XGBOOST_USE_HIP) - hipGetLastError(); // reset error. -#endif return 0; } return n_visgpus; diff --git a/src/common/cuda_to_hip.h b/src/common/cuda_to_hip.h new file mode 100644 index 000000000..6033a80b2 --- /dev/null +++ b/src/common/cuda_to_hip.h @@ -0,0 +1,57 @@ +/** + * Copyright 2017-2023 XGBoost contributors + */ +#pragma once + +#if defined(XGBOOST_USE_HIP) + +#define cudaSuccess hipSuccess +#define cudaGetLastError hipGetLastError + +#define cudaStream_t hipStream_t +#define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamPerThread hipStreamPerThread +#define cudaStreamLegacy hipStreamLegacy + +#define cudaEvent_t hipEvent_t +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy + +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaDeviceSynchronize hipDeviceSynchronize + +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDeviceGetAttribute hipDeviceGetAttribute + +#define cudaMallocHost hipMallocHost +#define cudaFreeHost hipFreeHost +#define cudaMalloc hipMalloc +#define cudaFree hipFree + +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDefault hipMemcpyDefault +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyHostToHost hipMemcpyHostToHost +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemset hipMemset + +#define cudaPointerAttributes hipPointerAttribute_t +#define cudaPointerGetAttributes hipPointerGetAttributes + +#define cudaMemGetInfo hipMemGetInfo +#define cudaFuncSetAttribute hipFuncSetAttribute + +#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount +#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor + +#endif diff --git a/src/common/device_helpers.hip.h b/src/common/device_helpers.hip.h index e7ee49b5a..2852155d4 100644 --- a/src/common/device_helpers.hip.h +++ b/src/common/device_helpers.hip.h @@ -31,6 +31,8 @@ #include #include +#include "cuda_to_hip.h" + #include "../collective/communicator-inl.h" #include "common.h" #include "xgboost/global_config.h" diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 14b60df33..f727384de 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -330,13 +330,8 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c } else { // copy hessian as weight CHECK_EQ(d_weight_out.size(), hessian.size()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(), - hipMemcpyDefault)); -#endif } return d_weight_out; } diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index bc99e6fc4..f86685eda 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -88,19 +88,10 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan batch_iter, template std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) { int n_mps = 0; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device)); -#endif int n_blocks_per_mp = 0; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, kBlockThreads, shared_mem)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, - kBlockThreads, shared_mem)); -#endif std::uint32_t grid_size = n_blocks_per_mp * n_mps; return grid_size; } @@ -348,13 +339,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, size_t columns, size_t begin, size_t end, SketchContainer *sketch_container) { dh::XGBCachingDeviceAllocator alloc; - -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); -#endif - info.weights_.SetDevice(device); auto weights = info.weights_.ConstDeviceSpan(); diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 70e5c448a..a9102f668 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -140,17 +140,10 @@ class HostDeviceVectorImpl { SetDevice(); CHECK_EQ(this->DeviceIdx(), other->DeviceIdx()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr, other->Size() * sizeof(T), cudaMemcpyDeviceToDevice)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(this->DevicePointer() + ori_size, - ptr, - other->Size() * sizeof(T), - hipMemcpyDeviceToDevice)); -#endif } } @@ -204,17 +197,10 @@ class HostDeviceVectorImpl { if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } SetDevice(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data().get(), data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(data_h_.data(), - data_d_->data().get(), - data_d_->size() * sizeof(T), - hipMemcpyDeviceToHost)); -#endif } void LazySyncDevice(GPUAccess access) { @@ -228,17 +214,10 @@ class HostDeviceVectorImpl { LazyResizeDevice(data_h_.size()); SetDevice(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), data_h_.data(), data_d_->size() * sizeof(T), cudaMemcpyHostToDevice)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), - data_h_.data(), - data_d_->size() * sizeof(T), - hipMemcpyHostToDevice)); -#endif gpu_access_ = access; } @@ -264,13 +243,8 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), data_d_->size() * sizeof(T), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), - data_d_->size() * sizeof(T), hipMemcpyDefault)); -#endif } } @@ -279,13 +253,8 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin, data_d_->size() * sizeof(T), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), begin, - data_d_->size() * sizeof(T), hipMemcpyDefault)); -#endif } void LazyResizeDevice(size_t new_size) { @@ -297,11 +266,7 @@ class HostDeviceVectorImpl { void SetDevice() { CHECK_GE(device_, 0); if (cudaSetDeviceHandler == nullptr) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif } else { (*cudaSetDeviceHandler)(device_); } diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh index 1d97f9b21..1f68c6ce7 100644 --- a/src/common/linalg_op.cuh +++ b/src/common/linalg_op.cuh @@ -12,17 +12,9 @@ namespace xgboost { namespace linalg { template -#if defined(XGBOOST_USE_CUDA) void ElementWiseKernelDevice(linalg::TensorView t, Fn&& fn, cudaStream_t s = nullptr) -#elif defined(XGBOOST_USE_HIP) -void ElementWiseKernelDevice(linalg::TensorView t, Fn&& fn, hipStream_t s = nullptr) -#endif { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(t.Device().ordinal)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(t.Device().ordinal)); -#endif static_assert(std::is_void>::value, "For function with return, use transform instead."); if (t.Contiguous()) { @@ -37,11 +29,7 @@ void ElementWiseKernelDevice(linalg::TensorView t, Fn&& fn, hipStream_t s } template -#if defined(XGBOOST_USE_HIP) -void ElementWiseTransformDevice(linalg::TensorView t, Fn&& fn, hipStream_t s = nullptr) -#elif defined(XGBOOST_USE_CUDA) void ElementWiseTransformDevice(linalg::TensorView t, Fn&& fn, cudaStream_t s = nullptr) -#endif { if (t.Contiguous()) { auto ptr = t.Values().data(); diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 881275298..9896165ad 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -110,15 +110,9 @@ void CopyTo(Span out, Span src) { CHECK_EQ(out.size(), src.size()); static_assert(std::is_same, std::remove_cv_t>::value); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(), out.size_bytes(), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(out.data(), src.data(), - out.size_bytes(), - hipMemcpyDefault)); -#endif } // Compute the merge path. @@ -251,11 +245,7 @@ common::Span> MergePath( void MergeImpl(int32_t device, Span const &d_x, Span const &x_ptr, Span const &d_y, Span const &y_ptr, Span out, Span out_ptr) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device)); -#endif CHECK_EQ(d_x.size() + d_y.size(), out.size()); CHECK_EQ(x_ptr.size(), out_ptr.size()); @@ -354,11 +344,7 @@ void MergeImpl(int32_t device, Span const &d_x, void SketchContainer::Push(Span entries, Span columns_ptr, common::Span cuts_ptr, size_t total_cuts, Span weights) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif Span out; dh::device_vector cuts; @@ -418,11 +404,7 @@ size_t SketchContainer::ScanInput(Span entries, Span d_col * pruning or merging. We preserve the first type and remove the second type. */ timer_.Start(__func__); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1); dh::XGBCachingDeviceAllocator alloc; @@ -479,11 +461,7 @@ size_t SketchContainer::ScanInput(Span entries, Span d_col void SketchContainer::Prune(size_t to) { timer_.Start(__func__); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif OffsetT to_total = 0; auto& h_columns_ptr = columns_ptr_b_.HostVector(); @@ -518,11 +496,7 @@ void SketchContainer::Prune(size_t to) { void SketchContainer::Merge(Span d_that_columns_ptr, Span that) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif timer_.Start(__func__); if (this->Current().size() == 0) { @@ -558,11 +532,7 @@ void SketchContainer::Merge(Span d_that_columns_ptr, } void SketchContainer::FixError() { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan(); auto in = dh::ToSpan(this->Current()); @@ -588,11 +558,7 @@ void SketchContainer::FixError() { } void SketchContainer::AllReduce(bool is_column_split) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif auto world = collective::GetWorldSize(); if (world == 1 || is_column_split) { return; @@ -674,11 +640,7 @@ struct InvalidCatOp { void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { timer_.Start(__func__); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif p_cuts->min_vals_.Resize(num_columns_); // Sync between workers. diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 79db5d857..221706274 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -176,11 +176,7 @@ class SketchContainer { size_t Unique(KeyComp key_comp = thrust::equal_to{}) { timer_.Start(__func__); -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#else dh::safe_cuda(cudaSetDevice(device_)); -#endif this->columns_ptr_.SetDevice(device_); Span d_column_scan = this->columns_ptr_.DeviceSpan(); diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu index 39aee4073..e9347aa82 100644 --- a/src/common/ranking_utils.cu +++ b/src/common/ranking_utils.cu @@ -147,13 +147,8 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) { auto const& h_group_ptr = info.group_ptr_; group_ptr_.Resize(h_group_ptr.size()); auto d_group_ptr = group_ptr_.DeviceSpan(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(), cudaMemcpyHostToDevice, cuctx->Stream())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(), - hipMemcpyHostToDevice, cuctx->Stream())); -#endif } auto d_group_ptr = DataGroupPtr(ctx); diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh index 23fda9256..77cf709d3 100644 --- a/src/common/threading_utils.cuh +++ b/src/common/threading_utils.cuh @@ -61,13 +61,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span group_ptr, out_group_threads_ptr.size()); size_t total = 0; -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, - sizeof(total), hipMemcpyDeviceToHost)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, sizeof(total), cudaMemcpyDeviceToHost)); -#endif return total; } diff --git a/src/data/array_interface.cu b/src/data/array_interface.cu index cf4117656..492c24200 100644 --- a/src/data/array_interface.cu +++ b/src/data/array_interface.cu @@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) { // default per-thread stream default: { dh::CUDAEvent e; -#if defined(XGBOOST_USE_CUDA) e.Record(dh::CUDAStreamView{reinterpret_cast(stream)}); -#elif defined(XGBOOST_USE_HIP) - e.Record(dh::CUDAStreamView{reinterpret_cast(stream)}); -#endif dh::DefaultStream().Wait(e); } } diff --git a/src/data/data.cu b/src/data/data.cu index 3fe44ee12..b1b75f5e6 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -22,19 +22,11 @@ namespace cub = hipcub; namespace xgboost { namespace { auto SetDeviceToPtr(void const* ptr) { -#if defined(XGBOOST_USE_CUDA) cudaPointerAttributes attr; dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr)); int32_t ptr_device = attr.device; dh::safe_cuda(cudaSetDevice(ptr_device)); return ptr_device; -#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */ - hipPointerAttribute_t attr; - dh::safe_cuda(hipPointerGetAttributes(&attr, ptr)); - int32_t ptr_device = attr.device; - dh::safe_cuda(hipSetDevice(ptr_device)); - return ptr_device; -#endif } template @@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens // set data data->Resize(array.n); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T), cudaMemcpyDefault, ctx->Stream())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T), - hipMemcpyDefault, ctx->Stream())); -#endif }); return; } @@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector* p_ }); bool non_dec = true; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool), cudaMemcpyDeviceToHost)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool), - hipMemcpyDeviceToHost)); -#endif CHECK(non_dec) << "`qid` must be sorted in increasing order along with data."; size_t bytes = 0; diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index 361d808ad..7b907f7e2 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter { device_idx_ = dh::CudaGetPointerDevice(first_column.data); CHECK_NE(device_idx_, Context::kCpuId); -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_idx_)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx_)); -#endif for (auto& json_col : json_columns) { auto column = ArrayInterface<1>(get(json_col)); @@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter { template std::size_t GetRowCounts(const AdapterBatchT batch, common::Span offset, int device_idx, float missing) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_idx)); -#endif IsValidFunctor is_valid(missing); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes())); -#endif auto n_samples = batch.NumRows(); bst_feature_t n_features = batch.NumCols(); diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index da6b52cc4..58b96b665 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, n_rows(n_rows) { monitor_.Init("ellpack_page"); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device)); -#endif monitor_.Start("InitCompressedData"); InitCompressedData(device); @@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param) : is_dense(dmat->IsDense()) { monitor_.Init("ellpack_page"); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx->gpu_id)); -#endif n_rows = dmat->Info().num_row_; @@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device)); -#endif *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows); CopyDataToEllpack(batch, feature_types, this, device, missing); @@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer(); dh::device_vector row_ptr(page.row_ptr.size()); auto d_row_ptr = dh::ToSpan(row_ptr); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), - hipMemcpyHostToDevice, ctx->CUDACtx()->Stream())); -#endif auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft); auto null = accessor.NullValue(); @@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device, if (row_batch.data.DeviceCanRead()) { auto const& d_data = row_batch.data.ConstDeviceSpan(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( entries_d.data().get(), d_data.data() + ent_cnt_begin, n_entries * sizeof(Entry), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync( - entries_d.data().get(), d_data.data() + ent_cnt_begin, - n_entries * sizeof(Entry), hipMemcpyDefault)); -#endif } else { const std::vector& data_vec = row_batch.data.ConstHostVector(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( entries_d.data().get(), data_vec.data() + ent_cnt_begin, n_entries * sizeof(Entry), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync( - entries_d.data().get(), data_vec.data() + ent_cnt_begin, - n_entries * sizeof(Entry), hipMemcpyDefault)); -#endif } const dim3 block3(32, 8, 1); // 256 threads diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 2247d281e..abfc400c1 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -10,11 +10,7 @@ namespace xgboost::data { void EllpackPageSource::Fetch() { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif if (!this->ReadCache()) { if (count_ != 0 && !sync_) { // source is initialized to be the 0th page during construction, so when count_ is 0 diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index a878ff115..4825b58e7 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, int32_t current_device; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaGetDevice(¤t_device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipGetDevice(¤t_device)); -#endif auto get_device = [&]() -> int32_t { std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id; @@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, // ctx_.gpu_id = proxy->DeviceIdx(); CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(get_device())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(get_device())); -#endif if (cols == 0) { cols = num_cols(); @@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, auto n_features = cols; CHECK_GE(n_features, 1) << "Data must has at least 1 column."; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(get_device())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(get_device())); -#endif if (!ref) { HostDeviceVector ft; @@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, while (iter.Next()) { init_page(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(get_device())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(get_device())); -#endif auto rows = num_rows(); dh::device_vector row_counts(rows + 1, 0); diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu index fe81a0f4d..39d701b43 100644 --- a/src/data/simple_dmatrix.cu +++ b/src/data/simple_dmatrix.cu @@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr : adapter->DeviceIdx(); CHECK_GE(device, 0); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device)); -#endif Context ctx; ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}}); diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh index 6b25afd45..a26899ff1 100644 --- a/src/data/simple_dmatrix.cuh +++ b/src/data/simple_dmatrix.cuh @@ -57,11 +57,7 @@ template void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, int device_idx, float missing) { -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_idx)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx)); -#endif IsValidFunctor is_valid(missing); // Count elements per row diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu index 51c144f11..1c1ae1ba4 100644 --- a/src/linear/updater_gpu_coordinate.cu +++ b/src/linear/updater_gpu_coordinate.cu @@ -60,11 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT return; } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif // The begin and end indices for the section of each column associated with // this device @@ -92,17 +88,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT auto col = page[fidx]; auto seg = column_segments[fidx]; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy( data_.data().get() + row_ptr_[fidx], col.data() + seg.first, sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy( - data_.data().get() + row_ptr_[fidx], - col.data() + seg.first, - sizeof(Entry) * (seg.second - seg.first), hipMemcpyHostToDevice)); -#endif } } @@ -182,11 +171,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT // This needs to be public because of the __device__ lambda. GradientPair GetBiasGradient(int group_idx, int num_group) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif auto counting = thrust::make_counting_iterator(0ull); auto f = [=] __device__(size_t idx) { @@ -211,11 +196,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT // This needs to be public because of the __device__ lambda. GradientPair GetGradient(int group_idx, int num_group, int fidx) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif common::Span d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]); size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx]; @@ -249,17 +230,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT } void UpdateGpair(const std::vector &host_gpair) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( gpair_.data().get(), host_gpair.data(), gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync( - gpair_.data().get(), - host_gpair.data(), - gpair_.size() * sizeof(GradientPair), hipMemcpyHostToDevice)); -#endif } // training parameter diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 0586f1a03..7f8fa38be 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -95,11 +95,7 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, Fn area_fn, std::shared_ptr cache) { auto labels = info.labels.View(device); auto weights = info.weights_.ConstDeviceSpan(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device.ordinal)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device.ordinal)); -#endif CHECK_NE(labels.Size(), 0); CHECK_EQ(labels.Size(), predts.size()); @@ -352,11 +348,7 @@ template double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, common::Span d_class_ptr, size_t n_classes, std::shared_ptr cache, Fn area_fn) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device.ordinal)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device.ordinal)); -#endif /** * Sorted idx */ @@ -934,11 +926,7 @@ std::pair GPURankingPRAUC(Context const *ctx, common::Span predts, MetaInfo const &info, std::shared_ptr *p_cache) { -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx->gpu_id)); -#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); -#endif if (predts.empty()) { return std::make_pair(0.0, static_cast(0)); diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu index 6c27f4100..ba236a0be 100644 --- a/src/metric/multiclass_metric.cu +++ b/src/metric/multiclass_metric.cu @@ -166,12 +166,7 @@ class MultiClassMetricsReduction { labels.SetDevice(device_); weights.SetDevice(device_); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_)); -#endif - result = DeviceReduceMetrics(weights, labels, preds, n_class); } #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu index e4accc436..ef49687f9 100644 --- a/src/metric/survival_metric.cu +++ b/src/metric/survival_metric.cu @@ -159,11 +159,7 @@ class ElementWiseSurvivalMetricsReduction { labels_upper_bound.SetDevice(ctx.gpu_id); weights.SetDevice(ctx.gpu_id); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx.gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx.gpu_id)); -#endif result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds); } diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu index 4bbabbf28..4835373ad 100644 --- a/src/objective/adaptive.cu +++ b/src/objective/adaptive.cu @@ -30,22 +30,13 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span pos dh::device_vector* p_ridx, HostDeviceVector* p_nptr, HostDeviceVector* p_nidx, RegTree const& tree) { // copy position to buffer -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx->Ordinal())); -#endif auto cuctx = ctx->CUDACtx(); size_t n_samples = position.size(); dh::device_vector sorted_position(position.size()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(), position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(sorted_position.data().get(), position.data(), - position.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream())); -#endif p_ridx->resize(position.size()); dh::Iota(dh::ToSpan(*p_ridx)); @@ -98,17 +89,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span pos bst_node_t* h_first_unique = reinterpret_cast(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t), cudaMemcpyDeviceToHost, copy_stream.View())); dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t), cudaMemcpyDeviceToHost, copy_stream.View())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t), - hipMemcpyDeviceToHost, copy_stream.View())); - dh::safe_cuda(hipMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t), - hipMemcpyDeviceToHost, copy_stream.View())); -#endif /** * copy node index (leaf index) @@ -171,11 +155,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span pos void UpdateTreeLeafDevice(Context const* ctx, common::Span position, std::int32_t group_idx, MetaInfo const& info, float learning_rate, HostDeviceVector const& predt, float alpha, RegTree* p_tree) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx->Ordinal())); -#endif dh::device_vector ridx; HostDeviceVector nptr; HostDeviceVector nidx; diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu index 14bd31063..f0a7f1d5e 100644 --- a/src/objective/lambdarank_obj.cu +++ b/src/objective/lambdarank_obj.cu @@ -297,11 +297,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector const linalg::Matrix* out_gpair) { // boilerplate std::int32_t device_id = ctx->gpu_id; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_id)); -#endif auto n_groups = p_cache->Groups(); info.labels.SetDevice(device_id); @@ -385,11 +381,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter, linalg::Matrix* out_gpair) { // boilerplate auto device = ctx->Device(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device.ordinal)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device.ordinal)); -#endif auto const d_inv_IDCG = p_cache->InvIDCG(ctx); auto const discount = p_cache->Discount(ctx); @@ -457,11 +449,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter, linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { auto device = ctx->Device(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device.ordinal)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device.ordinal)); -#endif info.labels.SetDevice(device); predt.SetDevice(device); @@ -500,11 +488,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter, linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { auto device = ctx->Device(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device.ordinal)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device.ordinal)); -#endif info.labels.SetDevice(device); predt.SetDevice(device); diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index d5c08c22f..b1ab57b98 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -341,11 +341,7 @@ class DeviceModel { int num_group; void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(gpu_id)); -#endif // Copy decision trees to device tree_segments = HostDeviceVector({}, gpu_id); @@ -366,21 +362,12 @@ class DeviceModel { auto& src_nodes = model.trees.at(tree_idx)->GetNodes(); auto& src_stats = model.trees.at(tree_idx)->GetStats(); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(), sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault)); dh::safe_cuda(cudaMemcpyAsync( d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(), sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync( - d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(), - sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault)); - dh::safe_cuda(hipMemcpyAsync( - d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(), - sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault)); -#endif } tree_group = HostDeviceVector(model.tree_info.size(), 0, gpu_id); @@ -504,11 +491,7 @@ void ExtractPaths( dh::device_vector> *paths, DeviceModel *model, dh::device_vector *path_categories, int gpu_id) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(gpu_id)); -#endif auto& device_model = *model; dh::caching_device_vector info(device_model.nodes.Size()); @@ -584,15 +567,9 @@ void ExtractPaths( thrust::max_element(thrust::device, max_elem_it, max_elem_it + d_cat_node_segments.size()) - max_elem_it; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it, h_max_cat.size_bytes(), cudaMemcpyDeviceToHost)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpy(h_max_cat.data(), - d_cat_node_segments.data() + max_cat_it, - h_max_cat.size_bytes(), hipMemcpyDeviceToHost)); -#endif max_cat = h_max_cat[0].size; CHECK_GE(max_cat, 1); path_categories->resize(max_cat * paths->size()); @@ -786,11 +763,7 @@ class ColumnSplitHelper { void PredictDMatrix(DMatrix* dmat, HostDeviceVector* out_preds, DeviceModel const& model, bst_feature_t num_features, std::uint32_t num_group) const { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif dh::caching_device_vector decision_storage{}; dh::caching_device_vector missing_storage{}; @@ -970,11 +943,7 @@ class GPUPredictor : public xgboost::Predictor { ~GPUPredictor() override { if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif } } @@ -1071,11 +1040,7 @@ class GPUPredictor : public xgboost::Predictor { LOG(FATAL) << "Dart booster feature " << not_implemented; } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif out_contribs->SetDevice(ctx_->gpu_id); if (tree_end == 0 || tree_end > model.trees.size()) { @@ -1135,11 +1100,7 @@ class GPUPredictor : public xgboost::Predictor { LOG(FATAL) << "Dart booster feature " << not_implemented; } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif out_contribs->SetDevice(ctx_->gpu_id); if (tree_end == 0 || tree_end > model.trees.size()) { @@ -1199,11 +1160,7 @@ class GPUPredictor : public xgboost::Predictor { void PredictLeaf(DMatrix *p_fmat, HostDeviceVector *predictions, const gbm::GBTreeModel &model, unsigned tree_end) const override { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id); const MetaInfo& info = p_fmat->Info(); diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index b6f21004f..ad5992602 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { for (auto idx : nidx) { copy_stream_.View().Wait(event); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync( - h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(), - d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View())); -#endif } } @@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( dh::ToSpan(out_entries)); GPUExpandEntry root_entry; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), cudaMemcpyDeviceToHost)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), - hipMemcpyDeviceToHost)); -#endif return root_entry; } } // namespace xgboost::tree diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 2cbe13a22..b23cb670b 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span); } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaGetLastError()); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipGetLastError()); -#endif } } // namespace tree diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index b1c738142..b1ded6cda 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -16,22 +16,14 @@ namespace tree { RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_idx_)); -#endif ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); } RowPartitioner::~RowPartitioner() { -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx_)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(device_idx_)); -#endif } common::Span RowPartitioner::GetRows(bst_node_t nidx) { diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 38938e848..74f0dee2b 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -287,15 +287,9 @@ class RowPartitioner { total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), - h_batch_info.size() * sizeof(PerNodeData), - hipMemcpyDefault)); -#else dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), h_batch_info.size() * sizeof(PerNodeData), cudaMemcpyDefault)); -#endif // Temporary arrays auto h_counts = pinned_.GetSpan(nidx.size(), 0); @@ -305,13 +299,8 @@ class RowPartitioner { SortPositionBatch( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, &tmp_); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(), - hipMemcpyDefault)); -#endif // TODO(Rory): this synchronisation hurts performance a lot // Future optimisation should find a way to skip this dh::DefaultStream().Sync(); @@ -348,15 +337,9 @@ class RowPartitioner { void FinalisePosition(common::Span d_out_position, FinalisePositionOpT op) { dh::TemporaryArray d_node_info_storage(ridx_segments_.size()); -#if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), - sizeof(NodePositionInfo) * ridx_segments_.size(), - hipMemcpyDefault)); -#else dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), sizeof(NodePositionInfo) * ridx_segments_.size(), cudaMemcpyDefault)); -#endif constexpr int kBlockSize = 512; const int kItemsThread = 8; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 6e06450fb..58074a79e 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -232,26 +232,16 @@ struct GPUHistMakerDevice { this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(), param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif this->interaction_constraints.Reset(); if (d_gpair.size() != dh_gpair->Size()) { d_gpair.resize(dh_gpair->Size()); } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(), dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(), - dh_gpair->Size() * sizeof(GradientPair), - hipMemcpyDeviceToDevice)); -#endif auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat); page = sample.page; gpair = sample.gpair; @@ -338,28 +328,15 @@ struct GPUHistMakerDevice { max_active_features = std::max(max_active_features, static_cast(input.feature_set.size())); } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( d_node_inputs.data().get(), h_node_inputs.data(), h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync( - d_node_inputs.data().get(), h_node_inputs.data(), - h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault)); -#endif this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs), shared_inputs, dh::ToSpan(entries)); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(), - entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), - hipMemcpyDeviceToHost)); -#endif - dh::DefaultStream().Sync(); } @@ -412,13 +389,8 @@ struct GPUHistMakerDevice { BitVector missing_bits{dh::ToSpan(missing_storage)}; dh::TemporaryArray split_data_storage(num_candidates); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(), num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(), - num_candidates * sizeof(NodeSplitData), hipMemcpyDefault)); -#endif auto d_split_data = dh::ToSpan(split_data_storage); dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable { @@ -527,15 +499,9 @@ struct GPUHistMakerDevice { dh::TemporaryArray d_nodes(p_tree->GetNodes().size()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(), d_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(), - d_nodes.size() * sizeof(RegTree::Node), - hipMemcpyHostToDevice)); -#endif auto const& h_split_types = p_tree->GetSplitTypes(); auto const& categories = p_tree->GetSplitCategories(); @@ -606,15 +572,9 @@ struct GPUHistMakerDevice { auto s_position = p_out_position->ConstDeviceSpan(); positions.resize(s_position.size()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(), s_position.size_bytes(), cudaMemcpyDeviceToDevice, ctx_->CUDACtx()->Stream())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(), - s_position.size_bytes(), hipMemcpyDeviceToDevice, - ctx_->CUDACtx()->Stream())); -#endif dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) { bst_node_t position = d_out_position[idx]; @@ -632,26 +592,16 @@ struct GPUHistMakerDevice { CHECK(out_preds_d.Device().IsCUDA()); CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->Ordinal())); -#endif auto d_position = dh::ToSpan(positions); CHECK_EQ(out_preds_d.Size(), d_position.size()); auto const& h_nodes = p_tree->GetNodes(); dh::caching_device_vector nodes(h_nodes.size()); -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(), h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice, ctx_->CUDACtx()->Stream())); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(), - h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice, - ctx_->CUDACtx()->Stream())); -#endif auto d_nodes = dh::ToSpan(nodes); CHECK_EQ(out_preds_d.Shape(1), 1); @@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater { ++t_idx; } -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaGetLastError()); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipGetLastError()); -#endif } catch (const std::exception& e) { LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl; } @@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater { this->column_sampler_ = std::make_shared(column_sampling_seed); auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()}; -#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); -#elif defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); -#endif info_->feature_types.SetDevice(ctx_->gpu_id); maker = std::make_unique(