add cuda to hip wrapper
This commit is contained in:
parent
ea19555474
commit
ffbbc9c968
@ -59,21 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
|
||||
void XGBoostAPIGuard::SetGPUAttribute() {
|
||||
// Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
|
||||
// If errors, do nothing, assuming running on CPU only machine.
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaGetDevice(&device_id_);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
hipGetDevice(&device_id_);
|
||||
#endif
|
||||
}
|
||||
|
||||
void XGBoostAPIGuard::RestoreGPUAttribute() {
|
||||
// Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
|
||||
// If errors, do nothing, assuming running on CPU only machine.
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaSetDevice(device_id_);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
hipSetDevice(device_id_);
|
||||
#endif
|
||||
}
|
||||
|
||||
void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
|
||||
|
||||
@ -26,22 +26,12 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_ordinal_));
|
||||
#endif
|
||||
auto size = count * GetTypeSize(data_type);
|
||||
host_buffer_.resize(size);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
|
||||
Allreduce(host_buffer_.data(), count, data_type, op);
|
||||
dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
|
||||
AllReduce(host_buffer_.data(), count, data_type, op);
|
||||
dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
|
||||
@ -49,7 +39,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
host_buffer_.resize(send_size * world_size_);
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
|
||||
@ -57,15 +46,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
Allgather(host_buffer_.data(), host_buffer_.size());
|
||||
dh::safe_cuda(
|
||||
cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_ordinal_));
|
||||
host_buffer_.resize(send_size * world_size_);
|
||||
dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
|
||||
hipMemcpyDefault));
|
||||
Allgather(host_buffer_.data(), host_buffer_.size());
|
||||
dh::safe_cuda(
|
||||
hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
|
||||
@ -74,11 +54,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_ordinal_));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
#endif
|
||||
|
||||
segments->clear();
|
||||
segments->resize(world_size_, 0);
|
||||
@ -92,25 +68,15 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
for (int32_t i = 0; i < world_size_; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
if (i == rank_) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
|
||||
cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
|
||||
hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
Broadcast(host_buffer_.data() + offset, as_bytes, i);
|
||||
offset += as_bytes;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
|
||||
hipMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
void Synchronize() override {
|
||||
|
||||
@ -185,13 +185,8 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
|
||||
sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
|
||||
group_ptr.data() + 1, ctx->CUDACtx()->Stream());
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
|
||||
sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
|
||||
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -2,17 +2,14 @@
|
||||
* Copyright 2018-2022 XGBoost contributors
|
||||
*/
|
||||
#include "common.h"
|
||||
#include "cuda_to_hip.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
void SetDevice(std::int32_t device) {
|
||||
if (device >= 0) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,17 +18,9 @@ int AllVisibleGPUs() {
|
||||
try {
|
||||
// When compiled with CUDA but running on CPU only device,
|
||||
// cudaGetDeviceCount will fail.
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetDeviceCount(&n_visgpus));
|
||||
#endif
|
||||
} catch (const dmlc::Error &) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaGetLastError(); // reset error.
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
hipGetLastError(); // reset error.
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
return n_visgpus;
|
||||
|
||||
57
src/common/cuda_to_hip.h
Normal file
57
src/common/cuda_to_hip.h
Normal file
@ -0,0 +1,57 @@
|
||||
/**
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
|
||||
#define cudaSuccess hipSuccess
|
||||
#define cudaGetLastError hipGetLastError
|
||||
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaStreamCreate hipStreamCreate
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamDestroy hipStreamDestroy
|
||||
#define cudaStreamWaitEvent hipStreamWaitEvent
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamPerThread hipStreamPerThread
|
||||
#define cudaStreamLegacy hipStreamLegacy
|
||||
|
||||
#define cudaEvent_t hipEvent_t
|
||||
#define cudaEventCreate hipEventCreate
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDestroy hipEventDestroy
|
||||
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaGetDeviceCount hipGetDeviceCount
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
|
||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||
#define cudaDeviceGetAttribute hipDeviceGetAttribute
|
||||
|
||||
#define cudaMallocHost hipMallocHost
|
||||
#define cudaFreeHost hipFreeHost
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaFree hipFree
|
||||
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyDefault hipMemcpyDefault
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyHostToHost hipMemcpyHostToHost
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define cudaMemsetAsync hipMemsetAsync
|
||||
#define cudaMemset hipMemset
|
||||
|
||||
#define cudaPointerAttributes hipPointerAttribute_t
|
||||
#define cudaPointerGetAttributes hipPointerGetAttributes
|
||||
|
||||
#define cudaMemGetInfo hipMemGetInfo
|
||||
#define cudaFuncSetAttribute hipFuncSetAttribute
|
||||
|
||||
#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount
|
||||
#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
|
||||
#endif
|
||||
@ -31,6 +31,8 @@
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "cuda_to_hip.h"
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "common.h"
|
||||
#include "xgboost/global_config.h"
|
||||
|
||||
@ -330,13 +330,8 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
|
||||
} else {
|
||||
// copy hessian as weight
|
||||
CHECK_EQ(d_weight_out.size(), hessian.size());
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
|
||||
hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
return d_weight_out;
|
||||
}
|
||||
|
||||
@ -88,19 +88,10 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
|
||||
template <std::uint32_t kBlockThreads, typename Kernel>
|
||||
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
|
||||
int n_mps = 0;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
|
||||
#endif
|
||||
int n_blocks_per_mp = 0;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, shared_mem));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, shared_mem));
|
||||
#endif
|
||||
std::uint32_t grid_size = n_blocks_per_mp * n_mps;
|
||||
return grid_size;
|
||||
}
|
||||
@ -348,13 +339,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
size_t columns, size_t begin, size_t end,
|
||||
SketchContainer *sketch_container) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#endif
|
||||
|
||||
info.weights_.SetDevice(device);
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
|
||||
@ -140,17 +140,10 @@ class HostDeviceVectorImpl {
|
||||
SetDevice();
|
||||
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
||||
ptr,
|
||||
other->Size() * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(this->DevicePointer() + ori_size,
|
||||
ptr,
|
||||
other->Size() * sizeof(T),
|
||||
hipMemcpyDeviceToDevice));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -204,17 +197,10 @@ class HostDeviceVectorImpl {
|
||||
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
|
||||
SetDevice();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(),
|
||||
data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(data_h_.data(),
|
||||
data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
void LazySyncDevice(GPUAccess access) {
|
||||
@ -228,17 +214,10 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(data_h_.size());
|
||||
SetDevice();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
|
||||
data_h_.data(),
|
||||
data_d_->size() * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(),
|
||||
data_h_.data(),
|
||||
data_d_->size() * sizeof(T),
|
||||
hipMemcpyHostToDevice));
|
||||
#endif
|
||||
gpu_access_ = access;
|
||||
}
|
||||
|
||||
@ -264,13 +243,8 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T), hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -279,13 +253,8 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), begin,
|
||||
data_d_->size() * sizeof(T), hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
void LazyResizeDevice(size_t new_size) {
|
||||
@ -297,11 +266,7 @@ class HostDeviceVectorImpl {
|
||||
void SetDevice() {
|
||||
CHECK_GE(device_, 0);
|
||||
if (cudaSetDeviceHandler == nullptr) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
} else {
|
||||
(*cudaSetDeviceHandler)(device_);
|
||||
}
|
||||
|
||||
@ -12,17 +12,9 @@
|
||||
namespace xgboost {
|
||||
namespace linalg {
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
|
||||
#endif
|
||||
{
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(t.Device().ordinal));
|
||||
#endif
|
||||
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
|
||||
"For function with return, use transform instead.");
|
||||
if (t.Contiguous()) {
|
||||
@ -37,11 +29,7 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s
|
||||
}
|
||||
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
|
||||
#endif
|
||||
{
|
||||
if (t.Contiguous()) {
|
||||
auto ptr = t.Values().data();
|
||||
|
||||
@ -110,15 +110,9 @@ void CopyTo(Span<T> out, Span<U> src) {
|
||||
CHECK_EQ(out.size(), src.size());
|
||||
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
|
||||
out.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(out.data(), src.data(),
|
||||
out.size_bytes(),
|
||||
hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Compute the merge path.
|
||||
@ -251,11 +245,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
||||
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
|
||||
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
CHECK_EQ(d_x.size() + d_y.size(), out.size());
|
||||
CHECK_EQ(x_ptr.size(), out_ptr.size());
|
||||
@ -354,11 +344,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT> cuts_ptr,
|
||||
size_t total_cuts, Span<float> weights) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
|
||||
Span<SketchEntry> out;
|
||||
dh::device_vector<SketchEntry> cuts;
|
||||
@ -418,11 +404,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
* pruning or merging. We preserve the first type and remove the second type.
|
||||
*/
|
||||
timer_.Start(__func__);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
@ -479,11 +461,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
|
||||
void SketchContainer::Prune(size_t to) {
|
||||
timer_.Start(__func__);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
|
||||
OffsetT to_total = 0;
|
||||
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
||||
@ -518,11 +496,7 @@ void SketchContainer::Prune(size_t to) {
|
||||
|
||||
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
Span<SketchEntry const> that) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
|
||||
timer_.Start(__func__);
|
||||
if (this->Current().size() == 0) {
|
||||
@ -558,11 +532,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
}
|
||||
|
||||
void SketchContainer::FixError() {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
|
||||
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
||||
auto in = dh::ToSpan(this->Current());
|
||||
@ -588,11 +558,7 @@ void SketchContainer::FixError() {
|
||||
}
|
||||
|
||||
void SketchContainer::AllReduce(bool is_column_split) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world == 1 || is_column_split) {
|
||||
return;
|
||||
@ -674,11 +640,7 @@ struct InvalidCatOp {
|
||||
|
||||
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
||||
timer_.Start(__func__);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
p_cuts->min_vals_.Resize(num_columns_);
|
||||
|
||||
// Sync between workers.
|
||||
|
||||
@ -176,11 +176,7 @@ class SketchContainer {
|
||||
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
||||
timer_.Start(__func__);
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#else
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#endif
|
||||
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||
|
||||
@ -147,13 +147,8 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
auto const& h_group_ptr = info.group_ptr_;
|
||||
group_ptr_.Resize(h_group_ptr.size());
|
||||
auto d_group_ptr = group_ptr_.DeviceSpan();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, cuctx->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
|
||||
hipMemcpyHostToDevice, cuctx->Stream()));
|
||||
#endif
|
||||
}
|
||||
|
||||
auto d_group_ptr = DataGroupPtr(ctx);
|
||||
|
||||
@ -61,13 +61,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
|
||||
out_group_threads_ptr.size());
|
||||
size_t total = 0;
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||
sizeof(total), hipMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||
sizeof(total), cudaMemcpyDeviceToHost));
|
||||
#endif
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
|
||||
// default per-thread stream
|
||||
default: {
|
||||
dh::CUDAEvent e;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
|
||||
#endif
|
||||
dh::DefaultStream().Wait(e);
|
||||
}
|
||||
}
|
||||
|
||||
@ -22,19 +22,11 @@ namespace cub = hipcub;
|
||||
namespace xgboost {
|
||||
namespace {
|
||||
auto SetDeviceToPtr(void const* ptr) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaPointerAttributes attr;
|
||||
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
|
||||
int32_t ptr_device = attr.device;
|
||||
dh::safe_cuda(cudaSetDevice(ptr_device));
|
||||
return ptr_device;
|
||||
#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
|
||||
hipPointerAttribute_t attr;
|
||||
dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
|
||||
int32_t ptr_device = attr.device;
|
||||
dh::safe_cuda(hipSetDevice(ptr_device));
|
||||
return ptr_device;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, int32_t D>
|
||||
@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
||||
// set data
|
||||
data->Resize(array.n);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
|
||||
cudaMemcpyDefault, ctx->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
|
||||
hipMemcpyDefault, ctx->Stream()));
|
||||
#endif
|
||||
});
|
||||
return;
|
||||
}
|
||||
@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
||||
});
|
||||
bool non_dec = true;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
|
||||
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
|
||||
size_t bytes = 0;
|
||||
|
||||
@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
|
||||
CHECK_NE(device_idx_, Context::kCpuId);
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
#endif
|
||||
|
||||
for (auto& json_col : json_columns) {
|
||||
auto column = ArrayInterface<1>(get<Object const>(json_col));
|
||||
@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
|
||||
template <typename AdapterBatchT>
|
||||
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
|
||||
float missing) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx));
|
||||
#endif
|
||||
|
||||
IsValidFunctor is_valid(missing);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||
#endif
|
||||
|
||||
auto n_samples = batch.NumRows();
|
||||
bst_feature_t n_features = batch.NumCols();
|
||||
|
||||
@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
n_rows(n_rows) {
|
||||
monitor_.Init("ellpack_page");
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(device);
|
||||
@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
|
||||
: is_dense(dmat->IsDense()) {
|
||||
monitor_.Init("ellpack_page");
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
|
||||
#endif
|
||||
|
||||
n_rows = dmat->Info().num_row_;
|
||||
|
||||
@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
|
||||
common::Span<size_t> row_counts_span,
|
||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||
size_t n_rows, common::HistogramCuts const& cuts) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
|
||||
CopyDataToEllpack(batch, feature_types, this, device, missing);
|
||||
@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
|
||||
auto d_row_ptr = dh::ToSpan(row_ptr);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
#endif
|
||||
|
||||
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
|
||||
auto null = accessor.NullValue();
|
||||
@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
if (row_batch.data.DeviceCanRead()) {
|
||||
auto const& d_data = row_batch.data.ConstDeviceSpan();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
entries_d.data().get(), d_data.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
entries_d.data().get(), d_data.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), hipMemcpyDefault));
|
||||
#endif
|
||||
} else {
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
const dim3 block3(32, 8, 1); // 256 threads
|
||||
|
||||
@ -10,11 +10,7 @@
|
||||
|
||||
namespace xgboost::data {
|
||||
void EllpackPageSource::Fetch() {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
if (!this->ReadCache()) {
|
||||
if (count_ != 0 && !sync_) {
|
||||
// source is initialized to be the 0th page during construction, so when count_ is 0
|
||||
|
||||
@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
|
||||
int32_t current_device;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetDevice(¤t_device));
|
||||
#endif
|
||||
|
||||
auto get_device = [&]() -> int32_t {
|
||||
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
|
||||
@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
// ctx_.gpu_id = proxy->DeviceIdx();
|
||||
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(get_device()));
|
||||
#endif
|
||||
|
||||
if (cols == 0) {
|
||||
cols = num_cols();
|
||||
@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
auto n_features = cols;
|
||||
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(get_device()));
|
||||
#endif
|
||||
|
||||
if (!ref) {
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
while (iter.Next()) {
|
||||
init_page();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(get_device()));
|
||||
#endif
|
||||
|
||||
auto rows = num_rows();
|
||||
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
||||
|
||||
@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
|
||||
: adapter->DeviceIdx();
|
||||
CHECK_GE(device, 0);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
Context ctx;
|
||||
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
|
||||
|
||||
@ -57,11 +57,7 @@ template <typename AdapterBatchT>
|
||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
int device_idx, float missing) {
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
#endif
|
||||
|
||||
IsValidFunctor is_valid(missing);
|
||||
// Count elements per row
|
||||
|
||||
@ -60,11 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
// The begin and end indices for the section of each column associated with
|
||||
// this device
|
||||
@ -92,17 +88,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
auto col = page[fidx];
|
||||
auto seg = column_segments[fidx];
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
data_.data().get() + row_ptr_[fidx],
|
||||
col.data() + seg.first,
|
||||
sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(
|
||||
data_.data().get() + row_ptr_[fidx],
|
||||
col.data() + seg.first,
|
||||
sizeof(Entry) * (seg.second - seg.first), hipMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,11 +171,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
|
||||
// This needs to be public because of the __device__ lambda.
|
||||
GradientPair GetBiasGradient(int group_idx, int num_group) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
auto counting = thrust::make_counting_iterator(0ull);
|
||||
auto f = [=] __device__(size_t idx) {
|
||||
@ -211,11 +196,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
|
||||
// This needs to be public because of the __device__ lambda.
|
||||
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
|
||||
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
|
||||
@ -249,17 +230,10 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
}
|
||||
|
||||
void UpdateGpair(const std::vector<GradientPair> &host_gpair) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
gpair_.data().get(),
|
||||
host_gpair.data(),
|
||||
gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
gpair_.data().get(),
|
||||
host_gpair.data(),
|
||||
gpair_.size() * sizeof(GradientPair), hipMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
|
||||
// training parameter
|
||||
|
||||
@ -95,11 +95,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
|
||||
auto labels = info.labels.View(device);
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device.ordinal));
|
||||
#endif
|
||||
|
||||
CHECK_NE(labels.Size(), 0);
|
||||
CHECK_EQ(labels.Size(), predts.size());
|
||||
@ -352,11 +348,7 @@ template <bool scale, typename Fn>
|
||||
double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
|
||||
common::Span<uint32_t> d_class_ptr, size_t n_classes,
|
||||
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device.ordinal));
|
||||
#endif
|
||||
/**
|
||||
* Sorted idx
|
||||
*/
|
||||
@ -934,11 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
||||
#endif
|
||||
|
||||
if (predts.empty()) {
|
||||
return std::make_pair(0.0, static_cast<uint32_t>(0));
|
||||
|
||||
@ -166,12 +166,7 @@ class MultiClassMetricsReduction {
|
||||
labels.SetDevice(device_);
|
||||
weights.SetDevice(device_);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
|
||||
result = DeviceReduceMetrics(weights, labels, preds, n_class);
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
|
||||
@ -159,11 +159,7 @@ class ElementWiseSurvivalMetricsReduction {
|
||||
labels_upper_bound.SetDevice(ctx.gpu_id);
|
||||
weights.SetDevice(ctx.gpu_id);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx.gpu_id));
|
||||
#endif
|
||||
|
||||
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
|
||||
}
|
||||
|
||||
@ -30,22 +30,13 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
|
||||
dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
|
||||
HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
|
||||
// copy position to buffer
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
|
||||
#endif
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
size_t n_samples = position.size();
|
||||
dh::device_vector<bst_node_t> sorted_position(position.size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
|
||||
position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(sorted_position.data().get(), position.data(),
|
||||
position.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream()));
|
||||
#endif
|
||||
|
||||
p_ridx->resize(position.size());
|
||||
dh::Iota(dh::ToSpan(*p_ridx));
|
||||
@ -98,17 +89,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
|
||||
bst_node_t* h_first_unique =
|
||||
reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
|
||||
cudaMemcpyDeviceToHost, copy_stream.View()));
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
|
||||
cudaMemcpyDeviceToHost, copy_stream.View()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
|
||||
hipMemcpyDeviceToHost, copy_stream.View()));
|
||||
dh::safe_cuda(hipMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
|
||||
hipMemcpyDeviceToHost, copy_stream.View()));
|
||||
#endif
|
||||
|
||||
/**
|
||||
* copy node index (leaf index)
|
||||
@ -171,11 +155,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
|
||||
void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
|
||||
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
|
||||
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
|
||||
#endif
|
||||
dh::device_vector<size_t> ridx;
|
||||
HostDeviceVector<size_t> nptr;
|
||||
HostDeviceVector<bst_node_t> nidx;
|
||||
|
||||
@ -297,11 +297,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
|
||||
linalg::Matrix<GradientPair>* out_gpair) {
|
||||
// boilerplate
|
||||
std::int32_t device_id = ctx->gpu_id;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_id));
|
||||
#endif
|
||||
auto n_groups = p_cache->Groups();
|
||||
|
||||
info.labels.SetDevice(device_id);
|
||||
@ -385,11 +381,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
|
||||
linalg::Matrix<GradientPair>* out_gpair) {
|
||||
// boilerplate
|
||||
auto device = ctx->Device();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device.ordinal));
|
||||
#endif
|
||||
auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
|
||||
auto const discount = p_cache->Discount(ctx);
|
||||
|
||||
@ -457,11 +449,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
|
||||
linalg::VectorView<double> li, linalg::VectorView<double> lj,
|
||||
linalg::Matrix<GradientPair>* out_gpair) {
|
||||
auto device = ctx->Device();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device.ordinal));
|
||||
#endif
|
||||
|
||||
info.labels.SetDevice(device);
|
||||
predt.SetDevice(device);
|
||||
@ -500,11 +488,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
|
||||
linalg::VectorView<double> li, linalg::VectorView<double> lj,
|
||||
linalg::Matrix<GradientPair>* out_gpair) {
|
||||
auto device = ctx->Device();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device.ordinal));
|
||||
#endif
|
||||
|
||||
info.labels.SetDevice(device);
|
||||
predt.SetDevice(device);
|
||||
|
||||
@ -341,11 +341,7 @@ class DeviceModel {
|
||||
int num_group;
|
||||
|
||||
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(gpu_id));
|
||||
#endif
|
||||
|
||||
// Copy decision trees to device
|
||||
tree_segments = HostDeviceVector<size_t>({}, gpu_id);
|
||||
@ -366,21 +362,12 @@ class DeviceModel {
|
||||
auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
|
||||
auto& src_stats = model.trees.at(tree_idx)->GetStats();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
|
||||
sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
|
||||
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
|
||||
sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault));
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
|
||||
sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
|
||||
@ -504,11 +491,7 @@ void ExtractPaths(
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
||||
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
||||
int gpu_id) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(gpu_id));
|
||||
#endif
|
||||
auto& device_model = *model;
|
||||
|
||||
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
||||
@ -584,15 +567,9 @@ void ExtractPaths(
|
||||
thrust::max_element(thrust::device, max_elem_it,
|
||||
max_elem_it + d_cat_node_segments.size()) -
|
||||
max_elem_it;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
|
||||
d_cat_node_segments.data() + max_cat_it,
|
||||
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(h_max_cat.data(),
|
||||
d_cat_node_segments.data() + max_cat_it,
|
||||
h_max_cat.size_bytes(), hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
max_cat = h_max_cat[0].size;
|
||||
CHECK_GE(max_cat, 1);
|
||||
path_categories->resize(max_cat * paths->size());
|
||||
@ -786,11 +763,7 @@ class ColumnSplitHelper {
|
||||
|
||||
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
|
||||
bst_feature_t num_features, std::uint32_t num_group) const {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
dh::caching_device_vector<BitType> decision_storage{};
|
||||
dh::caching_device_vector<BitType> missing_storage{};
|
||||
|
||||
@ -970,11 +943,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
|
||||
~GPUPredictor() override {
|
||||
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -1071,11 +1040,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
out_contribs->SetDevice(ctx_->gpu_id);
|
||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||
@ -1135,11 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
out_contribs->SetDevice(ctx_->gpu_id);
|
||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||
@ -1199,11 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
|
||||
const gbm::GBTreeModel &model,
|
||||
unsigned tree_end) const override {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
||||
|
||||
const MetaInfo& info = p_fmat->Info();
|
||||
|
||||
@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
||||
for (auto idx : nidx) {
|
||||
copy_stream_.View().Wait(event);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
|
||||
dh::ToSpan(out_entries));
|
||||
GPUExpandEntry root_entry;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
return root_entry;
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
|
||||
split_cats_.resize(node_categorical_storage_size_);
|
||||
h_split_cats_.resize(node_categorical_storage_size_);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(
|
||||
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(
|
||||
hipMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
|
||||
#endif
|
||||
|
||||
cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
|
||||
sort_input_.resize(cat_sorted_idx_.size());
|
||||
|
||||
@ -266,11 +266,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
// decide whether to use shared memory
|
||||
int device = 0;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetDevice(&device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetDevice(&device));
|
||||
#endif
|
||||
|
||||
// opt into maximum shared memory for the kernel if necessary
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
@ -303,17 +299,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
int num_groups = feature_groups.NumGroups();
|
||||
int n_mps = 0;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||
int n_blocks_per_mp = 0;
|
||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, smem_size));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
|
||||
int n_blocks_per_mp = 0;
|
||||
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, smem_size));
|
||||
#endif
|
||||
|
||||
// This gives the number of blocks to keep the device occupied
|
||||
// Use this as the maximum number of blocks
|
||||
@ -347,11 +336,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetLastError());
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
|
||||
@ -16,22 +16,14 @@ namespace tree {
|
||||
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
|
||||
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
#endif
|
||||
|
||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
||||
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
||||
}
|
||||
|
||||
RowPartitioner::~RowPartitioner() {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
#endif
|
||||
}
|
||||
|
||||
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
|
||||
|
||||
@ -287,15 +287,9 @@ class RowPartitioner {
|
||||
total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
|
||||
hipMemcpyDefault));
|
||||
#else
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
|
||||
// Temporary arrays
|
||||
auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
|
||||
@ -305,13 +299,8 @@ class RowPartitioner {
|
||||
SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
|
||||
dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
|
||||
total_rows, op, &tmp_);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
hipMemcpyDefault));
|
||||
#endif
|
||||
// TODO(Rory): this synchronisation hurts performance a lot
|
||||
// Future optimisation should find a way to skip this
|
||||
dh::DefaultStream().Sync();
|
||||
@ -348,15 +337,9 @@ class RowPartitioner {
|
||||
void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
|
||||
dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
|
||||
sizeof(NodePositionInfo) * ridx_segments_.size(),
|
||||
hipMemcpyDefault));
|
||||
#else
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
|
||||
sizeof(NodePositionInfo) * ridx_segments_.size(),
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
|
||||
constexpr int kBlockSize = 512;
|
||||
const int kItemsThread = 8;
|
||||
|
||||
@ -232,26 +232,16 @@ struct GPUHistMakerDevice {
|
||||
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
|
||||
if (d_gpair.size() != dh_gpair->Size()) {
|
||||
d_gpair.resize(dh_gpair->Size());
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
hipMemcpyDeviceToDevice));
|
||||
#endif
|
||||
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
|
||||
page = sample.page;
|
||||
gpair = sample.gpair;
|
||||
@ -338,28 +328,15 @@ struct GPUHistMakerDevice {
|
||||
max_active_features =
|
||||
std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
d_node_inputs.data().get(), h_node_inputs.data(),
|
||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
d_node_inputs.data().get(), h_node_inputs.data(),
|
||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
|
||||
#endif
|
||||
|
||||
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
|
||||
shared_inputs, dh::ToSpan(entries));
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(),
|
||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
}
|
||||
|
||||
@ -412,13 +389,8 @@ struct GPUHistMakerDevice {
|
||||
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
||||
|
||||
dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
|
||||
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
|
||||
num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
|
||||
#endif
|
||||
auto d_split_data = dh::ToSpan(split_data_storage);
|
||||
|
||||
dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
|
||||
@ -527,15 +499,9 @@ struct GPUHistMakerDevice {
|
||||
|
||||
dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
|
||||
d_nodes.size() * sizeof(RegTree::Node),
|
||||
cudaMemcpyHostToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
|
||||
d_nodes.size() * sizeof(RegTree::Node),
|
||||
hipMemcpyHostToDevice));
|
||||
#endif
|
||||
|
||||
auto const& h_split_types = p_tree->GetSplitTypes();
|
||||
auto const& categories = p_tree->GetSplitCategories();
|
||||
@ -606,15 +572,9 @@ struct GPUHistMakerDevice {
|
||||
auto s_position = p_out_position->ConstDeviceSpan();
|
||||
positions.resize(s_position.size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
|
||||
s_position.size_bytes(), cudaMemcpyDeviceToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(),
|
||||
s_position.size_bytes(), hipMemcpyDeviceToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#endif
|
||||
|
||||
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
|
||||
bst_node_t position = d_out_position[idx];
|
||||
@ -632,26 +592,16 @@ struct GPUHistMakerDevice {
|
||||
CHECK(out_preds_d.Device().IsCUDA());
|
||||
CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
|
||||
#endif
|
||||
auto d_position = dh::ToSpan(positions);
|
||||
CHECK_EQ(out_preds_d.Size(), d_position.size());
|
||||
|
||||
auto const& h_nodes = p_tree->GetNodes();
|
||||
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
|
||||
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(),
|
||||
h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#endif
|
||||
|
||||
auto d_nodes = dh::ToSpan(nodes);
|
||||
CHECK_EQ(out_preds_d.Shape(1), 1);
|
||||
@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
++t_idx;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetLastError());
|
||||
#endif
|
||||
} catch (const std::exception& e) {
|
||||
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
|
||||
}
|
||||
@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
||||
maker = std::make_unique<GPUHistMakerDevice>(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user