add cuda to hip wrapper

This commit is contained in:
Your Name
2023-10-17 12:42:37 -07:00
parent ea19555474
commit ffbbc9c968
35 changed files with 60 additions and 509 deletions

View File

@@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
// default per-thread stream
default: {
dh::CUDAEvent e;
#if defined(XGBOOST_USE_CUDA)
e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
#elif defined(XGBOOST_USE_HIP)
e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
#endif
dh::DefaultStream().Wait(e);
}
}

View File

@@ -22,19 +22,11 @@ namespace cub = hipcub;
namespace xgboost {
namespace {
auto SetDeviceToPtr(void const* ptr) {
#if defined(XGBOOST_USE_CUDA)
cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(cudaSetDevice(ptr_device));
return ptr_device;
#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
hipPointerAttribute_t attr;
dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(hipSetDevice(ptr_device));
return ptr_device;
#endif
}
template <typename T, int32_t D>
@@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
// set data
data->Resize(array.n);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
cudaMemcpyDefault, ctx->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
hipMemcpyDefault, ctx->Stream()));
#endif
});
return;
}
@@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
});
bool non_dec = true;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
hipMemcpyDeviceToHost));
#endif
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
size_t bytes = 0;

View File

@@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, Context::kCpuId);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_));
#endif
for (auto& json_col : json_columns) {
auto column = ArrayInterface<1>(get<Object const>(json_col));
@@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
template <typename AdapterBatchT>
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
float missing) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#endif
auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();

View File

@@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
n_rows(n_rows) {
monitor_.Init("ellpack_page");
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
monitor_.Start("InitCompressedData");
InitCompressedData(device);
@@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
: is_dense(dmat->IsDense()) {
monitor_.Init("ellpack_page");
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
#endif
n_rows = dmat->Info().num_row_;
@@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
auto d_row_ptr = dh::ToSpan(row_ptr);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#endif
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
auto null = accessor.NullValue();
@@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
if (row_batch.data.DeviceCanRead()) {
auto const& d_data = row_batch.data.ConstDeviceSpan();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
entries_d.data().get(), d_data.data() + ent_cnt_begin,
n_entries * sizeof(Entry), hipMemcpyDefault));
#endif
} else {
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
n_entries * sizeof(Entry), hipMemcpyDefault));
#endif
}
const dim3 block3(32, 8, 1); // 256 threads

View File

@@ -10,11 +10,7 @@
namespace xgboost::data {
void EllpackPageSource::Fetch() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
if (!this->ReadCache()) {
if (count_ != 0 && !sync_) {
// source is initialized to be the 0th page during construction, so when count_ is 0

View File

@@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
int32_t current_device;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&current_device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&current_device));
#endif
auto get_device = [&]() -> int32_t {
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
@@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
// ctx_.gpu_id = proxy->DeviceIdx();
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
if (cols == 0) {
cols = num_cols();
@@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto n_features = cols;
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
if (!ref) {
HostDeviceVector<FeatureType> ft;
@@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
while (iter.Next()) {
init_page();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(get_device()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(get_device()));
#endif
auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0);

View File

@@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
: adapter->DeviceIdx();
CHECK_GE(device, 0);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});

View File

@@ -57,11 +57,7 @@ template <typename AdapterBatchT>
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
int device_idx, float missing) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing);
// Count elements per row