add cuda to hip wrapper
This commit is contained in:
@@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
|
||||
// default per-thread stream
|
||||
default: {
|
||||
dh::CUDAEvent e;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
|
||||
#endif
|
||||
dh::DefaultStream().Wait(e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,19 +22,11 @@ namespace cub = hipcub;
|
||||
namespace xgboost {
|
||||
namespace {
|
||||
auto SetDeviceToPtr(void const* ptr) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cudaPointerAttributes attr;
|
||||
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
|
||||
int32_t ptr_device = attr.device;
|
||||
dh::safe_cuda(cudaSetDevice(ptr_device));
|
||||
return ptr_device;
|
||||
#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
|
||||
hipPointerAttribute_t attr;
|
||||
dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
|
||||
int32_t ptr_device = attr.device;
|
||||
dh::safe_cuda(hipSetDevice(ptr_device));
|
||||
return ptr_device;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, int32_t D>
|
||||
@@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
||||
// set data
|
||||
data->Resize(array.n);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
|
||||
cudaMemcpyDefault, ctx->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
|
||||
hipMemcpyDefault, ctx->Stream()));
|
||||
#endif
|
||||
});
|
||||
return;
|
||||
}
|
||||
@@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
||||
});
|
||||
bool non_dec = true;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
|
||||
CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
|
||||
size_t bytes = 0;
|
||||
|
||||
@@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
|
||||
CHECK_NE(device_idx_, Context::kCpuId);
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
#endif
|
||||
|
||||
for (auto& json_col : json_columns) {
|
||||
auto column = ArrayInterface<1>(get<Object const>(json_col));
|
||||
@@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
|
||||
template <typename AdapterBatchT>
|
||||
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
|
||||
float missing) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx));
|
||||
#endif
|
||||
|
||||
IsValidFunctor is_valid(missing);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||
#endif
|
||||
|
||||
auto n_samples = batch.NumRows();
|
||||
bst_feature_t n_features = batch.NumCols();
|
||||
|
||||
@@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
n_rows(n_rows) {
|
||||
monitor_.Init("ellpack_page");
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(device);
|
||||
@@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
|
||||
: is_dense(dmat->IsDense()) {
|
||||
monitor_.Init("ellpack_page");
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
|
||||
#endif
|
||||
|
||||
n_rows = dmat->Info().num_row_;
|
||||
|
||||
@@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
|
||||
common::Span<size_t> row_counts_span,
|
||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||
size_t n_rows, common::HistogramCuts const& cuts) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
|
||||
CopyDataToEllpack(batch, feature_types, this, device, missing);
|
||||
@@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
|
||||
auto d_row_ptr = dh::ToSpan(row_ptr);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
#endif
|
||||
|
||||
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
|
||||
auto null = accessor.NullValue();
|
||||
@@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
if (row_batch.data.DeviceCanRead()) {
|
||||
auto const& d_data = row_batch.data.ConstDeviceSpan();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
entries_d.data().get(), d_data.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
entries_d.data().get(), d_data.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), hipMemcpyDefault));
|
||||
#endif
|
||||
} else {
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), hipMemcpyDefault));
|
||||
#endif
|
||||
}
|
||||
|
||||
const dim3 block3(32, 8, 1); // 256 threads
|
||||
|
||||
@@ -10,11 +10,7 @@
|
||||
|
||||
namespace xgboost::data {
|
||||
void EllpackPageSource::Fetch() {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#endif
|
||||
if (!this->ReadCache()) {
|
||||
if (count_ != 0 && !sync_) {
|
||||
// source is initialized to be the 0th page during construction, so when count_ is 0
|
||||
|
||||
@@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
|
||||
int32_t current_device;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetDevice(¤t_device));
|
||||
#endif
|
||||
|
||||
auto get_device = [&]() -> int32_t {
|
||||
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
|
||||
@@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
// ctx_.gpu_id = proxy->DeviceIdx();
|
||||
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(get_device()));
|
||||
#endif
|
||||
|
||||
if (cols == 0) {
|
||||
cols = num_cols();
|
||||
@@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
auto n_features = cols;
|
||||
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(get_device()));
|
||||
#endif
|
||||
|
||||
if (!ref) {
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
@@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
while (iter.Next()) {
|
||||
init_page();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(get_device()));
|
||||
#endif
|
||||
|
||||
auto rows = num_rows();
|
||||
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
||||
|
||||
@@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
|
||||
: adapter->DeviceIdx();
|
||||
CHECK_GE(device, 0);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device));
|
||||
#endif
|
||||
|
||||
Context ctx;
|
||||
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
|
||||
|
||||
@@ -57,11 +57,7 @@ template <typename AdapterBatchT>
|
||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
int device_idx, float missing) {
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
#endif
|
||||
|
||||
IsValidFunctor is_valid(missing);
|
||||
// Count elements per row
|
||||
|
||||
Reference in New Issue
Block a user