add cuda to hip wrapper
This commit is contained in:
@@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
||||
for (auto idx : nidx) {
|
||||
copy_stream_.View().Wait(event);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
|
||||
dh::ToSpan(out_entries));
|
||||
GPUExpandEntry root_entry;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
return root_entry;
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
|
||||
split_cats_.resize(node_categorical_storage_size_);
|
||||
h_split_cats_.resize(node_categorical_storage_size_);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(
|
||||
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(
|
||||
hipMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
|
||||
#endif
|
||||
|
||||
cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
|
||||
sort_input_.resize(cat_sorted_idx_.size());
|
||||
|
||||
@@ -266,11 +266,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
// decide whether to use shared memory
|
||||
int device = 0;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetDevice(&device));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetDevice(&device));
|
||||
#endif
|
||||
|
||||
// opt into maximum shared memory for the kernel if necessary
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
@@ -303,17 +299,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
int num_groups = feature_groups.NumGroups();
|
||||
int n_mps = 0;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||
int n_blocks_per_mp = 0;
|
||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, smem_size));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
|
||||
int n_blocks_per_mp = 0;
|
||||
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, smem_size));
|
||||
#endif
|
||||
|
||||
// This gives the number of blocks to keep the device occupied
|
||||
// Use this as the maximum number of blocks
|
||||
@@ -347,11 +336,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetLastError());
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
|
||||
@@ -16,22 +16,14 @@ namespace tree {
|
||||
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
|
||||
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
#endif
|
||||
|
||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
||||
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
||||
}
|
||||
|
||||
RowPartitioner::~RowPartitioner() {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
#endif
|
||||
}
|
||||
|
||||
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
|
||||
|
||||
@@ -287,15 +287,9 @@ class RowPartitioner {
|
||||
total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
|
||||
hipMemcpyDefault));
|
||||
#else
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
|
||||
// Temporary arrays
|
||||
auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
|
||||
@@ -305,13 +299,8 @@ class RowPartitioner {
|
||||
SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
|
||||
dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
|
||||
total_rows, op, &tmp_);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
hipMemcpyDefault));
|
||||
#endif
|
||||
// TODO(Rory): this synchronisation hurts performance a lot
|
||||
// Future optimisation should find a way to skip this
|
||||
dh::DefaultStream().Sync();
|
||||
@@ -348,15 +337,9 @@ class RowPartitioner {
|
||||
void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
|
||||
dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
|
||||
sizeof(NodePositionInfo) * ridx_segments_.size(),
|
||||
hipMemcpyDefault));
|
||||
#else
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
|
||||
sizeof(NodePositionInfo) * ridx_segments_.size(),
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
|
||||
constexpr int kBlockSize = 512;
|
||||
const int kItemsThread = 8;
|
||||
|
||||
@@ -232,26 +232,16 @@ struct GPUHistMakerDevice {
|
||||
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
|
||||
if (d_gpair.size() != dh_gpair->Size()) {
|
||||
d_gpair.resize(dh_gpair->Size());
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
hipMemcpyDeviceToDevice));
|
||||
#endif
|
||||
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
|
||||
page = sample.page;
|
||||
gpair = sample.gpair;
|
||||
@@ -338,28 +328,15 @@ struct GPUHistMakerDevice {
|
||||
max_active_features =
|
||||
std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
d_node_inputs.data().get(), h_node_inputs.data(),
|
||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
d_node_inputs.data().get(), h_node_inputs.data(),
|
||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
|
||||
#endif
|
||||
|
||||
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
|
||||
shared_inputs, dh::ToSpan(entries));
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(),
|
||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
}
|
||||
|
||||
@@ -412,13 +389,8 @@ struct GPUHistMakerDevice {
|
||||
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
||||
|
||||
dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
|
||||
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
|
||||
num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
|
||||
#endif
|
||||
auto d_split_data = dh::ToSpan(split_data_storage);
|
||||
|
||||
dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
|
||||
@@ -527,15 +499,9 @@ struct GPUHistMakerDevice {
|
||||
|
||||
dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
|
||||
d_nodes.size() * sizeof(RegTree::Node),
|
||||
cudaMemcpyHostToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
|
||||
d_nodes.size() * sizeof(RegTree::Node),
|
||||
hipMemcpyHostToDevice));
|
||||
#endif
|
||||
|
||||
auto const& h_split_types = p_tree->GetSplitTypes();
|
||||
auto const& categories = p_tree->GetSplitCategories();
|
||||
@@ -606,15 +572,9 @@ struct GPUHistMakerDevice {
|
||||
auto s_position = p_out_position->ConstDeviceSpan();
|
||||
positions.resize(s_position.size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
|
||||
s_position.size_bytes(), cudaMemcpyDeviceToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(),
|
||||
s_position.size_bytes(), hipMemcpyDeviceToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#endif
|
||||
|
||||
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
|
||||
bst_node_t position = d_out_position[idx];
|
||||
@@ -632,26 +592,16 @@ struct GPUHistMakerDevice {
|
||||
CHECK(out_preds_d.Device().IsCUDA());
|
||||
CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
|
||||
#endif
|
||||
auto d_position = dh::ToSpan(positions);
|
||||
CHECK_EQ(out_preds_d.Size(), d_position.size());
|
||||
|
||||
auto const& h_nodes = p_tree->GetNodes();
|
||||
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
|
||||
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(),
|
||||
h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice,
|
||||
ctx_->CUDACtx()->Stream()));
|
||||
#endif
|
||||
|
||||
auto d_nodes = dh::ToSpan(nodes);
|
||||
CHECK_EQ(out_preds_d.Shape(1), 1);
|
||||
@@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
++t_idx;
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipGetLastError());
|
||||
#endif
|
||||
} catch (const std::exception& e) {
|
||||
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
|
||||
}
|
||||
@@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
||||
maker = std::make_unique<GPUHistMakerDevice>(
|
||||
|
||||
Reference in New Issue
Block a user