Use nccl group calls to prevent from dead lock. (#4113)
* launch all reduce sequentially. * Fix gpu_exact test memory leak.
This commit is contained in:
parent
05243642bb
commit
f8ca2960fc
@ -852,7 +852,7 @@ class AllReducer {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<cudaStream_t> streams;
|
||||
std::vector<int> device_ordinals;
|
||||
std::vector<int> device_ordinals; // device id from CUDA
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
@ -459,9 +459,8 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::~HostDeviceVector() {
|
||||
HostDeviceVectorImpl<T>* tmp = impl_;
|
||||
delete impl_;
|
||||
impl_ = nullptr;
|
||||
delete tmp;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
@ -1050,19 +1050,20 @@ class GPUHistMakerSpecialised{
|
||||
|
||||
void AllReduceHist(int nidx) {
|
||||
if (shards_.size() == 1) return;
|
||||
|
||||
monitor_.Start("AllReduce");
|
||||
dh::ExecuteIndexShards(
|
||||
&shards_,
|
||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
||||
reducer_.AllReduceSum(
|
||||
dist_.Devices().Index(shard->device_id_),
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
n_bins_ * (sizeof(GradientSumT) /
|
||||
sizeof(typename GradientSumT::ValueT)));
|
||||
});
|
||||
|
||||
reducer_.GroupStart();
|
||||
for (auto& shard : shards_) {
|
||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
||||
reducer_.AllReduceSum(
|
||||
dist_.Devices().Index(shard->device_id_),
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||
}
|
||||
reducer_.GroupEnd();
|
||||
reducer_.Synchronize();
|
||||
|
||||
monitor_.Stop("AllReduce");
|
||||
}
|
||||
|
||||
|
||||
@ -42,6 +42,9 @@ TEST(GPUExact, Update) {
|
||||
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
|
||||
|
||||
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
|
||||
|
||||
delete dmat;
|
||||
delete p_gpuexact_maker;
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user