Use nccl group calls to prevent from dead lock. (#4113)
* launch all reduce sequentially. * Fix gpu_exact test memory leak.
This commit is contained in:
parent
05243642bb
commit
f8ca2960fc
@ -852,7 +852,7 @@ class AllReducer {
|
|||||||
#ifdef XGBOOST_USE_NCCL
|
#ifdef XGBOOST_USE_NCCL
|
||||||
std::vector<ncclComm_t> comms;
|
std::vector<ncclComm_t> comms;
|
||||||
std::vector<cudaStream_t> streams;
|
std::vector<cudaStream_t> streams;
|
||||||
std::vector<int> device_ordinals;
|
std::vector<int> device_ordinals; // device id from CUDA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|||||||
@ -459,9 +459,8 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::~HostDeviceVector() {
|
HostDeviceVector<T>::~HostDeviceVector() {
|
||||||
HostDeviceVectorImpl<T>* tmp = impl_;
|
delete impl_;
|
||||||
impl_ = nullptr;
|
impl_ = nullptr;
|
||||||
delete tmp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|||||||
@ -1050,19 +1050,20 @@ class GPUHistMakerSpecialised{
|
|||||||
|
|
||||||
void AllReduceHist(int nidx) {
|
void AllReduceHist(int nidx) {
|
||||||
if (shards_.size() == 1) return;
|
if (shards_.size() == 1) return;
|
||||||
|
|
||||||
monitor_.Start("AllReduce");
|
monitor_.Start("AllReduce");
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
reducer_.GroupStart();
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
for (auto& shard : shards_) {
|
||||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
||||||
reducer_.AllReduceSum(
|
reducer_.AllReduceSum(
|
||||||
dist_.Devices().Index(shard->device_id_),
|
dist_.Devices().Index(shard->device_id_),
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
n_bins_ * (sizeof(GradientSumT) /
|
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||||
sizeof(typename GradientSumT::ValueT)));
|
}
|
||||||
});
|
reducer_.GroupEnd();
|
||||||
|
reducer_.Synchronize();
|
||||||
|
|
||||||
monitor_.Stop("AllReduce");
|
monitor_.Stop("AllReduce");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -42,6 +42,9 @@ TEST(GPUExact, Update) {
|
|||||||
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
|
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
|
||||||
|
|
||||||
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
|
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
|
||||||
|
|
||||||
|
delete dmat;
|
||||||
|
delete p_gpuexact_maker;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user