GPU performance logging/improvements (#3945)
- Improved GPU performance logging - Only use one execute shards function - Revert performance regression on multi-GPU - Use threads to launch NCCL AllReduce
This commit is contained in:
@@ -337,10 +337,10 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
std::vector<size_t> device_offsets;
|
||||
DeviceOffsets(batch.offset, &device_offsets);
|
||||
batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
|
||||
dh::ExecuteShards(&shards, [&](DeviceShard& shard){
|
||||
shard.PredictInternal(batch, dmat->Info(), out_preds, model, h_tree_segments,
|
||||
h_nodes, tree_begin, tree_end);
|
||||
});
|
||||
dh::ExecuteIndexShards(&shards, [&](int idx, DeviceShard& shard) {
|
||||
shard.PredictInternal(batch, dmat->Info(), out_preds, model,
|
||||
h_tree_segments, h_nodes, tree_begin, tree_end);
|
||||
});
|
||||
i_batch++;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user