GPU performance logging/improvements (#3945)
- Improved GPU performance logging - Only use one execute shards function - Revert performance regression on multi-GPU - Use threads to launch NCCL AllReduce
This commit is contained in:
@@ -358,7 +358,7 @@ struct GPUSketcher {
|
||||
});
|
||||
|
||||
// compute sketches for each shard
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||
dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
|
||||
shard->Init(batch, info);
|
||||
shard->Sketch(batch, info);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user