GPU performance logging/improvements (#3945)

- Improved GPU performance logging

- Only use one execute shards function

- Revert performance regression on multi-GPU

- Use threads to launch NCCL AllReduce
This commit is contained in:
Rory Mitchell
2018-11-29 14:36:51 +13:00
committed by GitHub
parent c5f92df475
commit a9d684db18
8 changed files with 127 additions and 102 deletions

View File

@@ -337,10 +337,10 @@ class GPUPredictor : public xgboost::Predictor {
std::vector<size_t> device_offsets;
DeviceOffsets(batch.offset, &device_offsets);
batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
dh::ExecuteShards(&shards, [&](DeviceShard& shard){
shard.PredictInternal(batch, dmat->Info(), out_preds, model, h_tree_segments,
h_nodes, tree_begin, tree_end);
});
dh::ExecuteIndexShards(&shards, [&](int idx, DeviceShard& shard) {
shard.PredictInternal(batch, dmat->Info(), out_preds, model,
h_tree_segments, h_nodes, tree_begin, tree_end);
});
i_batch++;
}
}