GPU performance logging/improvements (#3945)

- Improved GPU performance logging

- Only use one execute shards function

- Revert performance regression on multi-GPU

- Use threads to launch NCCL AllReduce
This commit is contained in:
Rory Mitchell
2018-11-29 14:36:51 +13:00
committed by GitHub
parent c5f92df475
commit a9d684db18
8 changed files with 127 additions and 102 deletions

View File

@@ -258,7 +258,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
monitor.Start("UpdateGpair");
// Update gpair
dh::ExecuteShards(&shards, [&](std::unique_ptr<DeviceShard> &shard) {
dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
shard->UpdateGpair(in_gpair->ConstHostVector(), model->param);
});
monitor.Stop("UpdateGpair");
@@ -300,7 +300,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
model->bias()[group_idx] += dbias;
// Update residual
dh::ExecuteShards(&shards, [&](std::unique_ptr<DeviceShard> &shard) {
dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
shard->UpdateBiasResidual(dbias, group_idx,
model->param.num_output_group);
});
@@ -324,7 +324,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
param.reg_lambda_denorm));
w += dw;
dh::ExecuteShards(&shards, [&](std::unique_ptr<DeviceShard> &shard) {
dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
});
}