Fix specifying gpu_id, add tests. (#3851)

* Rewrite gpu_id related code.

* Remove normalised/unnormalised operatios.
* Address difference between `Index' and `Device ID'.
* Modify doc for `gpu_id'.
* Better LOG for GPUSet.
* Check specified n_gpus.
* Remove inappropriate `device_idx' term.
* Clarify GpuIdType and size_t.
This commit is contained in:
Jiaming Yuan
2018-11-06 18:17:53 +13:00
committed by GitHub
parent 1698fe64bb
commit f1275f52c1
20 changed files with 341 additions and 203 deletions

View File

@@ -1,5 +1,5 @@
/*!
* Copyright by Contributors 2017
* Copyright 2017-2018 by Contributors
*/
#include <dmlc/parameter.h>
#include <thrust/copy.h>
@@ -230,7 +230,7 @@ class GPUPredictor : public xgboost::Predictor {
offsets[0] = 0;
#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
for (int shard = 0; shard < devices_.Size(); ++shard) {
int device = devices_[shard];
int device = devices_.DeviceId(shard);
auto data_span = data.DeviceSpan(device);
dh::safe_cuda(cudaSetDevice(device));
// copy the last element from every shard
@@ -271,6 +271,7 @@ class GPUPredictor : public xgboost::Predictor {
const int BLOCK_THREADS = 128;
size_t num_rows = batch.offset.DeviceSize(device_) - 1;
if (num_rows < 1) { return; }
const int GRID_SIZE = static_cast<int>(dh::DivRoundUp(num_rows, BLOCK_THREADS));
@@ -282,8 +283,8 @@ class GPUPredictor : public xgboost::Predictor {
use_shared = false;
}
const auto& data_distr = batch.data.Distribution();
int index = data_distr.Devices().Index(device_);
size_t entry_start = data_distr.ShardStart(batch.data.Size(), index);
size_t entry_start = data_distr.ShardStart(batch.data.Size(),
data_distr.Devices().Index(device_));
PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
(dh::ToSpan(nodes), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments),
@@ -291,6 +292,7 @@ class GPUPredictor : public xgboost::Predictor {
batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_,
num_rows, entry_start, use_shared, model.param.num_output_group);
dh::safe_cuda(cudaGetLastError());
dh::safe_cuda(cudaDeviceSynchronize());
}
@@ -350,7 +352,7 @@ class GPUPredictor : public xgboost::Predictor {
const gbm::GBTreeModel& model, int tree_begin,
unsigned ntree_limit = 0) override {
GPUSet devices = GPUSet::All(
param.n_gpus, dmat->Info().num_row_).Normalised(param.gpu_id);
param.gpu_id, param.n_gpus, dmat->Info().num_row_);
ConfigureShards(devices);
if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
@@ -464,7 +466,7 @@ class GPUPredictor : public xgboost::Predictor {
cpu_predictor->Init(cfg, cache);
param.InitAllowUnknown(cfg);
GPUSet devices = GPUSet::All(param.n_gpus).Normalised(param.gpu_id);
GPUSet devices = GPUSet::All(param.gpu_id, param.n_gpus);
ConfigureShards(devices);
}
@@ -477,7 +479,7 @@ class GPUPredictor : public xgboost::Predictor {
shards.clear();
shards.resize(devices_.Size());
dh::ExecuteIndexShards(&shards, [=](size_t i, DeviceShard& shard){
shard.Init(devices_[i]);
shard.Init(devices_.DeviceId(i));
});
}