Multi-GPU HostDeviceVector. (#3287)
* Multi-GPU HostDeviceVector. - HostDeviceVector instances can now span multiple devices, defined by GPUSet struct - the interface of HostDeviceVector has been modified accordingly - GPU objective functions are now multi-GPU - GPU predicting from cache is now multi-GPU - avoiding omp_set_num_threads() calls - other minor changes
This commit is contained in:
committed by
Rory Mitchell
parent
90a5c4db9d
commit
b8a0d66fe6
@@ -369,8 +369,7 @@ struct DeviceShard {
|
||||
}
|
||||
|
||||
// Reset values for each update iteration
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, int device) {
|
||||
auto begin = dh_gpair->tbegin(device);
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
position.CurrentDVec().Fill(0);
|
||||
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||
@@ -380,7 +379,7 @@ struct DeviceShard {
|
||||
|
||||
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
||||
ridx_segments.front() = Segment(0, ridx.Size());
|
||||
this->gpair.copy(begin + row_begin_idx, begin + row_end_idx);
|
||||
this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx));
|
||||
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
||||
hist.Reset();
|
||||
}
|
||||
@@ -505,7 +504,7 @@ struct DeviceShard {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
if (!prediction_cache_initialised) {
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
prediction_cache.Data(), &out_preds_d[row_begin_idx],
|
||||
prediction_cache.Data(), out_preds_d,
|
||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
prediction_cache_initialised = true;
|
||||
@@ -528,7 +527,7 @@ struct DeviceShard {
|
||||
});
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
&out_preds_d[row_begin_idx], prediction_cache.Data(),
|
||||
out_preds_d, prediction_cache.Data(),
|
||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
@@ -543,6 +542,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
param_.InitAllowUnknown(args);
|
||||
CHECK(param_.n_gpus != 0) << "Must have at least one device";
|
||||
n_devices_ = param_.n_gpus;
|
||||
devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus));
|
||||
|
||||
dh::CheckComputeCapability();
|
||||
|
||||
@@ -610,15 +610,11 @@ class GPUHistMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
// Create device shards
|
||||
omp_set_num_threads(shards_.size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
shards_[cpu_thread_id] = std::unique_ptr<DeviceShard>(
|
||||
new DeviceShard(device_list_[cpu_thread_id], cpu_thread_id, gmat_,
|
||||
row_segments[cpu_thread_id],
|
||||
row_segments[cpu_thread_id + 1], n_bins_, param_));
|
||||
}
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
|
||||
shard = std::unique_ptr<DeviceShard>(
|
||||
new DeviceShard(device_list_[i], i, gmat_,
|
||||
row_segments[i], row_segments[i + 1], n_bins_, param_));
|
||||
});
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised_ = true;
|
||||
@@ -636,12 +632,9 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
// Copy gpair & reset memory
|
||||
monitor_.Start("InitDataReset", device_list_);
|
||||
omp_set_num_threads(shards_.size());
|
||||
|
||||
// TODO(canonizer): make it parallel again once HostDeviceVector is
|
||||
// thread-safe
|
||||
for (int shard = 0; shard < shards_.size(); ++shard)
|
||||
shards_[shard]->Reset(gpair, param_.gpu_id);
|
||||
gpair->Reshard(devices_);
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {shard->Reset(gpair); });
|
||||
monitor_.Stop("InitDataReset", device_list_);
|
||||
}
|
||||
|
||||
@@ -676,16 +669,16 @@ class GPUHistMaker : public TreeUpdater {
|
||||
subtraction_trick_nidx = nidx_left;
|
||||
}
|
||||
|
||||
for (auto& shard : shards_) {
|
||||
shard->BuildHist(build_hist_nidx);
|
||||
}
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||
shard->BuildHist(build_hist_nidx);
|
||||
});
|
||||
|
||||
this->AllReduceHist(build_hist_nidx);
|
||||
|
||||
for (auto& shard : shards_) {
|
||||
shard->SubtractionTrick(nidx_parent, build_hist_nidx,
|
||||
subtraction_trick_nidx);
|
||||
}
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||
shard->SubtractionTrick(nidx_parent, build_hist_nidx,
|
||||
subtraction_trick_nidx);
|
||||
});
|
||||
}
|
||||
|
||||
// Returns best loss
|
||||
@@ -743,22 +736,20 @@ class GPUHistMaker : public TreeUpdater {
|
||||
auto root_nidx = 0;
|
||||
// Sum gradients
|
||||
std::vector<GradientPair> tmp_sums(shards_.size());
|
||||
omp_set_num_threads(shards_.size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
auto& shard = shards_[cpu_thread_id];
|
||||
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
||||
tmp_sums[cpu_thread_id] = dh::SumReduction(
|
||||
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
|
||||
}
|
||||
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
|
||||
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
||||
tmp_sums[i] =
|
||||
dh::SumReduction(shard->temp_memory, shard->gpair.Data(),
|
||||
shard->gpair.Size());
|
||||
});
|
||||
auto sum_gradient =
|
||||
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
||||
|
||||
// Generate root histogram
|
||||
for (auto& shard : shards_) {
|
||||
shard->BuildHist(root_nidx);
|
||||
}
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||
shard->BuildHist(root_nidx);
|
||||
});
|
||||
|
||||
this->AllReduceHist(root_nidx);
|
||||
|
||||
@@ -802,14 +793,11 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
||||
|
||||
omp_set_num_threads(shards_.size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
shards_[cpu_thread_id]->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
|
||||
split_gidx, default_dir_left,
|
||||
is_dense, fidx_begin, fidx_end);
|
||||
}
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||
shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
|
||||
split_gidx, default_dir_left,
|
||||
is_dense, fidx_begin, fidx_end);
|
||||
});
|
||||
}
|
||||
|
||||
void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
|
||||
@@ -903,8 +891,6 @@ class GPUHistMaker : public TreeUpdater {
|
||||
monitor_.Stop("EvaluateSplits", device_list_);
|
||||
}
|
||||
}
|
||||
// Reset omp num threads
|
||||
omp_set_num_threads(nthread);
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache(
|
||||
@@ -912,13 +898,10 @@ class GPUHistMaker : public TreeUpdater {
|
||||
monitor_.Start("UpdatePredictionCache", device_list_);
|
||||
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
||||
return false;
|
||||
|
||||
bst_float* out_preds_d = p_out_preds->DevicePointer(param_.gpu_id);
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards_.size(); ++shard) {
|
||||
shards_[shard]->UpdatePredictionCache(out_preds_d);
|
||||
}
|
||||
p_out_preds->Reshard(devices_);
|
||||
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||
shard->UpdatePredictionCache(p_out_preds->DevicePointer(shard->device_idx));
|
||||
});
|
||||
monitor_.Stop("UpdatePredictionCache", device_list_);
|
||||
return true;
|
||||
}
|
||||
@@ -992,6 +975,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
std::vector<int> device_list_;
|
||||
|
||||
DMatrix* p_last_fmat_;
|
||||
GPUSet devices_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||
|
||||
Reference in New Issue
Block a user