remove device shards (#4867)

2019-09-24 22:15:46 -07:00
parent 0b89cd1dfa
commit 562bb0ae31
8 changed files with 572 additions and 635 deletions
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -435,7 +435,7 @@ __global__ void SharedMemHistKernel(xgboost::ELLPackMatrix matrix,

 // Manage memory for a single GPU
 template <typename GradientSumT>
-struct DeviceShard {
+struct GPUHistMakerDevice {
  int device_id;
  EllpackPageImpl* page;

@@ -474,12 +474,12 @@ struct DeviceShard {
                          std::function<bool(ExpandEntry, ExpandEntry)>>;
  std::unique_ptr<ExpandQueue> qexpand;

-  DeviceShard(int _device_id,
-              EllpackPageImpl* _page,
-              bst_uint _n_rows,
-              TrainParam _param,
-              uint32_t column_sampler_seed,
-              uint32_t n_features)
+  GPUHistMakerDevice(int _device_id,
+                     EllpackPageImpl* _page,
+                     bst_uint _n_rows,
+                     TrainParam _param,
+                     uint32_t column_sampler_seed,
+                     uint32_t n_features)
      : device_id(_device_id),
        page(_page),
        n_rows(_n_rows),
@@ -487,12 +487,12 @@ struct DeviceShard {
        prediction_cache_initialised(false),
        column_sampler(column_sampler_seed),
        interaction_constraints(param, n_features) {
-    monitor.Init(std::string("DeviceShard") + std::to_string(device_id));
+    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(device_id));
  }

  void InitHistogram();

-  ~DeviceShard() {  // NOLINT
+  ~GPUHistMakerDevice() {  // NOLINT
    dh::safe_cuda(cudaSetDevice(device_id));
    for (auto& stream : streams) {
      dh::safe_cuda(cudaStreamDestroy(stream));
@@ -781,7 +781,7 @@ struct DeviceShard {
    auto left_node_rows = row_partitioner->GetRows(nidx_left).size();
    auto right_node_rows = row_partitioner->GetRows(nidx_right).size();
    // Decide whether to build the left histogram or right histogram
-    // Find the largest number of training instances on any given Shard
+    // Find the largest number of training instances on any given device
    // Assume this will be the bottleneck and avoid building this node if
    // possible
    std::vector<size_t> max_reduce;
@@ -939,7 +939,7 @@ struct DeviceShard {
 };

 template <typename GradientSumT>
-inline void DeviceShard<GradientSumT>::InitHistogram() {
+inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
  CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
      << "Max leaves and max depth cannot both be unconstrained for "
      "gpu_hist.";
@@ -1026,19 +1026,17 @@ class GPUHistMakerSpecialised {
      page->Init(device_, param_.max_bin, hist_maker_param_.gpu_batch_nrows);
    }

-    // Create device shard
    dh::safe_cuda(cudaSetDevice(device_));
-    shard_.reset(new DeviceShard<GradientSumT>(device_,
-                                               page,
-                                               info_->num_row_,
-                                               param_,
-                                               column_sampling_seed,
-                                               info_->num_col_));
+    maker_.reset(new GPUHistMakerDevice<GradientSumT>(device_,
+                                                      page,
+                                                      info_->num_row_,
+                                                      param_,
+                                                      column_sampling_seed,
+                                                      info_->num_col_));

-    // Init global data for each shard
    monitor_.StartCuda("InitHistogram");
    dh::safe_cuda(cudaSetDevice(device_));
-    shard_->InitHistogram();
+    maker_->InitHistogram();
    monitor_.StopCuda("InitHistogram");

    p_last_fmat_ = dmat;
@@ -1077,18 +1075,17 @@ class GPUHistMakerSpecialised {
    monitor_.StopCuda("InitData");

    gpair->SetDevice(device_);
-    shard_->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
+    maker_->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
  }

  bool UpdatePredictionCache(
      const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) {
-    if (shard_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
+    if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
      return false;
    }
    monitor_.StartCuda("UpdatePredictionCache");
    p_out_preds->SetDevice(device_);
-    dh::safe_cuda(cudaSetDevice(shard_->device_id));
-    shard_->UpdatePredictionCache(p_out_preds->DevicePointer());
+    maker_->UpdatePredictionCache(p_out_preds->DevicePointer());
    monitor_.StopCuda("UpdatePredictionCache");
    return true;
  }
@@ -1096,7 +1093,7 @@ class GPUHistMakerSpecialised {
  TrainParam param_;           // NOLINT
  MetaInfo* info_{};             // NOLINT

-  std::unique_ptr<DeviceShard<GradientSumT>> shard_;  // NOLINT
+  std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker_;  // NOLINT

 private:
  bool initialised_;