xgboost/src/tree/updater_gpu_hist.cu

/*!
 * Copyright 2017 XGBoost contributors
 */
#include <xgboost/tree_updater.h>
#include <memory>
#include <utility>
#include <vector>
#include "../common/compressed_iterator.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
#include "param.h"
#include "updater_gpu_common.cuh"

namespace xgboost {
namespace tree {

DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);

typedef bst_gpair_integer gpair_sum_t;
static const ncclDataType_t nccl_sum_t = ncclInt64;

// Helper for explicit template specialisation
template <int N>
struct Int {};

struct DeviceGMat {
  dh::dvec<common::compressed_byte_t> gidx_buffer;
  common::CompressedIterator<uint32_t> gidx;
  dh::dvec<size_t> row_ptr;
  void Init(int device_idx, const common::GHistIndexMatrix& gmat,
            bst_ulong element_begin, bst_ulong element_end, bst_ulong row_begin,
            bst_ulong row_end, int n_bins) {
    dh::safe_cuda(cudaSetDevice(device_idx));
    CHECK(gidx_buffer.size()) << "gidx_buffer must be externally allocated";
    CHECK_EQ(row_ptr.size(), (row_end - row_begin) + 1)
        << "row_ptr must be externally allocated";

    common::CompressedBufferWriter cbw(n_bins);
    std::vector<common::compressed_byte_t> host_buffer(gidx_buffer.size());
    cbw.Write(host_buffer.data(), gmat.index.begin() + element_begin,
              gmat.index.begin() + element_end);
    gidx_buffer = host_buffer;
    gidx = common::CompressedIterator<uint32_t>(gidx_buffer.data(), n_bins);

    // row_ptr
    dh::safe_cuda(cudaMemcpy(row_ptr.data(), gmat.row_ptr.data() + row_begin,
                             row_ptr.size() * sizeof(size_t),
                             cudaMemcpyHostToDevice));
    // normalise row_ptr
    size_t start = gmat.row_ptr[row_begin];
    auto d_row_ptr = row_ptr.data();
    dh::launch_n(row_ptr.device_idx(), row_ptr.size(),
                 [=] __device__(size_t idx) { d_row_ptr[idx] -= start; });
  }
};

struct HistHelper {
  gpair_sum_t* d_hist;
  int n_bins;
  __host__ __device__ HistHelper(gpair_sum_t* ptr, int n_bins)
      : d_hist(ptr), n_bins(n_bins) {}

  __device__ void Add(bst_gpair gpair, int gidx, int nidx) const {
    int hist_idx = nidx * n_bins + gidx;

    auto dst_ptr =
        reinterpret_cast<unsigned long long int*>(&d_hist[hist_idx]);  // NOLINT
    gpair_sum_t tmp(gpair.GetGrad(), gpair.GetHess());
    auto src_ptr = reinterpret_cast<gpair_sum_t::value_t*>(&tmp);

    atomicAdd(dst_ptr,
              static_cast<unsigned long long int>(*src_ptr));  // NOLINT
    atomicAdd(dst_ptr + 1,
              static_cast<unsigned long long int>(*(src_ptr + 1)));  // NOLINT
  }
  __device__ gpair_sum_t Get(int gidx, int nidx) const {
    return d_hist[nidx * n_bins + gidx];
  }
};

struct DeviceHist {
  int n_bins;
  dh::dvec<gpair_sum_t> data;

  void Init(int n_bins_in) {
    this->n_bins = n_bins_in;
    CHECK(!data.empty()) << "DeviceHist must be externally allocated";
  }

  void Reset(int device_idx) {
    cudaSetDevice(device_idx);
    data.fill(gpair_sum_t());
  }

  HistHelper GetBuilder() { return HistHelper(data.data(), n_bins); }

  gpair_sum_t* GetLevelPtr(int depth) {
    return data.data() + n_nodes(depth - 1) * n_bins;
  }

  int LevelSize(int depth) { return n_bins * n_nodes_level(depth); }
};

template <int BLOCK_THREADS>
__global__ void find_split_kernel(
    const gpair_sum_t* d_level_hist, int* d_feature_segments, int depth,
    uint64_t n_features, int n_bins, DeviceNodeStats* d_nodes,
    int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map,
    GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp,
    bool colsample, int* d_feature_flags) {
  typedef cub::KeyValuePair<int, float> ArgMaxT;
  typedef cub::BlockScan<gpair_sum_t, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
      BlockScanT;
  typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
  typedef cub::BlockReduce<gpair_sum_t, BLOCK_THREADS> SumReduceT;

  union TempStorage {
    typename BlockScanT::TempStorage scan;
    typename MaxReduceT::TempStorage max_reduce;
    typename SumReduceT::TempStorage sum_reduce;
  };

  __shared__ cub::Uninitialized<DeviceSplitCandidate> uninitialized_split;
  DeviceSplitCandidate& split = uninitialized_split.Alias();
  __shared__ cub::Uninitialized<gpair_sum_t> uninitialized_sum;
  gpair_sum_t& shared_sum = uninitialized_sum.Alias();
  __shared__ ArgMaxT block_max;
  __shared__ TempStorage temp_storage;

  if (threadIdx.x == 0) {
    split = DeviceSplitCandidate();
  }

  __syncthreads();

  // below two are for accessing full-sized node list stored on each device
  // always one block per node, BLOCK_THREADS threads per block
  int level_node_idx = blockIdx.x + nodes_offset_device;
  int node_idx = n_nodes(depth - 1) + level_node_idx;

  for (int fidx = 0; fidx < n_features; fidx++) {
    if (colsample && d_feature_flags[fidx] == 0) continue;

    int begin = d_feature_segments[level_node_idx * n_features + fidx];
    int end = d_feature_segments[level_node_idx * n_features + fidx + 1];

    gpair_sum_t feature_sum = gpair_sum_t();
    for (int reduce_begin = begin; reduce_begin < end;
         reduce_begin += BLOCK_THREADS) {
      bool thread_active = reduce_begin + threadIdx.x < end;
      // Scan histogram
      gpair_sum_t bin = thread_active ? d_level_hist[reduce_begin + threadIdx.x]
                                      : gpair_sum_t();

      feature_sum +=
          SumReduceT(temp_storage.sum_reduce).Reduce(bin, cub::Sum());
    }

    if (threadIdx.x == 0) {
      shared_sum = feature_sum;
    }
    //    __syncthreads(); // no need to synch because below there is a Scan

    auto prefix_op = SumCallbackOp<gpair_sum_t>();
    for (int scan_begin = begin; scan_begin < end;
         scan_begin += BLOCK_THREADS) {
      bool thread_active = scan_begin + threadIdx.x < end;
      gpair_sum_t bin = thread_active ? d_level_hist[scan_begin + threadIdx.x]
                                      : gpair_sum_t();

      BlockScanT(temp_storage.scan)
          .ExclusiveScan(bin, bin, cub::Sum(), prefix_op);

      // Calculate gain
      gpair_sum_t parent_sum = gpair_sum_t(d_nodes[node_idx].sum_gradients);
      float parent_gain = d_nodes[node_idx].root_gain;

      gpair_sum_t missing = parent_sum - shared_sum;

      bool missing_left;
      float gain = thread_active
                       ? loss_chg_missing(bin, missing, parent_sum, parent_gain,
                                          gpu_param, missing_left)
                       : -FLT_MAX;
      __syncthreads();

      // Find thread with best gain
      ArgMaxT tuple(threadIdx.x, gain);
      ArgMaxT best =
          MaxReduceT(temp_storage.max_reduce).Reduce(tuple, cub::ArgMax());

      if (threadIdx.x == 0) {
        block_max = best;
      }

      __syncthreads();

      // Best thread updates split
      if (threadIdx.x == block_max.key) {
        float fvalue;
        int gidx = (scan_begin - (level_node_idx * n_bins)) + threadIdx.x;
        if (threadIdx.x == 0 &&
            begin == scan_begin) {  // check at start of first tile
          fvalue = d_fidx_min_map[fidx];
        } else {
          fvalue = d_gidx_fvalue_map[gidx - 1];
        }

        gpair_sum_t left = missing_left ? bin + missing : bin;
        gpair_sum_t right = parent_sum - left;

        split.Update(gain, missing_left ? LeftDir : RightDir, fvalue, fidx,
                     left, right, gpu_param);
      }
      __syncthreads();
    }  // end scan
  }    // end over features

  // Create node
  if (threadIdx.x == 0 && split.IsValid()) {
    d_nodes[node_idx].SetSplit(split);

    DeviceNodeStats& left_child = d_nodes[left_child_nidx(node_idx)];
    DeviceNodeStats& right_child = d_nodes[right_child_nidx(node_idx)];
    bool& left_child_smallest = d_left_child_smallest_temp[node_idx];
    left_child =
        DeviceNodeStats(split.left_sum, left_child_nidx(node_idx), gpu_param);

    right_child =
        DeviceNodeStats(split.right_sum, right_child_nidx(node_idx), gpu_param);

    // Record smallest node
    if (split.left_sum.GetHess() <= split.right_sum.GetHess()) {
      left_child_smallest = true;
    } else {
      left_child_smallest = false;
    }
  }
}
class GPUHistMaker : public TreeUpdater {
 public:
  GPUHistMaker()
      : initialised(false),
        is_dense(false),
        p_last_fmat_(nullptr),
        prediction_cache_initialised(false) {}
  ~GPUHistMaker() {
    if (initialised) {
      for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
        ncclCommDestroy(comms[d_idx]);

        dh::safe_cuda(cudaSetDevice(dList[d_idx]));
        dh::safe_cuda(cudaStreamDestroy(*(streams[d_idx])));
      }
      for (int num_d = 1; num_d <= n_devices;
           ++num_d) {  // loop over number of devices used
        for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
          ncclCommDestroy(find_split_comms[num_d - 1][d_idx]);
        }
      }
    }
  }
  void Init(
      const std::vector<std::pair<std::string, std::string>>& args) override {
    param.InitAllowUnknown(args);
    CHECK(param.max_depth < 16) << "Tree depth too large.";
    CHECK(param.max_depth != 0) << "Tree depth cannot be 0.";
    CHECK(param.grow_policy != TrainParam::kLossGuide)
        << "Loss guided growth policy not supported. Use CPU algorithm.";
    this->param = param;

    CHECK(param.n_gpus != 0) << "Must have at least one device";
  }
  void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
              const std::vector<RegTree*>& trees) override {
    GradStats::CheckInfo(dmat->info());
    // rescale learning rate according to size of trees
    float lr = param.learning_rate;
    param.learning_rate = lr / trees.size();
    // build tree
    try {
      for (size_t i = 0; i < trees.size(); ++i) {
        this->UpdateTree(gpair, dmat, trees[i]);
      }
    } catch (const std::exception& e) {
      LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
    }
    param.learning_rate = lr;
  }

  void InitData(const std::vector<bst_gpair>& gpair, DMatrix& fmat,  // NOLINT
                const RegTree& tree) {
    dh::Timer time1;
    // set member num_rows and n_devices for rest of GPUHistBuilder members
    info = &fmat.info();
    CHECK(info->num_row < std::numeric_limits<bst_uint>::max());
    num_rows = static_cast<bst_uint>(info->num_row);
    n_devices = dh::n_devices(param.n_gpus, num_rows);

    if (!initialised) {
      // reset static timers used across iterations
      cpu_init_time = 0;
      gpu_init_time = 0;
      cpu_time.Reset();
      gpu_time = 0;

      // set dList member
      dList.resize(n_devices);
      for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
        int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
        dList[d_idx] = device_idx;
      }

      // initialize nccl

      comms.resize(n_devices);
      streams.resize(n_devices);
      dh::safe_nccl(ncclCommInitAll(comms.data(), n_devices,
                                    dList.data()));  // initialize communicator
                                                     // (One communicator per
                                                     // process)

      // printf("# NCCL: Using devices\n");
      for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
        streams[d_idx] =
            reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
        dh::safe_cuda(cudaSetDevice(dList[d_idx]));
        dh::safe_cuda(cudaStreamCreate(streams[d_idx]));

        int cudaDev;
        int rank;
        cudaDeviceProp prop;
        dh::safe_nccl(ncclCommCuDevice(comms[d_idx], &cudaDev));
        dh::safe_nccl(ncclCommUserRank(comms[d_idx], &rank));
        dh::safe_cuda(cudaGetDeviceProperties(&prop, cudaDev));
        // printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
        //             prop.pciBusID, prop.name);
        // cudaDriverGetVersion(&driverVersion);
        // cudaRuntimeGetVersion(&runtimeVersion);
        std::ostringstream oss;
        oss << "CUDA Capability Major/Minor version number: " << prop.major
            << "." << prop.minor << " is insufficient.  Need >=3.5.";
        int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
        CHECK(failed == 0) << oss.str();
      }

      // local find_split group of comms for each case of reduced number of
      // GPUs to use
      find_split_comms.resize(
          n_devices,
          std::vector<ncclComm_t>(n_devices));  // TODO(JCM): Excessive, but
                                                // ok, and best to do
                                                // here instead of
                                                // repeatedly
      for (int num_d = 1; num_d <= n_devices;
           ++num_d) {  // loop over number of devices used
        dh::safe_nccl(
            ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
                            dList.data()));  // initialize communicator
                                             // (One communicator per
                                             // process)
      }

      is_dense = info->num_nonzero == info->num_col * info->num_row;
      dh::Timer time0;
      hmat_.Init(&fmat, param.max_bin);
      cpu_init_time += time0.ElapsedSeconds();
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] CPU Time for hmat_.Init "
                     << time0.ElapsedSeconds() << " sec";
        fflush(stdout);
      }
      time0.Reset();

      gmat_.cut = &hmat_;
      cpu_init_time += time0.ElapsedSeconds();
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.cut "
                     << time0.ElapsedSeconds() << " sec";
        fflush(stdout);
      }
      time0.Reset();

      gmat_.Init(&fmat);
      cpu_init_time += time0.ElapsedSeconds();
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.Init() "
                     << time0.ElapsedSeconds() << " sec";
        fflush(stdout);
      }
      time0.Reset();

      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE)
            << "[GPU Plug-in] CPU Time for hmat_.Init, gmat_.cut, gmat_.Init "
            << cpu_init_time << " sec";
        fflush(stdout);
      }

      int n_bins = static_cast<int >(hmat_.row_ptr.back());
      int n_features = static_cast<int >(hmat_.row_ptr.size() - 1);

      // deliniate data onto multiple gpus
      device_row_segments.push_back(0);
      device_element_segments.push_back(0);
      bst_uint offset = 0;
      bst_uint shard_size = static_cast<bst_uint>(
          std::ceil(static_cast<double>(num_rows) / n_devices));
      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        offset += shard_size;
        offset = std::min(offset, num_rows);
        device_row_segments.push_back(offset);
        device_element_segments.push_back(gmat_.row_ptr[offset]);
      }

      // Build feature segments
      std::vector<int> h_feature_segments;
      for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) {
        for (int fidx = 0; fidx < n_features; fidx++) {
          h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins);
        }
      }
      h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins);

      // Construct feature map
      std::vector<int> h_gidx_feature_map(n_bins);
      for (int fidx = 0; fidx < n_features; fidx++) {
        for (auto i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
          h_gidx_feature_map[i] = fidx;
        }
      }

      int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins;

      // allocate unique common data that reside on master device (NOTE: None
      // currently)
      //    int master_device=dList[0];
      //    ba.allocate(master_device, );

      // allocate vectors across all devices
      temp_memory.resize(n_devices);
      hist_vec.resize(n_devices);
      nodes.resize(n_devices);
      nodes_temp.resize(n_devices);
      nodes_child_temp.resize(n_devices);
      left_child_smallest.resize(n_devices);
      left_child_smallest_temp.resize(n_devices);
      feature_flags.resize(n_devices);
      fidx_min_map.resize(n_devices);
      feature_segments.resize(n_devices);
      prediction_cache.resize(n_devices);
      position.resize(n_devices);
      position_tmp.resize(n_devices);
      device_matrix.resize(n_devices);
      device_gpair.resize(n_devices);
      gidx_feature_map.resize(n_devices);
      gidx_fvalue_map.resize(n_devices);

      int find_split_n_devices = static_cast<int >(std::pow(2, std::floor(std::log2(n_devices))));
      find_split_n_devices =
          std::min(n_nodes_level(param.max_depth), find_split_n_devices);
      int max_num_nodes_device =
          n_nodes_level(param.max_depth) / find_split_n_devices;

      // num_rows_segment: for sharding rows onto gpus for splitting data
      // num_elements_segment: for sharding rows (of elements) onto gpus for
      // splitting data
      // max_num_nodes_device: for sharding nodes onto gpus for split finding
      // All other variables have full copy on gpu, with copy either being
      // identical or just current portion (like for histogram) before
      // AllReduce
      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        bst_uint num_rows_segment =
            device_row_segments[d_idx + 1] - device_row_segments[d_idx];
        bst_ulong num_elements_segment =
            device_element_segments[d_idx + 1] - device_element_segments[d_idx];
        ba.allocate(
            device_idx, param.silent, &(hist_vec[d_idx].data),
            n_nodes(param.max_depth - 1) * n_bins, &nodes[d_idx],
            n_nodes(param.max_depth), &nodes_temp[d_idx], max_num_nodes_device,
            &nodes_child_temp[d_idx], max_num_nodes_device,
            &left_child_smallest[d_idx], n_nodes(param.max_depth),
            &left_child_smallest_temp[d_idx], max_num_nodes_device,
            &feature_flags[d_idx],
            n_features,  // may change but same on all devices
            &fidx_min_map[d_idx],
            hmat_.min_val.size(),  // constant and same on all devices
            &feature_segments[d_idx],
            h_feature_segments.size(),  // constant and same on all devices
            &prediction_cache[d_idx], num_rows_segment, &position[d_idx],
            num_rows_segment, &position_tmp[d_idx], num_rows_segment,
            &device_gpair[d_idx], num_rows_segment,
            &device_matrix[d_idx].gidx_buffer,
            common::CompressedBufferWriter::CalculateBufferSize(
                num_elements_segment,
                n_bins),  // constant and same on all devices
            &device_matrix[d_idx].row_ptr, num_rows_segment + 1,
            &gidx_feature_map[d_idx],
            n_bins,  // constant and same on all devices
            &gidx_fvalue_map[d_idx],
            hmat_.cut.size());  // constant and same on all devices

        // Copy Host to Device (assumes comes after ba.allocate that sets
        // device)
        device_matrix[d_idx].Init(
            device_idx, gmat_, device_element_segments[d_idx],
            device_element_segments[d_idx + 1], device_row_segments[d_idx],
            device_row_segments[d_idx + 1], n_bins);
        gidx_feature_map[d_idx] = h_gidx_feature_map;
        gidx_fvalue_map[d_idx] = hmat_.cut;
        feature_segments[d_idx] = h_feature_segments;
        fidx_min_map[d_idx] = hmat_.min_val;

        // Initialize, no copy
        hist_vec[d_idx].Init(n_bins);     // init host object
        prediction_cache[d_idx].fill(0);  // init device object (assumes comes
                                          // after ba.allocate that sets device)
        feature_flags[d_idx].fill(
            1);  // init device object (assumes comes after
                 // ba.allocate that sets device)
      }
    }

    // copy or init to do every iteration
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      nodes[d_idx].fill(DeviceNodeStats());
      nodes_temp[d_idx].fill(DeviceNodeStats());
      nodes_child_temp[d_idx].fill(DeviceNodeStats());

      position[d_idx].fill(0);

      device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx],
                               gpair.begin() + device_row_segments[d_idx + 1]);

      subsample_gpair(&device_gpair[d_idx], param.subsample,
                      device_row_segments[d_idx]);

      hist_vec[d_idx].Reset(device_idx);

      // left_child_smallest and left_child_smallest_temp don't need to be
      // initialized
    }

    dh::synchronize_n_devices(n_devices, dList);

    if (!initialised) {
      gpu_init_time = time1.ElapsedSeconds() - cpu_init_time;
      gpu_time = -cpu_init_time;
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] Time for GPU operations during First "
                        "Call to InitData() "
                     << gpu_init_time << " sec";
        fflush(stdout);
      }
    }

    p_last_fmat_ = &fmat;

    initialised = true;
  }

  void BuildHist(int depth) {
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      size_t begin = device_element_segments[d_idx];
      size_t end = device_element_segments[d_idx + 1];
      size_t row_begin = device_row_segments[d_idx];
      size_t row_end = device_row_segments[d_idx + 1];

      auto d_gidx = device_matrix[d_idx].gidx;
      auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();
      auto d_position = position[d_idx].data();
      auto d_gpair = device_gpair[d_idx].data();
      auto d_left_child_smallest = left_child_smallest[d_idx].data();
      auto hist_builder = hist_vec[d_idx].GetBuilder();
      dh::TransformLbs(
          device_idx, &temp_memory[d_idx], end - begin, d_row_ptr,
          row_end - row_begin, is_dense,
          [=] __device__(size_t local_idx, int local_ridx) {
            int nidx = d_position[local_ridx];  // OPTMARK: latency
            if (!is_active(nidx, depth)) return;

            // Only increment smallest node
            bool is_smallest = (d_left_child_smallest[parent_nidx(nidx)] &&
                                is_left_child(nidx)) ||
                               (!d_left_child_smallest[parent_nidx(nidx)] &&
                                !is_left_child(nidx));
            if (!is_smallest && depth > 0) return;

            int gidx = d_gidx[local_idx];
            bst_gpair gpair = d_gpair[local_ridx];

            hist_builder.Add(gpair, gidx,
                             nidx);  // OPTMARK: This is slow, could use
                                     // shared memory or cache results
                                     // intead of writing to global
                                     // memory every time in atomic way.
          });
    }

    dh::synchronize_n_devices(n_devices, dList);

    //  time.printElapsed("Add Time");

    // (in-place) reduce each element of histogram (for only current level)
    // across multiple gpus
    // TODO(JCM): use out of place with pre-allocated buffer, but then have to
    // copy
    // back on device
    //  fprintf(stderr,"sizeof(bst_gpair)/sizeof(float)=%d\n",sizeof(bst_gpair)/sizeof(float));
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));
      dh::safe_nccl(ncclAllReduce(
          reinterpret_cast<const void*>(hist_vec[d_idx].GetLevelPtr(depth)),
          reinterpret_cast<void*>(hist_vec[d_idx].GetLevelPtr(depth)),
          hist_vec[d_idx].LevelSize(depth) * sizeof(gpair_sum_t) /
              sizeof(gpair_sum_t::value_t),
          nccl_sum_t, ncclSum, comms[d_idx], *(streams[d_idx])));
    }

    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));
      dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
    }
    // if no NCCL, then presume only 1 GPU, then already correct

    //  time.printElapsed("Reduce-Add Time");

    // Subtraction trick (applied to all devices in same way -- to avoid doing
    // on master and then Bcast)
    if (depth > 0) {
      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        dh::safe_cuda(cudaSetDevice(device_idx));

        auto hist_builder = hist_vec[d_idx].GetBuilder();
        auto d_left_child_smallest = left_child_smallest[d_idx].data();
        int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;

        dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) {
          int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
          bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
          if (left_smallest) {
            nidx++;  // If left is smallest switch to right child
          }

          int gidx = idx % hist_builder.n_bins;
          gpair_sum_t parent = hist_builder.Get(gidx, parent_nidx(nidx));
          int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
          gpair_sum_t other = hist_builder.Get(gidx, other_nidx);
          gpair_sum_t sub = parent - other;
          hist_builder.Add(
              bst_gpair(sub.GetGrad(), sub.GetHess()), gidx,
              nidx);  // OPTMARK: This is slow, could use shared
                      // memory or cache results intead of writing to
                      // global memory every time in atomic way.
        });
      }
      dh::synchronize_n_devices(n_devices, dList);
    }
  }
#define MIN_BLOCK_THREADS 128
#define CHUNK_BLOCK_THREADS 128
// MAX_BLOCK_THREADS of 1024 is hard-coded maximum block size due
// to CUDA capability 35 and above requirement
// for Maximum number of threads per block
#define MAX_BLOCK_THREADS 512

  void FindSplit(int depth) {
    // Specialised based on max_bins
    this->FindSplitSpecialize(depth, Int<MIN_BLOCK_THREADS>());
  }

  template <int BLOCK_THREADS>
  void FindSplitSpecialize(int depth, Int<BLOCK_THREADS>) {
    if (param.max_bin <= BLOCK_THREADS) {
      LaunchFindSplit<BLOCK_THREADS>(depth);
    } else {
      this->FindSplitSpecialize(depth,
                                Int<BLOCK_THREADS + CHUNK_BLOCK_THREADS>());
    }
  }

  void FindSplitSpecialize(int depth, Int<MAX_BLOCK_THREADS>) {
    this->LaunchFindSplit<MAX_BLOCK_THREADS>(depth);
  }

  template <int BLOCK_THREADS>
  void LaunchFindSplit(int depth) {
    bool colsample =
        param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;

    int num_nodes_device = n_nodes_level(depth);
    const int GRID_SIZE = num_nodes_device;

    // all GPUs do same work
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      int nodes_offset_device = 0;
      find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
          hist_vec[d_idx].GetLevelPtr(depth), feature_segments[d_idx].data(),
          depth, info->num_col, hmat_.row_ptr.back(), nodes[d_idx].data(),
          nodes_offset_device, fidx_min_map[d_idx].data(),
          gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
          left_child_smallest[d_idx].data(), colsample,
          feature_flags[d_idx].data());
    }

    // NOTE: No need to syncrhonize with host as all above pure P2P ops or
    // on-device ops
  }
  void InitFirstNode(const std::vector<bst_gpair>& gpair) {
    // Perform asynchronous reduction on each gpu
    std::vector<bst_gpair> device_sums(n_devices);
#pragma omp parallel for num_threads(n_devices)
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));
      auto begin = device_gpair[d_idx].tbegin();
      auto end = device_gpair[d_idx].tend();
      bst_gpair init = bst_gpair();
      auto binary_op = thrust::plus<bst_gpair>();
      device_sums[d_idx] = thrust::reduce(begin, end, init, binary_op);
    }

    bst_gpair sum = bst_gpair();
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      sum += device_sums[d_idx];
    }

    // Setup first node so all devices have same first node (here done same on
    // all devices, or could have done one device and Bcast if worried about
    // exact precision issues)
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];

      auto d_nodes = nodes[d_idx].data();
      auto gpu_param = GPUTrainingParam(param);

      dh::launch_n(device_idx, 1, [=] __device__(int idx) {
        bst_gpair sum_gradients = sum;
        d_nodes[idx] = DeviceNodeStats(sum_gradients, 0, gpu_param);
      });
    }
    // synch all devices to host before moving on (No, can avoid because
    // BuildHist calls another kernel in default stream)
    //  dh::synchronize_n_devices(n_devices, dList);
  }
  void UpdatePosition(int depth) {
    if (is_dense) {
      this->UpdatePositionDense(depth);
    } else {
      this->UpdatePositionSparse(depth);
    }
  }
  void UpdatePositionDense(int depth) {
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];

      auto d_position = position[d_idx].data();
      DeviceNodeStats* d_nodes = nodes[d_idx].data();
      auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
      auto d_gidx = device_matrix[d_idx].gidx;
      auto n_columns = info->num_col;
      size_t begin = device_row_segments[d_idx];
      size_t end = device_row_segments[d_idx + 1];

      dh::launch_n(device_idx, end - begin, [=] __device__(size_t local_idx) {
        int pos = d_position[local_idx];
        if (!is_active(pos, depth)) {
          return;
        }
        DeviceNodeStats node = d_nodes[pos];

        if (node.IsLeaf()) {
          return;
        }

        int gidx = d_gidx[local_idx * static_cast<size_t>(n_columns) +
                          static_cast<size_t>(node.fidx)];

        float fvalue = d_gidx_fvalue_map[gidx];

        if (fvalue <= node.fvalue) {
          d_position[local_idx] = left_child_nidx(pos);
        } else {
          d_position[local_idx] = right_child_nidx(pos);
        }
      });
    }
    dh::synchronize_n_devices(n_devices, dList);
    // dh::safe_cuda(cudaDeviceSynchronize());
  }

  void UpdatePositionSparse(int depth) {
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];

      auto d_position = position[d_idx].data();
      auto d_position_tmp = position_tmp[d_idx].data();
      DeviceNodeStats* d_nodes = nodes[d_idx].data();
      auto d_gidx_feature_map = gidx_feature_map[d_idx].data();
      auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
      auto d_gidx = device_matrix[d_idx].gidx;
      auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();

      size_t row_begin = device_row_segments[d_idx];
      size_t row_end = device_row_segments[d_idx + 1];
      size_t element_begin = device_element_segments[d_idx];
      size_t element_end = device_element_segments[d_idx + 1];

      // Update missing direction
      dh::launch_n(device_idx, row_end - row_begin,
                   [=] __device__(int local_idx) {
                     int pos = d_position[local_idx];
                     if (!is_active(pos, depth)) {
                       d_position_tmp[local_idx] = pos;
                       return;
                     }

                     DeviceNodeStats node = d_nodes[pos];

                     if (node.IsLeaf()) {
                       d_position_tmp[local_idx] = pos;
                       return;
                     } else if (node.dir == LeftDir) {
                       d_position_tmp[local_idx] = pos * 2 + 1;
                     } else {
                       d_position_tmp[local_idx] = pos * 2 + 2;
                     }
                   });

      // Update node based on fvalue where exists
      // OPTMARK: This kernel is very inefficient for both compute and memory,
      // dominated by memory dependency / access patterns

      dh::TransformLbs(
          device_idx, &temp_memory[d_idx], element_end - element_begin,
          d_row_ptr, row_end - row_begin, is_dense,
          [=] __device__(size_t local_idx, int local_ridx) {
            int pos = d_position[local_ridx];
            if (!is_active(pos, depth)) {
              return;
            }

            DeviceNodeStats node = d_nodes[pos];

            if (node.IsLeaf()) {
              return;
            }

            int gidx = d_gidx[local_idx];
            int findex =
                d_gidx_feature_map[gidx];  // OPTMARK: slowest global
                                           // memory access, maybe setup
                                           // position, gidx, etc. as
                                           // combined structure?

            if (findex == node.fidx) {
              float fvalue = d_gidx_fvalue_map[gidx];

              if (fvalue <= node.fvalue) {
                d_position_tmp[local_ridx] = left_child_nidx(pos);
              } else {
                d_position_tmp[local_ridx] = right_child_nidx(pos);
              }
            }
          });
      position[d_idx] = position_tmp[d_idx];
    }
    dh::synchronize_n_devices(n_devices, dList);
  }
  void ColSampleTree() {
    if (param.colsample_bylevel == 1.0 && param.colsample_bytree == 1.0) return;

    feature_set_tree.resize(info->num_col);
    std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
    feature_set_tree = col_sample(feature_set_tree, param.colsample_bytree);
  }
  void ColSampleLevel() {
    if (param.colsample_bylevel == 1.0 && param.colsample_bytree == 1.0) return;

    feature_set_level.resize(feature_set_tree.size());
    feature_set_level = col_sample(feature_set_tree, param.colsample_bylevel);
    std::vector<int> h_feature_flags(info->num_col, 0);
    for (auto fidx : feature_set_level) {
      h_feature_flags[fidx] = 1;
    }

    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      feature_flags[d_idx] = h_feature_flags;
    }
    dh::synchronize_n_devices(n_devices, dList);
  }
  bool UpdatePredictionCache(const DMatrix* data,
                             std::vector<bst_float>* p_out_preds) override {
    std::vector<bst_float>& out_preds = *p_out_preds;

    if (nodes.empty() || !p_last_fmat_ || data != p_last_fmat_) {
      return false;
    }

    if (!prediction_cache_initialised) {
      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        size_t row_begin = device_row_segments[d_idx];
        size_t row_end = device_row_segments[d_idx + 1];

        prediction_cache[d_idx].copy(out_preds.begin() + row_begin,
                                     out_preds.begin() + row_end);
      }
      prediction_cache_initialised = true;
    }
    dh::synchronize_n_devices(n_devices, dList);

    float eps = param.learning_rate;
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      size_t row_begin = device_row_segments[d_idx];
      size_t row_end = device_row_segments[d_idx + 1];

      auto d_nodes = nodes[d_idx].data();
      auto d_position = position[d_idx].data();
      auto d_prediction_cache = prediction_cache[d_idx].data();

      dh::launch_n(device_idx, prediction_cache[d_idx].size(),
                   [=] __device__(int local_idx) {
                     int pos = d_position[local_idx];
                     d_prediction_cache[local_idx] += d_nodes[pos].weight * eps;
                   });

      dh::safe_cuda(
          cudaMemcpy(&out_preds[row_begin], prediction_cache[d_idx].data(),
                     prediction_cache[d_idx].size() * sizeof(bst_float),
                     cudaMemcpyDeviceToHost));
    }
    dh::synchronize_n_devices(n_devices, dList);

    return true;
  }
  void UpdateTree(const std::vector<bst_gpair>& gpair, DMatrix* p_fmat,
                  RegTree* p_tree) {
    dh::Timer time0;

    this->InitData(gpair, *p_fmat, *p_tree);
    this->InitFirstNode(gpair);
    this->ColSampleTree();

    for (int depth = 0; depth < param.max_depth; depth++) {
      this->ColSampleLevel();
      this->BuildHist(depth);
      this->FindSplit(depth);
      this->UpdatePosition(depth);
    }

    // done with multi-GPU, pass back result from master to tree on host
    int master_device = dList[0];
    dh::safe_cuda(cudaSetDevice(master_device));
    dense2sparse_tree(p_tree, nodes[0], param);

    gpu_time += time0.ElapsedSeconds();

    if (param.debug_verbose) {
      LOG(CONSOLE)
          << "[GPU Plug-in] Cumulative GPU Time excluding initial time "
          << (gpu_time - gpu_init_time) << " sec";
      fflush(stdout);
    }

    if (param.debug_verbose) {
      LOG(CONSOLE) << "[GPU Plug-in] Cumulative CPU Time "
                   << cpu_time.ElapsedSeconds() << " sec";
      LOG(CONSOLE)
          << "[GPU Plug-in] Cumulative CPU Time excluding initial time "
          << (cpu_time.ElapsedSeconds() - cpu_init_time - gpu_time) << " sec";
      fflush(stdout);
    }
  }

 protected:
  TrainParam param;
  // std::unique_ptr<GPUHistBuilder> builder;
  common::HistCutMatrix hmat_;
  common::GHistIndexMatrix gmat_;
  MetaInfo* info;
  bool initialised;
  bool is_dense;
  const DMatrix* p_last_fmat_;
  bool prediction_cache_initialised;

  dh::bulk_allocator<dh::memory_type::DEVICE> ba;

  std::vector<int> feature_set_tree;
  std::vector<int> feature_set_level;

  bst_uint num_rows;
  int n_devices;

  // below vectors are for each devices used
  std::vector<int> dList;
  std::vector<int> device_row_segments;
  std::vector<size_t> device_element_segments;

  std::vector<dh::CubMemory> temp_memory;
  std::vector<DeviceHist> hist_vec;
  std::vector<dh::dvec<DeviceNodeStats>> nodes;
  std::vector<dh::dvec<DeviceNodeStats>> nodes_temp;
  std::vector<dh::dvec<DeviceNodeStats>> nodes_child_temp;
  std::vector<dh::dvec<bool>> left_child_smallest;
  std::vector<dh::dvec<bool>> left_child_smallest_temp;
  std::vector<dh::dvec<int>> feature_flags;
  std::vector<dh::dvec<float>> fidx_min_map;
  std::vector<dh::dvec<int>> feature_segments;
  std::vector<dh::dvec<bst_float>> prediction_cache;
  std::vector<dh::dvec<int>> position;
  std::vector<dh::dvec<int>> position_tmp;
  std::vector<DeviceGMat> device_matrix;
  std::vector<dh::dvec<bst_gpair>> device_gpair;
  std::vector<dh::dvec<int>> gidx_feature_map;
  std::vector<dh::dvec<float>> gidx_fvalue_map;

  std::vector<cudaStream_t*> streams;
  std::vector<ncclComm_t> comms;
  std::vector<std::vector<ncclComm_t>> find_split_comms;

  double cpu_init_time;
  double gpu_init_time;
  dh::Timer cpu_time;
  double gpu_time;
};

XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
    .describe("Grow tree with GPU.")
    .set_body([]() { return new GPUHistMaker(); });
}  // namespace tree
}  // namespace xgboost