Merge generic device helper functions into gpu set. (#3626)

* Remove the use of old NDevices* functions. * Use GPUSet in timer.h.
2018-08-26 14:14:23 +08:00
parent 3261002099
commit 60787ecebc
12 changed files with 299 additions and 199 deletions
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,6 +6,8 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_

+#include <xgboost/base.h>
+
 #include <vector>
 #include <string>
 #include <sstream>
@@ -35,6 +37,71 @@ inline std::string ToString(const T& data) {
  return os.str();
 }

+/*
+ * Range iterator
+ */
+class Range {
+ public:
+  class Iterator {
+    friend class Range;
+
+   public:
+    using DifferenceType = int64_t;
+
+    XGBOOST_DEVICE int64_t operator*() const { return i_; }
+    XGBOOST_DEVICE const Iterator &operator++() {
+      i_ += step_;
+      return *this;
+    }
+    XGBOOST_DEVICE Iterator operator++(int) {
+      Iterator res {*this};
+      i_ += step_;
+      return res;
+    }
+
+    XGBOOST_DEVICE bool operator==(const Iterator &other) const {
+      return i_ >= other.i_;
+    }
+    XGBOOST_DEVICE bool operator!=(const Iterator &other) const {
+      return i_ < other.i_;
+    }
+
+    XGBOOST_DEVICE void Step(DifferenceType s) { step_ = s; }
+
+   protected:
+    XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
+    XGBOOST_DEVICE explicit Iterator(int64_t start, int step) :
+        i_{start}, step_{step} {}
+
+   public:
+    int64_t i_;
+    DifferenceType step_ = 1;
+  };
+
+  XGBOOST_DEVICE Iterator begin() const { return begin_; }  // NOLINT
+  XGBOOST_DEVICE Iterator end() const { return end_; }      // NOLINT
+
+  XGBOOST_DEVICE Range(int64_t begin, int64_t end)
+      : begin_(begin), end_(end) {}
+  XGBOOST_DEVICE Range(int64_t begin, int64_t end, Iterator::DifferenceType step)
+      : begin_(begin, step), end_(end) {}
+
+  XGBOOST_DEVICE bool operator==(const Range& other) const {
+    return *begin_ == *other.begin_ && *end_ == *other.end_;
+  }
+  XGBOOST_DEVICE bool operator!=(const Range& other) const {
+    return !(*this == other);
+  }
+
+  XGBOOST_DEVICE void Step(Iterator::DifferenceType s) { begin_.Step(s); }
+
+  XGBOOST_DEVICE Iterator::DifferenceType GetStep() const { return begin_.step_; }
+
+ private:
+  Iterator begin_;
+  Iterator end_;
+};
+
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -7,6 +7,10 @@
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 #include <xgboost/logging.h>
+
+#include "common.h"
+#include "gpu_set.h"
+
 #include <algorithm>
 #include <chrono>
 #include <ctime>
@@ -28,25 +32,6 @@ namespace dh {
 #define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
 #define DEV_INLINE __device__ __forceinline__

-/*
- * Error handling  functions
- */
-
-#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
-
-inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                       int line) {
-  if (code != cudaSuccess) {
-    std::stringstream ss;
-    ss << file << "(" << line << ")";
-    std::string file_and_line;
-    ss >> file_and_line;
-    throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
-  }
-
-  return code;
-}
-
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)

@@ -73,45 +58,20 @@ const T *Raw(const thrust::device_vector<T> &v) {  //  NOLINT
  return raw_pointer_cast(v.data());
 }

-inline int NVisibleDevices() {
-  int n_visgpus = 0;
-
-  dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-
-  return n_visgpus;
-}
-
-inline int NDevicesAll(int n_gpus) {
-  int n_devices_visible = dh::NVisibleDevices();
-  int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
-  return (n_devices);
-}
-inline int NDevices(int n_gpus, int num_rows) {
-  int n_devices = dh::NDevicesAll(n_gpus);
-  // fix-up device number to be limited by number of rows
-  n_devices = n_devices > num_rows ? num_rows : n_devices;
-  return (n_devices);
-}
-
 // if n_devices=-1, then use all visible devices
-inline void SynchronizeNDevices(int n_devices, std::vector<int> dList) {
-  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
-    int device_idx = dList[d_idx];
-    safe_cuda(cudaSetDevice(device_idx));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-inline void SynchronizeAll() {
-  for (int device_idx = 0; device_idx < NVisibleDevices(); device_idx++) {
-    safe_cuda(cudaSetDevice(device_idx));
+inline void SynchronizeNDevices(xgboost::GPUSet devices) {
+  devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
+  for (auto const d : devices.Unnormalised()) {
+    safe_cuda(cudaSetDevice(d));
    safe_cuda(cudaDeviceSynchronize());
  }
 }

-inline std::string DeviceName(int device_idx) {
-  cudaDeviceProp prop;
-  dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
-  return std::string(prop.name);
+inline void SynchronizeAll() {
+  for (int device_idx : xgboost::GPUSet::AllVisible()) {
+    safe_cuda(cudaSetDevice(device_idx));
+    safe_cuda(cudaDeviceSynchronize());
+  }
 }

 inline size_t AvailableMemory(int device_idx) {
@@ -144,15 +104,8 @@ inline size_t MaxSharedMemory(int device_idx) {
  return prop.sharedMemPerBlock;
 }

-// ensure gpu_id is correct, so not dependent upon user knowing details
-inline int GetDeviceIdx(int gpu_id) {
-  // protect against overrun for gpu_id
-  return (std::abs(gpu_id) + 0) % dh::NVisibleDevices();
-}
-
 inline void CheckComputeCapability() {
-  int n_devices = NVisibleDevices();
-  for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+  for (int d_idx : xgboost::GPUSet::AllVisible()) {
    cudaDeviceProp prop;
    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
    std::ostringstream oss;
@@ -163,12 +116,11 @@ inline void CheckComputeCapability() {
  }
 }

-  
 DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) {
  atomicOr(&buffer[ibyte / sizeof(unsigned int)], (unsigned int)b << (ibyte % (sizeof(unsigned int)) * 8));
 }

-/*! 
+/*!
 * \brief Find the strict upper bound for an element in a sorted array
 *  using binary search.
 * \param cuts pointer to the first element of the sorted array
@@ -199,67 +151,18 @@ DEV_INLINE int UpperBound(const float* __restrict__ cuts, int n, float v) {
  return right;
 }

-/*
- * Range iterator
- */
-
-class Range {
- public:
-  class Iterator {
-    friend class Range;
-
-   public:
-    XGBOOST_DEVICE int64_t operator*() const { return i_; }
-    XGBOOST_DEVICE const Iterator &operator++() {
-      i_ += step_;
-      return *this;
-    }
-    XGBOOST_DEVICE Iterator operator++(int) {
-      Iterator copy(*this);
-      i_ += step_;
-      return copy;
-    }
-
-    XGBOOST_DEVICE bool operator==(const Iterator &other) const {
-      return i_ >= other.i_;
-    }
-    XGBOOST_DEVICE bool operator!=(const Iterator &other) const {
-      return i_ < other.i_;
-    }
-
-    XGBOOST_DEVICE void Step(int s) { step_ = s; }
-
-   protected:
-    XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
-
-   public:
-    uint64_t i_;
-    int step_ = 1;
-  };
-
-  XGBOOST_DEVICE Iterator begin() const { return begin_; }  // NOLINT
-  XGBOOST_DEVICE Iterator end() const { return end_; }      // NOLINT
-  XGBOOST_DEVICE Range(int64_t begin, int64_t end)
-      : begin_(begin), end_(end) {}
-  XGBOOST_DEVICE void Step(int s) { begin_.Step(s); }
-
- private:
-  Iterator begin_;
-  Iterator end_;
-};
-
 template <typename T>
-__device__ Range GridStrideRange(T begin, T end) {
+__device__ xgboost::common::Range GridStrideRange(T begin, T end) {
  begin += blockDim.x * blockIdx.x + threadIdx.x;
-  Range r(begin, end);
+  xgboost::common::Range r(begin, end);
  r.Step(gridDim.x * blockDim.x);
  return r;
 }

 template <typename T>
-__device__ Range BlockStrideRange(T begin, T end) {
+__device__ xgboost::common::Range BlockStrideRange(T begin, T end) {
  begin += threadIdx.x;
-  Range r(begin, end);
+  xgboost::common::Range r(begin, end);
  r.Step(blockDim.x);
  return r;
 }
@@ -557,7 +460,7 @@ class BulkAllocator {
  BulkAllocator(BulkAllocator<MemoryT>&&) = delete;
  void operator=(const BulkAllocator<MemoryT>&) = delete;
  void operator=(BulkAllocator<MemoryT>&&) = delete;
-  
+
  ~BulkAllocator() {
    for (size_t i = 0; i < d_ptr_.size(); i++) {
      if (!(d_ptr_[i] == nullptr)) {
--- a/src/common/gpu_set.h
+++ b/src/common/gpu_set.h
@@ -0,0 +1,122 @@
+/*!
+ * Copyright 2018 XGBoost contributors
+ */
+#ifndef XGBOOST_COMMON_GPU_SET_H_
+#define XGBOOST_COMMON_GPU_SET_H_
+
+#include <xgboost/base.h>
+#include <xgboost/logging.h>
+
+#include <limits>
+#include <string>
+
+#include "common.h"
+#include "span.h"
+
+#if defined(__CUDACC__)
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif
+
+namespace dh {
+#if defined(__CUDACC__)
+/*
+ * Error handling  functions
+ */
+#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
+
+inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
+                                       int line) {
+  if (code != cudaSuccess) {
+    throw thrust::system_error(code, thrust::cuda_category(),
+                               std::string{file} + "(" +  // NOLINT
+                               std::to_string(line) + ")");
+  }
+  return code;
+}
+#endif
+}  // namespace dh
+
+namespace xgboost {
+
+/* \brief set of devices across which HostDeviceVector can be distributed.
+ *
+ * Currently implemented as a range, but can be changed later to something else,
+ *   e.g. a bitset
+ */
+class GPUSet {
+ public:
+  explicit GPUSet(int start = 0, int ndevices = 0)
+      : devices_(start, start + ndevices) {}
+
+  static GPUSet Empty() { return GPUSet(); }
+
+  static GPUSet Range(int start, int ndevices) {
+    return ndevices <= 0 ? Empty() : GPUSet{start, ndevices};
+  }
+  /* \brief ndevices and num_rows both are upper bounds. */
+  static GPUSet All(int ndevices, int num_rows = std::numeric_limits<int>::max()) {
+    int n_devices_visible = AllVisible().Size();
+    ndevices = ndevices < 0 ? n_devices_visible : ndevices;
+    // fix-up device number to be limited by number of rows
+    ndevices = ndevices > num_rows ? num_rows : ndevices;
+    return Range(0, ndevices);
+  }
+
+  static GPUSet AllVisible() {
+    int n_visgpus = 0;
+#if defined(__CUDACC__)
+    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
+#endif
+    return Range(0, n_visgpus);
+  }
+  /* \brief Ensure gpu_id is correct, so not dependent upon user knowing details */
+  static int GetDeviceIdx(int gpu_id) {
+    return (std::abs(gpu_id) + 0) % AllVisible().Size();
+  }
+  /* \brief Counting from gpu_id */
+  GPUSet Normalised(int gpu_id) const {
+    return Range(gpu_id, *devices_.end() + gpu_id);
+  }
+  /* \brief Counting from 0 */
+  GPUSet Unnormalised() const {
+    return Range(0, *devices_.end() - *devices_.begin());
+  }
+
+  int Size() const {
+    int res = *devices_.end() - *devices_.begin();
+    return res < 0 ? 0 : res;
+  }
+
+  int operator[](int index) const {
+    CHECK(index >= 0 && index < *(devices_.end()));
+    return *devices_.begin() + index;
+  }
+
+  bool IsEmpty() const { return Size() == 0; }  // NOLINT
+
+  int Index(int device) const {
+    CHECK(Contains(device));
+    return device - *devices_.begin();
+  }
+
+  bool Contains(int device) const {
+    return *devices_.begin() <= device && device < *devices_.end();
+  }
+
+  common::Range::Iterator begin() const { return devices_.begin(); }  // NOLINT
+  common::Range::Iterator end() const { return devices_.end(); }      // NOLINT
+
+  friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
+    return lhs.devices_ == rhs.devices_;
+  }
+  friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  common::Range devices_;
+};
+}  // namespace xgboost
+
+#endif  // XGBOOST_COMMON_GPU_SET_H_
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -271,7 +271,7 @@ struct GPUSketcher {
        find_cuts_k<<<dh::DivRoundUp(n_cuts_cur_[icol], block), block>>>
          (cuts_d_.data().get() + icol * n_cuts_, fvalues_cur_.data().get(),
           weights2_.data().get(), n_unique, n_cuts_cur_[icol]);
-        dh::safe_cuda(cudaGetLastError());
+        dh::safe_cuda(cudaGetLastError());  // NOLINT
      }
    }

@@ -311,14 +311,14 @@ struct GPUSketcher {
         has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
         gpu_batch_nrows_, num_cols_,
         row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
-      dh::safe_cuda(cudaGetLastError());
-      dh::safe_cuda(cudaDeviceSynchronize());
+      dh::safe_cuda(cudaGetLastError());       // NOLINT
+      dh::safe_cuda(cudaDeviceSynchronize());  // NOLINT

      for (int icol = 0; icol < num_cols_; ++icol) {
        FindColumnCuts(batch_nrows, icol);
      }

-      dh::safe_cuda(cudaDeviceSynchronize());
+      dh::safe_cuda(cudaDeviceSynchronize());  // NOLINT

      // add cuts into sketches
      thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
@@ -379,7 +379,7 @@ struct GPUSketcher {
  }

  GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
-    devices_ = GPUSet::Range(param_.gpu_id, dh::NDevices(param_.n_gpus, n_rows));
+    devices_ = GPUSet::All(param_.n_gpus, n_rows).Normalised(param_.gpu_id);
  }

  std::vector<std::unique_ptr<DeviceShard>> shards_;
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -11,6 +11,7 @@
 #include <initializer_list>
 #include <vector>

+#include "gpu_set.h"
 #include "span.h"

 // only include thrust-related files if host_device_vector.h
@@ -23,40 +24,6 @@ namespace xgboost {

 template <typename T> struct HostDeviceVectorImpl;

-// set of devices across which HostDeviceVector can be distributed;
-// currently implemented as a range, but can be changed later to something else,
-// e.g. a bitset
-class GPUSet {
- public:
-  explicit GPUSet(int start = 0, int ndevices = 0)
-    : start_(start), ndevices_(ndevices) {}
-  static GPUSet Empty() { return GPUSet(); }
-  static GPUSet Range(int start, int ndevices) { return GPUSet(start, ndevices); }
-  int Size() const { return ndevices_; }
-  int operator[](int index) const {
-    CHECK(index >= 0 && index < ndevices_);
-    return start_ + index;
-  }
-  bool IsEmpty() const { return ndevices_ <= 0; }
-  int Index(int device) const {
-    CHECK(device >= start_ && device < start_ + ndevices_);
-    return device - start_;
-  }
-  bool Contains(int device) const {
-    return start_ <= device && device < start_ + ndevices_;
-  }
-  friend bool operator==(GPUSet a, GPUSet b) {
-    return a.start_ == b.start_ && a.ndevices_ == b.ndevices_;
-  }
-  friend bool operator!=(GPUSet a, GPUSet b) {
-    return a.start_ != b.start_ || a.ndevices_ != b.ndevices_;
-  }
-
- private:
-  int start_, ndevices_;
-};
-
-
 /**
 * @file host_device_vector.h
 * @brief A device-and-host vector abstraction layer.
--- a/src/common/timer.h
+++ b/src/common/timer.h
@@ -7,7 +7,8 @@
 #include <iostream>
 #include <map>
 #include <string>
-#include <vector>
+
+#include "gpu_set.h"

 namespace xgboost {
 namespace common {
@@ -66,21 +67,21 @@ struct Monitor {
    this->label = label;
  }
  void Start(const std::string &name) { timer_map[name].Start(); }
-  void Start(const std::string &name, std::vector<int> dList) {
+  void Start(const std::string &name, GPUSet devices) {
    if (debug_verbose) {
 #ifdef __CUDACC__
 #include "device_helpers.cuh"
-      dh::SynchronizeNDevices(dList.size(), dList);
+      dh::SynchronizeNDevices(devices);
 #endif
    }
    timer_map[name].Start();
  }
  void Stop(const std::string &name) { timer_map[name].Stop(); }
-  void Stop(const std::string &name, std::vector<int> dList) {
+  void Stop(const std::string &name, GPUSet devices) {
    if (debug_verbose) {
 #ifdef __CUDACC__
 #include "device_helpers.cuh"
-      dh::SynchronizeNDevices(dList.size(), dList);
+      dh::SynchronizeNDevices(devices);
 #endif
    }
    timer_map[name].Stop();