Run training with empty DMatrix. (#4990)

This makes GPU Hist robust in distributed environment as some workers might not be associated with any data in either training or evaluation. * Disable rabit mock test for now: See #5012 . * Disable dask-cudf test at prediction for now: See #5003 * Launch dask job for all workers despite they might not have any data. * Check 0 rows in elementwise evaluation metrics. Using AUC and AUC-PR still throws an error. See #4663 for a robust fix. * Add tests for edge cases. * Add `LaunchKernel` wrapper handling zero sized grid. * Move some parts of allreducer into a cu file. * Don't validate feature names when the booster is empty. * Sync number of columns in DMatrix. As num_feature is required to be the same across all workers in data split mode. * Filtering in dask interface now by default syncs all booster that's not empty, instead of using rank 0. * Fix Jenkins' GPU tests. * Install dask-cuda from source in Jenkins' test. Now all tests are actually running. * Restore GPU Hist tree synchronization test. * Check UUID of running devices. The check is only performed on CUDA version >= 10.x, as 9.x doesn't have UUID field. * Fix CMake policy and project variables. Use xgboost_SOURCE_DIR uniformly, add policy for CMake >= 3.13. * Fix copying data to CPU * Fix race condition in cpu predictor. * Fix duplicated DMatrix construction. * Don't download extra nccl in CI script.
2019-11-06 16:13:13 +08:00
parent 807a244517
commit 7663de956c
44 changed files with 603 additions and 272 deletions
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -0,0 +1,91 @@
+/*!
+ * Copyright 2017-2019 XGBoost contributors
+ *
+ * \brief Utilities for CUDA.
+ */
+#ifdef XGBOOST_USE_NCCL
+#include <nccl.h>
+#endif  // #ifdef XGBOOST_USE_NCCL
+#include <sstream>
+
+#include "device_helpers.cuh"
+
+namespace dh {
+
+#if __CUDACC_VER_MAJOR__ > 9
+constexpr std::size_t kUuidLength =
+    sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
+
+void GetCudaUUID(int world_size, int rank, int device_ord,
+                 xgboost::common::Span<uint64_t, kUuidLength> uuid) {
+  cudaDeviceProp prob;
+  safe_cuda(cudaGetDeviceProperties(&prob, device_ord));
+  std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid));
+}
+
+std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
+  std::stringstream ss;
+  for (auto v : uuid) {
+    ss << std::hex << v;
+  }
+  return ss.str();
+}
+
+#endif  // __CUDACC_VER_MAJOR__ > 9
+
+void AllReducer::Init(int _device_ordinal) {
+#ifdef XGBOOST_USE_NCCL
+  LOG(DEBUG) << "Running nccl init on: " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__;
+
+  device_ordinal = _device_ordinal;
+  int32_t const rank = rabit::GetRank();
+
+#if __CUDACC_VER_MAJOR__ > 9
+  int32_t const world = rabit::GetWorldSize();
+
+  std::vector<uint64_t> uuids(world * kUuidLength, 0);
+  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
+  auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
+  GetCudaUUID(world, rank, device_ordinal, s_this_uuid);
+
+  // No allgather yet.
+  rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
+
+  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);;
+  size_t j = 0;
+  for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
+    converted[j] =
+        xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
+    j++;
+  }
+
+  auto iter = std::unique(converted.begin(), converted.end());
+  auto n_uniques = std::distance(converted.begin(), iter);
+  CHECK_EQ(n_uniques, world)
+      << "Multiple processes within communication group running on same CUDA "
+      << "device is not supported";
+#endif  // __CUDACC_VER_MAJOR__ > 9
+
+  id = GetUniqueId();
+  dh::safe_cuda(cudaSetDevice(device_ordinal));
+  dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rank));
+  safe_cuda(cudaStreamCreate(&stream));
+  initialised_ = true;
+#endif  // XGBOOST_USE_NCCL
+}
+
+AllReducer::~AllReducer() {
+#ifdef XGBOOST_USE_NCCL
+  if (initialised_) {
+    dh::safe_cuda(cudaStreamDestroy(stream));
+    ncclCommDestroy(comm);
+  }
+  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+    LOG(CONSOLE) << "======== NCCL Statistics========";
+    LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
+    LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
+  }
+#endif  // XGBOOST_USE_NCCL
+}
+
+}  // namespace dh
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -7,24 +7,25 @@
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-#include <xgboost/logging.h>
+
+#include <omp.h>
 #include <rabit/rabit.h>
+#include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>

-#include "xgboost/host_device_vector.h"
-#include "xgboost/span.h"
-
-#include "common.h"
-
 #include <algorithm>
-#include <omp.h>
 #include <chrono>
 #include <ctime>
-#include <cub/cub.cuh>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
+
+#include "xgboost/logging.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/span.h"
+
+#include "common.h"
 #include "timer.h"

 #ifdef XGBOOST_USE_NCCL
@@ -205,24 +206,53 @@ __global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
 }
 template <typename L>
 __global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
-                                L lambda) {
+                              L lambda) {
  for (auto i : GridStrideRange(begin, end)) {
    lambda(i, device_idx);
  }
 }

+/* \brief A wrapper around kernel launching syntax, used to guard against empty input.
+ *
+ * - nvcc fails to deduce template argument when kernel is a template accepting __device__
+ *   function as argument.  Hence functions like `LaunchN` cannot use this wrapper.
+ *
+ * - With c++ initialization list `{}` syntax, you are forced to comply with the CUDA type
+ *   spcification.
+ */
+class LaunchKernel {
+  size_t shmem_size_;
+  cudaStream_t stream_;
+
+  dim3 grids_;
+  dim3 blocks_;
+
+ public:
+  LaunchKernel(uint32_t _grids, uint32_t _blk, size_t _shmem=0, cudaStream_t _s=0) :
+      grids_{_grids, 1, 1}, blocks_{_blk, 1, 1}, shmem_size_{_shmem}, stream_{_s} {}
+  LaunchKernel(dim3 _grids, dim3 _blk, size_t _shmem=0, cudaStream_t _s=0) :
+      grids_{_grids}, blocks_{_blk}, shmem_size_{_shmem}, stream_{_s} {}
+
+  template <typename K, typename... Args>
+  void operator()(K kernel, Args... args) {
+    if (XGBOOST_EXPECT(grids_.x * grids_.y * grids_.z == 0, false)) {
+      LOG(DEBUG) << "Skipping empty CUDA kernel.";
+      return;
+    }
+    kernel<<<grids_, blocks_, shmem_size_, stream_>>>(args...);  // NOLINT
+  }
+};
+
 template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
 inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
  if (n == 0) {
    return;
  }
-
  safe_cuda(cudaSetDevice(device_idx));
-
  const int GRID_SIZE =
      static_cast<int>(xgboost::common::DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
-  LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
-                                                         n, lambda);
+  LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(  // NOLINT
+      static_cast<size_t>(0), n, lambda);
 }

 // Default stream version
@@ -301,6 +331,16 @@ inline detail::MemoryLogger &GlobalMemoryLogger() {
  return memory_logger;
 }

+// dh::DebugSyncDevice(__FILE__, __LINE__);
+inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
+  if (file != "" && line != -1) {
+    auto rank = rabit::GetRank();
+    LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line;
+  }
+  safe_cuda(cudaDeviceSynchronize());
+  safe_cuda(cudaGetLastError());
+}
+
 namespace detail{
 /**
 * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
@@ -763,7 +803,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
                      BLOCK_THREADS, segments, num_segments, count);

  LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, OffsetT>
-      <<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
+      <<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,  // NOLINT
                                               segments + 1, f, num_segments);
 }

@@ -963,7 +1003,6 @@ class SaveCudaContext {
 * streams. Must be initialised before use. If XGBoost is compiled without NCCL
 * this is a dummy class that will error if used with more than one GPU.
 */
-
 class AllReducer {
  bool initialised_;
  size_t allreduce_bytes_;  // Keep statistics of the number of bytes communicated
@@ -986,31 +1025,9 @@ class AllReducer {
   *
   * \param device_ordinal The device ordinal.
   */
+  void Init(int _device_ordinal);

-  void Init(int _device_ordinal) {
-#ifdef XGBOOST_USE_NCCL
-    /** \brief this >monitor . init. */
-    device_ordinal = _device_ordinal;
-    id = GetUniqueId();
-    dh::safe_cuda(cudaSetDevice(device_ordinal));
-    dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rabit::GetRank()));
-    safe_cuda(cudaStreamCreate(&stream));
-    initialised_ = true;
-#endif
-  }
-  ~AllReducer() {
-#ifdef XGBOOST_USE_NCCL
-    if (initialised_) {
-      dh::safe_cuda(cudaStreamDestroy(stream));
-      ncclCommDestroy(comm);
-    }
-    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      LOG(CONSOLE) << "======== NCCL Statistics========";
-      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
-      LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
-    }
-#endif
-  }
+  ~AllReducer();

  /**
   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -293,6 +293,7 @@ void DenseCuts::Build(DMatrix* p_fmat, uint32_t max_num_bins) {

 void DenseCuts::Init
 (std::vector<WXQSketch>* in_sketchs, uint32_t max_num_bins) {
+  monitor_.Start(__func__);
  std::vector<WXQSketch>& sketchs = *in_sketchs;
  constexpr int kFactor = 8;
  // gather the histogram data
@@ -332,6 +333,7 @@ void DenseCuts::Init
    CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back());
    p_cuts_->cut_ptrs_.push_back(cut_size);
  }
+  monitor_.Stop(__func__);
 }

 void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -252,8 +252,10 @@ class GPUSketcher {
      });
    } else if (n_cuts_cur_[icol] > 0) {
      // if more elements than cuts: use binary search on cumulative weights
-      int block = 256;
-      FindCutsK<<<common::DivRoundUp(n_cuts_cur_[icol], block), block>>>(
+      uint32_t constexpr kBlockThreads = 256;
+      uint32_t const kGrids = common::DivRoundUp(n_cuts_cur_[icol], kBlockThreads);
+      dh::LaunchKernel {kGrids, kBlockThreads} (
+          FindCutsK,
          cuts_d_.data().get() + icol * n_cuts_,
          fvalues_cur_.data().get(),
          weights2_.data().get(),
@@ -403,7 +405,8 @@ class GPUSketcher {
    // NOTE: This will typically support ~ 4M features - 64K*64
    dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
               common::DivRoundUp(num_cols_, block3.y), 1);
-    UnpackFeaturesK<<<grid3, block3>>>(
+    dh::LaunchKernel {grid3, block3} (
+        UnpackFeaturesK,
        fvalues_.data().get(),
        has_weights_ ? feature_weights_.data().get() : nullptr,
        row_ptrs_.data().get() + batch_row_begin,
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@@ -13,6 +13,20 @@
 namespace xgboost {
 namespace common {

+void Monitor::Start(std::string const &name) {
+  if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
+    statistics_map[name].timer.Start();
+  }
+}
+
+void Monitor::Stop(const std::string &name) {
+  if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
+    auto &stats = statistics_map[name];
+    stats.timer.Stop();
+    stats.count++;
+  }
+}
+
 std::vector<Monitor::StatMap> Monitor::CollectFromOtherRanks() const {
  // Since other nodes might have started timers that this one haven't, so
  // we can't simply call all reduce.
--- a/src/common/timer.cu
+++ b/src/common/timer.cu
@@ -0,0 +1,38 @@
+/*!
+ * Copyright by Contributors 2019
+ */
+#if defined(XGBOOST_USE_NVTX)
+#include <nvToolsExt.h>
+#endif  // defined(XGBOOST_USE_NVTX)
+
+#include <string>
+
+#include "xgboost/logging.h"
+#include "device_helpers.cuh"
+#include "timer.h"
+
+namespace xgboost {
+namespace common {
+
+void Monitor::StartCuda(const std::string& name) {
+  if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
+    auto &stats = statistics_map[name];
+    stats.timer.Start();
+#if defined(XGBOOST_USE_NVTX)
+    stats.nvtx_id = nvtxRangeStartA(name.c_str());
+#endif  // defined(XGBOOST_USE_NVTX)
+  }
+}
+
+void Monitor::StopCuda(const std::string& name) {
+  if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
+    auto &stats = statistics_map[name];
+    stats.timer.Stop();
+    stats.count++;
+#if defined(XGBOOST_USE_NVTX)
+    nvtxRangeEnd(stats.nvtx_id);
+#endif  // defined(XGBOOST_USE_NVTX)
+  }
+}
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/timer.h
+++ b/src/common/timer.h
@@ -10,10 +10,6 @@
 #include <utility>
 #include <vector>

-#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
-#include <nvToolsExt.h>
-#endif  // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
-
 namespace xgboost {
 namespace common {

@@ -84,37 +80,10 @@ struct Monitor {
  void Print() const;

  void Init(std::string label) { this->label = label; }
-  void Start(const std::string &name) {
-    if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
-      statistics_map[name].timer.Start();
-    }
-  }
-  void Stop(const std::string &name) {
-    if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
-      auto &stats = statistics_map[name];
-      stats.timer.Stop();
-      stats.count++;
-    }
-  }
-  void StartCuda(const std::string &name) {
-    if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
-      auto &stats = statistics_map[name];
-      stats.timer.Start();
-#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
-      stats.nvtx_id = nvtxRangeStartA(name.c_str());
-#endif  // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
-    }
-  }
-  void StopCuda(const std::string &name) {
-    if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
-      auto &stats = statistics_map[name];
-      stats.timer.Stop();
-      stats.count++;
-#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
-      nvtxRangeEnd(stats.nvtx_id);
-#endif  // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
-    }
-  }
+  void Start(const std::string &name);
+  void Stop(const std::string &name);
+  void StartCuda(const std::string &name);
+  void StopCuda(const std::string &name);
 };
 }  // namespace common
 }  // namespace xgboost
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -133,9 +133,12 @@ class Transform {
      size_t shard_size = range_size;
      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
      dh::safe_cuda(cudaSetDevice(device_));
-      const int GRID_SIZE =
+      const int kGrids =
          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
-      detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
+      if (kGrids == 0) {
+        return;
+      }
+      detail::LaunchCUDAKernel<<<kGrids, kBlockThreads>>>(  // NOLINT
          _func, shard_range, UnpackHDVOnDevice(_vectors)...);
    }
 #else