RMM integration plugin (#5873)

* [CI] Add RMM as an optional dependency * Replace caching allocator with pool allocator from RMM * Revert "Replace caching allocator with pool allocator from RMM" This reverts commit e15845d4e72e890c2babe31a988b26503a7d9038. * Use rmm::mr::get_default_resource() * Try setting default resource (doesn't work yet) * Allocate pool_mr in the heap * Prevent leaking pool_mr handle * Separate EXPECT_DEATH() in separate test suite suffixed DeathTest * Turn off death tests for RMM * Address reviewer's feedback * Prevent leaking of cuda_mr * Fix Jenkinsfile syntax * Remove unnecessary function in Jenkinsfile * [CI] Install NCCL into RMM container * Run Python tests * Try building with RMM, CUDA 10.0 * Do not use RMM for CUDA 10.0 target * Actually test for test_rmm flag * Fix TestPythonGPU * Use CNMeM allocator, since pool allocator doesn't yet support multiGPU * Use 10.0 container to build RMM-enabled XGBoost * Revert "Use 10.0 container to build RMM-enabled XGBoost" This reverts commit 789021fa31112e25b683aef39fff375403060141. * Fix Jenkinsfile * [CI] Assign larger /dev/shm to NCCL * Use 10.2 artifact to run multi-GPU Python tests * Add CUDA 10.0 -> 11.0 cross-version test; remove CUDA 10.0 target * Rename Conda env rmm_test -> gpu_test * Use env var to opt into CNMeM pool for C++ tests * Use identical CUDA version for RMM builds and tests * Use Pytest fixtures to enable RMM pool in Python tests * Move RMM to plugin/CMakeLists.txt; use PLUGIN_RMM * Use per-device MR; use command arg in gtest * Set CMake prefix path to use Conda env * Use 0.15 nightly version of RMM * Remove unnecessary header * Fix a unit test when cudf is missing * Add RMM demos * Remove print() * Use HostDeviceVector in GPU predictor * Simplify pytest setup; use LocalCUDACluster fixture * Address reviewers' commments Co-authored-by: Hyunsu Cho <chohyu01@cs.wasshington.edu>
2020-08-12 01:26:02 -07:00
parent c3ea3b7e37
commit 9adb812a0a
26 changed files with 508 additions and 140 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -36,7 +36,12 @@

 #ifdef XGBOOST_USE_NCCL
 #include "nccl.h"
-#endif
+#endif  // XGBOOST_USE_NCCL
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include "rmm/mr/device/per_device_resource.hpp"
+#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1

 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)

@@ -370,12 +375,21 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
 }

 namespace detail {
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
+#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
 /**
 * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
 */
 template <class T>
-struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
-  using SuperT = thrust::device_malloc_allocator<T>;
+struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
  using pointer = thrust::device_ptr<T>;  // NOLINT
  template<typename U>
  struct rebind  // NOLINT
@@ -391,10 +405,15 @@ struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
    return SuperT::deallocate(ptr, n);
  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBDefaultDeviceAllocatorImpl()
+    : SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{0}) {}
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 };

 /**
- * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs allocations if verbose. Does not initialise memory on construction.
+ * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs
+ * allocations if verbose. Does not initialise memory on construction.
 */
 template <class T>
 struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -11,6 +11,7 @@

 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
+#include "xgboost/tree_model.h"
 #include "device_helpers.cuh"

 namespace xgboost {
@@ -402,6 +403,7 @@ template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
+template class HostDeviceVector<RegTree::Node>;

 #if defined(__APPLE__)
 /*
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -213,39 +213,21 @@ __global__ void PredictKernel(Data data,

 class DeviceModel {
 public:
-  dh::device_vector<RegTree::Node> nodes;
-  dh::device_vector<size_t> tree_segments;
-  dh::device_vector<int> tree_group;
+  // Need to lazily construct the vectors because GPU id is only known at runtime
+  HostDeviceVector<RegTree::Node> nodes;
+  HostDeviceVector<size_t> tree_segments;
+  HostDeviceVector<int> tree_group;
  size_t tree_beg_;  // NOLINT
  size_t tree_end_;  // NOLINT
  int num_group;

-  void CopyModel(const gbm::GBTreeModel& model,
-                 const thrust::host_vector<size_t>& h_tree_segments,
-                 const thrust::host_vector<RegTree::Node>& h_nodes,
-                 size_t tree_begin, size_t tree_end) {
-    nodes.resize(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
-                                  sizeof(RegTree::Node) * h_nodes.size(),
-                                  cudaMemcpyHostToDevice));
-    tree_segments.resize(h_tree_segments.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_segments.data().get(), h_tree_segments.data(),
-                                  sizeof(size_t) * h_tree_segments.size(),
-                                  cudaMemcpyHostToDevice));
-    tree_group.resize(model.tree_info.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_group.data().get(), model.tree_info.data(),
-                                  sizeof(int) * model.tree_info.size(),
-                                  cudaMemcpyHostToDevice));
-    this->tree_beg_ = tree_begin;
-    this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->num_output_group;
-  }
-
  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
    dh::safe_cuda(cudaSetDevice(gpu_id));
+
    CHECK_EQ(model.param.size_leaf_vector, 0);
    // Copy decision trees to device
-    thrust::host_vector<size_t> h_tree_segments{};
+    tree_segments = std::move(HostDeviceVector<size_t>({}, gpu_id));
+    auto& h_tree_segments = tree_segments.HostVector();
    h_tree_segments.reserve((tree_end - tree_begin) + 1);
    size_t sum = 0;
    h_tree_segments.push_back(sum);
@@ -254,13 +236,21 @@ class DeviceModel {
      h_tree_segments.push_back(sum);
    }

-    thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
+    nodes = std::move(HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(),
+                                                      gpu_id));
+    auto& h_nodes = nodes.HostVector();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
      auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
      std::copy(src_nodes.begin(), src_nodes.end(),
                h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
    }
-    CopyModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
+
+    tree_group = std::move(HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id));
+    auto& h_tree_group = tree_group.HostVector();
+    std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
+    this->tree_beg_ = tree_begin;
+    this->tree_end_ = tree_end;
+    this->num_group = model.learner_model_param->num_output_group;
  }
 };

@@ -287,8 +277,8 @@ class GPUPredictor : public xgboost::Predictor {
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
        PredictKernel<SparsePageLoader, SparsePageView>,
        data,
-        dh::ToSpan(model_.nodes), predictions->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
+        model_.nodes.DeviceSpan(), predictions->DeviceSpan().subspan(batch_offset),
+        model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(),
        model_.tree_beg_, model_.tree_end_, num_features, num_rows,
        entry_start, use_shared, model_.num_group);
  }
@@ -303,8 +293,8 @@ class GPUPredictor : public xgboost::Predictor {
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
        PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
        batch,
-        dh::ToSpan(model_.nodes), out_preds->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
+        model_.nodes.DeviceSpan(), out_preds->DeviceSpan().subspan(batch_offset),
+        model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(),
        model_.tree_beg_, model_.tree_end_, batch.NumFeatures(), num_rows,
        entry_start, use_shared, model_.num_group);
  }
@@ -435,8 +425,8 @@ class GPUPredictor : public xgboost::Predictor {
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
        PredictKernel<Loader, typename Loader::BatchT>,
        m->Value(),
-        dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
-        dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
+        d_model.nodes.DeviceSpan(), out_preds->predictions.DeviceSpan(),
+        d_model.tree_segments.DeviceSpan(), d_model.tree_group.DeviceSpan(),
        tree_begin, tree_end, m->NumColumns(), info.num_row_,
        entry_start, use_shared, output_groups);
  }