RMM integration plugin (#5873)

* [CI] Add RMM as an optional dependency

* Replace caching allocator with pool allocator from RMM

* Revert "Replace caching allocator with pool allocator from RMM"

This reverts commit e15845d4e72e890c2babe31a988b26503a7d9038.

* Use rmm::mr::get_default_resource()

* Try setting default resource (doesn't work yet)

* Allocate pool_mr in the heap

* Prevent leaking pool_mr handle

* Separate EXPECT_DEATH() in separate test suite suffixed DeathTest

* Turn off death tests for RMM

* Address reviewer's feedback

* Prevent leaking of cuda_mr

* Fix Jenkinsfile syntax

* Remove unnecessary function in Jenkinsfile

* [CI] Install NCCL into RMM container

* Run Python tests

* Try building with RMM, CUDA 10.0

* Do not use RMM for CUDA 10.0 target

* Actually test for test_rmm flag

* Fix TestPythonGPU

* Use CNMeM allocator, since pool allocator doesn't yet support multiGPU

* Use 10.0 container to build RMM-enabled XGBoost

* Revert "Use 10.0 container to build RMM-enabled XGBoost"

This reverts commit 789021fa31112e25b683aef39fff375403060141.

* Fix Jenkinsfile

* [CI] Assign larger /dev/shm to NCCL

* Use 10.2 artifact to run multi-GPU Python tests

* Add CUDA 10.0 -> 11.0 cross-version test; remove CUDA 10.0 target

* Rename Conda env rmm_test -> gpu_test

* Use env var to opt into CNMeM pool for C++ tests

* Use identical CUDA version for RMM builds and tests

* Use Pytest fixtures to enable RMM pool in Python tests

* Move RMM to plugin/CMakeLists.txt; use PLUGIN_RMM

* Use per-device MR; use command arg in gtest

* Set CMake prefix path to use Conda env

* Use 0.15 nightly version of RMM

* Remove unnecessary header

* Fix a unit test when cudf is missing

* Add RMM demos

* Remove print()

* Use HostDeviceVector in GPU predictor

* Simplify pytest setup; use LocalCUDACluster fixture

* Address reviewers' commments

Co-authored-by: Hyunsu Cho <chohyu01@cs.wasshington.edu>
This commit is contained in:
Philip Hyunsu Cho
2020-08-12 01:26:02 -07:00
committed by GitHub
parent c3ea3b7e37
commit 9adb812a0a
26 changed files with 508 additions and 140 deletions

View File

@@ -36,7 +36,12 @@
#ifdef XGBOOST_USE_NCCL
#include "nccl.h"
#endif
#endif // XGBOOST_USE_NCCL
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include "rmm/mr/device/per_device_resource.hpp"
#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@@ -370,12 +375,21 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
}
namespace detail {
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
/**
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
*/
template <class T>
struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
using SuperT = thrust::device_malloc_allocator<T>;
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; // NOLINT
template<typename U>
struct rebind // NOLINT
@@ -391,10 +405,15 @@ struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
return SuperT::deallocate(ptr, n);
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBDefaultDeviceAllocatorImpl()
: SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{0}) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
};
/**
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs allocations if verbose. Does not initialise memory on construction.
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs
* allocations if verbose. Does not initialise memory on construction.
*/
template <class T>
struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {

View File

@@ -11,6 +11,7 @@
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/tree_model.h"
#include "device_helpers.cuh"
namespace xgboost {
@@ -402,6 +403,7 @@ template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<uint32_t>; // bst_feature_t
template class HostDeviceVector<RegTree::Node>;
#if defined(__APPLE__)
/*

View File

@@ -213,39 +213,21 @@ __global__ void PredictKernel(Data data,
class DeviceModel {
public:
dh::device_vector<RegTree::Node> nodes;
dh::device_vector<size_t> tree_segments;
dh::device_vector<int> tree_group;
// Need to lazily construct the vectors because GPU id is only known at runtime
HostDeviceVector<RegTree::Node> nodes;
HostDeviceVector<size_t> tree_segments;
HostDeviceVector<int> tree_group;
size_t tree_beg_; // NOLINT
size_t tree_end_; // NOLINT
int num_group;
void CopyModel(const gbm::GBTreeModel& model,
const thrust::host_vector<size_t>& h_tree_segments,
const thrust::host_vector<RegTree::Node>& h_nodes,
size_t tree_begin, size_t tree_end) {
nodes.resize(h_nodes.size());
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
sizeof(RegTree::Node) * h_nodes.size(),
cudaMemcpyHostToDevice));
tree_segments.resize(h_tree_segments.size());
dh::safe_cuda(cudaMemcpyAsync(tree_segments.data().get(), h_tree_segments.data(),
sizeof(size_t) * h_tree_segments.size(),
cudaMemcpyHostToDevice));
tree_group.resize(model.tree_info.size());
dh::safe_cuda(cudaMemcpyAsync(tree_group.data().get(), model.tree_info.data(),
sizeof(int) * model.tree_info.size(),
cudaMemcpyHostToDevice));
this->tree_beg_ = tree_begin;
this->tree_end_ = tree_end;
this->num_group = model.learner_model_param->num_output_group;
}
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
dh::safe_cuda(cudaSetDevice(gpu_id));
CHECK_EQ(model.param.size_leaf_vector, 0);
// Copy decision trees to device
thrust::host_vector<size_t> h_tree_segments{};
tree_segments = std::move(HostDeviceVector<size_t>({}, gpu_id));
auto& h_tree_segments = tree_segments.HostVector();
h_tree_segments.reserve((tree_end - tree_begin) + 1);
size_t sum = 0;
h_tree_segments.push_back(sum);
@@ -254,13 +236,21 @@ class DeviceModel {
h_tree_segments.push_back(sum);
}
thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
nodes = std::move(HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(),
gpu_id));
auto& h_nodes = nodes.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
std::copy(src_nodes.begin(), src_nodes.end(),
h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
}
CopyModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
tree_group = std::move(HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id));
auto& h_tree_group = tree_group.HostVector();
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
this->tree_beg_ = tree_begin;
this->tree_end_ = tree_end;
this->num_group = model.learner_model_param->num_output_group;
}
};
@@ -287,8 +277,8 @@ class GPUPredictor : public xgboost::Predictor {
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
PredictKernel<SparsePageLoader, SparsePageView>,
data,
dh::ToSpan(model_.nodes), predictions->DeviceSpan().subspan(batch_offset),
dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
model_.nodes.DeviceSpan(), predictions->DeviceSpan().subspan(batch_offset),
model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(),
model_.tree_beg_, model_.tree_end_, num_features, num_rows,
entry_start, use_shared, model_.num_group);
}
@@ -303,8 +293,8 @@ class GPUPredictor : public xgboost::Predictor {
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
batch,
dh::ToSpan(model_.nodes), out_preds->DeviceSpan().subspan(batch_offset),
dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
model_.nodes.DeviceSpan(), out_preds->DeviceSpan().subspan(batch_offset),
model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(),
model_.tree_beg_, model_.tree_end_, batch.NumFeatures(), num_rows,
entry_start, use_shared, model_.num_group);
}
@@ -435,8 +425,8 @@ class GPUPredictor : public xgboost::Predictor {
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
PredictKernel<Loader, typename Loader::BatchT>,
m->Value(),
dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
d_model.nodes.DeviceSpan(), out_preds->predictions.DeviceSpan(),
d_model.tree_segments.DeviceSpan(), d_model.tree_group.DeviceSpan(),
tree_begin, tree_end, m->NumColumns(), info.num_row_,
entry_start, use_shared, output_groups);
}