RMM integration plugin (#5873)

* [CI] Add RMM as an optional dependency

* Replace caching allocator with pool allocator from RMM

* Revert "Replace caching allocator with pool allocator from RMM"

This reverts commit e15845d4e72e890c2babe31a988b26503a7d9038.

* Use rmm::mr::get_default_resource()

* Try setting default resource (doesn't work yet)

* Allocate pool_mr in the heap

* Prevent leaking pool_mr handle

* Separate EXPECT_DEATH() in separate test suite suffixed DeathTest

* Turn off death tests for RMM

* Address reviewer's feedback

* Prevent leaking of cuda_mr

* Fix Jenkinsfile syntax

* Remove unnecessary function in Jenkinsfile

* [CI] Install NCCL into RMM container

* Run Python tests

* Try building with RMM, CUDA 10.0

* Do not use RMM for CUDA 10.0 target

* Actually test for test_rmm flag

* Fix TestPythonGPU

* Use CNMeM allocator, since pool allocator doesn't yet support multiGPU

* Use 10.0 container to build RMM-enabled XGBoost

* Revert "Use 10.0 container to build RMM-enabled XGBoost"

This reverts commit 789021fa31112e25b683aef39fff375403060141.

* Fix Jenkinsfile

* [CI] Assign larger /dev/shm to NCCL

* Use 10.2 artifact to run multi-GPU Python tests

* Add CUDA 10.0 -> 11.0 cross-version test; remove CUDA 10.0 target

* Rename Conda env rmm_test -> gpu_test

* Use env var to opt into CNMeM pool for C++ tests

* Use identical CUDA version for RMM builds and tests

* Use Pytest fixtures to enable RMM pool in Python tests

* Move RMM to plugin/CMakeLists.txt; use PLUGIN_RMM

* Use per-device MR; use command arg in gtest

* Set CMake prefix path to use Conda env

* Use 0.15 nightly version of RMM

* Remove unnecessary header

* Fix a unit test when cudf is missing

* Add RMM demos

* Remove print()

* Use HostDeviceVector in GPU predictor

* Simplify pytest setup; use LocalCUDACluster fixture

* Address reviewers' commments

Co-authored-by: Hyunsu Cho <chohyu01@cs.wasshington.edu>
This commit is contained in:
Philip Hyunsu Cho
2020-08-12 01:26:02 -07:00
committed by GitHub
parent c3ea3b7e37
commit 9adb812a0a
26 changed files with 508 additions and 140 deletions

View File

@@ -20,6 +20,15 @@
#include "../../src/gbm/gbtree_model.h"
#include "xgboost/predictor.h"
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include <memory>
#include <numeric>
#include <vector>
#include "rmm/mr/device/per_device_resource.hpp"
#include "rmm/mr/device/cuda_memory_resource.hpp"
#include "rmm/mr/device/pool_memory_resource.hpp"
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
bool FileExists(const std::string& filename) {
struct stat st;
return stat(filename.c_str(), &st) == 0;
@@ -478,4 +487,57 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
return gbm;
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
using CUDAMemoryResource = rmm::mr::cuda_memory_resource;
using PoolMemoryResource = rmm::mr::pool_memory_resource<CUDAMemoryResource>;
class RMMAllocator {
public:
std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
int n_gpu;
RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
int current_device;
CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
for (int i = 0; i < n_gpu; ++i) {
CHECK_EQ(cudaSetDevice(i), cudaSuccess);
cuda_mr.push_back(std::make_unique<CUDAMemoryResource>());
pool_mr.push_back(std::make_unique<PoolMemoryResource>(cuda_mr[i].get()));
}
CHECK_EQ(cudaSetDevice(current_device), cudaSuccess);
}
~RMMAllocator() = default;
};
void DeleteRMMResource(RMMAllocator* r) {
delete r;
}
RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
bool use_rmm_pool = false;
for (int i = 1; i < argc; ++i) {
if (argv[i] == std::string("--use-rmm-pool")) {
use_rmm_pool = true;
}
}
if (!use_rmm_pool) {
return RMMAllocatorPtr(nullptr, DeleteRMMResource);
}
LOG(INFO) << "Using RMM memory pool";
auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource);
for (int i = 0; i < ptr->n_gpu; ++i) {
rmm::mr::set_per_device_resource(rmm::cuda_device_id(i), ptr->pool_mr[i].get());
}
return ptr;
}
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
class RMMAllocator {};
void DeleteRMMResource(RMMAllocator* r) {}
RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
return RMMAllocatorPtr(nullptr, DeleteRMMResource);
}
#endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
} // namespace xgboost