Add use_rmm flag to global configuration (#6656)

* Ensure RMM is 0.18 or later

* Add use_rmm flag to global configuration

* Modify XGBCachingDeviceAllocatorImpl to skip CUB when use_rmm=True

* Update the demo

* [CI] Pin NumPy to 1.19.4, since NumPy 1.19.5 doesn't work with latest Shap
This commit is contained in:
Philip Hyunsu Cho
2021-03-09 14:53:05 -08:00
committed by GitHub
parent e4894111ba
commit 366f3cb9d8
12 changed files with 117 additions and 20 deletions

View File

@@ -32,6 +32,7 @@
#include "xgboost/logging.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/span.h"
#include "xgboost/global_config.h"
#include "common.h"
@@ -42,6 +43,14 @@
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include "rmm/mr/device/per_device_resource.hpp"
#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
#include "rmm/version_config.hpp"
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
#error "Please use RMM version 0.18 or later"
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
#error "Please use RMM version 0.18 or later"
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@@ -453,21 +462,42 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
return *allocator;
}
pointer allocate(size_t n) { // NOLINT
T* ptr;
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
n * sizeof(T));
if (errc != cudaSuccess) {
ThrowOOMError("Caching allocator", n * sizeof(T));
pointer thrust_ptr;
if (use_cub_allocator_) {
T* raw_ptr{nullptr};
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
if (errc != cudaSuccess) {
ThrowOOMError("Caching allocator", n * sizeof(T));
}
thrust_ptr = pointer(raw_ptr);
} else {
try {
thrust_ptr = SuperT::allocate(n);
dh::safe_cuda(cudaGetLastError());
} catch (const std::exception &e) {
ThrowOOMError(e.what(), n * sizeof(T));
}
}
pointer thrust_ptr{ ptr };
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
return thrust_ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
GetGlobalCachingAllocator().DeviceFree(ptr.get());
if (use_cub_allocator_) {
GetGlobalCachingAllocator().DeviceFree(ptr.get());
} else {
SuperT::deallocate(ptr, n);
}
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBCachingDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()),
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBOOST_DEVICE void construct(T *) {} // NOLINT
private:
bool use_cub_allocator_{true};
};
} // namespace detail