Use CUDA virtual memory for pinned memory allocation. (#10850)
- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
This commit is contained in:
@@ -3,6 +3,11 @@
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <numeric> // for iota
|
||||
#include <thrust/detail/sequence.inl> // for sequence
|
||||
|
||||
#include "../../../src/common/cuda_rt_utils.h" // for DrVersion
|
||||
#include "../../../src/common/device_helpers.cuh" // for CachingThrustPolicy, PinnedMemory
|
||||
#include "../../../src/common/device_vector.cuh"
|
||||
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
|
||||
|
||||
@@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
|
||||
ASSERT_EQ(peak, n_bytes);
|
||||
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
|
||||
}
|
||||
|
||||
#if defined(__linux__)
|
||||
namespace {
|
||||
class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
|
||||
public:
|
||||
void Run() {
|
||||
auto type = this->GetParam();
|
||||
detail::GrowOnlyVirtualMemVec vec{type};
|
||||
auto prop = xgboost::cudr::MakeAllocProp(type);
|
||||
auto gran = xgboost::cudr::GetAllocGranularity(&prop);
|
||||
ASSERT_GE(gran, 2);
|
||||
auto data = vec.GetSpan<std::int32_t>(32); // should be smaller than granularity
|
||||
ASSERT_EQ(data.size(), 32);
|
||||
static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
|
||||
|
||||
std::vector<std::int32_t> h_data(data.size());
|
||||
auto check = [&] {
|
||||
for (std::size_t i = 0; i < h_data.size(); ++i) {
|
||||
ASSERT_EQ(h_data[i], i);
|
||||
}
|
||||
};
|
||||
auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
|
||||
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
|
||||
thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
|
||||
n_orig);
|
||||
dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
|
||||
} else {
|
||||
std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
|
||||
std::copy_n(data.data(), data.size(), h_data.data());
|
||||
}
|
||||
};
|
||||
|
||||
fill(0, data);
|
||||
check();
|
||||
|
||||
auto n_orig = data.size();
|
||||
// Should be smaller than granularity, use already reserved.
|
||||
data = vec.GetSpan<std::int32_t>(128);
|
||||
h_data.resize(data.size());
|
||||
fill(n_orig, data);
|
||||
check();
|
||||
if (128 < gran) {
|
||||
ASSERT_EQ(vec.Capacity(), gran);
|
||||
}
|
||||
|
||||
n_orig = data.size();
|
||||
data = vec.GetSpan<std::int32_t>(gran / 2);
|
||||
h_data.resize(data.size());
|
||||
fill(n_orig, data);
|
||||
check();
|
||||
ASSERT_EQ(vec.Capacity(), gran * 2);
|
||||
|
||||
n_orig = data.size();
|
||||
data = vec.GetSpan<std::int32_t>(gran);
|
||||
h_data.resize(data.size());
|
||||
fill(n_orig, data);
|
||||
check();
|
||||
ASSERT_EQ(vec.Capacity(), gran * 4);
|
||||
}
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_P(TestVirtualMem, Alloc) { this->Run(); }
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
Basic, TestVirtualMem,
|
||||
::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
|
||||
[](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
|
||||
auto type = info.param;
|
||||
switch (type) {
|
||||
case CU_MEM_LOCATION_TYPE_DEVICE:
|
||||
return "Device";
|
||||
case CU_MEM_LOCATION_TYPE_HOST_NUMA:
|
||||
return "HostNuma";
|
||||
default:
|
||||
LOG(FATAL) << "unreachable";
|
||||
}
|
||||
return nullptr;
|
||||
});
|
||||
#endif // defined(__linux__)
|
||||
|
||||
TEST(TestVirtualMem, Version) {
|
||||
std::int32_t major, minor;
|
||||
xgboost::curt::DrVersion(&major, &minor);
|
||||
LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
|
||||
PinnedMemory pinned;
|
||||
if (major >= 12 && minor >= 5) {
|
||||
ASSERT_TRUE(pinned.IsVm());
|
||||
} else {
|
||||
ASSERT_FALSE(pinned.IsVm());
|
||||
}
|
||||
}
|
||||
} // namespace dh
|
||||
|
||||
@@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
|
||||
|
||||
namespace {
|
||||
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
auto n = n_samples * n_features;
|
||||
std::vector<float> x;
|
||||
x.resize(n);
|
||||
|
||||
@@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
|
||||
}
|
||||
|
||||
void TestHostDeviceVector(size_t n, DeviceOrd device) {
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
|
||||
HostDeviceVector<int> v;
|
||||
InitHostDeviceVector(n, device, &v);
|
||||
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||
@@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
|
||||
TEST(HostDeviceVector, Copy) {
|
||||
size_t n = 1001;
|
||||
auto device = DeviceOrd::CUDA(0);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
|
||||
|
||||
HostDeviceVector<int> v;
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user