RMM integration plugin (#5873)

* [CI] Add RMM as an optional dependency * Replace caching allocator with pool allocator from RMM * Revert "Replace caching allocator with pool allocator from RMM" This reverts commit e15845d4e72e890c2babe31a988b26503a7d9038. * Use rmm::mr::get_default_resource() * Try setting default resource (doesn't work yet) * Allocate pool_mr in the heap * Prevent leaking pool_mr handle * Separate EXPECT_DEATH() in separate test suite suffixed DeathTest * Turn off death tests for RMM * Address reviewer's feedback * Prevent leaking of cuda_mr * Fix Jenkinsfile syntax * Remove unnecessary function in Jenkinsfile * [CI] Install NCCL into RMM container * Run Python tests * Try building with RMM, CUDA 10.0 * Do not use RMM for CUDA 10.0 target * Actually test for test_rmm flag * Fix TestPythonGPU * Use CNMeM allocator, since pool allocator doesn't yet support multiGPU * Use 10.0 container to build RMM-enabled XGBoost * Revert "Use 10.0 container to build RMM-enabled XGBoost" This reverts commit 789021fa31112e25b683aef39fff375403060141. * Fix Jenkinsfile * [CI] Assign larger /dev/shm to NCCL * Use 10.2 artifact to run multi-GPU Python tests * Add CUDA 10.0 -> 11.0 cross-version test; remove CUDA 10.0 target * Rename Conda env rmm_test -> gpu_test * Use env var to opt into CNMeM pool for C++ tests * Use identical CUDA version for RMM builds and tests * Use Pytest fixtures to enable RMM pool in Python tests * Move RMM to plugin/CMakeLists.txt; use PLUGIN_RMM * Use per-device MR; use command arg in gtest * Set CMake prefix path to use Conda env * Use 0.15 nightly version of RMM * Remove unnecessary header * Fix a unit test when cudf is missing * Add RMM demos * Remove print() * Use HostDeviceVector in GPU predictor * Simplify pytest setup; use LocalCUDACluster fixture * Address reviewers' commments Co-authored-by: Hyunsu Cho <chohyu01@cs.wasshington.edu>
2020-08-12 01:26:02 -07:00
parent c3ea3b7e37
commit 9adb812a0a
26 changed files with 508 additions and 140 deletions
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -17,8 +17,8 @@ ENV PATH=/opt/python/bin:$PATH

 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
-    conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.7 cudf=0.14 cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
+    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+        python=3.7 cudf=0.15* rmm=0.15* cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis

 ENV GOSU_VERSION 1.10
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -0,0 +1,47 @@
+ARG CUDA_VERSION
+FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu16.04
+ARG CUDA_VERSION
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
+    # Python
+    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3.sh -b -p /opt/python && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr
+
+# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
+RUN \
+    export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
+    export NCCL_VERSION=2.7.5-1 && \
+    apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
+
+ENV PATH=/opt/python/bin:$PATH
+
+# Create new Conda environment with RMM
+RUN \
+    conda create -n gpu_test -c nvidia -c rapidsai-nightly -c rapidsai -c conda-forge -c defaults \
+        python=3.7 rmm=0.15* cudatoolkit=$CUDA_VERSION
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -1,10 +1,23 @@
 #!/usr/bin/env bash
 set -e

+if [[ "$1" == --conda-env=* ]]
+then
+  conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -)
+  echo "Activating Conda environment ${conda_env}"
+  shift 1
+  cmake_args="$@"
+  source activate ${conda_env}
+  cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX"
+else
+  cmake_args="$@"
+  cmake_prefix_flag=''
+fi
+
 rm -rf build
 mkdir build
 cd build
-cmake .. "$@" -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja
+cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja ${cmake_prefix_flag}
 ninja clean
 time ninja -v
 cd ..
--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@@ -2,7 +2,15 @@
 set -e
 set -x

-suite=$1
+if [ "$#" -lt 1 ]
+then
+  suite=''
+  args=''
+else
+  suite=$1
+  shift 1
+  args="$@"
+fi

 # Install XGBoost Python package
 function install_xgboost {
@@ -26,34 +34,40 @@ function install_xgboost {
  fi
 }

+function uninstall_xgboost {
+  pip uninstall -y xgboost
+}
+
 # Run specified test suite
 case "$suite" in
  gpu)
    source activate gpu_test
    install_xgboost
-    pytest -v -s -rxXs --fulltrace -m "not mgpu" tests/python-gpu
+    pytest -v -s -rxXs --fulltrace -m "not mgpu" ${args} tests/python-gpu
+    uninstall_xgboost
    ;;

  mgpu)
    source activate gpu_test
    install_xgboost
-    pytest -v -s -rxXs --fulltrace -m "mgpu" tests/python-gpu
+    pytest -v -s -rxXs --fulltrace -m "mgpu" ${args} tests/python-gpu

    cd tests/distributed
    ./runtests-gpu.sh
-    cd -
+    uninstall_xgboost
    ;;

  cpu)
    source activate cpu_test
    install_xgboost
-    pytest -v -s --fulltrace tests/python
+    pytest -v -s -rxXs --fulltrace ${args} tests/python
    cd tests/distributed
    ./runtests.sh
+    uninstall_xgboost
    ;;

  *)
-    echo "Usage: $0 {gpu|mgpu|cpu}"
+    echo "Usage: $0 {gpu|mgpu|cpu} [extra args to pass to pytest]"
    exit 1
    ;;
 esac
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -37,6 +37,8 @@ if (USE_CUDA)
    $<$<COMPILE_LANGUAGE:CUDA>:${GEN_CODE}>)
  target_compile_definitions(testxgboost
    PRIVATE -DXGBOOST_USE_CUDA=1)
+  find_package(CUDA)
+  target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
  set_target_properties(testxgboost PROPERTIES
    CUDA_SEPARABLE_COMPILATION OFF)

--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@@ -97,11 +97,6 @@ TEST(Span, FromPtrLen) {
    }
  }

-  {
-    auto lazy = [=]() {Span<float const, 16> tmp (arr, 5);};
-    EXPECT_DEATH(lazy(), "\\[xgboost\\] Condition .* failed.\n");
-  }
-
  // dynamic extent
  {
    Span<float, 16> s (arr, 16);
@@ -122,6 +117,15 @@ TEST(Span, FromPtrLen) {
  }
 }

+TEST(SpanDeathTest, FromPtrLen) {
+  float arr[16];
+  InitializeRange(arr, arr+16);
+  {
+    auto lazy = [=]() {Span<float const, 16> tmp (arr, 5);};
+    EXPECT_DEATH(lazy(), "\\[xgboost\\] Condition .* failed.\n");
+  }
+}
+
 TEST(Span, FromFirstLast) {
  float arr[16];
  InitializeRange(arr, arr+16);
@@ -285,7 +289,13 @@ TEST(Span, ElementAccess) {
    ASSERT_EQ(i, arr[j]);
    ++j;
  }
+}

+TEST(SpanDeathTest, ElementAccess) {
+  float arr[16];
+  InitializeRange(arr, arr + 16);
+
+  Span<float> s (arr);
  EXPECT_DEATH(s[16], "\\[xgboost\\] Condition .* failed.\n");
  EXPECT_DEATH(s[-1], "\\[xgboost\\] Condition .* failed.\n");

@@ -312,7 +322,9 @@ TEST(Span, FrontBack) {
    ASSERT_EQ(s.front(), 0);
    ASSERT_EQ(s.back(), 3);
  }
+}

+TEST(SpanDeathTest, FrontBack) {
  {
    Span<float, 0> s;
    EXPECT_DEATH(s.front(), "\\[xgboost\\] Condition .* failed.\n");
@@ -340,10 +352,6 @@ TEST(Span, FirstLast) {
    for (size_t i = 0; i < first.size(); ++i) {
      ASSERT_EQ(first[i], arr[i]);
    }
-    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
-    EXPECT_DEATH(s.first<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first<17>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first<32>(), "\\[xgboost\\] Condition .* failed.\n");
  }

  {
@@ -359,10 +367,6 @@ TEST(Span, FirstLast) {
    for (size_t i = 0; i < last.size(); ++i) {
      ASSERT_EQ(last[i], arr[i+12]);
    }
-    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
-    EXPECT_DEATH(s.last<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.last<17>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.last<32>(), "\\[xgboost\\] Condition .* failed.\n");
  }

  // dynamic extent
@@ -379,10 +383,6 @@ TEST(Span, FirstLast) {
      ASSERT_EQ(first[i], s[i]);
    }

-    EXPECT_DEATH(s.first(-1), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first(17), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first(32), "\\[xgboost\\] Condition .* failed.\n");
-
    delete [] arr;
  }

@@ -399,6 +399,50 @@ TEST(Span, FirstLast) {
      ASSERT_EQ(s[12 + i], last[i]);
    }

+    delete [] arr;
+  }
+}
+
+TEST(SpanDeathTest, FirstLast) {
+  // static extent
+  {
+    float arr[16];
+    InitializeRange(arr, arr + 16);
+
+    Span<float> s (arr);
+    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
+    EXPECT_DEATH(s.first<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first<17>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first<32>(), "\\[xgboost\\] Condition .* failed.\n");
+  }
+
+  {
+    float arr[16];
+    InitializeRange(arr, arr + 16);
+
+    Span<float> s (arr);
+    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
+    EXPECT_DEATH(s.last<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.last<17>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.last<32>(), "\\[xgboost\\] Condition .* failed.\n");
+  }
+
+  // dynamic extent
+  {
+    float *arr = new float[16];
+    InitializeRange(arr, arr + 16);
+    Span<float> s (arr, 16);
+    EXPECT_DEATH(s.first(-1), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first(17), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first(32), "\\[xgboost\\] Condition .* failed.\n");
+
+    delete [] arr;
+  }
+
+  {
+    float *arr = new float[16];
+    InitializeRange(arr, arr + 16);
+    Span<float> s (arr, 16);
    EXPECT_DEATH(s.last(-1), "\\[xgboost\\] Condition .* failed.\n");
    EXPECT_DEATH(s.last(17), "\\[xgboost\\] Condition .* failed.\n");
    EXPECT_DEATH(s.last(32), "\\[xgboost\\] Condition .* failed.\n");
@@ -420,7 +464,11 @@ TEST(Span, Subspan) {
  auto s4 = s1.subspan(2, dynamic_extent);
  ASSERT_EQ(s1.data() + 2, s4.data());
  ASSERT_EQ(s4.size(), s1.size() - 2);
+}

+TEST(SpanDeathTest, Subspan) {
+  int arr[16] {0};
+  Span<int> s1 (arr);
  EXPECT_DEATH(s1.subspan(-1, 0), "\\[xgboost\\] Condition .* failed.\n");
  EXPECT_DEATH(s1.subspan(17, 0), "\\[xgboost\\] Condition .* failed.\n");

--- a/tests/cpp/common/test_span.cu
+++ b/tests/cpp/common/test_span.cu
@@ -221,7 +221,7 @@ struct TestElementAccess {
  }
 };

-TEST(GPUSpan, ElementAccess) {
+TEST(GPUSpanDeathTest, ElementAccess) {
  dh::safe_cuda(cudaSetDevice(0));
  auto test_element_access = []() {
    thrust::host_vector<float> h_vec (16);
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -59,7 +59,7 @@ TEST(Transform, DeclareUnifiedTest(Basic)) {
 }

 #if !defined(__CUDACC__)
-TEST(Transform, Exception) {
+TEST(TransformDeathTest, Exception) {
  size_t const kSize {16};
  std::vector<bst_float> h_in(kSize);
  const HostDeviceVector<bst_float> in_vec{h_in, -1};
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -20,6 +20,15 @@
 #include "../../src/gbm/gbtree_model.h"
 #include "xgboost/predictor.h"

+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include <memory>
+#include <numeric>
+#include <vector>
+#include "rmm/mr/device/per_device_resource.hpp"
+#include "rmm/mr/device/cuda_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
 bool FileExists(const std::string& filename) {
  struct stat st;
  return stat(filename.c_str(), &st) == 0;
@@ -478,4 +487,57 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
  return gbm;
 }

+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+using CUDAMemoryResource = rmm::mr::cuda_memory_resource;
+using PoolMemoryResource = rmm::mr::pool_memory_resource<CUDAMemoryResource>;
+class RMMAllocator {
+ public:
+  std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
+  std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
+  int n_gpu;
+  RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
+    int current_device;
+    CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    for (int i = 0; i < n_gpu; ++i) {
+      CHECK_EQ(cudaSetDevice(i), cudaSuccess);
+      cuda_mr.push_back(std::make_unique<CUDAMemoryResource>());
+      pool_mr.push_back(std::make_unique<PoolMemoryResource>(cuda_mr[i].get()));
+    }
+    CHECK_EQ(cudaSetDevice(current_device), cudaSuccess);
+  }
+  ~RMMAllocator() = default;
+};
+
+void DeleteRMMResource(RMMAllocator* r) {
+  delete r;
+}
+
+RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
+  bool use_rmm_pool = false;
+  for (int i = 1; i < argc; ++i) {
+    if (argv[i] == std::string("--use-rmm-pool")) {
+      use_rmm_pool = true;
+    }
+  }
+  if (!use_rmm_pool) {
+    return RMMAllocatorPtr(nullptr, DeleteRMMResource);
+  }
+  LOG(INFO) << "Using RMM memory pool";
+  auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource);
+  for (int i = 0; i < ptr->n_gpu; ++i) {
+    rmm::mr::set_per_device_resource(rmm::cuda_device_id(i), ptr->pool_mr[i].get());
+  }
+  return ptr;
+}
+#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+class RMMAllocator {};
+
+void DeleteRMMResource(RMMAllocator* r) {}
+
+RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
+  return RMMAllocatorPtr(nullptr, DeleteRMMResource);
+}
+#endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
+
 }  // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <cstdio>
 #include <string>
+#include <memory>
 #include <vector>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -352,5 +353,9 @@ inline int Next(DataIterHandle self) {
  return static_cast<CudaArrayIterForTest*>(self)->Next();
 }

+class RMMAllocator;
+using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
+RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
+
 }  // namespace xgboost
 #endif
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -3,13 +3,17 @@
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
 #include <string>
+#include <memory>
 #include <vector>

+#include "helpers.h"
+
 int main(int argc, char ** argv) {
  xgboost::Args args {{"verbosity", "2"}};
  xgboost::ConsoleLogger::Configure(args);

  testing::InitGoogleTest(&argc, argv);
  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests(argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -119,7 +119,7 @@ void TestIncorrectRow() {
  });
 }

-TEST(RowPartitioner, IncorrectRow) {
+TEST(RowPartitionerDeathTest, IncorrectRow) {
  ASSERT_DEATH({ TestIncorrectRow(); },".*");
 }
 }  // namespace tree
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -2,4 +2,4 @@
 markers =
    mgpu: Mark a test that requires multiple GPUs to run.
    ci: Mark a test that runs only on CI.
-    gtest: Mark a test that requires C++ Google Test executable.
+    gtest: Mark a test that requires C++ Google Test executable.
--- a/tests/python-gpu/conftest.py
+++ b/tests/python-gpu/conftest.py
@@ -0,0 +1,45 @@
+import sys
+import pytest
+import logging
+
+sys.path.append("tests/python")
+import testing as tm                          # noqa
+
+def has_rmm():
+    try:
+        import rmm
+        return True
+    except ImportError:
+        return False
+
+@pytest.fixture(scope='session', autouse=True)
+def setup_rmm_pool(request, pytestconfig):
+    if pytestconfig.getoption('--use-rmm-pool'):
+        if not has_rmm():
+            raise ImportError('The --use-rmm-pool option requires the RMM package')
+        import rmm
+        from dask_cuda.utils import get_n_gpus
+        rmm.reinitialize(pool_allocator=True, initial_pool_size=1024*1024*1024,
+                         devices=list(range(get_n_gpus())))
+
+@pytest.fixture(scope='function')
+def local_cuda_cluster(request, pytestconfig):
+    kwargs = {}
+    if hasattr(request, 'param'):
+        kwargs.update(request.param)
+    if pytestconfig.getoption('--use-rmm-pool'):
+        if not has_rmm():
+            raise ImportError('The --use-rmm-pool option requires the RMM package')
+        import rmm
+        from dask_cuda.utils import get_n_gpus
+        rmm.reinitialize()
+        kwargs['rmm_pool_size'] = '2GB'
+    if tm.no_dask_cuda()['condition']:
+        raise ImportError('The local_cuda_cluster fixture requires dask_cuda package')
+    from dask_cuda import LocalCUDACluster
+    cluster = LocalCUDACluster(**kwargs)
+    yield cluster
+    cluster.close()
+
+def pytest_addoption(parser):
+    parser.addoption('--use-rmm-pool', action='store_true', default=False, help='Use RMM pool')
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -6,7 +6,6 @@ sys.path.append("tests/python")
 import testing as tm
 import test_demos as td         # noqa

-
@pytest.mark.skipif(**tm.no_cupy())
 def test_data_iterator():
    script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py')
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -3,7 +3,6 @@ import os
 import pytest
 import numpy as np
 import asyncio
-import unittest
 import xgboost
 import subprocess
 from hypothesis import given, strategies, settings, note
@@ -23,7 +22,6 @@ import testing as tm                          # noqa
 try:
    import dask.dataframe as dd
    from xgboost import dask as dxgb
-    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    from dask import array as da
    import cudf
@@ -151,50 +149,51 @@ def run_gpu_hist(params, num_rounds, dataset, DMatrixT, client):
    assert tm.non_increasing(history['train'][dataset.metric])


-class TestDistributedGPU(unittest.TestCase):
+class TestDistributedGPU:
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_cudf())
    @pytest.mark.skipif(**tm.no_dask_cudf())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
-    def test_dask_dataframe(self):
-        with LocalCUDACluster() as cluster:
-            with Client(cluster) as client:
-                run_with_dask_dataframe(dxgb.DaskDMatrix, client)
-                run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
+    def test_dask_dataframe(self, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            run_with_dask_dataframe(dxgb.DaskDMatrix, client)
+            run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)

-    @given(parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(params=parameter_strategy, num_rounds=strategies.integers(1, 20),
+           dataset=tm.dataset_strategy)
    @settings(deadline=duration(seconds=120))
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.skipif(**tm.no_dask_cuda())
+    @pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster'])
    @pytest.mark.mgpu
-    def test_gpu_hist(self, params, num_rounds, dataset):
-        with LocalCUDACluster(n_workers=2) as cluster:
-            with Client(cluster) as client:
-                run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
-                             client)
-                run_gpu_hist(params, num_rounds, dataset,
-                             dxgb.DaskDeviceQuantileDMatrix, client)
+    def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
+                         client)
+            run_gpu_hist(params, num_rounds, dataset,
+                         dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_cupy())
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
-    def test_dask_array(self):
-        with LocalCUDACluster() as cluster:
-            with Client(cluster) as client:
-                run_with_dask_array(dxgb.DaskDMatrix, client)
-                run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)
+    def test_dask_array(self, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            run_with_dask_array(dxgb.DaskDMatrix, client)
+            run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
-    def test_empty_dmatrix(self):
-        with LocalCUDACluster() as cluster:
-            with Client(cluster) as client:
-                parameters = {'tree_method': 'gpu_hist',
-                              'debug_synchronize': True}
-                run_empty_dmatrix_reg(client, parameters)
-                run_empty_dmatrix_cls(client, parameters)
+    def test_empty_dmatrix(self, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            parameters = {'tree_method': 'gpu_hist',
+                          'debug_synchronize': True}
+            run_empty_dmatrix_reg(client, parameters)
+            run_empty_dmatrix_cls(client, parameters)

-    def run_quantile(self, name):
+    def run_quantile(self, name, local_cuda_cluster):
        if sys.platform.startswith("win"):
            pytest.skip("Skipping dask tests on Windows")

@@ -217,34 +216,33 @@ class TestDistributedGPU(unittest.TestCase):
            env[port[0]] = port[1]
            return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)

-        with LocalCUDACluster() as cluster:
-            with Client(cluster) as client:
-                workers = list(dxgb._get_client_workers(client).keys())
-                rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
-                futures = client.map(runit,
-                                     workers,
-                                     pure=False,
-                                     workers=workers,
-                                     rabit_args=rabit_args)
-                results = client.gather(futures)
-                for ret in results:
-                    msg = ret.stdout.decode('utf-8')
-                    assert msg.find('1 test from GPUQuantile') != -1, msg
-                    assert ret.returncode == 0, msg
+        with Client(local_cuda_cluster) as client:
+            workers = list(dxgb._get_client_workers(client).keys())
+            rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
+            futures = client.map(runit,
+                                 workers,
+                                 pure=False,
+                                 workers=workers,
+                                 rabit_args=rabit_args)
+            results = client.gather(futures)
+            for ret in results:
+                msg = ret.stdout.decode('utf-8')
+                assert msg.find('1 test from GPUQuantile') != -1, msg
+                assert ret.returncode == 0, msg

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    @pytest.mark.gtest
-    def test_quantile_basic(self):
-        self.run_quantile('AllReduceBasic')
+    def test_quantile_basic(self, local_cuda_cluster):
+        self.run_quantile('AllReduceBasic', local_cuda_cluster)

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
    @pytest.mark.gtest
-    def test_quantile_same_on_all_workers(self):
-        self.run_quantile('SameOnAllWorkers')
+    def test_quantile_same_on_all_workers(self, local_cuda_cluster):
+        self.run_quantile('SameOnAllWorkers', local_cuda_cluster)


 async def run_from_dask_array_asyncio(scheduler_address):
@@ -275,11 +273,11 @@ async def run_from_dask_array_asyncio(scheduler_address):


@pytest.mark.skipif(**tm.no_dask())
+@pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.mgpu
-def test_with_asyncio():
-    with LocalCUDACluster() as cluster:
-        with Client(cluster) as client:
-            address = client.scheduler.address
-            output = asyncio.run(run_from_dask_array_asyncio(address))
-            assert isinstance(output['booster'], xgboost.Booster)
-            assert isinstance(output['history'], dict)
+def test_with_asyncio(local_cuda_cluster):
+    with Client(local_cuda_cluster) as client:
+        address = client.scheduler.address
+        output = asyncio.run(run_from_dask_array_asyncio(address))
+        assert isinstance(output['booster'], xgboost.Booster)
+        assert isinstance(output['history'], dict)