Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
2024-09-28 04:26:44 +08:00
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
 }

 TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
-  TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
+  TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
 }

 TEST(GPUPredictor, CategoricalPrediction) {
@@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
 }

 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  TestCategoricalPredictLeaf(&ctx, false);
 }

@@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
 }

 TEST_F(MGPUPredictorTest, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
 }
 }  // namespace xgboost::predictor
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
  }
  auto learner = LearnerForTest(&ctx, m_train, kIters);
  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
@@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
 void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  }
  size_t constexpr kCols = 10;
  PredictionCacheEntry out_predictions;
@@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
  auto const rank = collective::GetRank();
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
  }
  auto n_threads = collective::GetWorkerLocalThreads(world_size);
  ctx.UpdateAllowUnknown(
@@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
                                       std::vector<float> const &expected_predt) {
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  }
  auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
  std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};