Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator.
- Define a driver API wrapper. Split up the runtime API wrapper.
This commit is contained in:
Jiaming Yuan
2024-09-28 04:26:44 +08:00
committed by GitHub
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions

View File

@@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
}
TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
}
TEST(GPUPredictor, CategoricalPrediction) {
@@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
}
TEST(GPUPredictor, CategoricalPredictLeaf) {
auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
TestCategoricalPredictLeaf(&ctx, false);
}
@@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
}
TEST_F(MGPUPredictorTest, SparseColumnSplit) {
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
}
} // namespace xgboost::predictor

View File

@@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
}
auto learner = LearnerForTest(&ctx, m_train, kIters);
auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
@@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
}
size_t constexpr kCols = 10;
PredictionCacheEntry out_predictions;
@@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
auto const rank = collective::GetRank();
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
}
auto n_threads = collective::GetWorkerLocalThreads(world_size);
ctx.UpdateAllowUnknown(
@@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
std::vector<float> const &expected_predt) {
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
}
auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};