xgboost/tests/cpp/tree/test_gpu_hist.cu
Jiaming Yuan e228c1a121
[EM] Make page concatenation optional. (#10826)
This PR introduces a new parameter `extmem_concat_pages` to make the page concatenation optional for GPU hist. In addition, the document is updated for the new GPU-based external memory.
2024-09-24 06:19:28 +08:00

373 lines
13 KiB
Plaintext

/**
* Copyright 2017-2024, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/base.h> // for Args
#include <xgboost/context.h> // for Context
#include <xgboost/host_device_vector.h> // for HostDeviceVector
#include <xgboost/json.h> // for Json
#include <xgboost/task.h> // for ObjInfo
#include <xgboost/tree_model.h> // for RegTree
#include <xgboost/tree_updater.h> // for TreeUpdater
#include <memory> // for unique_ptr
#include <string> // for string
#include <vector> // for vector
#include "../../../src/common/random.h" // for GlobalRandom
#include "../../../src/tree/param.h" // for TrainParam
#include "../collective/test_worker.h" // for BaseMGPUTest
#include "../helpers.h"
namespace xgboost::tree {
namespace {
void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
const std::string& sampling_method, bst_bin_t max_bin, bool concat_pages) {
Args args{
{"max_depth", "2"},
{"max_bin", std::to_string(max_bin)},
{"min_child_weight", "0.0"},
{"reg_alpha", "0"},
{"reg_lambda", "0"},
{"subsample", std::to_string(subsample)},
{"sampling_method", sampling_method},
};
TrainParam param;
param.UpdateAllowUnknown(args);
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> hist_maker{TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
if (subsample < 1.0) {
hist_maker->Configure(Args{{"extmem_concat_pages", std::to_string(concat_pages)}});
} else {
hist_maker->Configure(Args{});
}
std::vector<HostDeviceVector<bst_node_t>> position(1);
hist_maker->Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
{tree});
auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
if (subsample < 1.0 && !dmat->SingleColBlock() && concat_pages) {
ASSERT_FALSE(hist_maker->UpdatePredictionCache(dmat, cache));
} else {
ASSERT_TRUE(hist_maker->UpdatePredictionCache(dmat, cache));
}
}
} // anonymous namespace
TEST(GpuHist, UniformSampling) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 2;
constexpr float kSubsample = 0.9999;
common::GlobalRandom().seed(1994);
auto ctx = MakeCUDACtx(0);
// Create an in-memory DMatrix.
auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
ASSERT_TRUE(p_fmat->SingleColBlock());
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, false);
// Build another tree using sampling.
RegTree tree_sampling;
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample, "uniform",
kRows, false);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_sampling_h = preds_sampling.ConstHostVector();
for (size_t i = 0; i < kRows; i++) {
EXPECT_NEAR(preds_h[i], preds_sampling_h[i], 1e-8);
}
}
TEST(GpuHist, GradientBasedSampling) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 2;
constexpr float kSubsample = 0.9999;
common::GlobalRandom().seed(1994);
auto ctx = MakeCUDACtx(0);
// Create an in-memory DMatrix.
auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, false);
// Build another tree using sampling.
RegTree tree_sampling;
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample,
"gradient_based", kRows, false);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_sampling_h = preds_sampling.ConstHostVector();
for (size_t i = 0; i < kRows; i++) {
EXPECT_NEAR(preds_h[i], preds_sampling_h[i], 1e-3);
}
}
TEST(GpuHist, ExternalMemory) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 2;
// Create a DMatrix with multiple batches.
auto p_fmat_ext =
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
ASSERT_FALSE(p_fmat_ext->SingleColBlock());
// Create a single batch DMatrix.
auto p_fmat =
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
ASSERT_TRUE(p_fmat->SingleColBlock());
auto ctx = MakeCUDACtx(0);
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, true);
// Build another tree using multiple ELLPACK pages.
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows, true);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_ext_h = preds_ext.ConstHostVector();
for (size_t i = 0; i < kRows; i++) {
EXPECT_NEAR(preds_h[i], preds_ext_h[i], 1e-6);
}
}
TEST(GpuHist, ExternalMemoryWithSampling) {
constexpr size_t kRows = 4096, kCols = 2;
constexpr float kSubsample = 0.5;
const std::string kSamplingMethod = "gradient_based";
common::GlobalRandom().seed(0);
auto ctx = MakeCUDACtx(0);
// Create a single batch DMatrix.
auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}
.Device(ctx.Device())
.Batches(1)
.GenerateSparsePageDMatrix("temp", true);
ASSERT_TRUE(p_fmat->SingleColBlock());
// Create a DMatrix with multiple batches.
auto p_fmat_ext = RandomDataGenerator{kRows, kCols, 0.0f}
.Device(ctx.Device())
.Batches(4)
.GenerateSparsePageDMatrix("temp", true);
ASSERT_FALSE(p_fmat_ext->SingleColBlock());
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
auto rng = common::GlobalRandom();
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows, true);
// Build another tree using multiple ELLPACK pages.
common::GlobalRandom() = rng;
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, kSubsample, kSamplingMethod,
kRows, true);
Json jtree{Object{}};
Json jtree_ext{Object{}};
tree.SaveModel(&jtree);
tree_ext.SaveModel(&jtree_ext);
ASSERT_EQ(jtree, jtree_ext);
}
TEST(GpuHist, ConfigIO) {
auto ctx = MakeCUDACtx(0);
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_gpu_hist", &ctx, &task)};
updater->Configure(Args{});
Json j_updater{Object{}};
updater->SaveConfig(&j_updater);
ASSERT_TRUE(IsA<Object>(j_updater["hist_train_param"]));
updater->LoadConfig(j_updater);
Json j_updater_roundtrip{Object{}};
updater->SaveConfig(&j_updater_roundtrip);
ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["hist_train_param"]));
ASSERT_EQ(j_updater, j_updater_roundtrip);
}
TEST(GpuHist, MaxDepth) {
auto ctx = MakeCUDACtx(0);
size_t constexpr kRows = 16;
size_t constexpr kCols = 4;
auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
learner->SetParam("max_depth", "32");
learner->Configure();
ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
}
TEST(GpuHist, PageConcatConfig) {
auto ctx = MakeCUDACtx(0);
bst_idx_t n_samples = 64, n_features = 32;
auto p_fmat = RandomDataGenerator{n_samples, n_features, 0}.Batches(2).GenerateSparsePageDMatrix(
"temp", true);
auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
learner->SetParam("device", ctx.DeviceName());
learner->SetParam("extmem_concat_pages", "true");
learner->SetParam("subsample", "0.8");
learner->Configure();
learner->UpdateOneIter(0, p_fmat);
learner->SetParam("extmem_concat_pages", "false");
learner->Configure();
// GPU Hist rebuilds the updater after configuration. Training continues
learner->UpdateOneIter(1, p_fmat);
learner->SetParam("extmem_concat_pages", "true");
learner->SetParam("subsample", "1.0");
ASSERT_THAT([&] { learner->UpdateOneIter(2, p_fmat); }, GMockThrow("extmem_concat_pages"));
// Throws error on CPU.
{
auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
learner->SetParam("extmem_concat_pages", "true");
ASSERT_THAT([&] { learner->UpdateOneIter(0, p_fmat); }, GMockThrow("extmem_concat_pages"));
}
{
auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
learner->SetParam("extmem_concat_pages", "true");
learner->SetParam("tree_method", "approx");
ASSERT_THAT([&] { learner->UpdateOneIter(0, p_fmat); }, GMockThrow("extmem_concat_pages"));
}
}
namespace {
RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
hist_maker->Configure(Args{});
TrainParam param;
param.UpdateAllowUnknown(Args{});
linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
std::vector<HostDeviceVector<bst_node_t>> position(1);
RegTree tree;
hist_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
{&tree});
return tree;
}
void VerifyHistColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
Context ctx(MakeCUDACtx(GPUIDX));
auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
RegTree tree = GetHistTree(&ctx, sliced.get());
Json json{Object{}};
tree.SaveModel(&json);
Json expected_json{Object{}};
expected_tree.SaveModel(&expected_json);
ASSERT_EQ(json, expected_json);
}
} // anonymous namespace
class MGPUHistTest : public collective::BaseMGPUTest {};
TEST_F(MGPUHistTest, HistColumnSplit) {
auto constexpr kRows = 32;
auto constexpr kCols = 16;
Context ctx(MakeCUDACtx(0));
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
RegTree expected_tree = GetHistTree(&ctx, dmat.get());
this->DoTest([&] { VerifyHistColumnSplit(kRows, kCols, expected_tree); }, true);
this->DoTest([&] { VerifyHistColumnSplit(kRows, kCols, expected_tree); }, false);
}
namespace {
RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)};
approx_maker->Configure(Args{});
TrainParam param;
param.UpdateAllowUnknown(Args{});
linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
std::vector<HostDeviceVector<bst_node_t>> position(1);
RegTree tree;
approx_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
{&tree});
return tree;
}
void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
auto ctx = MakeCUDACtx(DistGpuIdx());
auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
RegTree tree = GetApproxTree(&ctx, sliced.get());
Json json{Object{}};
tree.SaveModel(&json);
Json expected_json{Object{}};
expected_tree.SaveModel(&expected_json);
ASSERT_EQ(json, expected_json);
}
} // anonymous namespace
class MGPUApproxTest : public collective::BaseMGPUTest {};
TEST_F(MGPUApproxTest, GPUApproxColumnSplit) {
auto constexpr kRows = 32;
auto constexpr kCols = 16;
Context ctx(MakeCUDACtx(0));
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
RegTree expected_tree = GetApproxTree(&ctx, dmat.get());
this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, true);
this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, false);
}
} // namespace xgboost::tree