[EM] Enable access to the number of batches. (#10691)

- Expose `NumBatches` in `DMatrix`.
- Small cleanup for removing legacy CUDA stream and ~force CUDA context initialization~.
- Purge old external memory data generation code.
This commit is contained in:
Jiaming Yuan
2024-08-17 02:59:45 +08:00
committed by GitHub
parent 033a666900
commit 8d7fe262d9
26 changed files with 169 additions and 352 deletions

View File

@@ -7,22 +7,18 @@
#include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
#include "../../../../src/tree/param.h"
#include "../../../../src/tree/param.h" // TrainParam
#include "../../filesystem.h" // dmlc::TemporaryDirectory
#include "../../helpers.h"
namespace xgboost::tree {
void VerifySampling(size_t page_size,
float subsample,
int sampling_method,
bool fixed_size_sampling = true,
bool check_sum = true) {
void VerifySampling(size_t page_size, float subsample, int sampling_method,
bool fixed_size_sampling = true, bool check_sum = true) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 1;
size_t sample_rows = kRows * subsample;
bst_idx_t sample_rows = kRows * subsample;
bst_idx_t n_batches = fixed_size_sampling ? 1 : 4;
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(
kRows, kCols, kRows / (page_size == 0 ? kRows : page_size), tmpdir.path + "/cache"));
auto dmat = RandomDataGenerator{kRows, kCols, 0.0f}.Batches(n_batches).GenerateSparsePageDMatrix(
"temp", true);
auto gpair = GenerateRandomGradients(kRows);
GradientPair sum_gpair{};
for (const auto& gp : gpair.ConstHostVector()) {
@@ -78,14 +74,12 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
constexpr size_t kRows = 2048;
constexpr size_t kCols = 1;
constexpr float kSubsample = 1.0f;
constexpr size_t kPageSize = 1024;
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix> dmat(
CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
auto dmat =
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
auto gpair = GenerateRandomGradients(kRows);
Context ctx{MakeCUDACtx(0)};
auto ctx = MakeCUDACtx(0);
gpair.SetDevice(ctx.Device());
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};

View File

@@ -406,7 +406,8 @@ namespace {
void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, bool is_approx,
bool force_read_by_column) {
size_t constexpr kEntries = 1 << 16;
auto m = CreateSparsePageDMatrix(kEntries, "cache");
auto m =
RandomDataGenerator{kEntries / 8, 8, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
std::vector<float> hess(m->Info().num_row_, 1.0);
if (is_approx) {

View File

@@ -17,12 +17,11 @@
#include "../../../src/common/random.h" // for GlobalRandom
#include "../../../src/tree/param.h" // for TrainParam
#include "../collective/test_worker.h" // for BaseMGPUTest
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
namespace xgboost::tree {
namespace {
void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat, bool is_ext,
void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
const std::string& sampling_method, bst_bin_t max_bin) {
Args args{
@@ -45,7 +44,7 @@ void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix
hist_maker->Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
{tree});
auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
if (subsample < 1.0 && is_ext) {
if (subsample < 1.0 && !dmat->SingleColBlock()) {
ASSERT_FALSE(hist_maker->UpdatePredictionCache(dmat, cache));
} else {
ASSERT_TRUE(hist_maker->UpdatePredictionCache(dmat, cache));
@@ -58,22 +57,23 @@ TEST(GpuHist, UniformSampling) {
constexpr size_t kCols = 2;
constexpr float kSubsample = 0.9999;
common::GlobalRandom().seed(1994);
auto ctx = MakeCUDACtx(0);
// Create an in-memory DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
ASSERT_TRUE(p_fmat->SingleColBlock());
linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Device());
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
Context ctx(MakeCUDACtx(0));
UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
// Build another tree using sampling.
RegTree tree_sampling;
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
UpdateTree(&ctx, &gpair, dmat.get(), false, &tree_sampling, &preds_sampling, kSubsample, "uniform",
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample, "uniform",
kRows);
// Make sure the predictions are the same.
@@ -89,23 +89,23 @@ TEST(GpuHist, GradientBasedSampling) {
constexpr size_t kCols = 2;
constexpr float kSubsample = 0.9999;
common::GlobalRandom().seed(1994);
auto ctx = MakeCUDACtx(0);
// Create an in-memory DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Device());
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
Context ctx(MakeCUDACtx(0));
UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
// Build another tree using sampling.
RegTree tree_sampling;
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
UpdateTree(&ctx, &gpair, dmat.get(), false, &tree_sampling, &preds_sampling, kSubsample,
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample,
"gradient_based", kRows);
// Make sure the predictions are the same.
@@ -119,29 +119,29 @@ TEST(GpuHist, GradientBasedSampling) {
TEST(GpuHist, ExternalMemory) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 2;
constexpr size_t kPageSize = 1024;
dmlc::TemporaryDirectory tmpdir;
// Create a DMatrix with multiple batches.
std::unique_ptr<DMatrix> dmat_ext(
CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
auto p_fmat_ext =
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
ASSERT_FALSE(p_fmat_ext->SingleColBlock());
// Create a single batch DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
auto p_fmat =
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
ASSERT_TRUE(p_fmat->SingleColBlock());
Context ctx(MakeCUDACtx(0));
auto ctx = MakeCUDACtx(0);
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
// Build another tree using multiple ELLPACK pages.
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
UpdateTree(&ctx, &gpair, dmat_ext.get(), true, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
@@ -157,20 +157,21 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
const std::string kSamplingMethod = "gradient_based";
common::GlobalRandom().seed(0);
dmlc::TemporaryDirectory tmpdir;
Context ctx(MakeCUDACtx(0));
auto ctx = MakeCUDACtx(0);
// Create a single batch DMatrix.
auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}
.Device(ctx.Device())
.Batches(1)
.GenerateSparsePageDMatrix("temp", true);
ASSERT_TRUE(p_fmat->SingleColBlock());
// Create a DMatrix with multiple batches.
auto p_fmat_ext = RandomDataGenerator{kRows, kCols, 0.0f}
.Device(ctx.Device())
.Batches(4)
.GenerateSparsePageDMatrix("temp", true);
ASSERT_FALSE(p_fmat_ext->SingleColBlock());
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows));
@@ -179,26 +180,25 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
auto rng = common::GlobalRandom();
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
UpdateTree(&ctx, &gpair, p_fmat.get(), true, &tree, &preds, kSubsample, kSamplingMethod, kRows);
HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows);
// Build another tree using multiple ELLPACK pages.
common::GlobalRandom() = rng;
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
UpdateTree(&ctx, &gpair, p_fmat_ext.get(), true, &tree_ext, &preds_ext, kSubsample,
kSamplingMethod, kRows);
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, kSubsample, kSamplingMethod,
kRows);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_ext_h = preds_ext.ConstHostVector();
for (size_t i = 0; i < kRows; i++) {
ASSERT_NEAR(preds_h[i], preds_ext_h[i], 1e-3);
}
Json jtree{Object{}};
Json jtree_ext{Object{}};
tree.SaveModel(&jtree);
tree_ext.SaveModel(&jtree_ext);
ASSERT_EQ(jtree, jtree_ext);
}
TEST(GpuHist, ConfigIO) {
Context ctx(MakeCUDACtx(0));
auto ctx = MakeCUDACtx(0);
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_gpu_hist", &ctx, &task)};
updater->Configure(Args{});