[EM] Support CPU quantile objective for external memory. (#10751)

This commit is contained in:
Jiaming Yuan
2024-08-27 04:16:57 +08:00
committed by GitHub
parent 12c6b7ceea
commit d6ebcfb032
13 changed files with 163 additions and 36 deletions

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2022-2023 by XGBoost contributors.
* Copyright 2022-2024, XGBoost contributors.
*/
#include <gtest/gtest.h>
#include <xgboost/base.h> // for bst_node_t
@@ -43,14 +43,15 @@ void TestLeafPartition(size_t n_samples) {
std::vector<size_t> h_nptr;
float split_value{0};
bst_feature_t const split_ind = 0;
for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{64, 0.2})) {
bst_feature_t const split_ind = 0;
auto ptr = page.cut.Ptrs()[split_ind + 1];
split_value = page.cut.Values().at(ptr / 2);
GetSplit(&tree, split_value, &candidates);
partitioner.UpdatePosition(&ctx, page, candidates, &tree);
std::vector<bst_node_t> position;
partitioner.LeafPartition(&ctx, tree, hess, &position);
std::vector<bst_node_t> position(page.Size());
partitioner.LeafPartition(&ctx, tree, hess, position);
std::sort(position.begin(), position.end());
size_t beg = std::distance(
position.begin(),
@@ -76,13 +77,59 @@ void TestLeafPartition(size_t n_samples) {
auto batch = page.GetView();
size_t left{0};
for (size_t i = 0; i < batch.Size(); ++i) {
if (not_sampled(i) && batch[i].front().fvalue < split_value) {
if (not_sampled(i) && batch[i][split_ind].fvalue < split_value) {
left++;
}
}
ASSERT_EQ(left, h_nptr[1] - h_nptr[0]); // equal to number of sampled assigned to left
}
}
void TestExternalMemory() {
Context ctx;
bst_bin_t max_bin = 32;
auto p_fmat =
RandomDataGenerator{256, 16, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
std::vector<CommonRowPartitioner> partitioners;
RegTree tree;
std::vector<CPUExpandEntry> candidates{{0, 0}};
auto gpair = GenerateRandomGradients(p_fmat->Info().num_row_);
auto t_gpair = linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), p_fmat->Info().num_row_, 1);
std::vector<bst_node_t> position(p_fmat->Info().num_row_);
auto param = BatchParam{max_bin, TrainParam::DftSparseThreshold()};
float split_value{0.0f};
bst_feature_t const split_ind = 0;
for (auto const& page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, param)) {
if (partitioners.empty()) {
auto ptr = page.cut.Ptrs()[split_ind + 1];
split_value = page.cut.Values().at(ptr / 2);
GetSplit(&tree, split_value, &candidates);
}
partitioners.emplace_back(&ctx, page.Size(), page.base_rowid, false);
partitioners.back().UpdatePosition(&ctx, page, candidates, &tree);
partitioners.back().LeafPartition(&ctx, tree, t_gpair, position);
}
bst_idx_t n_left{0};
for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
auto batch = page.GetView();
for (size_t i = 0; i < batch.Size(); ++i) {
if (batch[i][split_ind].fvalue < split_value) {
n_left++;
}
}
}
auto n_left_pos = std::count_if(position.cbegin(), position.cend(),
[&](auto v) { return v == tree[RegTree::kRoot].LeftChild(); });
ASSERT_EQ(n_left, n_left_pos);
std::sort(position.begin(), position.end());
auto end_it = std::unique(position.begin(), position.end());
ASSERT_EQ(std::distance(position.begin(), end_it), 2);
}
} // anonymous namespace
TEST(CommonRowPartitioner, LeafPartition) {
@@ -90,4 +137,6 @@ TEST(CommonRowPartitioner, LeafPartition) {
TestLeafPartition(n_samples);
}
}
TEST(CommonRowPartitioner, LeafPartitionExternalMemory) { TestExternalMemory(); }
} // namespace xgboost::tree

View File

@@ -4,6 +4,7 @@ import pytest
from hypothesis import given, settings, strategies
from xgboost.testing import no_cupy
from xgboost.testing.updater import check_quantile_loss_extmem
sys.path.append("tests/python")
from test_data_iterator import run_data_iterator
@@ -56,3 +57,8 @@ def test_cpu_data_iterator() -> None:
use_cupy=True,
on_host=False,
)
def test_quantile_objective() -> None:
with pytest.raises(ValueError, match="external memory"):
check_quantile_loss_extmem(2, 2, 2, "hist", "cuda")

View File

@@ -12,6 +12,7 @@ import xgboost as xgb
from xgboost import testing as tm
from xgboost.data import SingleBatchInternalIter as SingleBatch
from xgboost.testing import IteratorForTest, make_batches, non_increasing
from xgboost.testing.updater import check_quantile_loss_extmem
pytestmark = tm.timeout(30)
@@ -276,3 +277,28 @@ def test_cat_check() -> None:
Xy = xgb.DMatrix(it, enable_categorical=True)
with pytest.raises(ValueError, match="categorical features"):
xgb.train({"booster": "gblinear"}, Xy)
@given(
strategies.integers(1, 64),
strategies.integers(1, 8),
strategies.integers(1, 4),
)
@settings(deadline=None, max_examples=10, print_blob=True)
def test_quantile_objective(
n_samples_per_batch: int, n_features: int, n_batches: int
) -> None:
check_quantile_loss_extmem(
n_samples_per_batch,
n_features,
n_batches,
"hist",
"cpu",
)
check_quantile_loss_extmem(
n_samples_per_batch,
n_features,
n_batches,
"approx",
"cpu",
)