Revamp the rabit implementation. (#10112)
This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features: - Federated learning for both CPU and GPU. - NCCL. - More data types. - A unified interface for all the underlying implementations. - Improved timeout handling for both tracker and workers. - Exhausted tests with metrics (fixed a couple of bugs along the way). - A reusable tracker for Python and JVM packages.
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
/**
|
||||
* Copyright 2020-2023, XGBoost contributors
|
||||
* Copyright 2020-2024, XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/host_vector.h>
|
||||
|
||||
#include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
|
||||
#include "../../collective/test_worker.h" // for BaseMGPUTest
|
||||
#include "../../helpers.h"
|
||||
#include "../../histogram_helpers.h"
|
||||
#include "../test_evaluate_splits.h" // TestPartitionBasedSplit
|
||||
|
||||
namespace xgboost::tree {
|
||||
@@ -17,13 +17,13 @@ auto ZeroParam() {
|
||||
tparam.UpdateAllowUnknown(args);
|
||||
return tparam;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
inline GradientQuantiser DummyRoundingFactor(Context const* ctx) {
|
||||
GradientQuantiser DummyRoundingFactor(Context const* ctx) {
|
||||
thrust::device_vector<GradientPair> gpair(1);
|
||||
gpair[0] = {1000.f, 1000.f}; // Tests should not exceed sum of 1000
|
||||
return {ctx, dh::ToSpan(gpair), MetaInfo()};
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
thrust::device_vector<GradientPairInt64> ConvertToInteger(Context const* ctx,
|
||||
std::vector<GradientPairPrecise> x) {
|
||||
@@ -546,7 +546,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
|
||||
ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
|
||||
}
|
||||
|
||||
class MGPUHistTest : public BaseMGPUTest {};
|
||||
class MGPUHistTest : public collective::BaseMGPUTest {};
|
||||
|
||||
namespace {
|
||||
void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
|
||||
@@ -589,21 +589,29 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
|
||||
evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
|
||||
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
|
||||
|
||||
EXPECT_EQ(result.findex, 1) << "rank: " << rank;
|
||||
EXPECT_EQ(result.findex, 1);
|
||||
if (is_categorical) {
|
||||
ASSERT_TRUE(std::isnan(result.fvalue));
|
||||
} else {
|
||||
EXPECT_EQ(result.fvalue, 11.0) << "rank: " << rank;
|
||||
EXPECT_EQ(result.fvalue, 11.0);
|
||||
}
|
||||
EXPECT_EQ(result.left_sum + result.right_sum, parent_sum) << "rank: " << rank;
|
||||
EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
|
||||
DoTest(VerifyColumnSplitEvaluateSingleSplit, false);
|
||||
if (common::AllVisibleGPUs() > 1) {
|
||||
// We can't emulate multiple GPUs with NCCL.
|
||||
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
|
||||
}
|
||||
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, true, true);
|
||||
}
|
||||
|
||||
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
|
||||
DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
|
||||
if (common::AllVisibleGPUs() > 1) {
|
||||
// We can't emulate multiple GPUs with NCCL.
|
||||
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
|
||||
}
|
||||
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, true, true);
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "../../../../src/tree/hist/histogram.h" // for HistogramBuilder
|
||||
#include "../../../../src/tree/hist/param.h" // for HistMakerTrainParam
|
||||
#include "../../categorical_helpers.h" // for OneHotEncodeFeature
|
||||
#include "../../collective/test_worker.h" // for TestDistributedGlobal
|
||||
#include "../../helpers.h" // for RandomDataGenerator, GenerateRa...
|
||||
|
||||
namespace xgboost::tree {
|
||||
@@ -300,8 +301,8 @@ TEST(CPUHistogram, BuildHist) {
|
||||
|
||||
TEST(CPUHistogram, BuildHistColSplit) {
|
||||
auto constexpr kWorkers = 4;
|
||||
RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true);
|
||||
RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true);
|
||||
collective::TestDistributedGlobal(kWorkers, [] { TestBuildHistogram(true, true, true); });
|
||||
collective::TestDistributedGlobal(kWorkers, [] { TestBuildHistogram(true, false, true); });
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
/**
|
||||
* Copyright 2021-2023 by XGBoost contributors.
|
||||
* Copyright 2021-2024, XGBoost contributors.
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../src/common/numeric.h"
|
||||
#include "../../../src/tree/common_row_partitioner.h"
|
||||
#include "../collective/test_worker.h" // for TestDistributedGlobal
|
||||
#include "../helpers.h"
|
||||
#include "test_partitioner.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace xgboost::tree {
|
||||
namespace {
|
||||
std::vector<float> GenerateHess(size_t n_samples) {
|
||||
auto grad = GenerateRandomGradients(n_samples);
|
||||
@@ -145,8 +145,9 @@ TEST(Approx, PartitionerColSplit) {
|
||||
}
|
||||
|
||||
auto constexpr kWorkers = 4;
|
||||
RunWithInMemoryCommunicator(kWorkers, TestColumnSplitPartitioner, n_samples, base_rowid, Xy,
|
||||
&hess, min_value, mid_value, mid_partitioner);
|
||||
collective::TestDistributedGlobal(kWorkers, [&] {
|
||||
TestColumnSplitPartitioner(n_samples, base_rowid, Xy, &hess, min_value, mid_value,
|
||||
mid_partitioner);
|
||||
});
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2022-2023 by XGBoost Contributors
|
||||
* Copyright 2022-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/base.h> // for GradientPairInternal, GradientPairPrecise
|
||||
@@ -14,7 +14,6 @@
|
||||
#include <limits> // for numeric_limits
|
||||
#include <numeric> // for iota
|
||||
#include <tuple> // for make_tuple, tie, tuple
|
||||
#include <utility> // for pair
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/common/hist_util.h" // for HistogramCuts, HistCollection, GHistRow
|
||||
@@ -23,7 +22,6 @@
|
||||
#include "../../../src/tree/param.h" // for TrainParam, GradStats
|
||||
#include "../../../src/tree/split_evaluator.h" // for TreeEvaluator
|
||||
#include "../helpers.h" // for SimpleLCG, SimpleRealUniformDistribution
|
||||
#include "gtest/gtest_pred_impl.h" // for AssertionResult, ASSERT_EQ, ASSERT_TRUE
|
||||
|
||||
namespace xgboost::tree {
|
||||
/**
|
||||
@@ -96,13 +94,11 @@ class TestPartitionBasedSplit : public ::testing::Test {
|
||||
|
||||
// enumerate all possible partitions to find the optimal split
|
||||
do {
|
||||
int32_t thresh;
|
||||
float score;
|
||||
std::vector<GradientPairPrecise> sorted_hist(node_hist.size());
|
||||
for (size_t i = 0; i < sorted_hist.size(); ++i) {
|
||||
sorted_hist[i] = node_hist[sorted_idx_[i]];
|
||||
}
|
||||
std::tie(thresh, score) = enumerate({sorted_hist}, total_gpair_);
|
||||
auto [thresh, score] = enumerate({sorted_hist}, total_gpair_);
|
||||
if (score > best_score_) {
|
||||
best_score_ = score;
|
||||
}
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
/**
|
||||
* Copyright 2022-2023, XGBoost Contributors
|
||||
* Copyright 2022-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/linalg.h>
|
||||
|
||||
#include "../../src/common/linalg_op.h"
|
||||
#include "../../src/tree/fit_stump.h"
|
||||
#include "../collective/test_worker.h" // for TestDistributedGlobal
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost::tree {
|
||||
@@ -43,7 +44,7 @@ TEST(InitEstimation, FitStump) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
TEST(InitEstimation, GPUFitStump) {
|
||||
Context ctx;
|
||||
ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
|
||||
ctx.UpdateAllowUnknown(Args{{"device", "cuda"}});
|
||||
TestFitStump(&ctx);
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
@@ -51,6 +52,6 @@ TEST(InitEstimation, GPUFitStump) {
|
||||
TEST(InitEstimation, FitStumpColumnSplit) {
|
||||
Context ctx;
|
||||
auto constexpr kWorldSize{3};
|
||||
RunWithInMemoryCommunicator(kWorldSize, &TestFitStump, &ctx, DataSplitMode::kCol);
|
||||
collective::TestDistributedGlobal(kWorldSize, [&] { TestFitStump(&ctx, DataSplitMode::kCol); });
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -13,14 +13,19 @@
|
||||
#include "../../../src/common/common.h"
|
||||
#include "../../../src/data/ellpack_page.cuh" // for EllpackPageImpl
|
||||
#include "../../../src/data/ellpack_page.h" // for EllpackPage
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../../../src/tree/updater_gpu_hist.cu"
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "../collective/test_worker.h" // for BaseMGPUTest
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "../helpers.h"
|
||||
#include "../histogram_helpers.h"
|
||||
#include "xgboost/context.h"
|
||||
#include "xgboost/json.h"
|
||||
|
||||
#if defined(XGBOOST_USE_FEDERATED)
|
||||
#include "../plugin/federated/test_worker.h" // for TestFederatedGlobal
|
||||
#endif // defined(XGBOOST_USE_FEDERATED)
|
||||
|
||||
namespace xgboost::tree {
|
||||
TEST(GpuHist, DeviceHistogram) {
|
||||
// Ensures that node allocates correctly after reaching `kStopGrowingSize`.
|
||||
@@ -458,9 +463,9 @@ void VerifyHistColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& ex
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
class MGPUHistTest : public BaseMGPUTest {};
|
||||
class MGPUHistTest : public collective::BaseMGPUTest {};
|
||||
|
||||
TEST_F(MGPUHistTest, GPUHistColumnSplit) {
|
||||
TEST_F(MGPUHistTest, HistColumnSplit) {
|
||||
auto constexpr kRows = 32;
|
||||
auto constexpr kCols = 16;
|
||||
|
||||
@@ -468,7 +473,8 @@ TEST_F(MGPUHistTest, GPUHistColumnSplit) {
|
||||
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
|
||||
RegTree expected_tree = GetHistTree(&ctx, dmat.get());
|
||||
|
||||
DoTest(VerifyHistColumnSplit, kRows, kCols, expected_tree);
|
||||
this->DoTest([&] { VerifyHistColumnSplit(kRows, kCols, expected_tree); }, true);
|
||||
this->DoTest([&] { VerifyHistColumnSplit(kRows, kCols, expected_tree); }, false);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@@ -508,7 +514,7 @@ void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const&
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
class MGPUApproxTest : public BaseMGPUTest {};
|
||||
class MGPUApproxTest : public collective::BaseMGPUTest {};
|
||||
|
||||
TEST_F(MGPUApproxTest, GPUApproxColumnSplit) {
|
||||
auto constexpr kRows = 32;
|
||||
@@ -518,6 +524,7 @@ TEST_F(MGPUApproxTest, GPUApproxColumnSplit) {
|
||||
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
|
||||
RegTree expected_tree = GetApproxTree(&ctx, dmat.get());
|
||||
|
||||
DoTest(VerifyApproxColumnSplit, kRows, kCols, expected_tree);
|
||||
this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, true);
|
||||
this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, false);
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
#include <xgboost/tree_model.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../collective/test_worker.h" // for TestDistributedGlobal
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost::tree {
|
||||
@@ -118,8 +119,8 @@ void TestColumnSplit(bool categorical) {
|
||||
}
|
||||
|
||||
auto constexpr kWorldSize = 2;
|
||||
RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, categorical,
|
||||
std::cref(expected_tree));
|
||||
collective::TestDistributedGlobal(
|
||||
kWorldSize, [&] { VerifyColumnSplit(kRows, kCols, categorical, expected_tree); });
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
|
||||
@@ -11,26 +11,26 @@ namespace {
|
||||
auto MakeTreeForTest() {
|
||||
bst_target_t n_targets{3};
|
||||
bst_feature_t n_features{4};
|
||||
RegTree tree{n_targets, n_features};
|
||||
CHECK(tree.IsMultiTarget());
|
||||
std::unique_ptr<RegTree> tree{std::make_unique<RegTree>(n_targets, n_features)};
|
||||
CHECK(tree->IsMultiTarget());
|
||||
linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()};
|
||||
linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()};
|
||||
linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()};
|
||||
tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
|
||||
left_weight.HostView(), right_weight.HostView());
|
||||
tree->ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
|
||||
left_weight.HostView(), right_weight.HostView());
|
||||
return tree;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(MultiTargetTree, JsonIO) {
|
||||
auto tree = MakeTreeForTest();
|
||||
ASSERT_EQ(tree.NumNodes(), 3);
|
||||
ASSERT_EQ(tree.NumTargets(), 3);
|
||||
ASSERT_EQ(tree.GetMultiTargetTree()->Size(), 3);
|
||||
ASSERT_EQ(tree.Size(), 3);
|
||||
ASSERT_EQ(tree->NumNodes(), 3);
|
||||
ASSERT_EQ(tree->NumTargets(), 3);
|
||||
ASSERT_EQ(tree->GetMultiTargetTree()->Size(), 3);
|
||||
ASSERT_EQ(tree->Size(), 3);
|
||||
|
||||
Json jtree{Object{}};
|
||||
tree.SaveModel(&jtree);
|
||||
tree->SaveModel(&jtree);
|
||||
|
||||
auto check_jtree = [](Json jtree, RegTree const& tree) {
|
||||
ASSERT_EQ(get<String const>(jtree["tree_param"]["num_nodes"]), std::to_string(tree.NumNodes()));
|
||||
@@ -40,7 +40,7 @@ TEST(MultiTargetTree, JsonIO) {
|
||||
ASSERT_EQ(get<I32Array const>(jtree["left_children"]).size(), tree.NumNodes());
|
||||
ASSERT_EQ(get<I32Array const>(jtree["right_children"]).size(), tree.NumNodes());
|
||||
};
|
||||
check_jtree(jtree, tree);
|
||||
check_jtree(jtree, *tree);
|
||||
|
||||
RegTree loaded;
|
||||
loaded.LoadModel(jtree);
|
||||
@@ -49,18 +49,18 @@ TEST(MultiTargetTree, JsonIO) {
|
||||
|
||||
Json jtree1{Object{}};
|
||||
loaded.SaveModel(&jtree1);
|
||||
check_jtree(jtree1, tree);
|
||||
check_jtree(jtree1, *tree);
|
||||
}
|
||||
|
||||
TEST(MultiTargetTree, DumpDot) {
|
||||
auto tree = MakeTreeForTest();
|
||||
auto n_features = tree.NumFeatures();
|
||||
auto n_features = tree->NumFeatures();
|
||||
FeatureMap fmap;
|
||||
for (bst_feature_t f = 0; f < n_features; ++f) {
|
||||
auto name = "feat_" + std::to_string(f);
|
||||
fmap.PushBack(f, name.c_str(), "q");
|
||||
}
|
||||
auto str = tree.DumpModel(fmap, true, "dot");
|
||||
auto str = tree->DumpModel(fmap, true, "dot");
|
||||
ASSERT_NE(str.find("leaf=[2, 3, 4]"), std::string::npos);
|
||||
ASSERT_NE(str.find("leaf=[3, 4, 5]"), std::string::npos);
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "../../../src/tree/common_row_partitioner.h"
|
||||
#include "../../../src/tree/hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
|
||||
#include "../../../src/tree/param.h"
|
||||
#include "../collective/test_worker.h" // for TestDistributedGlobal
|
||||
#include "../helpers.h"
|
||||
#include "test_partitioner.h"
|
||||
#include "xgboost/data.h"
|
||||
@@ -190,9 +191,10 @@ void TestColumnSplitPartitioner(bst_target_t n_targets) {
|
||||
}
|
||||
|
||||
auto constexpr kWorkers = 4;
|
||||
RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
|
||||
n_samples, n_features, base_rowid, Xy, min_value, mid_value,
|
||||
mid_partitioner);
|
||||
collective::TestDistributedGlobal(kWorkers, [&] {
|
||||
VerifyColumnSplitPartitioner<ExpandEntry>(n_targets, n_samples, n_features, base_rowid, Xy,
|
||||
min_value, mid_value, mid_partitioner);
|
||||
});
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
@@ -245,8 +247,9 @@ void TestColumnSplit(bst_target_t n_targets) {
|
||||
}
|
||||
|
||||
auto constexpr kWorldSize = 2;
|
||||
RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, &ctx, kRows, kCols, n_targets,
|
||||
std::cref(expected_tree));
|
||||
collective::TestDistributedGlobal(kWorldSize, [&] {
|
||||
VerifyColumnSplit(&ctx, kRows, kCols, n_targets, std::cref(expected_tree));
|
||||
});
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
|
||||
Reference in New Issue
Block a user