From 7a02facc9d5bb25b8cc7862a1d11d5071240c7c7 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 24 Oct 2023 14:33:28 +0800 Subject: [PATCH] Serialize expand entry for allgather. (#9702) --- include/xgboost/json.h | 16 +-- src/collective/allgather.h | 9 +- src/collective/allreduce.h | 9 +- src/collective/comm.h | 16 --- src/collective/communicator-inl.h | 1 - src/common/bitfield.h | 2 +- src/common/type.h | 24 ++++ src/tree/gpu_hist/expand_entry.cuh | 115 ++++++++++++----- src/tree/hist/expand_entry.h | 122 +++++++++++++++++-- src/tree/param.h | 2 +- src/tree/updater_gpu_common.cuh | 7 +- tests/cpp/collective/test_comm.cc | 4 +- tests/cpp/tree/gpu_hist/test_expand_entry.cu | 28 +++++ tests/cpp/tree/hist/test_expand_entry.cc | 57 +++++++++ 14 files changed, 336 insertions(+), 76 deletions(-) create mode 100644 src/common/type.h create mode 100644 tests/cpp/tree/gpu_hist/test_expand_entry.cu create mode 100644 tests/cpp/tree/hist/test_expand_entry.cc diff --git a/include/xgboost/json.h b/include/xgboost/json.h index c2c16ef8f..a5872ec3a 100644 --- a/include/xgboost/json.h +++ b/include/xgboost/json.h @@ -153,7 +153,7 @@ class JsonTypedArray : public Value { using Type = T; JsonTypedArray() : Value(kind) {} - explicit JsonTypedArray(size_t n) : Value(kind) { vec_.resize(n); } + explicit JsonTypedArray(std::size_t n) : Value(kind) { vec_.resize(n); } JsonTypedArray(JsonTypedArray&& that) noexcept : Value{kind}, vec_{std::move(that.vec_)} {} bool operator==(Value const& rhs) const override; @@ -171,21 +171,21 @@ class JsonTypedArray : public Value { }; /** - * \brief Typed UBJSON array for 32-bit floating point. + * @brief Typed UBJSON array for 32-bit floating point. */ using F32Array = JsonTypedArray; /** - * \brief Typed UBJSON array for uint8_t. + * @brief Typed UBJSON array for uint8_t. */ -using U8Array = JsonTypedArray; +using U8Array = JsonTypedArray; /** - * \brief Typed UBJSON array for int32_t. + * @brief Typed UBJSON array for int32_t. */ -using I32Array = JsonTypedArray; +using I32Array = JsonTypedArray; /** - * \brief Typed UBJSON array for int64_t. + * @brief Typed UBJSON array for int64_t. */ -using I64Array = JsonTypedArray; +using I64Array = JsonTypedArray; class JsonObject : public Value { public: diff --git a/src/collective/allgather.h b/src/collective/allgather.h index 967187ceb..a566da78d 100644 --- a/src/collective/allgather.h +++ b/src/collective/allgather.h @@ -9,7 +9,8 @@ #include // for remove_cv_t #include // for vector -#include "comm.h" // for Comm, Channel, EraseType +#include "../common/type.h" // for EraseType +#include "comm.h" // for Comm, Channel #include "xgboost/collective/result.h" // for Result #include "xgboost/span.h" // for Span @@ -33,7 +34,7 @@ namespace cpu_impl { template [[nodiscard]] Result RingAllgather(Comm const& comm, common::Span data, std::size_t size) { auto n_bytes = sizeof(T) * size; - auto erased = EraseType(data); + auto erased = common::EraseType(data); auto rank = comm.Rank(); auto prev = BootstrapPrev(rank, comm.World()); @@ -65,8 +66,8 @@ template auto n_total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0); result.resize(n_total_bytes / sizeof(T)); auto h_result = common::Span{result.data(), result.size()}; - auto erased_result = EraseType(h_result); - auto erased_data = EraseType(data); + auto erased_result = common::EraseType(h_result); + auto erased_data = common::EraseType(data); std::vector offset(world + 1); return cpu_impl::RingAllgatherV(comm, sizes, erased_data, diff --git a/src/collective/allreduce.h b/src/collective/allreduce.h index e3f8ab5b8..0c94d11cc 100644 --- a/src/collective/allreduce.h +++ b/src/collective/allreduce.h @@ -4,8 +4,9 @@ #pragma once #include // for int8_t #include // for function -#include // for is_invocable_v +#include // for is_invocable_v, enable_if_t +#include "../common/type.h" // for EraseType, RestoreType #include "../data/array_interface.h" // for ArrayInterfaceHandler #include "comm.h" // for Comm, RestoreType #include "xgboost/collective/result.h" // for Result @@ -23,14 +24,14 @@ Result RingAllreduce(Comm const& comm, common::Span data, Func cons template std::enable_if_t, common::Span>, Result> Allreduce( Comm const& comm, common::Span data, Fn redop) { - auto erased = EraseType(data); + auto erased = common::EraseType(data); auto type = ToDType::kType; auto erased_fn = [type, redop](common::Span lhs, common::Span out) { CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction."; - auto lhs_t = RestoreType(lhs); - auto rhs_t = RestoreType(out); + auto lhs_t = common::RestoreType(lhs); + auto rhs_t = common::RestoreType(out); redop(lhs_t, rhs_t); }; diff --git a/src/collective/comm.h b/src/collective/comm.h index b501fcddd..adf23b9e4 100644 --- a/src/collective/comm.h +++ b/src/collective/comm.h @@ -137,20 +137,4 @@ class Channel { }; enum class Op { kMax = 0, kMin = 1, kSum = 2, kBitwiseAND = 3, kBitwiseOR = 4, kBitwiseXOR = 5 }; - -template , - std::add_const_t, std::int8_t>> -common::Span EraseType(common::Span data) { - auto n_total_bytes = data.size_bytes(); - auto erased = common::Span{reinterpret_cast>(data.data()), n_total_bytes}; - return erased; -} - -template -common::Span RestoreType(common::Span data) { - static_assert(std::is_same_v, std::int8_t>); - auto n_total_bytes = data.size_bytes(); - auto restored = common::Span{reinterpret_cast(data.data()), n_total_bytes / sizeof(T)}; - return restored; -} } // namespace xgboost::collective diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h index c58a9f3bc..34212def2 100644 --- a/src/collective/communicator-inl.h +++ b/src/collective/communicator-inl.h @@ -327,6 +327,5 @@ inline SpecialAllgatherVResult SpecialAllgatherV(std::vector const &inputs return {offsets, all_sizes, all_inputs}; } - } // namespace collective } // namespace xgboost diff --git a/src/common/bitfield.h b/src/common/bitfield.h index efabaa834..621078764 100644 --- a/src/common/bitfield.h +++ b/src/common/bitfield.h @@ -9,7 +9,7 @@ #include // for bitset #include // for uint32_t, uint64_t, uint8_t #include // for ostream -#include // for conditional_t, is_signed_v +#include // for conditional_t, is_signed_v, add_const_t #if defined(__CUDACC__) #include diff --git a/src/common/type.h b/src/common/type.h new file mode 100644 index 000000000..661a52ec1 --- /dev/null +++ b/src/common/type.h @@ -0,0 +1,24 @@ +/** + * Copyright 2023, XGBoost Contributors + */ +#pragma once +#include // for int8_t +#include // for is_const_v, add_const_t, conditional_t, add_pointer_t + +#include "xgboost/span.h" // for Span +namespace xgboost::common { +template , + std::add_const_t, std::int8_t>> +common::Span EraseType(common::Span data) { + auto n_total_bytes = data.size_bytes(); + auto erased = common::Span{reinterpret_cast>(data.data()), n_total_bytes}; + return erased; +} + +template +common::Span RestoreType(common::Span data) { + auto n_total_bytes = data.size_bytes(); + auto restored = common::Span{reinterpret_cast(data.data()), n_total_bytes / sizeof(T)}; + return restored; +} +} // namespace xgboost::common diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh index f21563a15..42dc7f49a 100644 --- a/src/tree/gpu_hist/expand_entry.cuh +++ b/src/tree/gpu_hist/expand_entry.cuh @@ -1,31 +1,36 @@ -/*! - * Copyright 2020 by XGBoost Contributors +/** + * Copyright 2020-2023, XGBoost Contributors */ #ifndef EXPAND_ENTRY_CUH_ #define EXPAND_ENTRY_CUH_ -#include + +#include // for numeric_limits +#include // for move #include "../param.h" #include "../updater_gpu_common.cuh" +#include "xgboost/base.h" // for bst_node_t -namespace xgboost { -namespace tree { - +namespace xgboost::tree { struct GPUExpandEntry { - int nid; - int depth; + bst_node_t nid; + bst_node_t depth; DeviceSplitCandidate split; - float base_weight { std::numeric_limits::quiet_NaN() }; - float left_weight { std::numeric_limits::quiet_NaN() }; - float right_weight { std::numeric_limits::quiet_NaN() }; + float base_weight{std::numeric_limits::quiet_NaN()}; + float left_weight{std::numeric_limits::quiet_NaN()}; + float right_weight{std::numeric_limits::quiet_NaN()}; GPUExpandEntry() = default; - XGBOOST_DEVICE GPUExpandEntry(int nid, int depth, DeviceSplitCandidate split, - float base, float left, float right) - : nid(nid), depth(depth), split(std::move(split)), base_weight{base}, - left_weight{left}, right_weight{right} {} - bool IsValid(const TrainParam& param, int num_leaves) const { + XGBOOST_DEVICE GPUExpandEntry(bst_node_t nid, bst_node_t depth, DeviceSplitCandidate split, + float base, float left, float right) + : nid(nid), + depth(depth), + split(std::move(split)), + base_weight{base}, + left_weight{left}, + right_weight{right} {} + [[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t num_leaves) const { if (split.loss_chg <= kRtEps) return false; if (split.left_sum.GetQuantisedHess() == 0 || split.right_sum.GetQuantisedHess() == 0) { return false; @@ -42,17 +47,11 @@ struct GPUExpandEntry { return true; } - bst_float GetLossChange() const { - return split.loss_chg; - } + [[nodiscard]] float GetLossChange() const { return split.loss_chg; } - int GetNodeId() const { - return nid; - } + [[nodiscard]] bst_node_t GetNodeId() const { return nid; } - int GetDepth() const { - return depth; - } + [[nodiscard]] bst_node_t GetDepth() const { return depth; } friend std::ostream& operator<<(std::ostream& os, const GPUExpandEntry& e) { os << "GPUExpandEntry: \n"; @@ -63,9 +62,69 @@ struct GPUExpandEntry { os << "right_sum: " << e.split.right_sum << "\n"; return os; } -}; -} // namespace tree -} // namespace xgboost + void Save(Json* p_out) const { + auto& out = *p_out; + + out["nid"] = Integer{this->nid}; + out["depth"] = Integer{this->depth}; + // GPU specific + out["base_weight"] = this->base_weight; + out["left_weight"] = this->left_weight; + out["right_weight"] = this->right_weight; + + /** + * Handle split + */ + out["split"] = Object{}; + auto& split = out["split"]; + split["loss_chg"] = this->split.loss_chg; + split["sindex"] = Integer{this->split.findex}; + split["split_value"] = this->split.fvalue; + + // cat + split["thresh"] = Integer{this->split.thresh}; + split["is_cat"] = Boolean{this->split.is_cat}; + /** + * Gradients + */ + auto save = [&](std::string const& name, GradientPairInt64 const& sum) { + out[name] = I64Array{2}; + auto& array = get(out[name]); + array[0] = sum.GetQuantisedGrad(); + array[1] = sum.GetQuantisedHess(); + }; + save("left_sum", this->split.left_sum); + save("right_sum", this->split.right_sum); + } + + void Load(Json const& in) { + this->nid = get(in["nid"]); + this->depth = get(in["depth"]); + // GPU specific + this->base_weight = get(in["base_weight"]); + this->left_weight = get(in["left_weight"]); + this->right_weight = get(in["right_weight"]); + + /** + * Handle split + */ + auto const& split = in["split"]; + this->split.loss_chg = get(split["loss_chg"]); + this->split.findex = get(split["sindex"]); + this->split.fvalue = get(split["split_value"]); + // cat + this->split.thresh = get(split["thresh"]); + this->split.is_cat = get(split["is_cat"]); + /** + * Gradients + */ + auto const& left_sum = get(in["left_sum"]); + this->split.left_sum = GradientPairInt64{left_sum[0], left_sum[1]}; + auto const& right_sum = get(in["right_sum"]); + this->split.right_sum = GradientPairInt64{right_sum[0], right_sum[1]}; + } +}; +} // namespace xgboost::tree #endif // EXPAND_ENTRY_CUH_ diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h index 0225a5110..d6315877d 100644 --- a/src/tree/hist/expand_entry.h +++ b/src/tree/hist/expand_entry.h @@ -1,16 +1,20 @@ /** - * Copyright 2021-2023 XGBoost contributors + * Copyright 2021-2023, XGBoost Contributors */ #ifndef XGBOOST_TREE_HIST_EXPAND_ENTRY_H_ #define XGBOOST_TREE_HIST_EXPAND_ENTRY_H_ -#include // for all_of -#include // for ostream -#include // for move -#include // for vector +#include // for all_of +#include // for ostream +#include // for string +#include // for add_const_t +#include // for move +#include // for vector -#include "../param.h" // for SplitEntry, SplitEntryContainer, TrainParam -#include "xgboost/base.h" // for GradientPairPrecise, bst_node_t +#include "../../common/type.h" // for EraseType +#include "../param.h" // for SplitEntry, SplitEntryContainer, TrainParam +#include "xgboost/base.h" // for GradientPairPrecise, bst_node_t +#include "xgboost/json.h" // for Json namespace xgboost::tree { /** @@ -29,6 +33,66 @@ struct ExpandEntryImpl { [[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t num_leaves) const { return static_cast(this)->IsValidImpl(param, num_leaves); } + + void Save(Json* p_out) const { + auto& out = *p_out; + auto self = static_cast(this); + + out["nid"] = Integer{this->nid}; + out["depth"] = Integer{this->depth}; + + /** + * Handle split + */ + out["split"] = Object{}; + auto& split = out["split"]; + split["loss_chg"] = self->split.loss_chg; + split["sindex"] = Integer{self->split.sindex}; + split["split_value"] = self->split.split_value; + + auto const& cat_bits = self->split.cat_bits; + auto s_cat_bits = common::Span{cat_bits.data(), cat_bits.size()}; + split["cat_bits"] = U8Array{s_cat_bits.size_bytes()}; + auto& j_cat_bits = get(split["cat_bits"]); + using T = typename decltype(self->split.cat_bits)::value_type; + auto erased = + common::EraseType, std::add_const_t>(s_cat_bits); + for (std::size_t i = 0; i < erased.size(); ++i) { + j_cat_bits[i] = erased[i]; + } + + split["is_cat"] = Boolean{self->split.is_cat}; + + self->SaveGrad(&split); + } + + void Load(Json const& in) { + auto self = static_cast(this); + + this->nid = get(in["nid"]); + this->depth = get(in["depth"]); + + /** + * Handle split + */ + auto const& split = in["split"]; + self->split.loss_chg = get(split["loss_chg"]); + self->split.sindex = get(split["sindex"]); + self->split.split_value = get(split["split_value"]); + + auto const& j_cat_bits = get(split["cat_bits"]); + using T = typename decltype(self->split.cat_bits)::value_type; + auto restored = common::RestoreType>( + common::Span{j_cat_bits.data(), j_cat_bits.size()}); + self->split.cat_bits.resize(restored.size()); + for (std::size_t i = 0; i < restored.size(); ++i) { + self->split.cat_bits[i] = restored[i]; + } + + self->split.is_cat = get(split["is_cat"]); + + self->LoadGrad(split); + } }; struct CPUExpandEntry : public ExpandEntryImpl { @@ -39,6 +103,24 @@ struct CPUExpandEntry : public ExpandEntryImpl { : ExpandEntryImpl{nidx, depth}, split(std::move(split)) {} CPUExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {} + void SaveGrad(Json* p_out) const { + auto& out = *p_out; + auto save = [&](std::string const& name, GradStats const& sum) { + out[name] = F32Array{2}; + auto& array = get(out[name]); + array[0] = sum.GetGrad(); + array[1] = sum.GetHess(); + }; + save("left_sum", this->split.left_sum); + save("right_sum", this->split.right_sum); + } + void LoadGrad(Json const& in) { + auto const& left_sum = get(in["left_sum"]); + this->split.left_sum = GradStats{left_sum[0], left_sum[1]}; + auto const& right_sum = get(in["right_sum"]); + this->split.right_sum = GradStats{right_sum[0], right_sum[1]}; + } + [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const { if (split.loss_chg <= kRtEps) return false; if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) { @@ -88,6 +170,32 @@ struct MultiExpandEntry : public ExpandEntryImpl { MultiExpandEntry() = default; MultiExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {} + void SaveGrad(Json* p_out) const { + auto& out = *p_out; + auto save = [&](std::string const& name, std::vector const& sum) { + out[name] = F32Array{sum.size() * 2}; + auto& array = get(out[name]); + for (std::size_t i = 0, j = 0; i < sum.size(); i++, j += 2) { + array[j] = sum[i].GetGrad(); + array[j + 1] = sum[i].GetHess(); + } + }; + save("left_sum", this->split.left_sum); + save("right_sum", this->split.right_sum); + } + void LoadGrad(Json const& in) { + auto load = [&](std::string const& name, std::vector* p_sum) { + auto const& array = get(in[name]); + auto& sum = *p_sum; + sum.resize(array.size() / 2); + for (std::size_t i = 0, j = 0; i < sum.size(); ++i, j += 2) { + sum[i] = GradientPairPrecise{array[j], array[j + 1]}; + } + }; + load("left_sum", &this->split.left_sum); + load("right_sum", &this->split.right_sum); + } + [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const { if (split.loss_chg <= kRtEps) return false; auto is_zero = [](auto const& sum) { diff --git a/src/tree/param.h b/src/tree/param.h index 5e2a36dfe..5f32a786b 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -401,7 +401,7 @@ struct SplitEntryContainer { /*! \brief split index */ bst_feature_t sindex{0}; bst_float split_value{0.0f}; - std::vector cat_bits; + std::vector cat_bits; bool is_cat{false}; GradientT left_sum; diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index 8f5b27ac6..1c3e6a552 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -14,9 +14,7 @@ #include "gpu_hist/histogram.cuh" #include "param.h" -namespace xgboost { -namespace tree { - +namespace xgboost::tree { struct GPUTrainingParam { // minimum amount of hessian(weight) allowed in a child float min_child_weight; @@ -136,5 +134,4 @@ struct SumCallbackOp { return old_prefix; } }; -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree diff --git a/tests/cpp/collective/test_comm.cc b/tests/cpp/collective/test_comm.cc index 7792c4c25..52fec7b5d 100644 --- a/tests/cpp/collective/test_comm.cc +++ b/tests/cpp/collective/test_comm.cc @@ -4,7 +4,9 @@ #include #include "../../../src/collective/comm.h" -#include "test_worker.h" +#include "../../../src/common/type.h" // for EraseType +#include "test_worker.h" // for TrackerTest + namespace xgboost::collective { namespace { class CommTest : public TrackerTest {}; diff --git a/tests/cpp/tree/gpu_hist/test_expand_entry.cu b/tests/cpp/tree/gpu_hist/test_expand_entry.cu new file mode 100644 index 000000000..68780f643 --- /dev/null +++ b/tests/cpp/tree/gpu_hist/test_expand_entry.cu @@ -0,0 +1,28 @@ +/** + * Copyright 2023, XGBoost Contributors + */ +#include +#include +#include // for RegTree + +#include "../../../../src/tree/gpu_hist/expand_entry.cuh" + +namespace xgboost::tree { +TEST(ExpandEntry, IOGPU) { + DeviceSplitCandidate split; + GPUExpandEntry entry{RegTree::kRoot, 0, split, 3.0, 1.0, 2.0}; + + Json je{Object{}}; + entry.Save(&je); + + GPUExpandEntry loaded; + loaded.Load(je); + + ASSERT_EQ(entry.base_weight, loaded.base_weight); + ASSERT_EQ(entry.left_weight, loaded.left_weight); + ASSERT_EQ(entry.right_weight, loaded.right_weight); + + ASSERT_EQ(entry.GetDepth(), loaded.GetDepth()); + ASSERT_EQ(entry.GetLossChange(), loaded.GetLossChange()); +} +} // namespace xgboost::tree diff --git a/tests/cpp/tree/hist/test_expand_entry.cc b/tests/cpp/tree/hist/test_expand_entry.cc new file mode 100644 index 000000000..c47615688 --- /dev/null +++ b/tests/cpp/tree/hist/test_expand_entry.cc @@ -0,0 +1,57 @@ +/** + * Copyright 2023, XGBoost Contributors + */ +#include +#include // for Json +#include // for RegTree + +#include "../../../../src/tree/hist/expand_entry.h" + +namespace xgboost::tree { +TEST(ExpandEntry, IO) { + CPUExpandEntry entry{RegTree::kRoot, 0}; + entry.split.Update(1.0, 1, /*new_split_value=*/0.3, true, true, GradStats{1.0, 1.0}, + GradStats{2.0, 2.0}); + bst_bin_t n_bins_feature = 256; + auto n = common::CatBitField::ComputeStorageSize(n_bins_feature); + entry.split.cat_bits = decltype(entry.split.cat_bits)(n, 0); + common::CatBitField cat_bits{entry.split.cat_bits}; + cat_bits.Set(n_bins_feature / 2); + + Json je{Object{}}; + entry.Save(&je); + + CPUExpandEntry loaded; + loaded.Load(je); + + ASSERT_EQ(loaded.split.is_cat, entry.split.is_cat); + ASSERT_EQ(loaded.split.cat_bits, entry.split.cat_bits); + ASSERT_EQ(loaded.split.left_sum.GetGrad(), entry.split.left_sum.GetGrad()); + ASSERT_EQ(loaded.split.right_sum.GetHess(), entry.split.right_sum.GetHess()); +} + +TEST(ExpandEntry, IOMulti) { + MultiExpandEntry entry{RegTree::kRoot, 0}; + auto left_sum = std::vector{{1.0, 1.0}, {1.0, 1.0}}; + auto right_sum = std::vector{{2.0, 2.0}, {2.0, 2.0}}; + entry.split.Update(1.0, 1, /*new_split_value=*/0.3, true, true, + linalg::MakeVec(left_sum.data(), left_sum.size()), + linalg::MakeVec(right_sum.data(), right_sum.size())); + bst_bin_t n_bins_feature = 256; + auto n = common::CatBitField::ComputeStorageSize(n_bins_feature); + entry.split.cat_bits = decltype(entry.split.cat_bits)(n, 0); + common::CatBitField cat_bits{entry.split.cat_bits}; + cat_bits.Set(n_bins_feature / 2); + + Json je{Object{}}; + entry.Save(&je); + + MultiExpandEntry loaded; + loaded.Load(je); + + ASSERT_EQ(loaded.split.is_cat, entry.split.is_cat); + ASSERT_EQ(loaded.split.cat_bits, entry.split.cat_bits); + ASSERT_EQ(loaded.split.left_sum, entry.split.left_sum); + ASSERT_EQ(loaded.split.right_sum, entry.split.right_sum); +} +} // namespace xgboost::tree