[EM] Suport quantile objectives for GPU-based external memory. (#10820)

- Improved error message for memory usage.
- Support quantile-based objectives for GPU external memory.
This commit is contained in:
Jiaming Yuan
2024-09-17 13:27:02 +08:00
committed by GitHub
parent de00e07087
commit 96bbf80457
10 changed files with 177 additions and 37 deletions

View File

@@ -5,9 +5,11 @@
#include <dmlc/thread_local.h> // for ThreadLocalStore
#include <cmath> // for pow
#include <cstdint> // for uint8_t
#include <cstdio> // for snprintf, size_t
#include <string> // for string
#include <utility> // for pair
#include "./random.h" // for GlobalRandomEngine, GlobalRandom
@@ -54,4 +56,20 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
}
}
}
std::string HumanMemUnit(std::size_t n_bytes) {
auto n_bytes_f64 = static_cast<double>(n_bytes);
double constexpr k1024 = 1024.0;
using P = std::pair<std::int32_t, StringView>;
std::stringstream ss;
for (auto pu : {P{3, "GB"}, P{2, "MB"}, P{1, "KB"}}) {
auto const [power, unit] = pu; // NOLINT
if (n_bytes_f64 >= (std::pow(k1024, power))) {
ss << (n_bytes_f64 / std::pow(k1024, power)) << unit;
return ss.str();
}
}
ss << n_bytes_f64 << "B";
return ss.str();
}
} // namespace xgboost::common

View File

@@ -188,5 +188,8 @@ template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
return indptr[group + 1] - 1;
}
// Convert the number of bytes to a human readable unit.
std::string HumanMemUnit(std::size_t n_bytes);
} // namespace xgboost::common
#endif // XGBOOST_COMMON_COMMON_H_

View File

@@ -15,8 +15,7 @@
#include <algorithm>
#include <cstddef> // for size_t
#include <cub/cub.cuh>
#include <cub/util_type.cuh> // for UnitWord
#include <tuple>
#include <cub/util_type.cuh> // for UnitWord, DoubleBuffer
#include <vector>
#include "common.h"
@@ -635,7 +634,7 @@ size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy
return thrust::make_pair(seg, *(val_first + i));
});
size_t segments_len = key_segments_last - key_segments_first;
thrust::fill(thrust::device, key_segments_out, key_segments_out + segments_len, 0);
thrust::fill(exec, key_segments_out, key_segments_out + segments_len, 0);
size_t n_inputs = std::distance(val_first, val_last);
// Reduce the number of uniques elements per segment, avoid creating an intermediate
// array for `reduce_by_key`. It's limited by the types that atomicAdd supports. For
@@ -736,22 +735,32 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
class CUDAStreamView;
class CUDAEvent {
cudaEvent_t event_{nullptr};
std::unique_ptr<cudaEvent_t, void (*)(cudaEvent_t *)> event_;
public:
CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
~CUDAEvent() {
if (event_) {
dh::safe_cuda(cudaEventDestroy(event_));
}
CUDAEvent()
: event_{[] {
auto e = new cudaEvent_t;
dh::safe_cuda(cudaEventCreateWithFlags(e, cudaEventDisableTiming));
return e;
}(),
[](cudaEvent_t *e) {
if (e) {
dh::safe_cuda(cudaEventDestroy(*e));
delete e;
}
}} {}
inline void Record(CUDAStreamView stream); // NOLINT
// Define swap-based ctor to make sure an event is always valid.
CUDAEvent(CUDAEvent &&e) : CUDAEvent() { std::swap(this->event_, e.event_); }
CUDAEvent &operator=(CUDAEvent &&e) {
std::swap(this->event_, e.event_);
return *this;
}
CUDAEvent(CUDAEvent const &that) = delete;
CUDAEvent &operator=(CUDAEvent const &that) = delete;
inline void Record(CUDAStreamView stream); // NOLINT
operator cudaEvent_t() const { return event_; } // NOLINT
operator cudaEvent_t() const { return *event_; } // NOLINT
cudaEvent_t const *data() const { return this->event_.get(); } // NOLINT
};
class CUDAStreamView {
@@ -785,7 +794,7 @@ class CUDAStreamView {
};
inline void CUDAEvent::Record(CUDAStreamView stream) { // NOLINT
dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
dh::safe_cuda(cudaEventRecord(*event_, cudaStream_t{stream}));
}
// Changing this has effect on prediction return, where we need to pass the pointer to

View File

@@ -2,18 +2,20 @@
* Copyright 2017-2024, XGBoost contributors
*/
#include "../collective/communicator-inl.h" // for GetRank
#include "common.h" // for HumanMemUnit
#include "device_helpers.cuh" // for CurrentDevice
#include "device_vector.cuh"
namespace dh {
namespace detail {
void ThrowOOMError(std::string const &err, size_t bytes) {
void ThrowOOMError(std::string const &err, std::size_t bytes) {
auto device = CurrentDevice();
auto rank = xgboost::collective::GetRank();
using xgboost::common::HumanMemUnit;
std::stringstream ss;
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
<< "- Free memory: " << dh::AvailableMemory(device) << "\n"
<< "- Requested memory: " << bytes << std::endl;
<< "- Free memory: " << HumanMemUnit(dh::AvailableMemory(device)) << "\n"
<< "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
LOG(FATAL) << ss.str();
}
} // namespace detail

View File

@@ -31,7 +31,7 @@
#include <map> // for map
#include <memory> // for unique_ptr
#include "common.h" // for safe_cuda
#include "common.h" // for safe_cuda, HumanMemUnit
#include "xgboost/logging.h"
namespace dh {
@@ -97,12 +97,13 @@ class MemoryLogger {
dh::safe_cuda(cudaGetDevice(&current_device));
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
<< " ========";
LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
LOG(CONSOLE) << "Peak memory usage: "
<< xgboost::common::HumanMemUnit(stats_.peak_allocated_bytes);
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
}
};
void ThrowOOMError(std::string const &err, size_t bytes);
void ThrowOOMError(std::string const &err, std::size_t bytes);
} // namespace detail
inline detail::MemoryLogger &GlobalMemoryLogger() {

View File

@@ -218,10 +218,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
model_.learner_model_param->OutputLength());
CHECK_NE(n_groups, 0);
if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf() && this->ctx_->IsCUDA()) {
LOG(FATAL) << "Current objective doesn't support external memory.";
}
// The node position for each row, 1 HDV for each tree in the forest. Note that the
// position is negated if the row is sampled out.
std::vector<HostDeviceVector<bst_node_t>> node_position;

View File

@@ -148,9 +148,10 @@ class CommonRowPartitioner {
template <typename ExpandEntry, typename GHistIndexMatrixT>
static void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
GHistIndexMatrixT const& gmat,
std::vector<int32_t>* split_conditions) {
std::vector<int32_t>* p_split_conditions) {
auto const& ptrs = gmat.cut.Ptrs();
auto const& vals = gmat.cut.Values();
auto& split_conditions = *p_split_conditions;
for (std::size_t i = 0; i < nodes.size(); ++i) {
bst_node_t const nidx = nodes[i].nid;
@@ -167,7 +168,7 @@ class CommonRowPartitioner {
split_cond = static_cast<bst_bin_t>(bound);
}
}
(*split_conditions)[i] = split_cond;
split_conditions[i] = split_cond;
}
}

View File

@@ -520,12 +520,11 @@ struct GPUHistMakerDevice {
// prediction cache
void FinalisePosition(DMatrix* p_fmat, RegTree const* p_tree, ObjInfo task,
HostDeviceVector<bst_node_t>* p_out_position) {
if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) {
LOG(FATAL) << "Current objective function can not be used with external memory.";
}
monitor.Start(__func__);
if (static_cast<std::size_t>(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) {
if (task.UpdateTreeLeaf()) {
LOG(FATAL) << "Current objective function can not be used with concatenated pages.";
}
// External memory with concatenation. Not supported.
p_out_position->Resize(0);
positions_.clear();