[EM] Prevent init with CUDA malloc resource. (#10606)

This commit is contained in:
Jiaming Yuan 2024-07-21 05:08:29 +08:00 committed by GitHub
parent 0846ad860c
commit cb62f9e73b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 105 additions and 41 deletions

View File

@ -3,6 +3,11 @@
*/ */
#pragma once #pragma once
#include <cstdint> // for int32_t #include <cstdint> // for int32_t
#if defined(XGBOOST_USE_NVTX)
#include <nvtx3/nvtx3.hpp>
#endif // defined(XGBOOST_USE_NVTX)
namespace xgboost::common { namespace xgboost::common {
std::int32_t AllVisibleGPUs(); std::int32_t AllVisibleGPUs();
@ -18,4 +23,36 @@ bool SupportsAts();
void CheckComputeCapability(); void CheckComputeCapability();
void SetDevice(std::int32_t device); void SetDevice(std::int32_t device);
struct NvtxDomain {
static constexpr char const *name{"libxgboost"}; // NOLINT
};
#if defined(XGBOOST_USE_NVTX)
using NvtxScopedRange = ::nvtx3::scoped_range_in<NvtxDomain>;
using NvtxEventAttr = ::nvtx3::event_attributes;
using NvtxRgb = ::nvtx3::rgb;
#else
class NvtxScopedRange {
public:
template <typename... Args>
explicit NvtxScopedRange(Args &&...) {}
};
class NvtxEventAttr {
public:
template <typename... Args>
explicit NvtxEventAttr(Args &&...) {}
};
class NvtxRgb {
public:
template <typename... Args>
explicit NvtxRgb(Args &&...) {}
};
#endif // defined(XGBOOST_USE_NVTX)
} // namespace xgboost::common } // namespace xgboost::common
#if defined(XGBOOST_USE_NVTX)
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
#else
#define xgboost_NVTX_FN_RANGE()
#endif // defined(XGBOOST_USE_NVTX)

View File

@ -16,10 +16,17 @@ namespace xgboost::common {
* @brief Make a fixed size `RefResourceView` with cudaMalloc resource. * @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
*/ */
template <typename T> template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx, [[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
std::size_t n_elements, T const& init) { std::size_t n_elements) {
auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T)); auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource}; auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
return ref;
}
template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
std::size_t n_elements, T const& init) {
auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init); thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
return ref; return ref;
} }

View File

@ -24,11 +24,9 @@ class CudaMallocResource : public ResourceHandler {
} }
~CudaMallocResource() noexcept(true) override { this->Clear(); } ~CudaMallocResource() noexcept(true) override { this->Clear(); }
void* Data() override { return storage_.data(); } [[nodiscard]] void* Data() override { return storage_.data(); }
[[nodiscard]] std::size_t Size() const override { return storage_.size(); } [[nodiscard]] std::size_t Size() const override { return storage_.size(); }
void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) { void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
this->storage_.resize(n_bytes, init);
}
}; };
class CudaMmapResource : public ResourceHandler { class CudaMmapResource : public ResourceHandler {

View File

@ -6,9 +6,10 @@
#include <utility> #include <utility>
#include "../collective/communicator-inl.h" #include "../collective/communicator-inl.h"
#include "cuda_rt_utils.h"
#if defined(XGBOOST_USE_NVTX) #if defined(XGBOOST_USE_NVTX)
#include <nvtx3/nvToolsExt.h> #include <nvtx3/nvtx3.hpp>
#endif // defined(XGBOOST_USE_NVTX) #endif // defined(XGBOOST_USE_NVTX)
namespace xgboost::common { namespace xgboost::common {
@ -17,8 +18,8 @@ void Monitor::Start(std::string const &name) {
auto &stats = statistics_map_[name]; auto &stats = statistics_map_[name];
stats.timer.Start(); stats.timer.Start();
#if defined(XGBOOST_USE_NVTX) #if defined(XGBOOST_USE_NVTX)
std::string nvtx_name = "xgboost::" + label_ + "::" + name; auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str()); stats.nvtx_id = range_handle.get_value();
#endif // defined(XGBOOST_USE_NVTX) #endif // defined(XGBOOST_USE_NVTX)
} }
} }
@ -29,34 +30,32 @@ void Monitor::Stop(const std::string &name) {
stats.timer.Stop(); stats.timer.Stop();
stats.count++; stats.count++;
#if defined(XGBOOST_USE_NVTX) #if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id); nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
#endif // defined(XGBOOST_USE_NVTX) #endif // defined(XGBOOST_USE_NVTX)
} }
} }
void Monitor::PrintStatistics(StatMap const& statistics) const { void Monitor::PrintStatistics(StatMap const &statistics) const {
for (auto &kv : statistics) { for (auto &kv : statistics) {
if (kv.second.first == 0) { if (kv.second.first == 0) {
LOG(WARNING) << LOG(WARNING) << "Timer for " << kv.first << " did not get stopped properly.";
"Timer for " << kv.first << " did not get stopped properly.";
continue; continue;
} }
LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6 LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6 << "s, "
<< "s, " << kv.second.first << " calls @ " << kv.second.first << " calls @ " << kv.second.second << "us" << std::endl;
<< kv.second.second
<< "us" << std::endl;
} }
} }
void Monitor::Print() const { void Monitor::Print() const {
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; } if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
return;
}
auto rank = collective::GetRank(); auto rank = collective::GetRank();
StatMap stat_map; StatMap stat_map;
for (auto const &kv : statistics_map_) { for (auto const &kv : statistics_map_) {
stat_map[kv.first] = std::make_pair( stat_map[kv.first] = std::make_pair(
kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>( kv.second.count,
kv.second.timer.elapsed) std::chrono::duration_cast<std::chrono::microseconds>(kv.second.timer.elapsed).count());
.count());
} }
if (stat_map.empty()) { if (stat_map.empty()) {
return; return;

View File

@ -404,7 +404,7 @@ size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bs
LOG(FATAL) << "Concatenating the same Ellpack."; LOG(FATAL) << "Concatenating the same Ellpack.";
return this->n_rows * this->row_stride; return this->n_rows * this->row_stride;
} }
dh::LaunchN(num_elements, CopyPage{this, page, offset}); dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset});
monitor_.Stop(__func__); monitor_.Stop(__func__);
return num_elements; return num_elements;
} }

View File

@ -6,6 +6,7 @@
#include <cstddef> // for size_t #include <cstddef> // for size_t
#include <vector> // for vector #include <vector> // for vector
#include "../common/cuda_rt_utils.h"
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream #include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc #include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec #include "../common/ref_resource_view.h" // for ReadVec, WriteVec
@ -21,6 +22,8 @@ namespace {
template <typename T> template <typename T>
[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, [[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
common::RefResourceView<T>* vec) { common::RefResourceView<T>* vec) {
xgboost_NVTX_FN_RANGE();
std::uint64_t n{0}; std::uint64_t n{0};
if (!fi->Read(&n)) { if (!fi->Read(&n)) {
return false; return false;
@ -37,7 +40,7 @@ template <typename T>
} }
auto ctx = Context{}.MakeCUDA(common::CurrentDevice()); auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
*vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0)); *vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream())); dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
return true; return true;
} }
@ -50,6 +53,7 @@ template <typename T>
[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
common::AlignedResourceReadStream* fi) { common::AlignedResourceReadStream* fi) {
xgboost_NVTX_FN_RANGE();
auto* impl = page->Impl(); auto* impl = page->Impl();
impl->SetCuts(this->cuts_); impl->SetCuts(this->cuts_);
@ -69,6 +73,8 @@ template <typename T>
[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page, [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
common::AlignedFileWriteStream* fo) { common::AlignedFileWriteStream* fo) {
xgboost_NVTX_FN_RANGE();
std::size_t bytes{0}; std::size_t bytes{0};
auto* impl = page.Impl(); auto* impl = page.Impl();
bytes += fo->Write(impl->n_rows); bytes += fo->Write(impl->n_rows);
@ -84,22 +90,30 @@ template <typename T>
} }
[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const { [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const {
xgboost_NVTX_FN_RANGE();
auto* impl = page->Impl(); auto* impl = page->Impl();
CHECK(this->cuts_->cut_values_.DeviceCanRead()); CHECK(this->cuts_->cut_values_.DeviceCanRead());
impl->SetCuts(this->cuts_); impl->SetCuts(this->cuts_);
// Read vector
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
auto read_vec = [&] {
common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
bst_idx_t n{0};
RET_IF_NOT(fi->Read(&n));
if (n == 0) {
return true;
}
impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
return true;
};
RET_IF_NOT(read_vec());
RET_IF_NOT(fi->Read(&impl->n_rows)); RET_IF_NOT(fi->Read(&impl->n_rows));
RET_IF_NOT(fi->Read(&impl->is_dense)); RET_IF_NOT(fi->Read(&impl->is_dense));
RET_IF_NOT(fi->Read(&impl->row_stride)); RET_IF_NOT(fi->Read(&impl->row_stride));
// Read vec
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
bst_idx_t n{0};
RET_IF_NOT(fi->Read(&n));
if (n != 0) {
impl->gidx_buffer =
common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
}
RET_IF_NOT(fi->Read(&impl->base_rowid)); RET_IF_NOT(fi->Read(&impl->base_rowid));
dh::DefaultStream().Sync(); dh::DefaultStream().Sync();
@ -108,19 +122,27 @@ template <typename T>
[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page, [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
EllpackHostCacheStream* fo) const { EllpackHostCacheStream* fo) const {
xgboost_NVTX_FN_RANGE();
bst_idx_t bytes{0}; bst_idx_t bytes{0};
auto* impl = page.Impl(); auto* impl = page.Impl();
// Write vector
auto write_vec = [&] {
common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
bst_idx_t n = impl->gidx_buffer.size();
bytes += fo->Write(n);
if (!impl->gidx_buffer.empty()) {
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
}
};
write_vec();
bytes += fo->Write(impl->n_rows); bytes += fo->Write(impl->n_rows);
bytes += fo->Write(impl->is_dense); bytes += fo->Write(impl->is_dense);
bytes += fo->Write(impl->row_stride); bytes += fo->Write(impl->row_stride);
// Write vector
bst_idx_t n = impl->gidx_buffer.size();
bytes += fo->Write(n);
if (!impl->gidx_buffer.empty()) {
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
}
bytes += fo->Write(impl->base_rowid); bytes += fo->Write(impl->base_rowid);
dh::DefaultStream().Sync(); dh::DefaultStream().Sync();

View File

@ -37,4 +37,5 @@ dependencies:
- pyspark>=3.4.0 - pyspark>=3.4.0
- cloudpickle - cloudpickle
- pip: - pip:
- setuptools
- sphinx_rtd_theme - sphinx_rtd_theme