[EM] Prevent init with CUDA malloc resource. (#10606)
This commit is contained in:
parent
0846ad860c
commit
cb62f9e73b
@ -3,6 +3,11 @@
|
||||
*/
|
||||
#pragma once
|
||||
#include <cstdint> // for int32_t
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
namespace xgboost::common {
|
||||
std::int32_t AllVisibleGPUs();
|
||||
|
||||
@ -18,4 +23,36 @@ bool SupportsAts();
|
||||
void CheckComputeCapability();
|
||||
|
||||
void SetDevice(std::int32_t device);
|
||||
|
||||
struct NvtxDomain {
|
||||
static constexpr char const *name{"libxgboost"}; // NOLINT
|
||||
};
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
using NvtxScopedRange = ::nvtx3::scoped_range_in<NvtxDomain>;
|
||||
using NvtxEventAttr = ::nvtx3::event_attributes;
|
||||
using NvtxRgb = ::nvtx3::rgb;
|
||||
#else
|
||||
class NvtxScopedRange {
|
||||
public:
|
||||
template <typename... Args>
|
||||
explicit NvtxScopedRange(Args &&...) {}
|
||||
};
|
||||
class NvtxEventAttr {
|
||||
public:
|
||||
template <typename... Args>
|
||||
explicit NvtxEventAttr(Args &&...) {}
|
||||
};
|
||||
class NvtxRgb {
|
||||
public:
|
||||
template <typename... Args>
|
||||
explicit NvtxRgb(Args &&...) {}
|
||||
};
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
} // namespace xgboost::common
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
|
||||
#else
|
||||
#define xgboost_NVTX_FN_RANGE()
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
@ -16,10 +16,17 @@ namespace xgboost::common {
|
||||
* @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
|
||||
*/
|
||||
template <typename T>
|
||||
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
|
||||
std::size_t n_elements, T const& init) {
|
||||
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
|
||||
std::size_t n_elements) {
|
||||
auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
|
||||
auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
|
||||
return ref;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
|
||||
std::size_t n_elements, T const& init) {
|
||||
auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
|
||||
thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
|
||||
return ref;
|
||||
}
|
||||
|
||||
@ -24,11 +24,9 @@ class CudaMallocResource : public ResourceHandler {
|
||||
}
|
||||
~CudaMallocResource() noexcept(true) override { this->Clear(); }
|
||||
|
||||
void* Data() override { return storage_.data(); }
|
||||
[[nodiscard]] void* Data() override { return storage_.data(); }
|
||||
[[nodiscard]] std::size_t Size() const override { return storage_.size(); }
|
||||
void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
|
||||
this->storage_.resize(n_bytes, init);
|
||||
}
|
||||
void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
|
||||
};
|
||||
|
||||
class CudaMmapResource : public ResourceHandler {
|
||||
|
||||
@ -6,9 +6,10 @@
|
||||
#include <utility>
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "cuda_rt_utils.h"
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#include <nvtx3/nvToolsExt.h>
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
namespace xgboost::common {
|
||||
@ -17,8 +18,8 @@ void Monitor::Start(std::string const &name) {
|
||||
auto &stats = statistics_map_[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
std::string nvtx_name = "xgboost::" + label_ + "::" + name;
|
||||
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
|
||||
auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
|
||||
stats.nvtx_id = range_handle.get_value();
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
@ -29,34 +30,32 @@ void Monitor::Stop(const std::string &name) {
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
nvtxRangeEnd(stats.nvtx_id);
|
||||
nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
|
||||
void Monitor::PrintStatistics(StatMap const& statistics) const {
|
||||
void Monitor::PrintStatistics(StatMap const &statistics) const {
|
||||
for (auto &kv : statistics) {
|
||||
if (kv.second.first == 0) {
|
||||
LOG(WARNING) <<
|
||||
"Timer for " << kv.first << " did not get stopped properly.";
|
||||
LOG(WARNING) << "Timer for " << kv.first << " did not get stopped properly.";
|
||||
continue;
|
||||
}
|
||||
LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6
|
||||
<< "s, " << kv.second.first << " calls @ "
|
||||
<< kv.second.second
|
||||
<< "us" << std::endl;
|
||||
LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6 << "s, "
|
||||
<< kv.second.first << " calls @ " << kv.second.second << "us" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void Monitor::Print() const {
|
||||
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
|
||||
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
auto rank = collective::GetRank();
|
||||
StatMap stat_map;
|
||||
for (auto const &kv : statistics_map_) {
|
||||
stat_map[kv.first] = std::make_pair(
|
||||
kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
kv.second.timer.elapsed)
|
||||
.count());
|
||||
kv.second.count,
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(kv.second.timer.elapsed).count());
|
||||
}
|
||||
if (stat_map.empty()) {
|
||||
return;
|
||||
|
||||
@ -404,7 +404,7 @@ size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bs
|
||||
LOG(FATAL) << "Concatenating the same Ellpack.";
|
||||
return this->n_rows * this->row_stride;
|
||||
}
|
||||
dh::LaunchN(num_elements, CopyPage{this, page, offset});
|
||||
dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset});
|
||||
monitor_.Stop(__func__);
|
||||
return num_elements;
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
#include <cstddef> // for size_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../common/cuda_rt_utils.h"
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc
|
||||
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
|
||||
@ -21,6 +22,8 @@ namespace {
|
||||
template <typename T>
|
||||
[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
|
||||
common::RefResourceView<T>* vec) {
|
||||
xgboost_NVTX_FN_RANGE();
|
||||
|
||||
std::uint64_t n{0};
|
||||
if (!fi->Read(&n)) {
|
||||
return false;
|
||||
@ -37,7 +40,7 @@ template <typename T>
|
||||
}
|
||||
|
||||
auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
*vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0));
|
||||
*vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
|
||||
dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
|
||||
return true;
|
||||
}
|
||||
@ -50,6 +53,7 @@ template <typename T>
|
||||
|
||||
[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
|
||||
common::AlignedResourceReadStream* fi) {
|
||||
xgboost_NVTX_FN_RANGE();
|
||||
auto* impl = page->Impl();
|
||||
|
||||
impl->SetCuts(this->cuts_);
|
||||
@ -69,6 +73,8 @@ template <typename T>
|
||||
|
||||
[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
|
||||
common::AlignedFileWriteStream* fo) {
|
||||
xgboost_NVTX_FN_RANGE();
|
||||
|
||||
std::size_t bytes{0};
|
||||
auto* impl = page.Impl();
|
||||
bytes += fo->Write(impl->n_rows);
|
||||
@ -84,22 +90,30 @@ template <typename T>
|
||||
}
|
||||
|
||||
[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const {
|
||||
xgboost_NVTX_FN_RANGE();
|
||||
|
||||
auto* impl = page->Impl();
|
||||
CHECK(this->cuts_->cut_values_.DeviceCanRead());
|
||||
impl->SetCuts(this->cuts_);
|
||||
|
||||
// Read vector
|
||||
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
auto read_vec = [&] {
|
||||
common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
|
||||
bst_idx_t n{0};
|
||||
RET_IF_NOT(fi->Read(&n));
|
||||
if (n == 0) {
|
||||
return true;
|
||||
}
|
||||
impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
|
||||
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
|
||||
return true;
|
||||
};
|
||||
RET_IF_NOT(read_vec());
|
||||
|
||||
RET_IF_NOT(fi->Read(&impl->n_rows));
|
||||
RET_IF_NOT(fi->Read(&impl->is_dense));
|
||||
RET_IF_NOT(fi->Read(&impl->row_stride));
|
||||
|
||||
// Read vec
|
||||
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
bst_idx_t n{0};
|
||||
RET_IF_NOT(fi->Read(&n));
|
||||
if (n != 0) {
|
||||
impl->gidx_buffer =
|
||||
common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
|
||||
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
|
||||
}
|
||||
RET_IF_NOT(fi->Read(&impl->base_rowid));
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
@ -108,19 +122,27 @@ template <typename T>
|
||||
|
||||
[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
|
||||
EllpackHostCacheStream* fo) const {
|
||||
xgboost_NVTX_FN_RANGE();
|
||||
|
||||
bst_idx_t bytes{0};
|
||||
auto* impl = page.Impl();
|
||||
bytes += fo->Write(impl->n_rows);
|
||||
bytes += fo->Write(impl->is_dense);
|
||||
bytes += fo->Write(impl->row_stride);
|
||||
|
||||
// Write vector
|
||||
auto write_vec = [&] {
|
||||
common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
|
||||
bst_idx_t n = impl->gidx_buffer.size();
|
||||
bytes += fo->Write(n);
|
||||
|
||||
if (!impl->gidx_buffer.empty()) {
|
||||
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
|
||||
}
|
||||
};
|
||||
|
||||
write_vec();
|
||||
|
||||
bytes += fo->Write(impl->n_rows);
|
||||
bytes += fo->Write(impl->is_dense);
|
||||
bytes += fo->Write(impl->row_stride);
|
||||
bytes += fo->Write(impl->base_rowid);
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
|
||||
@ -37,4 +37,5 @@ dependencies:
|
||||
- pyspark>=3.4.0
|
||||
- cloudpickle
|
||||
- pip:
|
||||
- setuptools
|
||||
- sphinx_rtd_theme
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user