diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 5fbf479c5..ed4b38f99 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -113,6 +113,7 @@ OBJECTS= \ $(PKGROOT)/src/common/charconv.o \ $(PKGROOT)/src/common/column_matrix.o \ $(PKGROOT)/src/common/common.o \ + $(PKGROOT)/src/common/cuda_rt_utils.o \ $(PKGROOT)/src/common/error_msg.o \ $(PKGROOT)/src/common/hist_util.o \ $(PKGROOT)/src/common/host_device_vector.o \ diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index a5a5c131e..d4330120f 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -113,6 +113,7 @@ OBJECTS= \ $(PKGROOT)/src/common/charconv.o \ $(PKGROOT)/src/common/column_matrix.o \ $(PKGROOT)/src/common/common.o \ + $(PKGROOT)/src/common/cuda_rt_utils.o \ $(PKGROOT)/src/common/error_msg.o \ $(PKGROOT)/src/common/hist_util.o \ $(PKGROOT)/src/common/host_device_vector.o \ diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 9abe72b87..64aab5c41 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -7,6 +7,8 @@ #define XGBOOST_BASE_H_ #include // for omp_uint, omp_ulong +// Put the windefs here to guard as many files as possible. +#include #include // for int32_t, uint64_t, int16_t #include // for ostream diff --git a/include/xgboost/collective/poll_utils.h b/include/xgboost/collective/poll_utils.h index a4d2fbacd..41b674964 100644 --- a/include/xgboost/collective/poll_utils.h +++ b/include/xgboost/collective/poll_utils.h @@ -4,13 +4,14 @@ * \author Tianqi Chen */ #pragma once -#include "xgboost/collective/result.h" -#include "xgboost/collective/socket.h" +#include +#include #if defined(_WIN32) +#include +// Socket API #include #include - #else #include diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h index c5dd977f6..bf5fffdaf 100644 --- a/include/xgboost/collective/socket.h +++ b/include/xgboost/collective/socket.h @@ -1,12 +1,8 @@ /** - * Copyright (c) 2022-2024, XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ #pragma once -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #include // errno, EINTR, EBADF #include // HOST_NAME_MAX #include // std::size_t @@ -18,18 +14,12 @@ #if defined(__linux__) #include // for TIOCOUTQ, FIONREAD -#endif // defined(__linux__) - -#if !defined(xgboost_IS_MINGW) - -#if defined(__MINGW32__) -#define xgboost_IS_MINGW 1 -#endif // defined(__MINGW32__) - -#endif // xgboost_IS_MINGW +#endif // defined(__linux__) #if defined(_WIN32) - +// Guard the include. +#include +// Socket API #include #include @@ -41,9 +31,9 @@ using in_port_t = std::uint16_t; #if !defined(xgboost_IS_MINGW) using ssize_t = int; -#endif // !xgboost_IS_MINGW() +#endif // !xgboost_IS_MINGW() -#else // UNIX +#else // UNIX #include // inet_ntop #include // fcntl, F_GETFL, O_NONBLOCK @@ -839,7 +829,3 @@ Result INetNToP(H const &host, std::string *p_out) { } // namespace xgboost #undef xgboost_CHECK_SYS_CALL - -#if defined(xgboost_IS_MINGW) -#undef xgboost_IS_MINGW -#endif diff --git a/include/xgboost/windefs.h b/include/xgboost/windefs.h new file mode 100644 index 000000000..e7e743184 --- /dev/null +++ b/include/xgboost/windefs.h @@ -0,0 +1,33 @@ +/** + * Copyright 2024, XGBoost Contributors + * + * @brief Macro for Windows. + */ +#pragma once + +#if !defined(xgboost_IS_WIN) + +#if defined(_MSC_VER) || defined(__MINGW32__) +#define xgboost_IS_WIN 1 +#endif // defined(_MSC_VER) || defined(__MINGW32__) + +#endif // !defined(xgboost_IS_WIN) + +#if defined(xgboost_IS_WIN) + +#if !defined(NOMINMAX) +#define NOMINMAX +#endif // !defined(NOMINMAX) + +// A macro used inside `windows.h` to avoid conflicts with `winsock2.h` +#define WIN32_LEAN_AND_MEAN + +#if !defined(xgboost_IS_MINGW) + +#if defined(__MINGW32__) +#define xgboost_IS_MINGW 1 +#endif // defined(__MINGW32__) + +#endif // xgboost_IS_MINGW + +#endif // defined(xgboost_IS_WIN) diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h index 0ad4ac073..a1928e6b1 100644 --- a/src/c_api/c_api_error.h +++ b/src/c_api/c_api_error.h @@ -7,10 +7,9 @@ #define XGBOOST_C_API_C_API_ERROR_H_ #include -#include -#include "c_api_utils.h" -#include "xgboost/collective/result.h" +#include "c_api_utils.h" // for XGBoostAPIGuard +#include "xgboost/logging.h" /*! \brief macro to guard beginning and end section of all functions */ #ifdef LOG_CAPI_INVOCATION diff --git a/src/cli_main.cc b/src/cli_main.cc index 54a345027..1c388cf84 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -4,29 +4,26 @@ * \brief The command line interface program of xgboost. * This file is not included in dynamic library. */ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #include - -#include +#include #include #include +#include #include #include -#include -#include -#include #include #include +#include +#include +#include #include + +#include "c_api/c_api_utils.h" #include "common/common.h" #include "common/config.h" #include "common/io.h" #include "common/version.h" -#include "c_api/c_api_utils.h" namespace xgboost { enum CLITask { diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc index 6cb3601db..bbc7a7c5a 100644 --- a/src/collective/tracker.cc +++ b/src/collective/tracker.cc @@ -7,11 +7,10 @@ #include // socket, AF_INET6, AF_INET, connect, getsockname #endif // defined(__unix__) || defined(__APPLE__) -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #if defined(_WIN32) +// Guard the include +#include +// Socket API #include #include #endif // defined(_WIN32) diff --git a/src/common/common.cc b/src/common/common.cc index 086f4c00d..10a667070 100644 --- a/src/common/common.cc +++ b/src/common/common.cc @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by Contributors + * Copyright 2015-2024, XGBoost Contributors */ #include "common.h" @@ -54,9 +54,4 @@ void EscapeU8(std::string const &string, std::string *p_buffer) { } } } - -#if !defined(XGBOOST_USE_CUDA) -int AllVisibleGPUs() { return 0; } -#endif // !defined(XGBOOST_USE_CUDA) - } // namespace xgboost::common diff --git a/src/common/common.cu b/src/common/common.cu index b6965904a..958f93779 100644 --- a/src/common/common.cu +++ b/src/common/common.cu @@ -1,29 +1,21 @@ -/*! - * Copyright 2018-2022 XGBoost contributors +/** + * Copyright 2018-2024, XGBoost contributors */ +#include +#include + #include "common.h" -namespace xgboost { -namespace common { - -void SetDevice(std::int32_t device) { - if (device >= 0) { - dh::safe_cuda(cudaSetDevice(device)); +namespace dh { +void ThrowOnCudaError(cudaError_t code, const char *file, int line) { + if (code != cudaSuccess) { + std::string f; + if (file != nullptr) { + f = file; + } + LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(), + f + ": " + std::to_string(line)) + .what(); } } - -int AllVisibleGPUs() { - int n_visgpus = 0; - try { - // When compiled with CUDA but running on CPU only device, - // cudaGetDeviceCount will fail. - dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); - } catch (const dmlc::Error &) { - cudaGetLastError(); // reset error. - return 0; - } - return n_visgpus; -} - -} // namespace common -} // namespace xgboost +} // namespace dh diff --git a/src/common/common.h b/src/common/common.h index 950dee521..93151670b 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by XGBoost Contributors + * Copyright 2015-2024, XGBoost Contributors * \file common.h * \brief Common utilities */ @@ -19,9 +19,8 @@ #include "xgboost/base.h" // for XGBOOST_DEVICE #include "xgboost/logging.h" // for LOG, LOG_FATAL, LogMessageFatal +// magic to define functions based on the compiler. #if defined(__CUDACC__) -#include -#include #define WITH_CUDA() true @@ -31,23 +30,20 @@ #endif // defined(__CUDACC__) +#if defined(XGBOOST_USE_CUDA) +#include +#endif + namespace dh { -#if defined(__CUDACC__) +#if defined(XGBOOST_USE_CUDA) /* - * Error handling functions + * Error handling functions */ +void ThrowOnCudaError(cudaError_t code, const char *file, int line); + #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__) -inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, - int line) { - if (code != cudaSuccess) { - LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(), - std::string{file} + ": " + // NOLINT - std::to_string(line)).what(); - } - return code; -} -#endif // defined(__CUDACC__) +#endif // defined(XGBOOST_USE_CUDA) } // namespace dh namespace xgboost::common { @@ -167,8 +163,6 @@ class Range { Iterator end_; }; -int AllVisibleGPUs(); - inline void AssertGPUSupport() { #ifndef XGBOOST_USE_CUDA LOG(FATAL) << "XGBoost version not compiled with GPU support."; @@ -187,16 +181,6 @@ inline void AssertSYCLSupport() { #endif // XGBOOST_USE_SYCL } -void SetDevice(std::int32_t device); - -#if !defined(XGBOOST_USE_CUDA) -inline void SetDevice(std::int32_t device) { - if (device >= 0) { - AssertGPUSupport(); - } -} -#endif - /** * @brief Last index of a group in a CSR style of index pointer. */ diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc new file mode 100644 index 000000000..d41981d8f --- /dev/null +++ b/src/common/cuda_rt_utils.cc @@ -0,0 +1,86 @@ +/** + * Copyright 2015-2024, XGBoost Contributors + */ +#include "cuda_rt_utils.h" + +#if defined(XGBOOST_USE_CUDA) +#include +#endif // defined(XGBOOST_USE_CUDA) + +#include // for int32_t + +#include "common.h" // for safe_cuda + +namespace xgboost::common { +#if defined(XGBOOST_USE_CUDA) +std::int32_t AllVisibleGPUs() { + int n_visgpus = 0; + try { + // When compiled with CUDA but running on CPU only device, + // cudaGetDeviceCount will fail. + dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); + } catch (const dmlc::Error &) { + cudaGetLastError(); // reset error. + return 0; + } + return n_visgpus; +} + +std::int32_t CurrentDevice() { + std::int32_t device = 0; + dh::safe_cuda(cudaGetDevice(&device)); + return device; +} + +// alternatively: `nvidia-smi -q | grep Addressing` +bool SupportsPageableMem() { + std::int32_t res{0}; + dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccess, CurrentDevice())); + return res == 1; +} + +bool SupportsAts() { + std::int32_t res{0}; + dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccessUsesHostPageTables, + CurrentDevice())); + return res == 1; +} + +void CheckComputeCapability() { + for (std::int32_t d_idx = 0; d_idx < AllVisibleGPUs(); ++d_idx) { + cudaDeviceProp prop; + dh::safe_cuda(cudaGetDeviceProperties(&prop, d_idx)); + std::ostringstream oss; + oss << "CUDA Capability Major/Minor version number: " << prop.major << "." << prop.minor + << " is insufficient. Need >=3.5"; + int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5); + if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx; + } +} + +void SetDevice(std::int32_t device) { + if (device >= 0) { + dh::safe_cuda(cudaSetDevice(device)); + } +} +#else +std::int32_t AllVisibleGPUs() { return 0; } + +std::int32_t CurrentDevice() { + AssertGPUSupport(); + return -1; +} + +bool SupportsPageableMem() { return false; } + +bool SupportsAts() { return false; } + +void CheckComputeCapability() {} + +void SetDevice(std::int32_t device) { + if (device >= 0) { + AssertGPUSupport(); + } +} +#endif // !defined(XGBOOST_USE_CUDA) +} // namespace xgboost::common diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h new file mode 100644 index 000000000..fa14f8434 --- /dev/null +++ b/src/common/cuda_rt_utils.h @@ -0,0 +1,21 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#pragma once +#include // for int32_t +namespace xgboost::common { +std::int32_t AllVisibleGPUs(); + +std::int32_t CurrentDevice(); + +// Whether the device supports coherently accessing pageable memory without calling +// `cudaHostRegister` on it +bool SupportsPageableMem(); + +// Address Translation Service (ATS) +bool SupportsAts(); + +void CheckComputeCapability(); + +void SetDevice(std::int32_t device); +} // namespace xgboost::common diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 98a76d72a..34faa4eb0 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -157,18 +157,6 @@ inline size_t MaxSharedMemoryOptin(int device_idx) { return static_cast(max_shared_memory); } -inline void CheckComputeCapability() { - for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) { - cudaDeviceProp prop; - safe_cuda(cudaGetDeviceProperties(&prop, d_idx)); - std::ostringstream oss; - oss << "CUDA Capability Major/Minor version number: " << prop.major << "." - << prop.minor << " is insufficient. Need >=3.5"; - int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5); - if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx; - } -} - XGBOOST_DEV_INLINE void AtomicOrByte(unsigned int *__restrict__ buffer, size_t ibyte, unsigned char b) { atomicOr(&buffer[ibyte / sizeof(unsigned int)], @@ -273,13 +261,15 @@ void Iota(Container array, cudaStream_t stream) { } // dh::DebugSyncDevice(__FILE__, __LINE__); -inline void DebugSyncDevice(std::string file="", int32_t line = -1) { - if (file != "" && line != -1) { - auto rank = xgboost::collective::GetRank(); - LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line; +inline void DebugSyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) { + { + auto err = cudaDeviceSynchronize(); + ThrowOnCudaError(err, file, line); + } + { + auto err = cudaGetLastError(); + ThrowOnCudaError(err, file, line); } - safe_cuda(cudaDeviceSynchronize()); - safe_cuda(cudaGetLastError()); } // Faster to instantiate than caching_device_vector and invokes no synchronisation diff --git a/src/common/io.cc b/src/common/io.cc index 1715669b0..4bc8d9de4 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -1,26 +1,21 @@ /** - * Copyright 2019-2023, by XGBoost Contributors + * Copyright 2019-2024, by XGBoost Contributors */ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - -#if !defined(xgboost_IS_WIN) - -#if defined(_MSC_VER) || defined(__MINGW32__) -#define xgboost_IS_WIN 1 -#endif // defined(_MSC_VER) || defined(__MINGW32__) - -#endif // !defined(xgboost_IS_WIN) - #if defined(__unix__) || defined(__APPLE__) + #include // for open, O_RDONLY -#include // for mmap, mmap64, munmap +#include // for mmap, mmap64, munmap, madvise #include // for close, getpagesize -#elif defined(xgboost_IS_WIN) -#define WIN32_LEAN_AND_MEAN + +#else + +#include + +#if defined(xgboost_IS_WIN) #include -#endif // defined(__unix__) +#endif // defined(xgboost_IS_WIN) + +#endif // defined(__unix__) || defined(__APPLE__) #include // for copy, transform #include // for tolower @@ -31,8 +26,7 @@ #include // for filesystem, weakly_canonical #include // for ifstream #include // for distance -#include // for numeric_limits -#include // for unique_ptr +#include // for unique_ptr, make_unique #include // for string #include // for error_code, system_category #include // for move @@ -40,7 +34,12 @@ #include "io.h" #include "xgboost/collective/socket.h" // for LastError -#include "xgboost/logging.h" +#include "xgboost/logging.h" // for CHECK_LE +#include "xgboost/string_view.h" // for StringView + +#if !defined(__linux__) && !defined(__GLIBC__) && !defined(xgboost_IS_WIN) +#include // for numeric_limits +#endif namespace xgboost::common { size_t PeekableInStream::Read(void* dptr, size_t size) { @@ -182,39 +181,9 @@ std::string FileExtension(std::string fname, bool lower) { // NVCC 11.8 doesn't allow `noexcept(false) = default` altogether. ResourceHandler::~ResourceHandler() noexcept(false) {} // NOLINT -struct MMAPFile { -#if defined(xgboost_IS_WIN) - HANDLE fd{INVALID_HANDLE_VALUE}; - HANDLE file_map{INVALID_HANDLE_VALUE}; -#else - std::int32_t fd{0}; -#endif - std::byte* base_ptr{nullptr}; - std::size_t base_size{0}; - std::size_t delta{0}; - std::string path; - - MMAPFile() = default; - -#if defined(xgboost_IS_WIN) - MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta, - std::string path) - : fd{fd}, - file_map{fm}, - base_ptr{base_ptr}, - base_size{base_size}, - delta{delta}, - path{std::move(path)} {} -#else - MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta, - std::string path) - : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {} -#endif -}; - -std::unique_ptr Open(std::string path, std::size_t offset, std::size_t length) { +MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t length) { if (length == 0) { - return std::make_unique(); + return new MMAPFile{}; } #if defined(xgboost_IS_WIN) @@ -234,10 +203,8 @@ std::unique_ptr Open(std::string path, std::size_t offset, std::size_t #if defined(__linux__) || defined(__GLIBC__) int prot{PROT_READ}; ptr = reinterpret_cast(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start)); - madvise(ptr, view_size, MADV_WILLNEED); CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg(); - auto handle = - std::make_unique(fd, ptr, view_size, offset - view_start, std::move(path)); + auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)}; #elif defined(xgboost_IS_WIN) auto file_size = GetFileSize(fd, nullptr); DWORD access = PAGE_READONLY; @@ -248,55 +215,62 @@ std::unique_ptr Open(std::string path, std::size_t offset, std::size_t CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg(); ptr = reinterpret_cast(MapViewOfFile(map_file, access, hoff, loff, view_size)); CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg(); - auto handle = std::make_unique(fd, map_file, ptr, view_size, offset - view_start, - std::move(path)); + auto handle = new MMAPFile{fd, map_file, ptr, view_size, offset - view_start, std::move(path)}; #else CHECK_LE(offset, std::numeric_limits::max()) << "File size has exceeded the limit on the current system."; int prot{PROT_READ}; ptr = reinterpret_cast(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start)); CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg(); - auto handle = - std::make_unique(fd, ptr, view_size, offset - view_start, std::move(path)); -#endif // defined(__linux__) + auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)}; +#endif // defined(__linux__) || defined(__GLIBC__) return handle; } -MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length) - : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {} - -MmapResource::~MmapResource() noexcept(false) { - if (!handle_) { +void detail::CloseMmap(MMAPFile* handle) { + if (!handle) { return; } #if defined(xgboost_IS_WIN) - if (handle_->base_ptr) { - CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg(); + if (handle->base_ptr) { + CHECK(UnmapViewOfFile(handle->base_ptr)) "Faled to call munmap: " << SystemErrorMsg(); } - if (handle_->fd != INVALID_HANDLE_VALUE) { - CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg(); + if (handle->fd != INVALID_HANDLE_VALUE) { + CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg(); } - if (handle_->file_map != INVALID_HANDLE_VALUE) { - CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg(); + if (handle->file_map != INVALID_HANDLE_VALUE) { + CHECK(CloseHandle(handle->file_map)) << "Failed to close mapping object: " << SystemErrorMsg(); } #else - if (handle_->base_ptr) { - CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1) - << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg(); + if (handle->base_ptr) { + CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1) + << "Faled to call munmap: `" << handle->path << "`. " << SystemErrorMsg(); } - if (handle_->fd != 0) { - CHECK_NE(close(handle_->fd), -1) - << "Faled to close: " << handle_->path << ". " << SystemErrorMsg(); + if (handle->fd != 0) { + CHECK_NE(close(handle->fd), -1) + << "Faled to close: `" << handle->path << "`. " << SystemErrorMsg(); } #endif + delete handle; } +MmapResource::MmapResource(StringView path, std::size_t offset, std::size_t length) + : ResourceHandler{kMmap}, + handle_{detail::OpenMmap(std::string{path}, offset, length), detail::CloseMmap}, + n_{length} { +#if defined(__unix__) || defined(__APPLE__) + madvise(handle_->base_ptr, handle_->base_size, MADV_WILLNEED); +#endif // defined(__unix__) || defined(__APPLE__) +} + +MmapResource::~MmapResource() noexcept(false) = default; + [[nodiscard]] void* MmapResource::Data() { if (!handle_) { return nullptr; } - return handle_->base_ptr + handle_->delta; + return this->handle_->Data(); } [[nodiscard]] std::size_t MmapResource::Size() const { return n_; } @@ -329,7 +303,3 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default; return this->pimpl_->Tell(); } } // namespace xgboost::common - -#if defined(xgboost_IS_WIN) -#undef xgboost_IS_WIN -#endif // defined(xgboost_IS_WIN) diff --git a/src/common/io.h b/src/common/io.h index 198ce7014..5f2e28336 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -7,7 +7,11 @@ #ifndef XGBOOST_COMMON_IO_H_ #define XGBOOST_COMMON_IO_H_ -#include +#include + +#if defined(xgboost_IS_WIN) +#include +#endif // defined(xgboost_IS_WIN) #include // for min, fill_n, copy_n #include // for array @@ -15,6 +19,7 @@ #include // for malloc, realloc, free #include // for memcpy #include // for ifstream +#include // for function #include // for numeric_limits #include // for unique_ptr #include // for string @@ -23,6 +28,7 @@ #include // for vector #include "common.h" // for DivRoundUp +#include "dmlc/io.h" // for SeekStream #include "xgboost/string_view.h" // for StringView namespace xgboost::common { @@ -224,7 +230,48 @@ inline std::string ReadAll(std::string const &path) { return content; } -struct MMAPFile; +/** + * @brief A handle to mmap file. + */ +struct MMAPFile { +#if defined(xgboost_IS_WIN) + HANDLE fd{INVALID_HANDLE_VALUE}; + HANDLE file_map{INVALID_HANDLE_VALUE}; +#else + std::int32_t fd{0}; +#endif // defined(xgboost_IS_WIN) + std::byte* base_ptr{nullptr}; + std::size_t base_size{0}; + std::size_t delta{0}; + std::string path; + + MMAPFile() = default; + +#if defined(xgboost_IS_WIN) + MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta, + std::string path) + : fd{fd}, + file_map{fm}, + base_ptr{base_ptr}, + base_size{base_size}, + delta{delta}, + path{std::move(path)} {} +#else + MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta, + std::string path) + : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {} +#endif // defined(xgboost_IS_WIN) + + void const* Data() const { return this->base_ptr + this->delta; } + void* Data() { return this->base_ptr + this->delta; } +}; + +namespace detail { +// call mmap +[[nodiscard]] MMAPFile* OpenMmap(std::string path, std::size_t offset, std::size_t length); +// close the mapped file handle. +void CloseMmap(MMAPFile* handle); +} // namespace detail /** * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is @@ -237,6 +284,8 @@ class ResourceHandler { enum Kind : std::uint8_t { kMalloc = 0, kMmap = 1, + kCudaMalloc = 2, + kCudaMmap = 3, }; private: @@ -251,6 +300,20 @@ class ResourceHandler { [[nodiscard]] virtual std::size_t Size() const = 0; [[nodiscard]] auto Type() const { return kind_; } + [[nodiscard]] StringView TypeName() const { + switch (this->Type()) { + case kMalloc: + return "Malloc"; + case kMmap: + return "Mmap"; + case kCudaMalloc: + return "CudaMalloc"; + case kCudaMmap: + return "CudaMmap"; + } + LOG(FATAL) << "Unreachable."; + return {}; + } // Allow exceptions for cleaning up resource. virtual ~ResourceHandler() noexcept(false); @@ -339,11 +402,11 @@ class MallocResource : public ResourceHandler { * @brief A class for wrapping mmap as a resource for RAII. */ class MmapResource : public ResourceHandler { - std::unique_ptr handle_; + std::unique_ptr> handle_; std::size_t n_; public: - MmapResource(std::string path, std::size_t offset, std::size_t length); + MmapResource(StringView path, std::size_t offset, std::size_t length); ~MmapResource() noexcept(false) override; [[nodiscard]] void* Data() override; @@ -471,9 +534,9 @@ class PrivateMmapConstStream : public AlignedResourceReadStream { * @param offset See the `offset` parameter of `mmap` for details. * @param length See the `length` parameter of `mmap` for details. */ - explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length) + explicit PrivateMmapConstStream(StringView path, std::size_t offset, std::size_t length) : AlignedResourceReadStream{std::shared_ptr{ // NOLINT - new MmapResource{std::move(path), offset, length}}} {} + new MmapResource{path, offset, length}}} {} ~PrivateMmapConstStream() noexcept(false) override; }; diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh new file mode 100644 index 000000000..ff311c140 --- /dev/null +++ b/src/common/ref_resource_view.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#pragma once + +#include // for size_t +#include // for make_shared + +#include "cuda_context.cuh" // for CUDAContext +#include "ref_resource_view.h" // for RefResourceView +#include "resource.cuh" // for CudaAllocResource +#include "xgboost/context.h" // for Context + +namespace xgboost::common { +/** + * @brief Make a fixed size `RefResourceView` with cudaMalloc resource. + */ +template +[[nodiscard]] RefResourceView MakeFixedVecWithCudaMalloc(Context const* ctx, + std::size_t n_elements, T const& init) { + auto resource = std::make_shared(n_elements * sizeof(T)); + auto ref = RefResourceView{resource->DataAs(), n_elements, resource}; + thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init); + return ref; +} +} // namespace xgboost::common diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h index 61adfdb7b..81058d923 100644 --- a/src/common/ref_resource_view.h +++ b/src/common/ref_resource_view.h @@ -43,24 +43,16 @@ class RefResourceView { } public: - RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem) - : ptr_{ptr}, size_{n}, mem_{std::move(mem)} { - CHECK_GE(mem_->Size(), n); - } /** * @brief Construct a view on ptr with length n. The ptr is held by the mem resource. * * @param ptr The pointer to view. * @param n The length of the view. * @param mem The owner of the pointer. - * @param init Initialize the view with this value. */ - RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem, - T const& init) - : RefResourceView{ptr, n, mem} { - if (n != 0) { - std::fill_n(ptr_, n, init); - } + RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem) + : ptr_{ptr}, size_{n}, mem_{std::move(mem)} { + CHECK_GE(mem_->Size(), n); } ~RefResourceView() = default; @@ -159,7 +151,9 @@ template template [[nodiscard]] RefResourceView MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) { auto resource = std::make_shared(n_elements * sizeof(T)); - return RefResourceView{resource->DataAs(), n_elements, resource, init}; + auto ref = RefResourceView{resource->DataAs(), n_elements, resource}; + std::fill_n(ref.data(), ref.size(), init); + return ref; } template diff --git a/src/common/resource.cu b/src/common/resource.cu new file mode 100644 index 000000000..ef662e3bd --- /dev/null +++ b/src/common/resource.cu @@ -0,0 +1,43 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#include "device_helpers.cuh" // for CurrentDevice +#include "resource.cuh" +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::common { +CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::size_t length) + : ResourceHandler{kCudaMmap}, + handle_{detail::OpenMmap(std::string{path}, offset, length), + [](MMAPFile* handle) { + // Don't close the mmap while CUDA kernel is running. + if (handle) { + dh::DefaultStream().Sync(); + } + detail::CloseMmap(handle); + }}, + n_{length} { + auto device = dh::CurrentDevice(); + dh::safe_cuda( + cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, device)); + dh::safe_cuda(cudaMemAdvise(handle_->base_ptr, handle_->base_size, + cudaMemAdviseSetPreferredLocation, device)); + dh::safe_cuda( + cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, device)); + dh::safe_cuda( + cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, device, dh::DefaultStream())); +} + +[[nodiscard]] void* CudaMmapResource::Data() { + if (!handle_) { + return nullptr; + } + return this->handle_->Data(); +} + +[[nodiscard]] std::size_t CudaMmapResource::Size() const { return n_; } + +CudaMmapResource::~CudaMmapResource() noexcept(false) = default; + +PrivateCudaMmapConstStream::~PrivateCudaMmapConstStream() noexcept(false) = default; +} // namespace xgboost::common diff --git a/src/common/resource.cuh b/src/common/resource.cuh new file mode 100644 index 000000000..90b9756a9 --- /dev/null +++ b/src/common/resource.cuh @@ -0,0 +1,54 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#pragma once +#include // for size_t +#include // for function + +#include "device_vector.cuh" // for DeviceUVector +#include "io.h" // for ResourceHandler, MMAPFile +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::common { +/** + * @brief Resource backed by `cudaMalloc`. + */ +class CudaMallocResource : public ResourceHandler { + dh::DeviceUVector storage_; + + void Clear() noexcept(true) { this->Resize(0); } + + public: + explicit CudaMallocResource(std::size_t n_bytes) : ResourceHandler{kCudaMalloc} { + this->Resize(n_bytes); + } + ~CudaMallocResource() noexcept(true) override { this->Clear(); } + + void* Data() override { return storage_.data(); } + [[nodiscard]] std::size_t Size() const override { return storage_.size(); } + void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) { + this->storage_.resize(n_bytes, init); + } +}; + +class CudaMmapResource : public ResourceHandler { + std::unique_ptr> handle_; + std::size_t n_; + + public: + CudaMmapResource() : ResourceHandler{kCudaMmap} {} + CudaMmapResource(StringView path, std::size_t offset, std::size_t length); + ~CudaMmapResource() noexcept(false) override; + + [[nodiscard]] void* Data() override; + [[nodiscard]] std::size_t Size() const override; +}; + +class PrivateCudaMmapConstStream : public AlignedResourceReadStream { + public: + explicit PrivateCudaMmapConstStream(StringView path, std::size_t offset, std::size_t length) + : AlignedResourceReadStream{ + std::shared_ptr{new CudaMmapResource{path, offset, length}}} {} + ~PrivateCudaMmapConstStream() noexcept(false) override; +}; +} // namespace xgboost::common diff --git a/src/context.cc b/src/context.cc index ef7110e7c..19060d5fc 100644 --- a/src/context.cc +++ b/src/context.cc @@ -1,5 +1,5 @@ /** - * Copyright 2014-2023 by XGBoost Contributors + * Copyright 2014-2024, XGBoost Contributors * * \brief Context object used for controlling runtime parameters. */ @@ -11,8 +11,9 @@ #include // for optional #include // for regex_replace, regex_match -#include "common/common.h" // AssertGPUSupport -#include "common/error_msg.h" // WarnDeprecatedGPUId +#include "common/common.h" // AssertGPUSupport +#include "common/cuda_rt_utils.h" // for AllVisibleGPUs +#include "common/error_msg.h" // WarnDeprecatedGPUId #include "common/threading_utils.h" #include "xgboost/string_view.h" diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 81656284e..7d3f4c820 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -11,8 +11,9 @@ #include "../common/categorical.h" #include "../common/cuda_context.cuh" #include "../common/hist_util.cuh" -#include "../common/transform_iterator.h" // MakeIndexTransformIter -#include "device_adapter.cuh" // for NoInfInData +#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc +#include "../common/transform_iterator.h" // MakeIndexTransformIter +#include "device_adapter.cuh" // for NoInfInData #include "ellpack_page.cuh" #include "ellpack_page.h" #include "gradient_index.h" @@ -43,21 +44,19 @@ __global__ void CompressBinEllpackKernel( common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer, // gidx_buffer const size_t* __restrict__ row_ptrs, // row offset of input data - const Entry* __restrict__ entries, // One batch of input data - const float* __restrict__ cuts, // HistogramCuts::cut_values_ - const uint32_t* __restrict__ cut_ptrs, // HistogramCuts::cut_ptrs_ + const Entry* __restrict__ entries, // One batch of input data + const float* __restrict__ cuts, // HistogramCuts::cut_values_ + const uint32_t* __restrict__ cut_ptrs, // HistogramCuts::cut_ptrs_ common::Span feature_types, - size_t base_row, // batch_row_begin - size_t n_rows, - size_t row_stride, - unsigned int null_gidx_value) { + size_t base_row, // batch_row_begin + size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) { size_t irow = threadIdx.x + blockIdx.x * blockDim.x; int ifeature = threadIdx.y + blockIdx.y * blockDim.y; if (irow >= n_rows || ifeature >= row_stride) { return; } int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); - unsigned int bin = null_gidx_value; + std::uint32_t bin = null_gidx_value; if (ifeature < row_length) { Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature]; int feature = entry.index; @@ -89,25 +88,23 @@ __global__ void CompressBinEllpackKernel( } // Construct an ELLPACK matrix with the given number of empty rows. -EllpackPageImpl::EllpackPageImpl(DeviceOrd device, +EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows) : is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} { monitor_.Init("ellpack_page"); - dh::safe_cuda(cudaSetDevice(device.ordinal)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); - monitor_.Start("InitCompressedData"); - this->InitCompressedData(device); - monitor_.Stop("InitCompressedData"); + this->InitCompressedData(ctx); } -EllpackPageImpl::EllpackPageImpl(DeviceOrd device, +EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, const SparsePage& page, bool is_dense, size_t row_stride, common::Span feature_types) : cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) { - this->InitCompressedData(device); - this->CreateHistIndices(device, page, feature_types); + this->InitCompressedData(ctx); + this->CreateHistIndices(ctx->Device(), page, feature_types); } // Construct an ELLPACK matrix in memory. @@ -129,9 +126,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP } monitor_.Stop("Quantiles"); - monitor_.Start("InitCompressedData"); - this->InitCompressedData(ctx->Device()); - monitor_.Stop("InitCompressedData"); + this->InitCompressedData(ctx); dmat->Info().feature_types.SetDevice(ctx->Device()); auto ft = dmat->Info().feature_types.ConstDeviceSpan(); @@ -234,7 +229,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::SpanGetDeviceAccessor(device); common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + auto d_compressed_buffer = dst->gidx_buffer.data(); // We redirect the scan output into this functor to do the actual writing WriteCompressedEllpackFunctor functor( @@ -275,7 +270,7 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::SpanGetDeviceAccessor(device); common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + auto d_compressed_buffer = dst->gidx_buffer.data(); auto row_stride = dst->row_stride; dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) { // For some reason this variable got captured as const @@ -290,20 +285,20 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span -EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense, - common::Span row_counts_span, +EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, + bool is_dense, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, std::shared_ptr cuts) { - dh::safe_cuda(cudaSetDevice(device.ordinal)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); - *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows); - CopyDataToEllpack(batch, feature_types, this, device, missing); - WriteNullValues(this, device, row_counts_span); + *this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows); + CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing); + WriteNullValues(this, ctx->Device(), row_counts_span); } #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \ template EllpackPageImpl::EllpackPageImpl( \ - __BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \ + Context const* ctx, __BATCH_T batch, float missing, bool is_dense, \ common::Span row_counts_span, common::Span feature_types, \ size_t row_stride, size_t n_rows, std::shared_ptr cuts); @@ -365,12 +360,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag row_stride = *std::max_element(it, it + page.Size()); CHECK(ctx->IsCUDA()); - monitor_.Start("InitCompressedData"); - InitCompressedData(ctx->Device()); - monitor_.Stop("InitCompressedData"); + InitCompressedData(ctx); // copy gidx - common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer(); + common::CompressedByteT* d_compressed_buffer = gidx_buffer.data(); dh::device_vector row_ptr(page.row_ptr.size()); auto d_row_ptr = dh::ToSpan(row_ptr); dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), @@ -389,20 +382,20 @@ struct CopyPage { // The number of elements to skip. size_t offset; - CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset) - : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()}, - src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, + CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset) + : cbw{dst->NumSymbols()}, + dst_data_d{dst->gidx_buffer.data()}, + src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()}, offset(offset) {} __device__ void operator()(size_t element_id) { - cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], - element_id + offset); + cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset); } }; // Copy the data from the given EllpackPage to the current page. -size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) { - monitor_.Start("Copy"); +size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) { + monitor_.Start(__func__); bst_idx_t num_elements = page->n_rows * page->row_stride; CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); @@ -411,10 +404,8 @@ size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size LOG(FATAL) << "Concatenating the same Ellpack."; return this->n_rows * this->row_stride; } - gidx_buffer.SetDevice(device); - page->gidx_buffer.SetDevice(device); - dh::LaunchN(num_elements, CopyPage(this, page, offset)); - monitor_.Stop("Copy"); + dh::LaunchN(num_elements, CopyPage{this, page, offset}); + monitor_.Stop(__func__); return num_elements; } @@ -423,8 +414,8 @@ struct CompactPage { common::CompressedBufferWriter cbw; common::CompressedByteT* dst_data_d; common::CompressedIterator src_iterator_d; - /*! \brief An array that maps the rows from the full DMatrix to the compacted - * page. + /** + * @brief An array that maps the rows from the full DMatrix to the compacted page. * * The total size is the number of rows in the original, uncompacted DMatrix. * Elements are the row ids in the compacted page. Rows not needed are set to @@ -438,24 +429,24 @@ struct CompactPage { size_t base_rowid; size_t row_stride; - CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, - common::Span row_indexes) + CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span row_indexes) : cbw{dst->NumSymbols()}, - dst_data_d{dst->gidx_buffer.DevicePointer()}, - src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, + dst_data_d{dst->gidx_buffer.data()}, + src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()}, row_indexes(row_indexes), base_rowid{src->base_rowid}, row_stride{src->row_stride} {} - __device__ void operator()(size_t row_id) { + __device__ void operator()(bst_idx_t row_id) { size_t src_row = base_rowid + row_id; size_t dst_row = row_indexes[src_row]; - if (dst_row == SIZE_MAX) return; + if (dst_row == SIZE_MAX) { + return; + } size_t dst_offset = dst_row * row_stride; size_t src_offset = row_id * row_stride; for (size_t j = 0; j < row_stride; j++) { - cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], - dst_offset + j); + cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j); } } }; @@ -467,28 +458,22 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page, CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); - gidx_buffer.SetDevice(ctx->Device()); - page->gidx_buffer.SetDevice(ctx->Device()); auto cuctx = ctx->CUDACtx(); - dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes)); + dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes}); monitor_.Stop(__func__); } // Initialize the buffer to stored compressed features. -void EllpackPageImpl::InitCompressedData(DeviceOrd device) { - size_t num_symbols = NumSymbols(); +void EllpackPageImpl::InitCompressedData(Context const* ctx) { + monitor_.Start(__func__); + auto num_symbols = NumSymbols(); // Required buffer size for storing data matrix in ELLPack format. - size_t compressed_size_bytes = + std::size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols); - gidx_buffer.SetDevice(device); - // Don't call fill unnecessarily - if (gidx_buffer.Size() == 0) { - gidx_buffer.Resize(compressed_size_bytes, 0); - } else { - gidx_buffer.Resize(compressed_size_bytes, 0); - thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0); - } + auto init = static_cast(0); + gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init); + monitor_.Stop(__func__); } // Compress a CSR page into ELLPACK. @@ -496,7 +481,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device, const SparsePage& row_batch, common::Span feature_types) { if (row_batch.Size() == 0) return; - unsigned int null_gidx_value = NumSymbols() - 1; + std::uint32_t null_gidx_value = NumSymbols() - 1; const auto& offset_vec = row_batch.offset.ConstHostVector(); @@ -541,13 +526,11 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device, const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x), common::DivRoundUp(row_stride, block3.y), 1); auto device_accessor = GetDeviceAccessor(device); - dh::LaunchKernel {grid3, block3}( - CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), - gidx_buffer.DevicePointer(), row_ptrs.data().get(), - entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), - device_accessor.feature_segments.data(), feature_types, - batch_row_begin, batch_nrows, row_stride, - null_gidx_value); + dh::LaunchKernel{grid3, block3}( // NOLINT + CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(), + row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), + device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows, + row_stride, null_gidx_value); } } @@ -566,26 +549,31 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride, EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( DeviceOrd device, common::Span feature_types) const { - gidx_buffer.SetDevice(device); return {device, cuts_, is_dense, row_stride, base_rowid, n_rows, - common::CompressedIterator(gidx_buffer.ConstDevicePointer(), - NumSymbols()), + common::CompressedIterator(gidx_buffer.data(), NumSymbols()), feature_types}; } + EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor( + Context const* ctx, std::vector* h_gidx_buffer, common::Span feature_types) const { + h_gidx_buffer->resize(gidx_buffer.size()); + CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size()); + CHECK_NE(gidx_buffer.size(), 0); + dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(), + cudaMemcpyDefault, dh::DefaultStream())); return {DeviceOrd::CPU(), cuts_, is_dense, row_stride, base_rowid, n_rows, - common::CompressedIterator(gidx_buffer.ConstHostPointer(), NumSymbols()), + common::CompressedIterator(h_gidx_buffer->data(), NumSymbols()), feature_types}; } } // namespace xgboost diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index d1f9472df..18b9384af 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -1,23 +1,25 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ - #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_ #define XGBOOST_DATA_ELLPACK_PAGE_CUH_ #include -#include #include "../common/categorical.h" #include "../common/compressed_iterator.h" #include "../common/device_helpers.cuh" #include "../common/hist_util.h" +#include "../common/ref_resource_view.h" // for RefResourceView #include "ellpack_page.h" +#include "xgboost/data.h" namespace xgboost { -/** \brief Struct for accessing and manipulating an ELLPACK matrix on the - * device. Does not own underlying memory and may be trivially copied into - * kernels.*/ +/** + * @brief Struct for accessing and manipulating an ELLPACK matrix on the device. + * + * Does not own underlying memory and may be trivially copied into kernels. + */ struct EllpackDeviceAccessor { /*! \brief Whether or not if the matrix is dense. */ bool is_dense; @@ -128,31 +130,31 @@ class GHistIndexMatrix; class EllpackPageImpl { public: - /*! - * \brief Default constructor. + /** + * @brief Default constructor. * * This is used in the external memory case. An empty ELLPACK page is constructed with its content * set later by the reader. */ EllpackPageImpl() = default; - /*! - * \brief Constructor from an existing EllpackInfo. + /** + * @brief Constructor from an existing EllpackInfo. * - * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo - * and the given number of rows. + * This is used in the sampling case. The ELLPACK page is constructed from an existing + * Ellpack page and the given number of rows. */ - EllpackPageImpl(DeviceOrd device, std::shared_ptr cuts, + EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows); - /*! - * \brief Constructor used for external memory. + /** + * @brief Constructor used for external memory. */ - EllpackPageImpl(DeviceOrd device, std::shared_ptr cuts, + EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, const SparsePage& page, bool is_dense, size_t row_stride, common::Span feature_types); - /*! - * \brief Constructor from an existing DMatrix. + /** + * @brief Constructor from an existing DMatrix. * * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix * in CSR format. @@ -160,37 +162,39 @@ class EllpackPageImpl { explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm); template - explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense, + explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, std::shared_ptr cuts); /** - * \brief Constructor from an existing CPU gradient index. + * @brief Constructor from an existing CPU gradient index. */ explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page, common::Span ft); - /*! \brief Copy the elements of the given ELLPACK page into this page. + /** + * @brief Copy the elements of the given ELLPACK page into this page. * - * @param device The GPU device to use. + * @param ctx The GPU context. * @param page The ELLPACK page to copy from. * @param offset The number of elements to skip before copying. * @returns The number of elements copied. */ - size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset); + bst_idx_t Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset); - /*! \brief Compact the given ELLPACK page into the current page. + /** + * @brief Compact the given ELLPACK page into the current page. * - * @param context The GPU context. + * @param ctx The GPU context. * @param page The ELLPACK page to compact from. * @param row_indexes Row indexes for the compacted page. */ void Compact(Context const* ctx, EllpackPageImpl const* page, common::Span row_indexes); - /*! \return Number of instances in the page. */ + /** @return Number of instances in the page. */ [[nodiscard]] bst_idx_t Size() const; - /*! \brief Set the base row id for this page. */ + /** @brief Set the base row id for this page. */ void SetBaseRowId(std::size_t row_id) { base_rowid = row_id; } @@ -199,43 +203,54 @@ class EllpackPageImpl { [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } void SetCuts(std::shared_ptr cuts) { cuts_ = cuts; } - /*! \return Estimation of memory cost of this page. */ + /** @return Estimation of memory cost of this page. */ static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ; - /*! \brief Return the total number of symbols (total number of bins plus 1 for - * not found). */ + /** + * @brief Return the total number of symbols (total number of bins plus 1 for not + * found). + */ [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; } - + /** + * @brief Get an accessor that can be passed into CUDA kernels. + */ [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor( DeviceOrd device, common::Span feature_types = {}) const; + /** + * @brief Get an accessor for host code. + */ [[nodiscard]] EllpackDeviceAccessor GetHostAccessor( + Context const* ctx, std::vector* h_gidx_buffer, common::Span feature_types = {}) const; private: - /*! - * \brief Compress a single page of CSR data into ELLPACK. + /** + * @brief Compress a single page of CSR data into ELLPACK. * * @param device The GPU device to use. * @param row_batch The CSR page. */ - void CreateHistIndices(DeviceOrd device, - const SparsePage& row_batch, + void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch, common::Span feature_types); - /*! - * \brief Initialize the buffer to store compressed features. + /** + * @brief Initialize the buffer to store compressed features. */ - void InitCompressedData(DeviceOrd device); + void InitCompressedData(Context const* ctx); public: - /*! \brief Whether or not if the matrix is dense. */ + /** @brief Whether or not if the matrix is dense. */ bool is_dense; - /*! \brief Row length for ELLPACK. */ + /** @brief Row length for ELLPACK. */ bst_idx_t row_stride; bst_idx_t base_rowid{0}; - bst_idx_t n_rows{}; - /*! \brief global index of histogram, which is stored in ELLPACK format. */ - HostDeviceVector gidx_buffer; + bst_idx_t n_rows{0}; + /** + * @brief Index of the gradient histogram, which is stored in ELLPACK format. + * + * This can be backed by various storage types. + */ + common::RefResourceView gidx_buffer; private: std::shared_ptr cuts_; diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index 059dd9f21..3f23c5d8d 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -4,11 +4,12 @@ #include #include // for size_t -#include // for uint64_t +#include // for vector -#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream -#include "../common/ref_resource_view.h" // for ReadVec, WriteVec -#include "ellpack_page.cuh" // for EllpackPage +#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream +#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc +#include "../common/ref_resource_view.h" // for ReadVec, WriteVec +#include "ellpack_page.cuh" // for EllpackPage #include "ellpack_page_raw_format.h" #include "ellpack_page_source.h" @@ -16,8 +17,10 @@ namespace xgboost::data { DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format); namespace { +// Function to support system without HMM or ATS template -[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, HostDeviceVector* vec) { +[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, + common::RefResourceView* vec) { std::uint64_t n{0}; if (!fi->Read(&n)) { return false; @@ -33,34 +36,34 @@ template return false; } - vec->Resize(n); - auto d_vec = vec->DeviceSpan(); - dh::safe_cuda( - cudaMemcpyAsync(d_vec.data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream())); + auto ctx = Context{}.MakeCUDA(common::CurrentDevice()); + *vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast(0)); + dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream())); return true; } } // namespace +#define RET_IF_NOT(expr) \ + if (!(expr)) { \ + return false; \ + } + [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, common::AlignedResourceReadStream* fi) { auto* impl = page->Impl(); + impl->SetCuts(this->cuts_); - if (!fi->Read(&impl->n_rows)) { - return false; - } - if (!fi->Read(&impl->is_dense)) { - return false; - } - if (!fi->Read(&impl->row_stride)) { - return false; - } - impl->gidx_buffer.SetDevice(device_); - if (!ReadDeviceVec(fi, &impl->gidx_buffer)) { - return false; - } - if (!fi->Read(&impl->base_rowid)) { - return false; + RET_IF_NOT(fi->Read(&impl->n_rows)); + RET_IF_NOT(fi->Read(&impl->is_dense)); + RET_IF_NOT(fi->Read(&impl->row_stride)); + + if (has_hmm_ats_) { + RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer)); + } else { + RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer)); } + RET_IF_NOT(fi->Read(&impl->base_rowid)); + dh::DefaultStream().Sync(); return true; } @@ -71,8 +74,10 @@ template bytes += fo->Write(impl->n_rows); bytes += fo->Write(impl->is_dense); bytes += fo->Write(impl->row_stride); - CHECK(!impl->gidx_buffer.ConstHostVector().empty()); - bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector()); + std::vector h_gidx_buffer; + Context ctx = Context{}.MakeCUDA(common::CurrentDevice()); + [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer); + bytes += common::WriteVec(fo, h_gidx_buffer); bytes += fo->Write(impl->base_rowid); dh::DefaultStream().Sync(); return bytes; @@ -82,33 +87,20 @@ template auto* impl = page->Impl(); CHECK(this->cuts_->cut_values_.DeviceCanRead()); impl->SetCuts(this->cuts_); - if (!fi->Read(&impl->n_rows)) { - return false; - } - if (!fi->Read(&impl->is_dense)) { - return false; - } - if (!fi->Read(&impl->row_stride)) { - return false; - } + RET_IF_NOT(fi->Read(&impl->n_rows)); + RET_IF_NOT(fi->Read(&impl->is_dense)); + RET_IF_NOT(fi->Read(&impl->row_stride)); // Read vec + Context ctx = Context{}.MakeCUDA(common::CurrentDevice()); bst_idx_t n{0}; - if (!fi->Read(&n)) { - return false; - } + RET_IF_NOT(fi->Read(&n)); if (n != 0) { - impl->gidx_buffer.SetDevice(device_); - impl->gidx_buffer.Resize(n); - auto span = impl->gidx_buffer.DeviceSpan(); - if (!fi->Read(span.data(), span.size_bytes())) { - return false; - } - } - - if (!fi->Read(&impl->base_rowid)) { - return false; + impl->gidx_buffer = + common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast(0)); + RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes())); } + RET_IF_NOT(fi->Read(&impl->base_rowid)); dh::DefaultStream().Sync(); return true; @@ -123,16 +115,17 @@ template bytes += fo->Write(impl->row_stride); // Write vector - bst_idx_t n = impl->gidx_buffer.Size(); + bst_idx_t n = impl->gidx_buffer.size(); bytes += fo->Write(n); - if (!impl->gidx_buffer.Empty()) { - auto span = impl->gidx_buffer.ConstDeviceSpan(); - bytes += fo->Write(span.data(), span.size_bytes()); + if (!impl->gidx_buffer.empty()) { + bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()); } bytes += fo->Write(impl->base_rowid); dh::DefaultStream().Sync(); return bytes; } + +#undef RET_IF_NOT } // namespace xgboost::data diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h index 8c3f89f0c..e2761c73f 100644 --- a/src/data/ellpack_page_raw_format.h +++ b/src/data/ellpack_page_raw_format.h @@ -26,10 +26,13 @@ class EllpackHostCacheStream; class EllpackPageRawFormat : public SparsePageFormat { std::shared_ptr cuts_; DeviceOrd device_; + // Supports CUDA HMM or ATS + bool has_hmm_ats_{false}; public: - explicit EllpackPageRawFormat(std::shared_ptr cuts, DeviceOrd device) - : cuts_{std::move(cuts)}, device_{device} {} + explicit EllpackPageRawFormat(std::shared_ptr cuts, DeviceOrd device, + bool has_hmm_ats) + : cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {} [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override; [[nodiscard]] std::size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override; diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index f53ae3ef1..a70d9150c 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -11,6 +11,7 @@ #include "../common/common.h" // for safe_cuda #include "../common/cuda_pinned_allocator.h" // for pinned_allocator #include "../common/device_helpers.cuh" // for CUDAStreamView, DefaultStream +#include "../common/resource.cuh" // for PrivateCudaMmapConstStream #include "ellpack_page.cuh" // for EllpackPageImpl #include "ellpack_page.h" // for EllpackPage #include "ellpack_page_source.h" @@ -86,16 +87,16 @@ void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek( void EllpackHostCacheStream::Bound(bst_idx_t offset_bytes) { this->p_impl_->Bound(offset_bytes); } /** - * EllpackFormatType + * EllpackCacheStreamPolicy */ template typename F> -EllpackFormatStreamPolicy::EllpackFormatStreamPolicy() +EllpackCacheStreamPolicy::EllpackCacheStreamPolicy() : p_cache_{std::make_shared()} {} template typename F> -[[nodiscard]] std::unique_ptr::WriterT> -EllpackFormatStreamPolicy::CreateWriter(StringView, std::uint32_t iter) { +[[nodiscard]] std::unique_ptr::WriterT> +EllpackCacheStreamPolicy::CreateWriter(StringView, std::uint32_t iter) { auto fo = std::make_unique(this->p_cache_); if (iter == 0) { CHECK(this->p_cache_->cache.empty()); @@ -106,9 +107,8 @@ EllpackFormatStreamPolicy::CreateWriter(StringView, std::uint32_t iter) { } template typename F> -[[nodiscard]] std::unique_ptr::ReaderT> -EllpackFormatStreamPolicy::CreateReader(StringView, bst_idx_t offset, - bst_idx_t length) const { +[[nodiscard]] std::unique_ptr::ReaderT> +EllpackCacheStreamPolicy::CreateReader(StringView, bst_idx_t offset, bst_idx_t length) const { auto fi = std::make_unique(this->p_cache_); fi->Seek(offset); fi->Bound(offset + length); @@ -117,18 +117,40 @@ EllpackFormatStreamPolicy::CreateReader(StringView, bst_idx_t offset, } // Instantiation -template EllpackFormatStreamPolicy::EllpackFormatStreamPolicy(); +template EllpackCacheStreamPolicy::EllpackCacheStreamPolicy(); template std::unique_ptr< - typename EllpackFormatStreamPolicy::WriterT> -EllpackFormatStreamPolicy::CreateWriter(StringView name, - std::uint32_t iter); + typename EllpackCacheStreamPolicy::WriterT> +EllpackCacheStreamPolicy::CreateWriter(StringView name, + std::uint32_t iter); template std::unique_ptr< - typename EllpackFormatStreamPolicy::ReaderT> -EllpackFormatStreamPolicy::CreateReader( + typename EllpackCacheStreamPolicy::ReaderT> +EllpackCacheStreamPolicy::CreateReader( StringView name, std::uint64_t offset, std::uint64_t length) const; +/** + * EllpackMmapStreamPolicy + */ + +template typename F> +[[nodiscard]] std::unique_ptr::ReaderT> +EllpackMmapStreamPolicy::CreateReader(StringView name, bst_idx_t offset, + bst_idx_t length) const { + if (has_hmm_) { + return std::make_unique(name, offset, length); + } else { + return std::make_unique(name, offset, length); + } +} + +// Instantiation +template std::unique_ptr< + typename EllpackMmapStreamPolicy::ReaderT> +EllpackMmapStreamPolicy::CreateReader(StringView name, + bst_idx_t offset, + bst_idx_t length) const; + /** * EllpackPageSourceImpl */ @@ -146,8 +168,8 @@ void EllpackPageSourceImpl::Fetch() { auto const& csr = this->source_->Page(); this->page_.reset(new EllpackPage{}); auto* impl = this->page_->Impl(); - *impl = EllpackPageImpl{this->Device(), this->GetCuts(), *csr, - is_dense_, row_stride_, feature_types_}; + Context ctx = Context{}.MakeCUDA(this->Device().ordinal); + *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_}; this->page_->SetBaseRowId(csr->base_rowid); this->WriteCache(); } @@ -157,5 +179,7 @@ void EllpackPageSourceImpl::Fetch() { template void EllpackPageSourceImpl>::Fetch(); template void -EllpackPageSourceImpl>::Fetch(); +EllpackPageSourceImpl>::Fetch(); +template void +EllpackPageSourceImpl>::Fetch(); } // namespace xgboost::data diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 7f50899b9..1436f9151 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -9,6 +9,7 @@ #include // for shared_ptr #include // for move +#include "../common/cuda_rt_utils.h" // for SupportsPageableMem #include "../common/hist_util.h" // for HistogramCuts #include "ellpack_page.h" // for EllpackPage #include "ellpack_page_raw_format.h" // for EllpackPageRawFormat @@ -59,14 +60,19 @@ template class EllpackFormatPolicy { std::shared_ptr cuts_{nullptr}; DeviceOrd device_; + bool has_hmm_{common::SupportsPageableMem()}; public: using FormatT = EllpackPageRawFormat; public: + EllpackFormatPolicy() = default; + // For testing with the HMM flag. + explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {} + [[nodiscard]] auto CreatePageFormat() const { CHECK_EQ(cuts_->cut_values_.Device(), device_); - std::unique_ptr fmt{new EllpackPageRawFormat{cuts_, device_}}; + std::unique_ptr fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}}; return fmt; } @@ -83,7 +89,7 @@ class EllpackFormatPolicy { }; template typename F> -class EllpackFormatStreamPolicy : public F { +class EllpackCacheStreamPolicy : public F { std::shared_ptr p_cache_; public: @@ -91,13 +97,42 @@ class EllpackFormatStreamPolicy : public F { using ReaderT = EllpackHostCacheStream; public: - EllpackFormatStreamPolicy(); + EllpackCacheStreamPolicy(); [[nodiscard]] std::unique_ptr CreateWriter(StringView name, std::uint32_t iter); [[nodiscard]] std::unique_ptr CreateReader(StringView name, bst_idx_t offset, bst_idx_t length) const; }; +template typename F> +class EllpackMmapStreamPolicy : public F { + bool has_hmm_{common::SupportsPageableMem()}; + + public: + using WriterT = common::AlignedFileWriteStream; + using ReaderT = common::AlignedResourceReadStream; + + public: + EllpackMmapStreamPolicy() = default; + // For testing with the HMM flag. + template < + typename std::enable_if_t, EllpackFormatPolicy>>* = nullptr> + explicit EllpackMmapStreamPolicy(bool has_hmm) : F{has_hmm}, has_hmm_{has_hmm} {} + + [[nodiscard]] std::unique_ptr CreateWriter(StringView name, std::uint32_t iter) { + std::unique_ptr fo; + if (iter == 0) { + fo = std::make_unique(name, "wb"); + } else { + fo = std::make_unique(name, "ab"); + } + return fo; + } + + [[nodiscard]] std::unique_ptr CreateReader(StringView name, bst_idx_t offset, + bst_idx_t length) const; +}; + template class EllpackPageSourceImpl : public PageSourceIncMixIn { using Super = PageSourceIncMixIn; @@ -128,11 +163,11 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn { // Cache to host using EllpackPageHostSource = - EllpackPageSourceImpl>; + EllpackPageSourceImpl>; // Cache to disk using EllpackPageSource = - EllpackPageSourceImpl>; + EllpackPageSourceImpl>; #if !defined(XGBOOST_USE_CUDA) template diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu index 42018eab4..f8c8f8d48 100644 --- a/src/data/gradient_index.cu +++ b/src/data/gradient_index.cu @@ -16,7 +16,8 @@ template void SetIndexData(Context const* ctx, EllpackPageImpl const* page, std::vector* p_hit_count_tloc, CompressOffset&& get_offset, GHistIndexMatrix* out) { - auto accessor = page->GetHostAccessor(); + std::vector h_gidx_buffer; + auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); common::Span index_data_span = {out->index.data(), out->index.Size()}; @@ -47,7 +48,8 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, if (page->is_dense) { std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride); } else { - auto accessor = page->GetHostAccessor(); + std::vector h_gidx_buffer; + auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) { diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h deleted file mode 100644 index d4eb81ad2..000000000 --- a/src/data/histogram_cut_format.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright 2021-2024, XGBoost contributors - */ -#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ -#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ - -#include // for Stream - -#include // for size_t - -#include "../common/hist_util.h" // for HistogramCuts -#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream -#include "../common/ref_resource_view.h" // for WriteVec, ReadVec - -namespace xgboost::data { -inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) { - if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) { - return false; - } - if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) { - return false; - } - if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) { - return false; - } - bool has_cat{false}; - if (!fi->Read(&has_cat)) { - return false; - } - decltype(cuts->MaxCategory()) max_cat{0}; - if (!fi->Read(&max_cat)) { - return false; - } - cuts->SetCategorical(has_cat, max_cat); - return true; -} - -inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts, - common::AlignedFileWriteStream *fo) { - std::size_t bytes = 0; - bytes += common::WriteVec(fo, cuts.Values()); - bytes += common::WriteVec(fo, cuts.Ptrs()); - bytes += common::WriteVec(fo, cuts.MinValues()); - bytes += fo->Write(cuts.HasCategorical()); - bytes += fo->Write(cuts.MaxCategory()); - return bytes; -} -} // namespace xgboost::data -#endif // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 2e8da2c7e..0cb32c5aa 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -5,6 +5,7 @@ #include #include "../collective/allreduce.h" +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../common/hist_util.cuh" #include "batch_utils.h" // for RegenGHist #include "device_adapter.cuh" @@ -45,11 +46,17 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, int32_t current_device; dh::safe_cuda(cudaGetDevice(¤t_device)); + auto get_ctx = [&]() { + Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device); + CHECK(!d_ctx.IsCPU()); + return d_ctx; + }; auto get_device = [&]() { auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device); CHECK(!d.IsCPU()); return d; }; + fmat_ctx_ = get_ctx(); /** * Generate quantiles @@ -118,7 +125,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, // that case device id is invalid. ellpack_.reset(new EllpackPage); *(ellpack_->Impl()) = - EllpackPageImpl(get_device(), cuts, this->IsDense(), row_stride, accumulated_rows); + EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), row_stride, accumulated_rows); } }; @@ -142,10 +149,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, proxy->Info().feature_types.SetDevice(get_device()); auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan(); auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) { - return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span, - d_feature_types, row_stride, rows, cuts); + return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types, + row_stride, rows, cuts); }); - size_t num_elements = ellpack_->Impl()->Copy(get_device(), &new_impl, offset); + std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset); offset += num_elements; proxy->Info().num_row_ = num_rows(); diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index 550631b72..62b39886e 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -226,7 +226,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl, public FormatStreamPol } // An heuristic for number of pre-fetched batches. We can make it part of BatchParam // to let user adjust number of pre-fetched batches when needed. - std::int32_t kPrefetches = 3; + std::int32_t constexpr kPrefetches = 3; std::int32_t n_prefetches = std::min(nthreads_, kPrefetches); n_prefetches = std::max(n_prefetches, 1); std::int32_t n_prefetch_batches = std::min(static_cast(n_prefetches), n_batches_); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 71905debc..2d288fa9d 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -10,12 +10,12 @@ #include #include -#include #include #include #include "../common/common.h" -#include "../common/error_msg.h" // NoCategorical, DeprecatedFunc +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs +#include "../common/error_msg.h" // NoCategorical, DeprecatedFunc #include "../common/threading_utils.h" #include "../common/timer.h" #include "gblinear_model.h" diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 9ff4abb4d..26c768faf 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -1,5 +1,5 @@ /** - * Copyright 2014-2023 by Contributors + * Copyright 2014-2024, XGBoost Contributors * \file gbtree.cc * \brief gradient boosted tree implementation. * \author Tianqi Chen @@ -10,14 +10,14 @@ #include #include // for equal -#include // for uint32_t -#include +#include // for uint32_t #include #include #include #include #include "../common/common.h" +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../common/error_msg.h" // for UnknownDevice, WarnOldSerialization, InplacePredictProxy #include "../common/random.h" #include "../common/threading_utils.h" @@ -347,7 +347,7 @@ void GBTree::LoadConfig(Json const& in) { // This would cause all trees to be pushed to trees_to_update // e.g. updating a model, then saving and loading it would result in an empty model tparam_.process_type = TreeProcessType::kDefault; - std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs(); + std::int32_t const n_gpus = common::AllVisibleGPUs(); auto msg = StringView{ R"( diff --git a/src/learner.cc b/src/learner.cc index 93db7f801..542bf1dc6 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1321,7 +1321,7 @@ class LearnerImpl : public LearnerIO { std::ostringstream os; os.precision(std::numeric_limits::max_digits10); os << '[' << iter << ']' << std::setiosflags(std::ios::fixed); - if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) { + if (metrics_.empty() && !tparam_.disable_default_eval_metric) { metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_)); auto config = obj_->DefaultMetricConfig(); if (!IsA(config)) { diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 29fb6bb6a..fe46e19ec 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -16,6 +16,7 @@ #include "../common/categorical.h" #include "../common/common.h" #include "../common/cuda_context.cuh" // for CUDAContext +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../common/device_helpers.cuh" #include "../common/error_msg.h" // for InplacePredictProxy #include "../data/device_adapter.cuh" diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu index dd71465df..4f1f994a6 100644 --- a/src/tree/fit_stump.cu +++ b/src/tree/fit_stump.cu @@ -3,9 +3,6 @@ * * @brief Utilities for estimating initial score. */ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) #include // cuda::par #include // thrust::make_counting_iterator diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h index 2af779f77..ab947a659 100644 --- a/src/tree/fit_stump.h +++ b/src/tree/fit_stump.h @@ -1,5 +1,5 @@ /** - * Copyright 2022 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors * * \brief Utilities for estimating initial score. */ @@ -7,18 +7,12 @@ #ifndef XGBOOST_TREE_FIT_STUMP_H_ #define XGBOOST_TREE_FIT_STUMP_H_ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #include // std::max -#include "../common/common.h" // AssertGPUSupport -#include "xgboost/base.h" // GradientPair -#include "xgboost/context.h" // Context -#include "xgboost/data.h" // MetaInfo -#include "xgboost/host_device_vector.h" // HostDeviceVector -#include "xgboost/linalg.h" // TensorView +#include "xgboost/base.h" // GradientPair +#include "xgboost/context.h" // Context +#include "xgboost/data.h" // MetaInfo +#include "xgboost/linalg.h" // TensorView namespace xgboost { namespace tree { diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index d2031ca21..3235e9ec3 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -163,14 +163,14 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx, if (!page_concatenated_) { // Concatenate all the external memory ELLPACK pages into a single in-memory page. page_.reset(nullptr); - size_t offset = 0; + bst_idx_t offset = 0; for (auto& batch : dmat->GetBatches(ctx, batch_param_)) { auto page = batch.Impl(); if (!page_) { - page_ = std::make_unique(ctx->Device(), page->CutsShared(), page->is_dense, + page_ = std::make_unique(ctx, page->CutsShared(), page->is_dense, page->row_stride, dmat->Info().num_row_); } - size_t num_elements = page_->Copy(ctx->Device(), page, offset); + bst_idx_t num_elements = page_->Copy(ctx, page, offset); offset += num_elements; } page_concatenated_ = true; @@ -228,11 +228,11 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx, auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), first_page->is_dense, + page_.reset(new EllpackPageImpl(ctx, first_page->CutsShared(), first_page->is_dense, first_page->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. - thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); + thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0); for (auto& batch : batch_iterator) { page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_)); } @@ -283,10 +283,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c // Perform Poisson sampling in place. thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), thrust::counting_iterator(0), dh::tbegin(gpair), - PoissonSampling(dh::ToSpan(threshold_), threshold_index, - RandomWeight(common::GlobalRandom()()))); + PoissonSampling{dh::ToSpan(threshold_), threshold_index, + RandomWeight(common::GlobalRandom()())}); // Count the sampled rows. - size_t sample_rows = + bst_idx_t sample_rows = thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero()); // Compact gradient pairs. gpair_.resize(sample_rows); @@ -302,10 +302,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), dmat->IsDense(), - first_page->row_stride, sample_rows)); + page_.reset(new EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), + first_page->row_stride, sample_rows}); // Compact the ELLPACK pages into the single sample page. - thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); + thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0); for (auto& batch : batch_iterator) { page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_)); } diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh index 5a57e2ae8..79008b1ae 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cuh +++ b/src/tree/gpu_hist/gradient_based_sampler.cuh @@ -1,20 +1,19 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ #pragma once -#include -#include -#include +#include // for size_t -#include "../../common/device_helpers.cuh" -#include "../../data/ellpack_page.cuh" - -namespace xgboost { -namespace tree { +#include "../../common/device_vector.cuh" // for device_vector, caching_device_vector +#include "../../data/ellpack_page.cuh" // for EllpackPageImpl +#include "xgboost/base.h" // for GradientPair +#include "xgboost/data.h" // for BatchParam +#include "xgboost/span.h" // for Span +namespace xgboost::tree { struct GradientBasedSample { /*!\brief Number of sampled rows. */ - size_t sample_rows; + std::size_t sample_rows; /*!\brief Sampled rows in ELLPACK format. */ EllpackPageImpl const* page; /*!\brief Gradient pairs for the sampled rows. */ @@ -137,5 +136,4 @@ class GradientBasedSampler { common::Monitor monitor_; std::unique_ptr strategy_; }; -}; // namespace tree -}; // namespace xgboost +}; // namespace xgboost::tree diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 199578572..7d566c3b4 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -16,7 +16,8 @@ #include "../collective/broadcast.h" #include "../common/bitfield.h" #include "../common/categorical.h" -#include "../common/cuda_context.cuh" // CUDAContext +#include "../common/cuda_context.cuh" // for CUDAContext +#include "../common/cuda_rt_utils.h" // for CheckComputeCapability #include "../common/device_helpers.cuh" #include "../common/hist_util.h" #include "../common/random.h" // for ColumnSampler, GlobalRandom @@ -826,7 +827,7 @@ class GPUHistMaker : public TreeUpdater { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Hist]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); - dh::CheckComputeCapability(); + common::CheckComputeCapability(); initialised_ = false; monitor_.Init("updater_gpu_hist"); @@ -852,17 +853,13 @@ class GPUHistMaker : public TreeUpdater { CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented(); auto gpair_hdv = gpair->Data(); // build tree - try { - std::size_t t_idx{0}; - for (xgboost::RegTree* tree : trees) { - this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]); - this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree); - ++t_idx; - } - dh::safe_cuda(cudaGetLastError()); - } catch (const std::exception& e) { - LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl; + std::size_t t_idx{0}; + for (xgboost::RegTree* tree : trees) { + this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]); + this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree); + ++t_idx; } + dh::safe_cuda(cudaGetLastError()); monitor_.Stop("Update"); } @@ -958,7 +955,7 @@ class GPUGlobalApproxMaker : public TreeUpdater { if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) { LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU."; } - dh::CheckComputeCapability(); + common::CheckComputeCapability(); initialised_ = false; monitor_.Init(this->Name()); diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h index 66c6ce9bf..243091190 100644 --- a/tests/cpp/collective/test_worker.h +++ b/tests/cpp/collective/test_worker.h @@ -15,7 +15,7 @@ #include "../../../src/collective/comm.h" #include "../../../src/collective/communicator-inl.h" // for Init, Finalize #include "../../../src/collective/tracker.h" // for GetHostAddress -#include "../../../src/common/common.h" // for AllVisibleGPUs +#include "../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../helpers.h" // for FileExists #if defined(XGBOOST_USE_FEDERATED) diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index a0aa5fa11..c730390c3 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -4,10 +4,11 @@ #include #include #include - -#include "../../../src/common/device_helpers.cuh" #include +#include "../../../src/common/cuda_rt_utils.h" // for SetDevice +#include "../../../src/common/device_helpers.cuh" + namespace xgboost::common { namespace { void SetDeviceForTest(DeviceOrd device) { diff --git a/tests/cpp/common/test_ref_resource_view.cc b/tests/cpp/common/test_ref_resource_view.cc index 9ae55fdec..b201f6913 100644 --- a/tests/cpp/common/test_ref_resource_view.cc +++ b/tests/cpp/common/test_ref_resource_view.cc @@ -1,5 +1,5 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include @@ -16,17 +16,16 @@ TEST(RefResourceView, Basic) { std::size_t n_bytes = 1024; auto mem = std::make_shared(n_bytes); { - RefResourceView view{reinterpret_cast(mem->Data()), mem->Size() / sizeof(float), mem}; + RefResourceView view{static_cast(mem->Data()), mem->Size() / sizeof(float), mem}; - RefResourceView kview{reinterpret_cast(mem->Data()), mem->Size() / sizeof(float), - mem}; + RefResourceView kview{static_cast(mem->Data()), mem->Size() / sizeof(float), mem}; ASSERT_EQ(mem.use_count(), 3); ASSERT_EQ(view.size(), n_bytes / sizeof(1024)); ASSERT_EQ(kview.size(), n_bytes / sizeof(1024)); } { - RefResourceView view{reinterpret_cast(mem->Data()), mem->Size() / sizeof(float), mem, - 1.5f}; + RefResourceView view{static_cast(mem->Data()), mem->Size() / sizeof(float), mem}; + std::fill_n(static_cast(mem->Data()), mem->Size() / sizeof(float), 1.5f); for (auto v : view) { ASSERT_EQ(v, 1.5f); } diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index 9d9687dda..8aab51b72 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -27,15 +27,15 @@ TEST(EllpackPage, EmptyDMatrix) { auto impl = page.Impl(); ASSERT_EQ(impl->row_stride, 0); ASSERT_EQ(impl->Cuts().TotalBins(), 0); - ASSERT_EQ(impl->gidx_buffer.Size(), 4); + ASSERT_EQ(impl->gidx_buffer.size(), 4); } TEST(EllpackPage, BuildGidxDense) { int constexpr kNRows = 16, kNCols = 8; - auto page = BuildEllpackPage(kNRows, kNCols); - - std::vector h_gidx_buffer(page->gidx_buffer.HostVector()); - common::CompressedIterator gidx(h_gidx_buffer.data(), page->NumSymbols()); + auto ctx = MakeCUDACtx(0); + auto page = BuildEllpackPage(&ctx, kNRows, kNCols); + std::vector h_gidx_buffer; + auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); ASSERT_EQ(page->row_stride, kNCols); @@ -58,16 +58,17 @@ TEST(EllpackPage, BuildGidxDense) { 1, 4, 7, 10, 14, 16, 19, 21, }; for (size_t i = 0; i < kNRows * kNCols; ++i) { - ASSERT_EQ(solution[i], gidx[i]); + ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]); } } TEST(EllpackPage, BuildGidxSparse) { int constexpr kNRows = 16, kNCols = 8; - auto page = BuildEllpackPage(kNRows, kNCols, 0.9f); + auto ctx = MakeCUDACtx(0); + auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f); - std::vector h_gidx_buffer(page->gidx_buffer.HostVector()); - common::CompressedIterator gidx(h_gidx_buffer.data(), 25); + std::vector h_gidx_buffer; + auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); ASSERT_LE(page->row_stride, 3); @@ -78,7 +79,7 @@ TEST(EllpackPage, BuildGidxSparse) { 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 }; for (size_t i = 0; i < kNRows * page->row_stride; ++i) { - ASSERT_EQ(solution[i], gidx[i]); + ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]); } } @@ -94,7 +95,7 @@ TEST(EllpackPage, FromCategoricalBasic) { Context ctx{MakeCUDACtx(0)}; auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()}; auto ellpack = EllpackPage(&ctx, m.get(), p); - auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU()); + auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device()); ASSERT_EQ(kCats, accessor.NumBins()); auto x_copy = x; @@ -110,13 +111,11 @@ TEST(EllpackPage, FromCategoricalBasic) { ASSERT_EQ(h_cuts_ptr.size(), 2); ASSERT_EQ(h_cuts_values.size(), kCats); - std::vector const &h_gidx_buffer = - ellpack.Impl()->gidx_buffer.HostVector(); - auto h_gidx_iter = common::CompressedIterator( - h_gidx_buffer.data(), accessor.NumSymbols()); + std::vector h_gidx_buffer; + auto h_accessor = ellpack.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer); for (size_t i = 0; i < x.size(); ++i) { - auto bin = h_gidx_iter[i]; + auto bin = h_accessor.gidx_iter[i]; auto bin_value = h_cuts_values.at(bin); ASSERT_EQ(AsCat(x[i]), AsCat(bin_value)); } @@ -152,12 +151,12 @@ TEST(EllpackPage, Copy) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(FstCU(), page->CutsShared(), page->is_dense, page->row_stride, kRows); + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows); // Copy batch pages into the result page. size_t offset = 0; for (auto& batch : dmat->GetBatches(&ctx, param)) { - size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset); + size_t num_elements = result.Copy(&ctx, batch.Impl(), offset); offset += num_elements; } @@ -171,11 +170,11 @@ TEST(EllpackPage, Copy) { EXPECT_EQ(impl->base_rowid, current_row); for (size_t i = 0; i < impl->Size(); i++) { - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row, + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row, row_d.data().get())); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row, + dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row, row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); @@ -200,7 +199,7 @@ TEST(EllpackPage, Compact) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(ctx.Device(), page->CutsShared(), page->is_dense, page->row_stride, + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kCompactedRows); // Compact batch pages into the result page. @@ -229,14 +228,13 @@ TEST(EllpackPage, Compact) { continue; } - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), - current_row, row_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row, + row_d.data().get())); dh::safe_cuda(cudaDeviceSynchronize()); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(kCols, - ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row, - row_result_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row, + row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); EXPECT_EQ(row, row_result); @@ -269,16 +267,13 @@ class EllpackPageTest : public testing::TestWithParam { ASSERT_EQ(from_sparse_page->base_rowid, 0); ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid); ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows); - ASSERT_EQ(from_sparse_page->gidx_buffer.Size(), from_ghist->gidx_buffer.Size()); - auto const& h_gidx_from_sparse = from_sparse_page->gidx_buffer.HostVector(); - auto const& h_gidx_from_ghist = from_ghist->gidx_buffer.HostVector(); + ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size()); + std::vector h_gidx_from_sparse, h_gidx_from_ghist; + auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist); + auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse); ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols()); - common::CompressedIterator from_ghist_it(h_gidx_from_ghist.data(), - from_ghist->NumSymbols()); - common::CompressedIterator from_sparse_it(h_gidx_from_sparse.data(), - from_sparse_page->NumSymbols()); for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) { - EXPECT_EQ(from_ghist_it[i], from_sparse_it[i]); + EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]); } } } diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index d5ff721f8..b7bb5f902 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -14,9 +14,8 @@ namespace xgboost::data { namespace { template -void TestEllpackPageRawFormat() { - FormatStreamPolicy policy; - +void TestEllpackPageRawFormat(FormatStreamPolicy *p_policy) { + auto &policy = *p_policy; Context ctx{MakeCUDACtx(0)}; auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; @@ -55,16 +54,30 @@ void TestEllpackPageRawFormat() { ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values()); ASSERT_EQ(loaded->base_rowid, orig->base_rowid); ASSERT_EQ(loaded->row_stride, orig->row_stride); - ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector()); + std::vector h_loaded, h_orig; + [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded); + [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig); + ASSERT_EQ(h_loaded, h_orig); } } } // anonymous namespace TEST(EllpackPageRawFormat, DiskIO) { - TestEllpackPageRawFormat>(); + EllpackMmapStreamPolicy policy{false}; + TestEllpackPageRawFormat(&policy); +} + +TEST(EllpackPageRawFormat, DiskIOHmm) { + if (common::SupportsPageableMem()) { + EllpackMmapStreamPolicy policy{true}; + TestEllpackPageRawFormat(&policy); + } else { + GTEST_SKIP_("HMM is not supported."); + } } TEST(EllpackPageRawFormat, HostIO) { - TestEllpackPageRawFormat>(); + EllpackCacheStreamPolicy policy; + TestEllpackPageRawFormat(&policy); } } // namespace xgboost::data diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu index 503cb7696..5fb90a5c1 100644 --- a/tests/cpp/data/test_iterative_dmatrix.cu +++ b/tests/cpp/data/test_iterative_dmatrix.cu @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023, XGBoost contributors + * Copyright 2020-2024, XGBoost contributors */ #include @@ -21,10 +21,10 @@ void TestEquivalent(float sparsity) { std::size_t offset = 0; auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl(); std::unique_ptr page_concatenated{new EllpackPageImpl( - ctx.Device(), first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)}; + &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)}; for (auto& batch : m.GetBatches(&ctx, {})) { auto page = batch.Impl(); - size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset); + size_t num_elements = page_concatenated->Copy(&ctx, page, offset); offset += num_elements; } auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device()); @@ -66,18 +66,15 @@ void TestEquivalent(float sparsity) { ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]); } - auto const& buffer_from_iter = page_concatenated->gidx_buffer; - auto const& buffer_from_data = ellpack.Impl()->gidx_buffer; - ASSERT_NE(buffer_from_data.Size(), 0); - - common::CompressedIterator data_buf{ - buffer_from_data.ConstHostPointer(), from_data.NumSymbols()}; - common::CompressedIterator data_iter{ - buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()}; + std::vector buffer_from_iter, buffer_from_data; + auto data_iter = page_concatenated->GetHostAccessor(&ctx, &buffer_from_iter); + auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data); + ASSERT_NE(buffer_from_data.size(), 0); + ASSERT_NE(buffer_from_iter.size(), 0); CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols()); CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride); for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) { - CHECK_EQ(data_buf[i], data_iter[i]); + CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]); } } } @@ -97,8 +94,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) { for (auto& ellpack : m.GetBatches(&ctx, {})) { n_batches ++; auto impl = ellpack.Impl(); - common::CompressedIterator iterator( - impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); + std::vector h_gidx; + auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx); auto cols = CudaArrayIterForTest::Cols(); auto rows = CudaArrayIterForTest::Rows(); @@ -111,7 +108,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) { for(auto i = 0ull; i < rows * cols; i++) { int column_idx = i % cols; - EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]); + EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]); } EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); @@ -147,12 +144,12 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) { *m.GetBatches(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()}) .begin(); auto impl = ellpack.Impl(); - common::CompressedIterator iterator( - impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); - EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue()); - EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + std::vector h_gidx; + auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx); + EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(ctx.Device()).NullValue()); // null values get placed after valid values in a row - EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(ctx.Device()).NullValue()); EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3); diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index 327f2ba63..046c4eed4 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -154,13 +154,18 @@ TEST(SparsePageDMatrix, RetainEllpackPage) { for (auto it = begin; it != end; ++it) { iterators.push_back(it.Page()); gidx_buffers.emplace_back(); - gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size()); - gidx_buffers.back().Copy((*it).Impl()->gidx_buffer); + gidx_buffers.back().SetDevice(ctx.Device()); + gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.size()); + auto d_dst = gidx_buffers.back().DevicePointer(); + auto const& d_src = (*it).Impl()->gidx_buffer; + dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault)); } ASSERT_GE(iterators.size(), 2); for (size_t i = 0; i < iterators.size(); ++i) { - ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector()); + std::vector h_buf; + [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf); + ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector()); ASSERT_EQ(iterators[i].use_count(), 1); } @@ -210,11 +215,11 @@ class TestEllpackPageExt : public ::testing::TestWithParamGetBatches(&ctx, param)) { if (!impl_ext) { - impl_ext = std::make_unique( - batch.Impl()->gidx_buffer.Device(), batch.Impl()->CutsShared(), batch.Impl()->is_dense, - batch.Impl()->row_stride, kRows); + impl_ext = std::make_unique(&ctx, batch.Impl()->CutsShared(), + batch.Impl()->is_dense, + batch.Impl()->row_stride, kRows); } - auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset); + auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset); offset += n_elems; } ASSERT_EQ(impl_ext->base_rowid, 0); @@ -223,8 +228,10 @@ class TestEllpackPageExt : public ::testing::TestWithParamrow_stride, 2); ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4); - std::vector buffer(impl->gidx_buffer.HostVector()); - std::vector buffer_ext(impl_ext->gidx_buffer.HostVector()); + std::vector buffer; + [[maybe_unused]] auto h_acc = impl->GetHostAccessor(&ctx, &buffer); + std::vector buffer_ext; + [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostAccessor(&ctx, &buffer_ext); ASSERT_EQ(buffer, buffer_ext); } }; diff --git a/tests/cpp/filesystem.h b/tests/cpp/filesystem.h index c8d144291..fafc8c7d1 100644 --- a/tests/cpp/filesystem.h +++ b/tests/cpp/filesystem.h @@ -1,13 +1,10 @@ -/*! - * Copyright (c) 2022 by XGBoost Contributors +/** + * Copyright 2022-2024, XGBoost Contributors */ #ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H #define XGBOOST_TESTS_CPP_FILESYSTEM_H -// A macro used inside `windows.h` to avoid conflicts with `winsock2.h` -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif // WIN32_LEAN_AND_MEAN +#include #include "dmlc/filesystem.h" diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 2211b2d00..b2e9e08cd 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -21,14 +21,11 @@ #if defined(__CUDACC__) #include "../../src/collective/communicator-inl.h" // for GetRank -#include "../../src/common/common.h" // for AllVisibleGPUs +#include "../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #endif // defined(__CUDACC__) #include "filesystem.h" // dmlc::TemporaryDirectory #include "xgboost/linalg.h" -#if !defined(_OPENMP) -#include -#endif #if defined(__CUDACC__) #define DeclareUnifiedTest(name) GPU ## name diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h index a33d6958a..ff021e819 100644 --- a/tests/cpp/histogram_helpers.h +++ b/tests/cpp/histogram_helpers.h @@ -23,7 +23,7 @@ class HistogramCutsWrapper : public common::HistogramCuts { }; } // namespace detail -inline std::unique_ptr BuildEllpackPage(int n_rows, int n_cols, +inline std::unique_ptr BuildEllpackPage(Context const* ctx, int n_rows, int n_cols, bst_float sparsity = 0) { auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix(); const SparsePage& batch = *dmat->GetBatches().begin(); @@ -48,7 +48,7 @@ inline std::unique_ptr BuildEllpackPage(int n_rows, int n_cols, } auto page = std::unique_ptr( - new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {})); + new EllpackPageImpl(ctx, cmat, batch, dmat->IsDense(), row_stride, {})); return page; } diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc index 972dfc53f..f31debb21 100644 --- a/tests/cpp/objective/test_aft_obj.cc +++ b/tests/cpp/objective/test_aft_obj.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023, XGBoost Contributors + * Copyright 2020-2024, XGBoost Contributors */ #include #include @@ -10,7 +10,6 @@ #include "xgboost/objective.h" #include "xgboost/logging.h" #include "../helpers.h" -#include "../../../src/common/survival_util.h" namespace xgboost::common { TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) { diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu index f3b906613..31760a97f 100644 --- a/tests/cpp/plugin/federated/test_federated_coll.cu +++ b/tests/cpp/plugin/federated/test_federated_coll.cu @@ -6,7 +6,7 @@ #include // for Result #include "../../../../src/collective/allreduce.h" -#include "../../../../src/common/common.h" // for AllVisibleGPUs +#include "../../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../../../../src/common/device_helpers.cuh" // for device_vector #include "../../../../src/common/type.h" // for EraseType #include "../../collective/test_worker.h" // for SocketTest diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc index 9bfbdd3ae..511b3d8d1 100644 --- a/tests/cpp/plugin/federated/test_federated_comm_group.cc +++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc @@ -1,11 +1,11 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include #include // for Json #include "../../../../src/collective/comm_group.h" -#include "../../helpers.h" +#include "../../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "test_worker.h" namespace xgboost::collective { diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu index 747adb6fd..c6fd8921c 100644 --- a/tests/cpp/plugin/federated/test_federated_comm_group.cu +++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu @@ -1,10 +1,11 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include #include // for Json #include "../../../../src/collective/comm_group.h" +#include "../../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../../helpers.h" #include "test_worker.h" diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu index 7684ff467..077698035 100644 --- a/tests/cpp/test_context.cu +++ b/tests/cpp/test_context.cu @@ -1,5 +1,5 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include #include // for Args @@ -8,7 +8,7 @@ #include // for string, to_string -#include "../../src/common/common.h" // for AllVisibleGPUs +#include "../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs namespace xgboost { namespace { diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index 9a0304f87..85bea39c5 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023, XGBoost Contributors + * Copyright 2020-2024, XGBoost Contributors */ #include @@ -102,19 +102,17 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) { EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer()); EXPECT_EQ(sampled_page->n_rows, kRows); - std::vector buffer(sampled_page->gidx_buffer.HostVector()); - common::CompressedIterator - ci(buffer.data(), sampled_page->NumSymbols()); + std::vector h_gidx_buffer; + auto h_accessor = sampled_page->GetHostAccessor(&ctx, &h_gidx_buffer); - size_t offset = 0; + std::size_t offset = 0; for (auto& batch : dmat->GetBatches(&ctx, param)) { auto page = batch.Impl(); - std::vector page_buffer(page->gidx_buffer.HostVector()); - common::CompressedIterator - page_ci(page_buffer.data(), page->NumSymbols()); + std::vector h_page_gidx_buffer; + auto page_accessor = page->GetHostAccessor(&ctx, &h_page_gidx_buffer); size_t num_elements = page->n_rows * page->row_stride; for (size_t i = 0; i < num_elements; i++) { - EXPECT_EQ(ci[i + offset], page_ci[i]); + EXPECT_EQ(h_accessor.gidx_iter[i + offset], page_accessor.gidx_iter[i]); } offset += num_elements; } diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index d11284466..860e4bfd4 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -328,8 +328,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParamGetBatches()) { concat.Push(page); } - EllpackPageImpl page{ - ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}}; + EllpackPageImpl page{&ctx, cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}}; auto ridx = partitioner.GetRows(0); auto d_histogram = dh::ToSpan(single_hist); DeviceHistogramBuilder builder; diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 200fb39fb..291b46ede 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -81,6 +81,7 @@ std::vector GetHostHistGpair() { template void TestBuildHist(bool use_shared_memory_histograms) { int const kNRows = 16, kNCols = 8; + Context ctx{MakeCUDACtx(0)}; TrainParam param; Args args{ @@ -89,9 +90,8 @@ void TestBuildHist(bool use_shared_memory_histograms) { }; param.Init(args); - auto page = BuildEllpackPage(kNRows, kNCols); + auto page = BuildEllpackPage(&ctx, kNRows, kNCols); BatchParam batch_param{}; - Context ctx{MakeCUDACtx(0)}; auto cs = std::make_shared(0); GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols, batch_param, MetaInfo()); @@ -105,7 +105,6 @@ void TestBuildHist(bool use_shared_memory_histograms) { } gpair.SetDevice(ctx.Device()); - thrust::host_vector h_gidx_buffer(page->gidx_buffer.HostVector()); maker.row_partitioner = std::make_unique(&ctx, kNRows, 0); maker.hist.Init(ctx.Device(), page->Cuts().TotalBins()); @@ -198,14 +197,12 @@ void TestHistogramIndexImpl() { auto grad = GenerateRandomGradients(kNRows); grad.SetDevice(DeviceOrd::CUDA(0)); maker->Reset(&grad, hist_maker_dmat.get(), kNCols); - std::vector h_gidx_buffer(maker->page->gidx_buffer.HostVector()); const auto &maker_ext = hist_maker_ext.maker; maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols); - std::vector h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector()); ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins()); - ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size()); + ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size()); } TEST(GpuHist, TestHistogramIndex) {