initial merge
This commit is contained in:
@@ -2,6 +2,9 @@
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
|
||||
#include <thrust/binary_search.h> // thrust::upper_bound
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
@@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT
|
||||
}
|
||||
namespace dh {
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#ifdef XGBOOST_USE_RCCL
|
||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code);
|
||||
ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
|
||||
ss << " " << file << "(" << line << ")\n";
|
||||
if (code == ncclUnhandledCudaError) {
|
||||
// nccl usually preserves the last error so we can get more details.
|
||||
auto err = hipPeekAtLastError();
|
||||
ss << " " << thrust::system_error(err, thrust::hip_category()).what();
|
||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||
} else if (code == ncclSystemError) {
|
||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||
"the network interface for NCCL via environment variables listed in its reference: "
|
||||
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
||||
}
|
||||
ss << " " << file << "(" << line << ")";
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user