diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index f2e6ac629..36249d5f2 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -110,12 +110,17 @@ namespace dh { #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__) inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, - int line) { + int line) { if (code != ncclSuccess) { std::stringstream ss; - ss << "NCCL failure :" << ncclGetErrorString(code) << " "; - ss << file << "(" << line << ")"; - throw std::runtime_error(ss.str()); + ss << "NCCL failure :" << ncclGetErrorString(code); + if (code == ncclUnhandledCudaError) { + // nccl usually preserves the last error so we can get more details. + auto err = cudaPeekAtLastError(); + ss << " " << thrust::system_error(err, thrust::cuda_category()).what(); + } + ss << " " << file << "(" << line << ")"; + LOG(FATAL) << ss.str(); } return code;