Better error message for ncclUnhandledCudaError. (#7190)

This commit is contained in:
Jiaming Yuan 2021-08-27 10:29:22 +08:00 committed by GitHub
parent b70e07da1f
commit e7d7ab6bc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -110,12 +110,17 @@ namespace dh {
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__) #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) { int line) {
if (code != ncclSuccess) { if (code != ncclSuccess) {
std::stringstream ss; std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code) << " "; ss << "NCCL failure :" << ncclGetErrorString(code);
ss << file << "(" << line << ")"; if (code == ncclUnhandledCudaError) {
throw std::runtime_error(ss.str()); // nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
}
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str();
} }
return code; return code;