Better error message for ncclUnhandledCudaError. (#7190)
This commit is contained in:
parent
b70e07da1f
commit
e7d7ab6bc3
@ -110,12 +110,17 @@ namespace dh {
|
|||||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||||
|
|
||||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||||
int line) {
|
int line) {
|
||||||
if (code != ncclSuccess) {
|
if (code != ncclSuccess) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
|
ss << "NCCL failure :" << ncclGetErrorString(code);
|
||||||
ss << file << "(" << line << ")";
|
if (code == ncclUnhandledCudaError) {
|
||||||
throw std::runtime_error(ss.str());
|
// nccl usually preserves the last error so we can get more details.
|
||||||
|
auto err = cudaPeekAtLastError();
|
||||||
|
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
|
||||||
|
}
|
||||||
|
ss << " " << file << "(" << line << ")";
|
||||||
|
LOG(FATAL) << ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
return code;
|
return code;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user