Better error message for ncclUnhandledCudaError. (#7190)
This commit is contained in:
parent
b70e07da1f
commit
e7d7ab6bc3
@ -113,9 +113,14 @@ inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
|
||||
ss << file << "(" << line << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code);
|
||||
if (code == ncclUnhandledCudaError) {
|
||||
// nccl usually preserves the last error so we can get more details.
|
||||
auto err = cudaPeekAtLastError();
|
||||
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
|
||||
}
|
||||
ss << " " << file << "(" << line << ")";
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
return code;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user