[doc] [dask] Troubleshooting NCCL errors. (#8943)

This commit is contained in:
Jiaming Yuan
2023-03-22 22:17:26 +08:00
committed by GitHub
parent a551bed803
commit ea04d4c46c
3 changed files with 44 additions and 20 deletions

View File

@@ -118,17 +118,20 @@ namespace dh {
#ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) {
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code);
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str();
}