From ea04d4c46c7b28f7e8459ba07db79a18e6200cc6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Mar 2023 22:17:26 +0800
Subject: [PATCH] [doc] [dask] Troubleshooting NCCL errors. (#8943)

---
 doc/tutorials/dask.rst                        | 35 ++++++++++++-------
 src/common/device_helpers.cuh                 | 13 ++++---
 .../test_nccl_device_communicator.cu          | 16 +++++++--
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index ba0da9089..c66c6131f 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
     booster = cls.get_booster()
 
 
-**********************
-Scikit-Learn interface
-**********************
+********************************
+Scikit-Learn Estimator Interface
+********************************
 
 As mentioned previously, there's another interface that mimics the scikit-learn estimators
 with higher level of of abstraction.  The interface is easier to use compared to the
@@ -488,12 +488,13 @@ with dask and optuna.
 Troubleshooting
 ***************
 
-.. versionadded:: 1.6.0
 
-In some environments XGBoost might fail to resolve the IP address of the scheduler, a
-symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
-during training.  A quick workaround is to specify the address explicitly.  To do that
-dask config is used:
+- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
+  symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
+  during training.  A quick workaround is to specify the address explicitly.  To do that
+  dask config is used:
+
+  .. versionadded:: 1.6.0
 
 .. code-block:: python
 
@@ -511,10 +512,20 @@ dask config is used:
         reg = dxgb.DaskXGBRegressor()
 
 
-Please note that XGBoost requires a different port than dask. By default, on a unix-like
-system XGBoost uses the port 0 to find available ports, which may fail if a user is
-running in a restricted docker environment. In this case, please open additional ports in
-the container and specify it as in the above snippet.
+- Please note that XGBoost requires a different port than dask. By default, on a unix-like
+  system XGBoost uses the port 0 to find available ports, which may fail if a user is
+  running in a restricted docker environment. In this case, please open additional ports
+  in the container and specify it as in the above snippet.
+
+- If you encounter a NCCL system error while training with GPU enabled, which usually
+  includes the error message `NCCL failure: unhandled system error`, you can specify its
+  network configuration using one of the environment variables listed in the `NCCL
+  document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
+  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
+  logs.
+
+- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
+  that includes `Multiple processes within a communication group ...` upon initialization.
 
 ************
 IPv6 Support
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 58300d06c..f048aed43 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -118,17 +118,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
 
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
   if (code != ncclSuccess) {
     std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
     if (code == ncclUnhandledCudaError) {
       // nccl usually preserves the last error so we can get more details.
       auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
     }
-    ss << " " << file << "(" << line << ")";
     LOG(FATAL) << ss.str();
   }
 
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 47de054c6..8ce877aef 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -1,10 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #ifdef XGBOOST_USE_NCCL
 
 #include <gtest/gtest.h>
 
+#include <string>  // for string
+
 #include "../../../src/collective/nccl_device_communicator.cuh"
 
 namespace xgboost {
@@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
+TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
+  try {
+    dh::safe_nccl(ncclSystemError);
+  } catch (dmlc::Error const& e) {
+    auto str = std::string{e.what()};
+    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
+  }
+}
 }  // namespace collective
 }  // namespace xgboost
 
-#endif
+#endif  // XGBOOST_USE_NCCL