[doc] [dask] Troubleshooting NCCL errors. (#8943)

2023-03-22 22:17:26 +08:00 · 2023-03-22 22:17:26 +08:00 · ea04d4c46c
commit ea04d4c46c
parent a551bed803
3 changed files with 44 additions and 20 deletions
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
    booster = cls.get_booster()


-**********************
-Scikit-Learn interface
-**********************
+********************************
+Scikit-Learn Estimator Interface
+********************************

 As mentioned previously, there's another interface that mimics the scikit-learn estimators
 with higher level of of abstraction.  The interface is easier to use compared to the
@ -488,12 +488,13 @@ with dask and optuna.
 Troubleshooting
 ***************

-.. versionadded:: 1.6.0

-In some environments XGBoost might fail to resolve the IP address of the scheduler, a
-symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
-during training.  A quick workaround is to specify the address explicitly.  To do that
-dask config is used:
+- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
+  symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
+  during training.  A quick workaround is to specify the address explicitly.  To do that
+  dask config is used:
+
+  .. versionadded:: 1.6.0

 .. code-block:: python

@ -511,10 +512,20 @@ dask config is used:
        reg = dxgb.DaskXGBRegressor()


-Please note that XGBoost requires a different port than dask. By default, on a unix-like
-system XGBoost uses the port 0 to find available ports, which may fail if a user is
-running in a restricted docker environment. In this case, please open additional ports in
-the container and specify it as in the above snippet.
+- Please note that XGBoost requires a different port than dask. By default, on a unix-like
+  system XGBoost uses the port 0 to find available ports, which may fail if a user is
+  running in a restricted docker environment. In this case, please open additional ports
+  in the container and specify it as in the above snippet.
+
+- If you encounter a NCCL system error while training with GPU enabled, which usually
+  includes the error message `NCCL failure: unhandled system error`, you can specify its
+  network configuration using one of the environment variables listed in the `NCCL
+  document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
+  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
+  logs.
+
+- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
+  that includes `Multiple processes within a communication group ...` upon initialization.

 ************
 IPv6 Support
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -118,17 +118,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)

-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
  if (code != ncclSuccess) {
    std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
    if (code == ncclUnhandledCudaError) {
      // nccl usually preserves the last error so we can get more details.
      auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
    }
-    ss << " " << file << "(" << line << ")";
    LOG(FATAL) << ss.str();
  }

--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@ -1,10 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
 */
 #ifdef XGBOOST_USE_NCCL

 #include <gtest/gtest.h>

+#include <string>  // for string
+
 #include "../../../src/collective/nccl_device_communicator.cuh"

 namespace xgboost {
@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
  EXPECT_THROW(construct(), dmlc::Error);
 }

+TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
+  try {
+    dh::safe_nccl(ncclSystemError);
+  } catch (dmlc::Error const& e) {
+    auto str = std::string{e.what()};
+    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
+  }
+}
 }  // namespace collective
 }  // namespace xgboost

-#endif
+#endif  // XGBOOST_USE_NCCL