[doc] [dask] Troubleshooting NCCL errors. (#8943)

2023-03-22 22:17:26 +08:00 · 2023-03-22 22:17:26 +08:00 · ea04d4c46c
commit ea04d4c46c
parent a551bed803
3 changed files with 44 additions and 20 deletions
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
    booster = cls.get_booster()
-**********************
+********************************
-Scikit-Learn interface
+Scikit-Learn Estimator Interface
-**********************
+********************************
 As mentioned previously, there's another interface that mimics the scikit-learn estimators
 with higher level of of abstraction.  The interface is easier to use compared to the
@ -488,12 +488,13 @@ with dask and optuna.
 Troubleshooting
 ***************
 .. versionadded:: 1.6.0
-In some environments XGBoost might fail to resolve the IP address of the scheduler, a
+- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
-symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
+  symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
-during training.  A quick workaround is to specify the address explicitly.  To do that
+  during training.  A quick workaround is to specify the address explicitly.  To do that
-dask config is used:
+  dask config is used:
  .. versionadded:: 1.6.0
 .. code-block:: python
@ -511,10 +512,20 @@ dask config is used:
        reg = dxgb.DaskXGBRegressor()
-Please note that XGBoost requires a different port than dask. By default, on a unix-like
+- Please note that XGBoost requires a different port than dask. By default, on a unix-like
-system XGBoost uses the port 0 to find available ports, which may fail if a user is
+  system XGBoost uses the port 0 to find available ports, which may fail if a user is
-running in a restricted docker environment. In this case, please open additional ports in
+  running in a restricted docker environment. In this case, please open additional ports
-the container and specify it as in the above snippet.
+  in the container and specify it as in the above snippet.
 - If you encounter a NCCL system error while training with GPU enabled, which usually
  includes the error message `NCCL failure: unhandled system error`, you can specify its
  network configuration using one of the environment variables listed in the `NCCL
  document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
  logs.
 - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
  that includes `Multiple processes within a communication group ...` upon initialization.
 ************
 IPv6 Support
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -118,17 +118,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
                                     int line) {
  if (code != ncclSuccess) {
    std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
    ss << " " << file << "(" << line << ")\n";
    if (code == ncclUnhandledCudaError) {
      // nccl usually preserves the last error so we can get more details.
      auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
    } else if (code == ncclSystemError) {
      ss << "  This might be caused by a network configuration issue. Please consider specifying "
            "the network interface for NCCL via environment variables listed in its reference: "
            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
    }
    ss << " " << file << "(" << line << ")";
    LOG(FATAL) << ss.str();
  }
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@ -1,10 +1,12 @@
-/*!
+/**
- * Copyright 2022 XGBoost contributors
+ * Copyright 2022-2023, XGBoost contributors
 */
 #ifdef XGBOOST_USE_NCCL
 #include <gtest/gtest.h>
 #include <string>  // for string
 #include "../../../src/collective/nccl_device_communicator.cuh"
 namespace xgboost {
@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
  EXPECT_THROW(construct(), dmlc::Error);
 }
 TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
  try {
    dh::safe_nccl(ncclSystemError);
  } catch (dmlc::Error const& e) {
    auto str = std::string{e.what()};
    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
  }
 }
 }  // namespace collective
 }  // namespace xgboost
-#endif
+#endif  // XGBOOST_USE_NCCL