From ea04d4c46c7b28f7e8459ba07db79a18e6200cc6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 22 Mar 2023 22:17:26 +0800 Subject: [PATCH] [doc] [dask] Troubleshooting NCCL errors. (#8943) --- doc/tutorials/dask.rst | 35 ++++++++++++------- src/common/device_helpers.cuh | 13 ++++--- .../test_nccl_device_communicator.cu | 16 +++++++-- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index ba0da9089..c66c6131f 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -190,9 +190,9 @@ Scikit-Learn wrapper object: booster = cls.get_booster() -********************** -Scikit-Learn interface -********************** +******************************** +Scikit-Learn Estimator Interface +******************************** As mentioned previously, there's another interface that mimics the scikit-learn estimators with higher level of of abstraction. The interface is easier to use compared to the @@ -488,12 +488,13 @@ with dask and optuna. Troubleshooting *************** -.. versionadded:: 1.6.0 -In some environments XGBoost might fail to resolve the IP address of the scheduler, a -symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error -during training. A quick workaround is to specify the address explicitly. To do that -dask config is used: +- In some environments XGBoost might fail to resolve the IP address of the scheduler, a + symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error + during training. A quick workaround is to specify the address explicitly. To do that + dask config is used: + + .. versionadded:: 1.6.0 .. code-block:: python @@ -511,10 +512,20 @@ dask config is used: reg = dxgb.DaskXGBRegressor() -Please note that XGBoost requires a different port than dask. By default, on a unix-like -system XGBoost uses the port 0 to find available ports, which may fail if a user is -running in a restricted docker environment. In this case, please open additional ports in -the container and specify it as in the above snippet. +- Please note that XGBoost requires a different port than dask. By default, on a unix-like + system XGBoost uses the port 0 to find available ports, which may fail if a user is + running in a restricted docker environment. In this case, please open additional ports + in the container and specify it as in the above snippet. + +- If you encounter a NCCL system error while training with GPU enabled, which usually + includes the error message `NCCL failure: unhandled system error`, you can specify its + network configuration using one of the environment variables listed in the `NCCL + document `__ such as + the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug + logs. + +- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message + that includes `Multiple processes within a communication group ...` upon initialization. ************ IPv6 Support diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 58300d06c..f048aed43 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -118,17 +118,20 @@ namespace dh { #ifdef XGBOOST_USE_NCCL #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__) -inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, - int line) { +inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) { if (code != ncclSuccess) { std::stringstream ss; - ss << "NCCL failure :" << ncclGetErrorString(code); + ss << "NCCL failure: " << ncclGetErrorString(code) << "."; + ss << " " << file << "(" << line << ")\n"; if (code == ncclUnhandledCudaError) { // nccl usually preserves the last error so we can get more details. auto err = cudaPeekAtLastError(); - ss << " " << thrust::system_error(err, thrust::cuda_category()).what(); + ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; + } else if (code == ncclSystemError) { + ss << " This might be caused by a network configuration issue. Please consider specifying " + "the network interface for NCCL via environment variables listed in its reference: " + "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n"; } - ss << " " << file << "(" << line << ")"; LOG(FATAL) << ss.str(); } diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu index 47de054c6..8ce877aef 100644 --- a/tests/cpp/collective/test_nccl_device_communicator.cu +++ b/tests/cpp/collective/test_nccl_device_communicator.cu @@ -1,10 +1,12 @@ -/*! - * Copyright 2022 XGBoost contributors +/** + * Copyright 2022-2023, XGBoost contributors */ #ifdef XGBOOST_USE_NCCL #include +#include // for string + #include "../../../src/collective/nccl_device_communicator.cuh" namespace xgboost { @@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) { EXPECT_THROW(construct(), dmlc::Error); } +TEST(NcclDeviceCommunicatorSimpleTest, SystemError) { + try { + dh::safe_nccl(ncclSystemError); + } catch (dmlc::Error const& e) { + auto str = std::string{e.what()}; + ASSERT_TRUE(str.find("environment variables") != std::string::npos); + } +} } // namespace collective } // namespace xgboost -#endif +#endif // XGBOOST_USE_NCCL