[doc] [dask] Troubleshooting NCCL errors. (#8943)

This commit is contained in:
Jiaming Yuan 2023-03-22 22:17:26 +08:00 committed by GitHub
parent a551bed803
commit ea04d4c46c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 44 additions and 20 deletions

View File

@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
booster = cls.get_booster() booster = cls.get_booster()
********************** ********************************
Scikit-Learn interface Scikit-Learn Estimator Interface
********************** ********************************
As mentioned previously, there's another interface that mimics the scikit-learn estimators As mentioned previously, there's another interface that mimics the scikit-learn estimators
with higher level of of abstraction. The interface is easier to use compared to the with higher level of of abstraction. The interface is easier to use compared to the
@ -488,12 +488,13 @@ with dask and optuna.
Troubleshooting Troubleshooting
*************** ***************
.. versionadded:: 1.6.0
In some environments XGBoost might fail to resolve the IP address of the scheduler, a - In some environments XGBoost might fail to resolve the IP address of the scheduler, a
symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
during training. A quick workaround is to specify the address explicitly. To do that during training. A quick workaround is to specify the address explicitly. To do that
dask config is used: dask config is used:
.. versionadded:: 1.6.0
.. code-block:: python .. code-block:: python
@ -511,10 +512,20 @@ dask config is used:
reg = dxgb.DaskXGBRegressor() reg = dxgb.DaskXGBRegressor()
Please note that XGBoost requires a different port than dask. By default, on a unix-like - Please note that XGBoost requires a different port than dask. By default, on a unix-like
system XGBoost uses the port 0 to find available ports, which may fail if a user is system XGBoost uses the port 0 to find available ports, which may fail if a user is
running in a restricted docker environment. In this case, please open additional ports in running in a restricted docker environment. In this case, please open additional ports
the container and specify it as in the above snippet. in the container and specify it as in the above snippet.
- If you encounter a NCCL system error while training with GPU enabled, which usually
includes the error message `NCCL failure: unhandled system error`, you can specify its
network configuration using one of the environment variables listed in the `NCCL
document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
logs.
- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
that includes `Multiple processes within a communication group ...` upon initialization.
************ ************
IPv6 Support IPv6 Support

View File

@ -118,17 +118,20 @@ namespace dh {
#ifdef XGBOOST_USE_NCCL #ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__) #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
int line) {
if (code != ncclSuccess) { if (code != ncclSuccess) {
std::stringstream ss; std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code); ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) { if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details. // nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError(); auto err = cudaPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::cuda_category()).what(); ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
} }
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str(); LOG(FATAL) << ss.str();
} }

View File

@ -1,10 +1,12 @@
/*! /**
* Copyright 2022 XGBoost contributors * Copyright 2022-2023, XGBoost contributors
*/ */
#ifdef XGBOOST_USE_NCCL #ifdef XGBOOST_USE_NCCL
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> // for string
#include "../../../src/collective/nccl_device_communicator.cuh" #include "../../../src/collective/nccl_device_communicator.cuh"
namespace xgboost { namespace xgboost {
@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
EXPECT_THROW(construct(), dmlc::Error); EXPECT_THROW(construct(), dmlc::Error);
} }
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
try {
dh::safe_nccl(ncclSystemError);
} catch (dmlc::Error const& e) {
auto str = std::string{e.what()};
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
}
}
} // namespace collective } // namespace collective
} // namespace xgboost } // namespace xgboost
#endif #endif // XGBOOST_USE_NCCL