[doc] [dask] Troubleshooting NCCL errors. (#8943)
This commit is contained in:
parent
a551bed803
commit
ea04d4c46c
@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
|
||||
booster = cls.get_booster()
|
||||
|
||||
|
||||
**********************
|
||||
Scikit-Learn interface
|
||||
**********************
|
||||
********************************
|
||||
Scikit-Learn Estimator Interface
|
||||
********************************
|
||||
|
||||
As mentioned previously, there's another interface that mimics the scikit-learn estimators
|
||||
with higher level of of abstraction. The interface is easier to use compared to the
|
||||
@ -488,12 +488,13 @@ with dask and optuna.
|
||||
Troubleshooting
|
||||
***************
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
In some environments XGBoost might fail to resolve the IP address of the scheduler, a
|
||||
symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
|
||||
during training. A quick workaround is to specify the address explicitly. To do that
|
||||
dask config is used:
|
||||
- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
|
||||
symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
|
||||
during training. A quick workaround is to specify the address explicitly. To do that
|
||||
dask config is used:
|
||||
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -511,10 +512,20 @@ dask config is used:
|
||||
reg = dxgb.DaskXGBRegressor()
|
||||
|
||||
|
||||
Please note that XGBoost requires a different port than dask. By default, on a unix-like
|
||||
system XGBoost uses the port 0 to find available ports, which may fail if a user is
|
||||
running in a restricted docker environment. In this case, please open additional ports in
|
||||
the container and specify it as in the above snippet.
|
||||
- Please note that XGBoost requires a different port than dask. By default, on a unix-like
|
||||
system XGBoost uses the port 0 to find available ports, which may fail if a user is
|
||||
running in a restricted docker environment. In this case, please open additional ports
|
||||
in the container and specify it as in the above snippet.
|
||||
|
||||
- If you encounter a NCCL system error while training with GPU enabled, which usually
|
||||
includes the error message `NCCL failure: unhandled system error`, you can specify its
|
||||
network configuration using one of the environment variables listed in the `NCCL
|
||||
document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
|
||||
the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
|
||||
logs.
|
||||
|
||||
- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
|
||||
that includes `Multiple processes within a communication group ...` upon initialization.
|
||||
|
||||
************
|
||||
IPv6 Support
|
||||
|
||||
@ -118,17 +118,20 @@ namespace dh {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code);
|
||||
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
|
||||
ss << " " << file << "(" << line << ")\n";
|
||||
if (code == ncclUnhandledCudaError) {
|
||||
// nccl usually preserves the last error so we can get more details.
|
||||
auto err = cudaPeekAtLastError();
|
||||
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
|
||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||
} else if (code == ncclSystemError) {
|
||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||
"the network interface for NCCL via environment variables listed in its reference: "
|
||||
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
||||
}
|
||||
ss << " " << file << "(" << line << ")";
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
/*!
|
||||
* Copyright 2022 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2022-2023, XGBoost contributors
|
||||
*/
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <string> // for string
|
||||
|
||||
#include "../../../src/collective/nccl_device_communicator.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
|
||||
try {
|
||||
dh::safe_nccl(ncclSystemError);
|
||||
} catch (dmlc::Error const& e) {
|
||||
auto str = std::string{e.what()};
|
||||
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
|
||||
}
|
||||
}
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
|
||||
#endif
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user