[doc] [dask] Troubleshooting NCCL errors. (#8943)
This commit is contained in:
parent
a551bed803
commit
ea04d4c46c
@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
|
|||||||
booster = cls.get_booster()
|
booster = cls.get_booster()
|
||||||
|
|
||||||
|
|
||||||
**********************
|
********************************
|
||||||
Scikit-Learn interface
|
Scikit-Learn Estimator Interface
|
||||||
**********************
|
********************************
|
||||||
|
|
||||||
As mentioned previously, there's another interface that mimics the scikit-learn estimators
|
As mentioned previously, there's another interface that mimics the scikit-learn estimators
|
||||||
with higher level of of abstraction. The interface is easier to use compared to the
|
with higher level of of abstraction. The interface is easier to use compared to the
|
||||||
@ -488,12 +488,13 @@ with dask and optuna.
|
|||||||
Troubleshooting
|
Troubleshooting
|
||||||
***************
|
***************
|
||||||
|
|
||||||
.. versionadded:: 1.6.0
|
|
||||||
|
|
||||||
In some environments XGBoost might fail to resolve the IP address of the scheduler, a
|
- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
|
||||||
symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
|
symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
|
||||||
during training. A quick workaround is to specify the address explicitly. To do that
|
during training. A quick workaround is to specify the address explicitly. To do that
|
||||||
dask config is used:
|
dask config is used:
|
||||||
|
|
||||||
|
.. versionadded:: 1.6.0
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -511,10 +512,20 @@ dask config is used:
|
|||||||
reg = dxgb.DaskXGBRegressor()
|
reg = dxgb.DaskXGBRegressor()
|
||||||
|
|
||||||
|
|
||||||
Please note that XGBoost requires a different port than dask. By default, on a unix-like
|
- Please note that XGBoost requires a different port than dask. By default, on a unix-like
|
||||||
system XGBoost uses the port 0 to find available ports, which may fail if a user is
|
system XGBoost uses the port 0 to find available ports, which may fail if a user is
|
||||||
running in a restricted docker environment. In this case, please open additional ports in
|
running in a restricted docker environment. In this case, please open additional ports
|
||||||
the container and specify it as in the above snippet.
|
in the container and specify it as in the above snippet.
|
||||||
|
|
||||||
|
- If you encounter a NCCL system error while training with GPU enabled, which usually
|
||||||
|
includes the error message `NCCL failure: unhandled system error`, you can specify its
|
||||||
|
network configuration using one of the environment variables listed in the `NCCL
|
||||||
|
document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
|
||||||
|
the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
|
||||||
|
logs.
|
||||||
|
|
||||||
|
- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
|
||||||
|
that includes `Multiple processes within a communication group ...` upon initialization.
|
||||||
|
|
||||||
************
|
************
|
||||||
IPv6 Support
|
IPv6 Support
|
||||||
|
|||||||
@ -118,17 +118,20 @@ namespace dh {
|
|||||||
#ifdef XGBOOST_USE_NCCL
|
#ifdef XGBOOST_USE_NCCL
|
||||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||||
|
|
||||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
|
||||||
int line) {
|
|
||||||
if (code != ncclSuccess) {
|
if (code != ncclSuccess) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "NCCL failure :" << ncclGetErrorString(code);
|
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
|
||||||
|
ss << " " << file << "(" << line << ")\n";
|
||||||
if (code == ncclUnhandledCudaError) {
|
if (code == ncclUnhandledCudaError) {
|
||||||
// nccl usually preserves the last error so we can get more details.
|
// nccl usually preserves the last error so we can get more details.
|
||||||
auto err = cudaPeekAtLastError();
|
auto err = cudaPeekAtLastError();
|
||||||
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
|
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||||
|
} else if (code == ncclSystemError) {
|
||||||
|
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||||
|
"the network interface for NCCL via environment variables listed in its reference: "
|
||||||
|
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
||||||
}
|
}
|
||||||
ss << " " << file << "(" << line << ")";
|
|
||||||
LOG(FATAL) << ss.str();
|
LOG(FATAL) << ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,10 +1,12 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2022 XGBoost contributors
|
* Copyright 2022-2023, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifdef XGBOOST_USE_NCCL
|
#ifdef XGBOOST_USE_NCCL
|
||||||
|
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <string> // for string
|
||||||
|
|
||||||
#include "../../../src/collective/nccl_device_communicator.cuh"
|
#include "../../../src/collective/nccl_device_communicator.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
|
|||||||
EXPECT_THROW(construct(), dmlc::Error);
|
EXPECT_THROW(construct(), dmlc::Error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
|
||||||
|
try {
|
||||||
|
dh::safe_nccl(ncclSystemError);
|
||||||
|
} catch (dmlc::Error const& e) {
|
||||||
|
auto str = std::string{e.what()};
|
||||||
|
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace collective
|
} // namespace collective
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
#endif
|
#endif // XGBOOST_USE_NCCL
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user