diff --git a/rabit/src/engine_mpi.cc b/rabit/src/engine_mpi.cc index c63f13a78..2b7bd78fd 100644 --- a/rabit/src/engine_mpi.cc +++ b/rabit/src/engine_mpi.cc @@ -22,34 +22,21 @@ class MPIEngine : public IEngine { MPIEngine(void) { version_number = 0; } - virtual void Allgather(void *sendrecvbuf_, - size_t total_size, - size_t slice_begin, - size_t slice_end, - size_t size_prev_slice, - const char* _file, - const int _line, - const char* _caller) { + void Allgather(void *sendrecvbuf_, size_t total_size, size_t slice_begin, + size_t slice_end, size_t size_prev_slice) override { utils::Error("MPIEngine:: Allgather is not supported"); } - virtual void Allreduce(void *sendrecvbuf_, - size_t type_nbytes, - size_t count, - ReduceFunction reducer, - PreprocFunction prepare_fun, - void *prepare_arg, - const char* _file, - const int _line, - const char* _caller) { + void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count, + ReduceFunction reducer, PreprocFunction prepare_fun, + void *prepare_arg) override { utils::Error("MPIEngine:: Allreduce is not supported,"\ "use Allreduce_ instead"); } - virtual int GetRingPrevRank(void) const { + int GetRingPrevRank(void) const override { utils::Error("MPIEngine:: GetRingPrevRank is not supported"); + return -1; } - virtual void Broadcast(void *sendrecvbuf_, size_t size, int root, - const char* _file, const int _line, - const char* _caller) { + void Broadcast(void *sendrecvbuf_, size_t size, int root) override { MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root); } virtual void InitAfterException(void) { @@ -166,10 +153,7 @@ void Allreduce_(void *sendrecvbuf, mpi::DataType dtype, mpi::OpType op, IEngine::PreprocFunction prepare_fun, - void *prepare_arg, - const char* _file, - const int _line, - const char* _caller) { + void *prepare_arg) { if (prepare_fun != NULL) prepare_fun(prepare_arg); MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, GetType(dtype), GetOp(op)); @@ -180,14 +164,35 @@ ReduceHandle::ReduceHandle(void) : handle_(NULL), redfunc_(NULL), htype_(NULL) { } ReduceHandle::~ReduceHandle(void) { + /* !WARNING! + + A handle can be held by a tree method/Learner from xgboost. The booster might not be + freed until program exit, while (good) users call rabit.finalize() before reaching + the end of program. So op->Free() might be called after finalization and results + into following error: + + ``` + Attempting to use an MPI routine after finalizing MPICH + ``` + + Here we skip calling Free if MPI has already been finalized to workaround the issue. + It can be a potential leak of memory. The best way to resolve it is to eliminate all + use of long living handle. + */ + int finalized = 0; + CHECK_EQ(MPI_Finalized(&finalized), MPI_SUCCESS); if (handle_ != NULL) { MPI::Op *op = reinterpret_cast(handle_); - op->Free(); + if (!finalized) { + op->Free(); + } delete op; } if (htype_ != NULL) { MPI::Datatype *dtype = reinterpret_cast(htype_); - dtype->Free(); + if (!finalized) { + dtype->Free(); + } delete dtype; } } @@ -217,10 +222,7 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) { void ReduceHandle::Allreduce(void *sendrecvbuf, size_t type_nbytes, size_t count, IEngine::PreprocFunction prepare_fun, - void *prepare_arg, - const char* _file, - const int _line, - const char* _caller) { + void *prepare_arg) { utils::Assert(handle_ != NULL, "must intialize handle to call AllReduce"); MPI::Op *op = reinterpret_cast(handle_); MPI::Datatype *dtype = reinterpret_cast(htype_); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 75c1f2064..c892457be 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -78,6 +78,11 @@ if (USE_OPENMP OR USE_CUDA) # CUDA requires OpenMP target_link_libraries(objxgboost PUBLIC OpenMP::OpenMP_CXX) endif (USE_OPENMP OR USE_CUDA) +if (RABIT_BUILD_MPI) + find_package(MPI REQUIRED) + target_link_libraries(objxgboost PUBLIC MPI::MPI_CXX) +endif (RABIT_BUILD_MPI) + # For MSVC: Call msvc_use_static_runtime() once again to completely # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462 # for issues caused by mixing of /MD and /MT flags diff --git a/tests/distributed/runtests-mpi.sh b/tests/distributed/runtests-mpi.sh new file mode 100755 index 000000000..d99da4417 --- /dev/null +++ b/tests/distributed/runtests-mpi.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +rm -f *.model* + +export DMLC_SUBMIT_CLUSTER=mpi + +submit="timeout 5 python ../../dmlc-core/tracker/dmlc-submit" + +echo "====== 1. Basic distributed test with Python ======" +$submit --cluster=local --num-workers=3 python test_basic.py + +echo "====== 2. Regression test for issue #3402 ======" +$submit --cluster=local --num-workers=2 --worker-cores=1 python test_issue3402.py diff --git a/tests/distributed/test_issue3402.py b/tests/distributed/test_issue3402.py index e6c498331..e3b87931b 100644 --- a/tests/distributed/test_issue3402.py +++ b/tests/distributed/test_issue3402.py @@ -65,7 +65,7 @@ y = [1, 0] dtrain = xgb.DMatrix(X, label=y) -param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } +param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' } watchlist = [(dtrain,'train')] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist)