Fix MPI build. (#6403)

This commit is contained in:
Jiaming Yuan 2020-11-21 13:38:21 +08:00 committed by GitHub
parent 2ce2a1a4d8
commit 42d31d9dcb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 52 additions and 32 deletions

View File

@ -22,34 +22,21 @@ class MPIEngine : public IEngine {
MPIEngine(void) {
version_number = 0;
}
virtual void Allgather(void *sendrecvbuf_,
size_t total_size,
size_t slice_begin,
size_t slice_end,
size_t size_prev_slice,
const char* _file,
const int _line,
const char* _caller) {
void Allgather(void *sendrecvbuf_, size_t total_size, size_t slice_begin,
size_t slice_end, size_t size_prev_slice) override {
utils::Error("MPIEngine:: Allgather is not supported");
}
virtual void Allreduce(void *sendrecvbuf_,
size_t type_nbytes,
size_t count,
ReduceFunction reducer,
PreprocFunction prepare_fun,
void *prepare_arg,
const char* _file,
const int _line,
const char* _caller) {
void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
ReduceFunction reducer, PreprocFunction prepare_fun,
void *prepare_arg) override {
utils::Error("MPIEngine:: Allreduce is not supported,"\
"use Allreduce_ instead");
}
virtual int GetRingPrevRank(void) const {
int GetRingPrevRank(void) const override {
utils::Error("MPIEngine:: GetRingPrevRank is not supported");
return -1;
}
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
const char* _file, const int _line,
const char* _caller) {
void Broadcast(void *sendrecvbuf_, size_t size, int root) override {
MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root);
}
virtual void InitAfterException(void) {
@ -166,10 +153,7 @@ void Allreduce_(void *sendrecvbuf,
mpi::DataType dtype,
mpi::OpType op,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
const char* _file,
const int _line,
const char* _caller) {
void *prepare_arg) {
if (prepare_fun != NULL) prepare_fun(prepare_arg);
MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf,
count, GetType(dtype), GetOp(op));
@ -180,14 +164,35 @@ ReduceHandle::ReduceHandle(void)
: handle_(NULL), redfunc_(NULL), htype_(NULL) {
}
ReduceHandle::~ReduceHandle(void) {
/* !WARNING!
A handle can be held by a tree method/Learner from xgboost. The booster might not be
freed until program exit, while (good) users call rabit.finalize() before reaching
the end of program. So op->Free() might be called after finalization and results
into following error:
```
Attempting to use an MPI routine after finalizing MPICH
```
Here we skip calling Free if MPI has already been finalized to workaround the issue.
It can be a potential leak of memory. The best way to resolve it is to eliminate all
use of long living handle.
*/
int finalized = 0;
CHECK_EQ(MPI_Finalized(&finalized), MPI_SUCCESS);
if (handle_ != NULL) {
MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
op->Free();
if (!finalized) {
op->Free();
}
delete op;
}
if (htype_ != NULL) {
MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
dtype->Free();
if (!finalized) {
dtype->Free();
}
delete dtype;
}
}
@ -217,10 +222,7 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
void ReduceHandle::Allreduce(void *sendrecvbuf,
size_t type_nbytes, size_t count,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
const char* _file,
const int _line,
const char* _caller) {
void *prepare_arg) {
utils::Assert(handle_ != NULL, "must intialize handle to call AllReduce");
MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);

View File

@ -78,6 +78,11 @@ if (USE_OPENMP OR USE_CUDA) # CUDA requires OpenMP
target_link_libraries(objxgboost PUBLIC OpenMP::OpenMP_CXX)
endif (USE_OPENMP OR USE_CUDA)
if (RABIT_BUILD_MPI)
find_package(MPI REQUIRED)
target_link_libraries(objxgboost PUBLIC MPI::MPI_CXX)
endif (RABIT_BUILD_MPI)
# For MSVC: Call msvc_use_static_runtime() once again to completely
# replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
# for issues caused by mixing of /MD and /MT flags

View File

@ -0,0 +1,13 @@
#!/bin/bash
rm -f *.model*
export DMLC_SUBMIT_CLUSTER=mpi
submit="timeout 5 python ../../dmlc-core/tracker/dmlc-submit"
echo "====== 1. Basic distributed test with Python ======"
$submit --cluster=local --num-workers=3 python test_basic.py
echo "====== 2. Regression test for issue #3402 ======"
$submit --cluster=local --num-workers=2 --worker-cores=1 python test_issue3402.py

View File

@ -65,7 +65,7 @@ y = [1, 0]
dtrain = xgb.DMatrix(X, label=y)
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' }
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' }
watchlist = [(dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)