Drop single point model recovery (#6262)
* Pass rabit params in JVM package. * Implement timeout using poll timeout parameter. * Remove OOB data check.
This commit is contained in:
@@ -1,12 +0,0 @@
|
||||
# Originally an example in demo/regression/
|
||||
tree_method=approx
|
||||
eta = 0.5
|
||||
gamma = 1.0
|
||||
seed = 0
|
||||
min_child_weight = 0
|
||||
max_depth = 5
|
||||
|
||||
num_round = 12
|
||||
save_period = 100
|
||||
data = "demo/data/agaricus.txt.train"
|
||||
eval[test] = "demo/data/agaricus.txt.test"
|
||||
@@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
source activate cpu_test
|
||||
|
||||
export DMLC_SUBMIT_CLUSTER=local
|
||||
|
||||
submit="python3 dmlc-core/tracker/dmlc-submit"
|
||||
# build xgboost with librabit mock
|
||||
# define max worker retry with dmlc-core local num atempt
|
||||
# instrument worker failure with mock=xxxx
|
||||
# check if host recovered from expectected iteration
|
||||
echo "====== 1. Fault recovery distributed test ======"
|
||||
exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1
|
||||
@@ -1,53 +0,0 @@
|
||||
#define RABIT_CXXTESTDEFS_H
|
||||
#if !defined(_WIN32)
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include "../../../rabit/src/allreduce_mock.h"
|
||||
|
||||
TEST(AllreduceMock, MockAllreduce)
|
||||
{
|
||||
rabit::engine::AllreduceMock m;
|
||||
|
||||
std::string mock_str = "mock=0,0,0,0";
|
||||
char cmd[mock_str.size()+1];
|
||||
std::copy(mock_str.begin(), mock_str.end(), cmd);
|
||||
cmd[mock_str.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd};
|
||||
m.Init(1, argv);
|
||||
m.rank = 0;
|
||||
EXPECT_THROW(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(AllreduceMock, MockBroadcast)
|
||||
{
|
||||
rabit::engine::AllreduceMock m;
|
||||
std::string mock_str = "mock=0,1,2,0";
|
||||
char cmd[mock_str.size()+1];
|
||||
std::copy(mock_str.begin(), mock_str.end(), cmd);
|
||||
cmd[mock_str.size()] = '\0';
|
||||
char* argv[] = {cmd};
|
||||
m.Init(1, argv);
|
||||
m.rank = 0;
|
||||
m.version_number=1;
|
||||
m.seq_counter=2;
|
||||
EXPECT_THROW(m.Broadcast(nullptr,0,0), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(AllreduceMock, MockGather)
|
||||
{
|
||||
rabit::engine::AllreduceMock m;
|
||||
std::string mock_str = "mock=3,13,22,0";
|
||||
char cmd[mock_str.size()+1];
|
||||
std::copy(mock_str.begin(), mock_str.end(), cmd);
|
||||
cmd[mock_str.size()] = '\0';
|
||||
char* argv[] = {cmd};
|
||||
m.Init(1, argv);
|
||||
m.rank = 3;
|
||||
m.version_number=13;
|
||||
m.seq_counter=22;
|
||||
EXPECT_THROW({m.Allgather(nullptr,0,0,0,0);}, dmlc::Error);
|
||||
}
|
||||
#endif // !defined(_WIN32)
|
||||
@@ -1,235 +0,0 @@
|
||||
#define RABIT_CXXTESTDEFS_H
|
||||
#if !defined(_WIN32)
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include "../../../rabit/src/allreduce_robust.h"
|
||||
|
||||
inline void MockErr(const char *fmt, ...) {EXPECT_STRCASEEQ(fmt, "[%d] exit due to time out %d s\n");}
|
||||
inline void MockAssert(bool val, const char *fmt, ...) {}
|
||||
rabit::engine::AllreduceRobust::ReturnType err_type(rabit::engine::AllreduceRobust::ReturnTypeEnum::kSockError);
|
||||
rabit::engine::AllreduceRobust::ReturnType succ_type(rabit::engine::AllreduceRobust::ReturnTypeEnum::kSuccess);
|
||||
|
||||
TEST(AllreduceRobust, SyncErrorTimeout)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd,cmd1};
|
||||
m.Init(2, argv);
|
||||
m.rank = 0;
|
||||
m.rabit_bootstrap_cache = true;
|
||||
m.error_ = MockErr;
|
||||
m.assert_ = MockAssert;
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1500));
|
||||
EXPECT_EQ(m.rabit_timeout_task_.get(), false);
|
||||
}
|
||||
|
||||
TEST(AllreduceRobust, SyncErrorReset)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
std::string rabit_debug = "rabit_debug=1";
|
||||
char cmd2[rabit_debug.size()+1];
|
||||
std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
|
||||
cmd2[rabit_debug.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd, cmd1,cmd2};
|
||||
m.Init(3, argv);
|
||||
m.rank = 0;
|
||||
m.assert_ = MockAssert;
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
EXPECT_EQ(m.CheckAndRecover(succ_type), true);
|
||||
EXPECT_EQ(m.rabit_timeout_task_.get(), true);
|
||||
m.Shutdown();
|
||||
}
|
||||
|
||||
TEST(AllreduceRobust, SyncSuccessErrorTimeout)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
std::string rabit_debug = "rabit_debug=1";
|
||||
char cmd2[rabit_debug.size()+1];
|
||||
std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
|
||||
cmd2[rabit_debug.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd, cmd1,cmd2};
|
||||
m.Init(3, argv);
|
||||
m.rank = 0;
|
||||
m.rabit_bootstrap_cache = true;
|
||||
m.assert_ = MockAssert;
|
||||
m.error_ = MockErr;
|
||||
EXPECT_EQ(m.CheckAndRecover(succ_type), true);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1500));
|
||||
EXPECT_EQ(m.rabit_timeout_task_.get(), false);
|
||||
}
|
||||
|
||||
TEST(AllreduceRobust, SyncSuccessErrorSuccess)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
std::string rabit_debug = "rabit_debug=1";
|
||||
char cmd2[rabit_debug.size()+1];
|
||||
std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
|
||||
cmd2[rabit_debug.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd, cmd1,cmd2};
|
||||
m.Init(3, argv);
|
||||
m.rank = 0;
|
||||
m.rabit_bootstrap_cache = true;
|
||||
m.assert_ = MockAssert;
|
||||
EXPECT_EQ(m.CheckAndRecover(succ_type), true);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
EXPECT_EQ(m.CheckAndRecover(succ_type), true);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1100));
|
||||
EXPECT_EQ(m.rabit_timeout_task_.get(), true);
|
||||
m.Shutdown();
|
||||
}
|
||||
|
||||
TEST(AllreduceRobust, SyncErrorNoResetTimeout)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
std::string rabit_debug = "rabit_debug=1";
|
||||
char cmd2[rabit_debug.size()+1];
|
||||
std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
|
||||
cmd2[rabit_debug.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd, cmd1,cmd2};
|
||||
m.Init(3, argv);
|
||||
m.rank = 0;
|
||||
m.rabit_bootstrap_cache = true;
|
||||
m.assert_ = MockAssert;
|
||||
m.error_ = MockErr;
|
||||
auto start = std::chrono::system_clock::now();
|
||||
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1100));
|
||||
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
|
||||
m.rabit_timeout_task_.wait();
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> diff = end-start;
|
||||
|
||||
EXPECT_EQ(m.rabit_timeout_task_.get(), false);
|
||||
// expect second error don't overwrite/reset timeout task
|
||||
EXPECT_LT(diff.count(), 2);
|
||||
}
|
||||
|
||||
TEST(AllreduceRobust, NoTimeoutShutDown)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
std::string rabit_debug = "rabit_debug=1";
|
||||
char cmd2[rabit_debug.size()+1];
|
||||
std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
|
||||
cmd2[rabit_debug.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd, cmd1,cmd2};
|
||||
m.Init(3, argv);
|
||||
m.rank = 0;
|
||||
|
||||
EXPECT_EQ(m.CheckAndRecover(succ_type), true);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
m.Shutdown();
|
||||
}
|
||||
|
||||
TEST(AllreduceRobust, ShutDownBeforeTimeout)
|
||||
{
|
||||
rabit::engine::AllreduceRobust m;
|
||||
|
||||
std::string rabit_timeout = "rabit_timeout=1";
|
||||
char cmd[rabit_timeout.size()+1];
|
||||
std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
|
||||
cmd[rabit_timeout.size()] = '\0';
|
||||
|
||||
std::string rabit_timeout_sec = "rabit_timeout_sec=1";
|
||||
char cmd1[rabit_timeout_sec.size()+1];
|
||||
std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
|
||||
cmd1[rabit_timeout_sec.size()] = '\0';
|
||||
|
||||
std::string rabit_debug = "rabit_debug=1";
|
||||
char cmd2[rabit_debug.size()+1];
|
||||
std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
|
||||
cmd2[rabit_debug.size()] = '\0';
|
||||
|
||||
char* argv[] = {cmd, cmd1,cmd2};
|
||||
m.Init(3, argv);
|
||||
m.rank = 0;
|
||||
rabit::engine::AllreduceRobust::LinkRecord a;
|
||||
m.err_link = &a;
|
||||
|
||||
EXPECT_EQ(m.CheckAndRecover(err_type), false);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
m.Shutdown();
|
||||
}
|
||||
#endif // !defined(_WIN32)
|
||||
@@ -1,8 +1,8 @@
|
||||
"""Distributed GPU tests."""
|
||||
import sys
|
||||
import time
|
||||
import xgboost as xgb
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
def run_test(name, params_fun):
|
||||
@@ -28,7 +28,7 @@ def run_test(name, params_fun):
|
||||
# Have each worker save its model
|
||||
model_name = "test.model.%s.%d" % (name, rank)
|
||||
bst.dump_model(model_name, with_stats=True)
|
||||
time.sleep(2)
|
||||
xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync
|
||||
xgb.rabit.tracker_print("Finished training\n")
|
||||
|
||||
if (rank == 0):
|
||||
@@ -49,9 +49,6 @@ def run_test(name, params_fun):
|
||||
|
||||
xgb.rabit.finalize()
|
||||
|
||||
if os.path.exists(model_name):
|
||||
os.remove(model_name)
|
||||
|
||||
|
||||
base_params = {
|
||||
'tree_method': 'gpu_hist',
|
||||
|
||||
@@ -7,6 +7,8 @@ submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
|
||||
|
||||
echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
|
||||
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py basic_1x4 || exit 1
|
||||
rm test.model.*
|
||||
|
||||
echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
|
||||
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py rf_1x4 || exit 1
|
||||
rm test.model.*
|
||||
|
||||
Reference in New Issue
Block a user