Drop single point model recovery (#6262)

* Pass rabit params in JVM package. * Implement timeout using poll timeout parameter. * Remove OOB data check.
2020-10-21 15:27:03 +08:00
parent 81c37c28d5
commit b5c2a47b20
22 changed files with 63 additions and 2879 deletions
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@@ -1,8 +1,8 @@
 """Distributed GPU tests."""
 import sys
-import time
 import xgboost as xgb
 import os
+import numpy as np


 def run_test(name, params_fun):
@@ -28,7 +28,7 @@ def run_test(name, params_fun):
    # Have each worker save its model
    model_name = "test.model.%s.%d" % (name, rank)
    bst.dump_model(model_name, with_stats=True)
-    time.sleep(2)
+    xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX)  # sync
    xgb.rabit.tracker_print("Finished training\n")

    if (rank == 0):
@@ -49,9 +49,6 @@ def run_test(name, params_fun):

    xgb.rabit.finalize()

-    if os.path.exists(model_name):
-        os.remove(model_name)
-

 base_params = {
    'tree_method': 'gpu_hist',
--- a/tests/distributed/runtests-gpu.sh
+++ b/tests/distributed/runtests-gpu.sh
@@ -7,6 +7,8 @@ submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"

 echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
 $submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py basic_1x4 || exit 1
+rm test.model.*

 echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
 $submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py rf_1x4 || exit 1
+rm test.model.*