Drop single point model recovery (#6262)

* Pass rabit params in JVM package.
* Implement timeout using poll timeout parameter.
* Remove OOB data check.
This commit is contained in:
Jiaming Yuan
2020-10-21 15:27:03 +08:00
committed by GitHub
parent 81c37c28d5
commit b5c2a47b20
22 changed files with 63 additions and 2879 deletions

View File

@@ -1,8 +1,8 @@
"""Distributed GPU tests."""
import sys
import time
import xgboost as xgb
import os
import numpy as np
def run_test(name, params_fun):
@@ -28,7 +28,7 @@ def run_test(name, params_fun):
# Have each worker save its model
model_name = "test.model.%s.%d" % (name, rank)
bst.dump_model(model_name, with_stats=True)
time.sleep(2)
xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync
xgb.rabit.tracker_print("Finished training\n")
if (rank == 0):
@@ -49,9 +49,6 @@ def run_test(name, params_fun):
xgb.rabit.finalize()
if os.path.exists(model_name):
os.remove(model_name)
base_params = {
'tree_method': 'gpu_hist',

View File

@@ -7,6 +7,8 @@ submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py basic_1x4 || exit 1
rm test.model.*
echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py rf_1x4 || exit 1
rm test.model.*