From 7fefd6865d861e7dd1b76cc6bd12c27c0b7f4d53 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Sat, 4 Aug 2018 19:20:04 -0700 Subject: [PATCH] Fix #3402: wrong fid crashes distributed algorithm (#3535) * Fix #3402: wrong fid crashes distributed algorithm The bug was introduced by the recent DMatrix refactor (#3301). It was partially fixed by #3408 but the example in #3402 was still failing. The example in #3402 will succeed after this fix is applied. * Explicitly specify "this" to prevent compile error * Add regression test * Add distributed test to Travis matrix * Install kubernetes Python package as dependency of dmlc tracker * Add Python dependencies * Add compile step * Reduce size of regression test case * Further reduce size of test --- .travis.yml | 4 ++ src/tree/updater_histmaker.cc | 5 +- tests/distributed/runtests.sh | 5 ++ tests/distributed/test_issue3402.py | 77 +++++++++++++++++++++++++++++ tests/travis/run_test.sh | 7 +++ tests/travis/setup.sh | 4 ++ 6 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 tests/distributed/test_issue3402.py diff --git a/.travis.yml b/.travis.yml index d607f8d6e..e86cfd048 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,6 +26,8 @@ env: - TASK=cmake_test # c++ test - TASK=cpp_test + # distributed test + - TASK=distributed_test matrix: exclude: @@ -39,6 +41,8 @@ matrix: env: TASK=python_lightweight_test - os: osx env: TASK=cpp_test + - os: osx + env: TASK=distributed_test # dependent apt packages addons: diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index 3cc0a760b..638017355 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -725,9 +725,10 @@ class GlobalProposalHistMaker: public CQHistMaker { const auto nsize = static_cast(this->work_set_.size()); #pragma omp parallel for schedule(dynamic, 1) for (bst_omp_uint i = 0; i < nsize; ++i) { - int offset = this->feat2workindex_[this->work_set_[i]]; + int fid = this->work_set_[i]; + int offset = this->feat2workindex_[fid]; if (offset >= 0) { - this->UpdateHistCol(gpair, batch[i], info, tree, + this->UpdateHistCol(gpair, batch[fid], info, tree, fset, offset, &this->thread_hist_[omp_get_thread_num()]); } diff --git a/tests/distributed/runtests.sh b/tests/distributed/runtests.sh index 997fb1893..a798802c8 100755 --- a/tests/distributed/runtests.sh +++ b/tests/distributed/runtests.sh @@ -1,4 +1,9 @@ #!/bin/bash +echo "====== 1. Basic distributed test with Python ======" PYTHONPATH=../../python-package/ ../../dmlc-core/tracker/dmlc-submit --cluster=local --num-workers=3\ python test_basic.py + +echo "====== 2. Regression test for issue #3402 ======" +PYTHONPATH=../../python-package/ ../../dmlc-core/tracker/dmlc-submit --cluster=local --num-workers=2 --worker-cores=1\ + python test_issue3402.py diff --git a/tests/distributed/test_issue3402.py b/tests/distributed/test_issue3402.py new file mode 100644 index 000000000..3dd895226 --- /dev/null +++ b/tests/distributed/test_issue3402.py @@ -0,0 +1,77 @@ +#!/usr/bin/python +import xgboost as xgb + +xgb.rabit.init() + +X = [ + [15.00,28.90,29.00,3143.70,0.00,0.10,69.90,90.00,13726.07,0.00,2299.70,0.00,0.05, + 4327.03,0.00,24.00,0.18,3.00,0.41,3.77,0.00,0.00,4.00,0.00,150.92,0.00,2.00,0.00, + 0.01,138.00,1.00,0.02,69.90,0.00,0.83,5.00,0.01,0.12,47.30,0.00,296.00,0.16,0.00, + 0.00,27.70,7.00,7.25,4406.16,1.00,0.54,245.28,3.00,0.06,306.50,5143.00,29.00,23.74, + 548.00,2.00,68.00,70.90,25.45,0.39,0.00,0.01,497.11,0.00,42.00,83.00,4.00,0.00,1.00, + 0.00,104.35,94.12,0.03,79.23,237.69,1.00,0.04,0.01,0.02,2.00,108.81,7.00,12.00,0.46, + 31.00,0.00,0.15,74.59,0.00,19.50,0.00,0.75,0.06,0.08,118.00,35.90,0.01,0.07,1.00, + 0.03,81.18,13.33,0.00,0.00,0.00,0.00,0.00,0.41,0.00,0.15,57.00,0.00,22.00,449.68, + 0.00,0.00,2.00,195.26,51.58,306.50,0.10,1.00,0.00,258.00,21.00,0.43,3.00,16.00,0.00, + 0.00,0.00,0.00,1.00,74.51,4.00,0.02,35.90,30.00,8.69,0.00,0.36,5.00,2.00,3.00,0.26, + 9.50,8.00,11.00,11918.15,0.00,258.00,13.00,9.04,0.14,604.65,0.92,74.59,0.00,0.00, + 72.76,1.00,0.22,64.00,2.00,0.00,0.00,0.02,0.00,305.50,27.70,0.02,0.00,177.00,14.00, + 0.00,0.05,90.00,0.03,0.00,1.00,0.43,4.00,0.05,0.09,431.00,0.00,2.00,0.00,0.00,1.00, + 0.25,0.17,0.00,0.00,21.00,94.12,0.17,0.00,0.00,0.00,548.00,0.00,68.00,0.00,0.00,9.50, + 25.45,1390.31,7.00,0.00,2.00,310.70,0.00,0.01,0.01,0.03,81.40,1.00,0.02,0.00,9.00, + 6.00,0.00,175.76,36.00,0.00,20.75,2.00,0.00,0.00,0.00,0.22,74.16,0.10,56.81,0.00, + 2197.03,0.00,197.66,0.00,55.00,20.00,367.18,22.00,0.00,0.01,1510.26,0.24,0.00,0.01, + 0.00,11.00,278.10,61.70,278.10,0.00,0.08,0.57,1.00,0.65,255.60,0.00,0.86,0.25,70.95, + 2299.70,0.23,0.05,92.70,1.00,38.00,0.00,0.00,56.81,21.85,0.00,23.74,0.00,2.00,0.03, + 2.00,0.00,347.58,30.00,243.55,109.00,0.00,296.00,6.00,6.00,0.00,0.00,109.00,2299.70, + 0.00,0.01,0.08,1.00,4745.09,4.00,0.18,0.00,0.17,0.02,0.00,1.00,147.13,71.07,2115.16, + 0.00,0.26,0.00,43.00,604.90,49.44,4327.03,0.68,0.75,0.10,86.36,52.98,0.20,0.00,22.50, + 305.50,0.00,1.00,0.00,7.00,0.78,0.00,296.00,22.50,0.00,5.00,2979.54,1.00,14.00,51.00, + 0.42,0.11,0.00,1.00,0.00,0.00,70.90,37.84,0.02,548.40,0.00,46.35,5.00,1.66,0.29,0.00, + 0.02,2255.69,160.53,790.64,6775.15,0.68,19.50,2299.70,79.87,6.00,0.00,60.00,0.27, + 233.77,10.00,0.00,0.00,23.00,82.27,1.00,0.00,1.00,0.42,1.00,0.01,0.40,0.41,9.50,2299.70, + 46.30,0.00,0.00,2299.70,3.00,0.00,0.00,83.00,1.00], + [48.00,80.89,69.90,11570.00,26.00,0.40,468.00,0.00,5739.46,0.00,1480.00,90.89,0.00, + 14042.09,3600.08,120.00,0.09,31.00,0.25,2.36,0.00,7.00,22.00,0.00,257.59,0.00,6.00, + 260.00,0.05,313.00,1.00,0.07,468.00,0.00,0.67,11.00,0.02,0.32,0.00,0.00,1387.61,0.34, + 0.00,0.00,158.04,6.00,13.98,12380.05,0.00,0.16,122.74,3.00,0.18,291.33,7517.79,124.00, + 45.08,900.00,1.00,0.00,577.25,79.75,0.39,0.00,0.00,244.62,0.00,57.00,178.00,19.00, + 0.00,1.00,386.10,103.51,480.00,0.06,129.41,334.31,1.00,0.06,0.00,0.06,3.00,125.55, + 0.00,76.00,0.14,30.00,0.00,0.03,411.29,791.33,55.00,0.12,3.80,0.07,0.01,188.00,221.11, + 0.01,0.15,1.00,0.18,144.32,15.00,0.00,0.05,0.00,3.00,0.00,0.20,0.00,0.14,62.00,0.06, + 55.00,239.35,0.00,0.00,2.00,534.20,747.50,400.57,0.40,0.00,0.00,219.98,30.00,0.25, + 1.00,70.00,0.02,0.04,0.00,0.00,7.00,747.50,8.67,0.06,271.01,28.00,5.63,75.39,0.46, + 11.00,3.00,19.00,0.38,131.74,23.00,39.00,30249.41,0.00,202.68,2.00,64.94,0.03,2787.68, + 0.54,35.00,0.02,106.03,25.00,1.00,0.10,45.00,2.00,0.00,0.00,0.00,0.00,449.27,172.38, + 0.05,0.00,550.00,130.00,2006.55,0.07,0.00,0.03,0.00,5.00,0.21,22.00,0.05,0.01,1011.40, + 0.00,4.00,3600.08,0.00,1.00,1.00,1.00,0.00,3.00,9.00,270.00,0.12,0.03,0.00,0.00,820.00, + 1827.50,0.00,100.33,0.00,131.74,53.16,9557.97,7.00,0.00,11.00,180.81,0.00,0.01,0.04, + 0.02,1480.00,0.92,0.05,0.00,15.00,6.00,0.00,161.42,28.00,169.00,35.60,4.00,0.12,0.00, + 0.00,0.27,230.56,0.42,171.90,0.00,28407.51,1.00,883.10,0.00,261.00,9.00,1031.67,38.00, + 0.00,0.04,1607.68,0.32,791.33,0.04,1403.00,2.00,2260.50,88.08,2260.50,0.00,0.12,0.75, + 3.00,0.00,1231.68,0.07,0.60,0.24,0.00,0.00,0.15,0.14,753.50,1.00,95.00,7.00,0.26, + 77.63,38.45,0.00,42.65,0.00,14.00,0.07,6.00,0.00,1911.59,43.00,386.77,1324.80,0.00, + 518.00,10.00,10.00,0.11,0.00,1324.80,0.00,0.00,0.02,0.16,1.00,10492.12,5.00,0.94, + 5.00,0.08,0.10,1.00,0.92,3731.49,105.81,6931.39,0.00,0.43,0.00,118.00,5323.71,81.66, + 14042.09,0.08,0.20,0.40,96.64,0.00,0.08,4.00,1028.82,353.00,0.00,2.00,32.00,43.00, + 5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00, + 7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26, + 4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07, + 17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01, + 0.00,0.00,0.00,0.00,52.00,8.00]] +y = [1, 0] + +dtrain = xgb.DMatrix(X, label=y) + +param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } +watchlist = [(dtrain,'train')] +num_round = 2 +bst = xgb.train(param, dtrain, num_round, watchlist) + +if xgb.rabit.get_rank() == 0: + bst.save_model("test_issue3402.model") + xgb.rabit.tracker_print("Finished training\n") + +# Notify the tracker all training has been successful +# This is only needed in distributed training. +xgb.rabit.finalize() diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 32caf1825..fa02e9266 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -147,3 +147,10 @@ if [ ${TASK} == "cpp_test" ]; then echo "GTEST_PATH="${CACHE_PREFIX} >> config.mk make cover fi + +if [ ${TASK} == "distributed_test" ]; then + set -e + make all || exit -1 + cd tests/distributed + ./runtests.sh +fi diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 086ca6954..546e80344 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -27,3 +27,7 @@ if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_lightweight_test" ]; the conda create -n python3 python=3.5 conda create -n python2 python=2.7 fi + +if [ ${TASK} == "distributed_test" ]; then + pip install --user kubernetes numpy scipy +fi