From 2715baef64c07546314977caf858d1cce1043b66 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Sun, 2 Apr 2017 11:37:40 -0700 Subject: [PATCH] Fix bugs in multithreaded ApplySplitSparseData() (#2161) * Bugfix 1: Fix segfault in multithreaded ApplySplitSparseData() When there are more threads than rows in rowset, some threads end up with empty ranges, causing them to crash. (iend - 1 needs to be accessible as part of algorithm) Fix: run only those threads with nonempty ranges. * Add regression test for Bugfix 1 * Moving python_omp_test to existing python test group Turns out you don't need to set "OMP_NUM_THREADS" to enable multithreading. Just add nthread parameter. * Bugfix 2: Fix corner case of ApplySplitSparseData() for categorical feature When split value is less than all cut points, split_cond is set incorrectly. Fix: set split_cond = -1 to indicate this scenario * Bugfix 3: Initialize data layout indicator before using it data_layout_ is accessed before being set; this variable determines whether feature 0 is included in feat_set. Fix: re-order code in InitData() to initialize data_layout_ first * Adding regression test for Bugfix 2 Unfortunately, no regression test for Bugfix 3, as there is no way to deterministically assign value to an uninitialized variable. --- include/xgboost/base.h | 1 + src/tree/updater_fast_hist.cc | 136 +++++++++++++++++----------------- tests/python/test_openmp.py | 43 +++++++++++ 3 files changed, 113 insertions(+), 67 deletions(-) create mode 100644 tests/python/test_openmp.py diff --git a/include/xgboost/base.h b/include/xgboost/base.h index fff566876..17dae0659 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -64,6 +64,7 @@ namespace xgboost { * used for feature index and row index. */ typedef uint32_t bst_uint; +typedef int32_t bst_int; /*! \brief long integers */ typedef uint64_t bst_ulong; // NOLINT(*) /*! \brief float type, used for storing statistics */ diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc index 5abcbbb87..3ef812c9a 100644 --- a/src/tree/updater_fast_hist.cc +++ b/src/tree/updater_fast_hist.cc @@ -374,6 +374,24 @@ class FastHistMaker: public TreeUpdater { row_set_collection_.Init(); } + { + /* determine layout of data */ + const auto nrow = info.num_row; + const auto ncol = info.num_col; + const auto nnz = info.num_nonzero; + // number of discrete bins for feature 0 + const unsigned nbins_f0 = gmat.cut->row_ptr[1] - gmat.cut->row_ptr[0]; + if (nrow * ncol == nnz) { + // dense data with zero-based indexing + data_layout_ = kDenseDataZeroBased; + } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) { + // dense data with one-based indexing + data_layout_ = kDenseDataOneBased; + } else { + // sparse data + data_layout_ = kSparseData; + } + } { // store a pointer to the tree p_last_tree_ = &tree; @@ -398,24 +416,6 @@ class FastHistMaker: public TreeUpdater { << " is too small that no feature can be included"; feat_index.resize(n); } - { - /* determine layout of data */ - const auto nrow = info.num_row; - const auto ncol = info.num_col; - const auto nnz = info.num_nonzero; - // number of discrete bins for feature 0 - const unsigned nbins_f0 = gmat.cut->row_ptr[1] - gmat.cut->row_ptr[0]; - if (nrow * ncol == nnz) { - // dense data with zero-based indexing - data_layout_ = kDenseDataZeroBased; - } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) { - // dense data with one-based indexing - data_layout_ = kDenseDataOneBased; - } else { - // sparse data - data_layout_ = kSparseData; - } - } if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { /* specialized code for dense data: choose the column that has a least positive number of discrete bins. @@ -520,11 +520,11 @@ class FastHistMaker: public TreeUpdater { const bst_float split_pt = (*p_tree)[nid].split_cond(); const bst_uint lower_bound = gmat.cut->row_ptr[fid]; const bst_uint upper_bound = gmat.cut->row_ptr[fid + 1]; - // set the split condition correctly - bst_uint split_cond = 0; - // set the condition + bst_int split_cond = -1; + // convert floating-point split_pt into corresponding bin_id + // split_cond = -1 indicates that split_pt is less than all known cut points for (unsigned i = gmat.cut->row_ptr[fid]; i < gmat.cut->row_ptr[fid + 1]; ++i) { - if (split_pt == gmat.cut->cut[i]) split_cond = i; + if (split_pt == gmat.cut->cut[i]) split_cond = static_cast(i); } const auto& rowset = row_set_collection_[nid]; @@ -547,7 +547,7 @@ class FastHistMaker: public TreeUpdater { const GHistIndexMatrix& gmat, std::vector* p_row_split_tloc, const Column& column, - bst_uint split_cond, + bst_int split_cond, bool default_left) { std::vector& row_split_tloc = *p_row_split_tloc; const int K = 8; // loop unrolling factor @@ -575,7 +575,7 @@ class FastHistMaker: public TreeUpdater { right.push_back(rid[k]); } } else { - if (rbin[k] + column.index_base <= split_cond) { + if (static_cast(rbin[k] + column.index_base) <= split_cond) { left.push_back(rid[k]); } else { right.push_back(rid[k]); @@ -595,7 +595,7 @@ class FastHistMaker: public TreeUpdater { right.push_back(rid); } } else { - if (rbin + column.index_base <= split_cond) { + if (static_cast(rbin + column.index_base) <= split_cond) { left.push_back(rid); } else { right.push_back(rid); @@ -609,7 +609,7 @@ class FastHistMaker: public TreeUpdater { std::vector* p_row_split_tloc, bst_uint lower_bound, bst_uint upper_bound, - bst_uint split_cond, + bst_int split_cond, bool default_left) { std::vector& row_split_tloc = *p_row_split_tloc; const int K = 8; // loop unrolling factor @@ -634,7 +634,7 @@ class FastHistMaker: public TreeUpdater { } for (int k = 0; k < K; ++k) { if (p[k] != row[k].index + row[k].size && *p[k] < upper_bound) { - if (*p[k] <= split_cond) { + if (static_cast(*p[k]) <= split_cond) { left.push_back(rid[k]); } else { right.push_back(rid[k]); @@ -655,7 +655,7 @@ class FastHistMaker: public TreeUpdater { auto& left = row_split_tloc[0].left; auto& right = row_split_tloc[0].right; if (p != row.index + row.size && *p < upper_bound) { - if (*p <= split_cond) { + if (static_cast(*p) <= split_cond) { left.push_back(rid); } else { right.push_back(rid); @@ -677,7 +677,7 @@ class FastHistMaker: public TreeUpdater { const Column& column, bst_uint lower_bound, bst_uint upper_bound, - bst_uint split_cond, + bst_int split_cond, bool default_left) { std::vector& row_split_tloc = *p_row_split_tloc; const bst_omp_uint nrows = rowset.end - rowset.begin; @@ -687,52 +687,54 @@ class FastHistMaker: public TreeUpdater { const bst_uint tid = omp_get_thread_num(); const bst_omp_uint ibegin = tid * nrows / nthread; const bst_omp_uint iend = (tid + 1) * nrows / nthread; - // search first nonzero row with index >= rowset[ibegin] - const uint32_t* p = std::lower_bound(column.row_ind, - column.row_ind + column.len, - rowset.begin[ibegin]); + if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range + // search first nonzero row with index >= rowset[ibegin] + const uint32_t* p = std::lower_bound(column.row_ind, + column.row_ind + column.len, + rowset.begin[ibegin]); - auto& left = row_split_tloc[tid].left; - auto& right = row_split_tloc[tid].right; - if (p != column.row_ind + column.len && *p <= rowset.begin[iend - 1]) { - bst_omp_uint cursor = p - column.row_ind; + auto& left = row_split_tloc[tid].left; + auto& right = row_split_tloc[tid].right; + if (p != column.row_ind + column.len && *p <= rowset.begin[iend - 1]) { + bst_omp_uint cursor = p - column.row_ind; - for (bst_omp_uint i = ibegin; i < iend; ++i) { - const bst_uint rid = rowset.begin[i]; - while (cursor < column.len - && column.row_ind[cursor] < rid - && column.row_ind[cursor] <= rowset.begin[iend - 1]) { - ++cursor; - } - if (cursor < column.len && column.row_ind[cursor] == rid) { - const T rbin = column.index[cursor]; - if (rbin + column.index_base <= split_cond) { - left.push_back(rid); - } else { - right.push_back(rid); + for (bst_omp_uint i = ibegin; i < iend; ++i) { + const bst_uint rid = rowset.begin[i]; + while (cursor < column.len + && column.row_ind[cursor] < rid + && column.row_ind[cursor] <= rowset.begin[iend - 1]) { + ++cursor; + } + if (cursor < column.len && column.row_ind[cursor] == rid) { + const T rbin = column.index[cursor]; + if (static_cast(rbin + column.index_base) <= split_cond) { + left.push_back(rid); + } else { + right.push_back(rid); + } + ++cursor; + } else { + // missing value + if (default_left) { + left.push_back(rid); + } else { + right.push_back(rid); + } + } + } + } else { // all rows in [ibegin, iend) have missing values + if (default_left) { + for (bst_omp_uint i = ibegin; i < iend; ++i) { + const bst_uint rid = rowset.begin[i]; + left.push_back(rid); } - ++cursor; } else { - // missing value - if (default_left) { - left.push_back(rid); - } else { + for (bst_omp_uint i = ibegin; i < iend; ++i) { + const bst_uint rid = rowset.begin[i]; right.push_back(rid); } } } - } else { // all rows in [ibegin, iend) have missing values - if (default_left) { - for (bst_omp_uint i = ibegin; i < iend; ++i) { - const bst_uint rid = rowset.begin[i]; - left.push_back(rid); - } - } else { - for (bst_omp_uint i = ibegin; i < iend; ++i) { - const bst_uint rid = rowset.begin[i]; - right.push_back(rid); - } - } } } } diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py new file mode 100644 index 000000000..eb73daf2c --- /dev/null +++ b/tests/python/test_openmp.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +from scipy.sparse import csr_matrix +import xgboost as xgb +import unittest + + +class TestOMP(unittest.TestCase): + def test_omp(self): + # a contrived example where one node has an instance set of size 2. + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + indices = [2, 1, 1, 2, 0, 0, 2, 0, 1, 3] + indptr = [0, 1, 2, 4, 5, 7, 9, 10] + A = csr_matrix((data, indices, indptr), shape=(7, 4)) + y = [1, 1, 0, 0, 0, 1, 1] + dtrain = xgb.DMatrix(A, label=y) + + # 1. use 3 threads to train a tree with an instance set of size 2 + param = {'booster': 'gbtree', + 'objective': 'binary:logistic', + 'grow_policy': 'lossguide', + 'tree_method': 'hist', + 'eval_metric': 'auc', + 'max_depth': 0, + 'max_leaves': 1024, + 'min_child_weight': 0, + 'nthread': 3} + + watchlist = [(dtrain, 'train')] + num_round = 1 + res = {} + xgb.train(param, dtrain, num_round, watchlist, evals_result=res) + assert res['train']['auc'][-1] > 0.99 + + # 2. vary number of threads and test whether you get the same result + param['nthread'] = 1 + res2 = {} + xgb.train(param, dtrain, num_round, watchlist, evals_result=res2) + assert res['train']['auc'][-1] == res2['train']['auc'][-1] + + param['nthread'] = 2 + res3 = {} + xgb.train(param, dtrain, num_round, watchlist, evals_result=res3) + assert res['train']['auc'][-1] == res3['train']['auc'][-1]