From 2715baef64c07546314977caf858d1cce1043b66 Mon Sep 17 00:00:00 2001
From: Philip Cho <chohyu01@cs.washington.edu>
Date: Sun, 2 Apr 2017 11:37:40 -0700
Subject: [PATCH] Fix bugs in multithreaded ApplySplitSparseData() (#2161)

* Bugfix 1: Fix segfault in multithreaded ApplySplitSparseData()

When there are more threads than rows in rowset, some threads end up
with empty ranges, causing them to crash. (iend - 1 needs to be
accessible as part of algorithm)

Fix: run only those threads with nonempty ranges.

* Add regression test for Bugfix 1

* Moving python_omp_test to existing python test group

Turns out you don't need to set "OMP_NUM_THREADS" to enable
multithreading. Just add nthread parameter.

* Bugfix 2: Fix corner case of ApplySplitSparseData() for categorical feature

When split value is less than all cut points, split_cond is set
incorrectly.

Fix: set split_cond = -1 to indicate this scenario

* Bugfix 3: Initialize data layout indicator before using it

data_layout_ is accessed before being set; this variable determines
whether feature 0 is included in feat_set.

Fix: re-order code in InitData() to initialize data_layout_ first

* Adding regression test for Bugfix 2

Unfortunately, no regression test for Bugfix 3, as there is no
way to deterministically assign value to an uninitialized variable.
---
 include/xgboost/base.h        |   1 +
 src/tree/updater_fast_hist.cc | 136 +++++++++++++++++-----------------
 tests/python/test_openmp.py   |  43 +++++++++++
 3 files changed, 113 insertions(+), 67 deletions(-)
 create mode 100644 tests/python/test_openmp.py

diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index fff566876..17dae0659 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -64,6 +64,7 @@ namespace xgboost {
  *  used for feature index and row index.
  */
 typedef uint32_t bst_uint;
+typedef int32_t bst_int;
 /*! \brief long integers */
 typedef uint64_t bst_ulong;  // NOLINT(*)
 /*! \brief float type, used for storing statistics */
diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc
index 5abcbbb87..3ef812c9a 100644
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@@ -374,6 +374,24 @@ class FastHistMaker: public TreeUpdater {
         row_set_collection_.Init();
       }
 
+      {
+        /* determine layout of data */
+        const auto nrow = info.num_row;
+        const auto ncol = info.num_col;
+        const auto nnz = info.num_nonzero;
+        // number of discrete bins for feature 0
+        const unsigned nbins_f0 = gmat.cut->row_ptr[1] - gmat.cut->row_ptr[0];
+        if (nrow * ncol == nnz) {
+          // dense data with zero-based indexing
+          data_layout_ = kDenseDataZeroBased;
+        } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
+          // dense data with one-based indexing
+          data_layout_ = kDenseDataOneBased;
+        } else {
+          // sparse data
+          data_layout_ = kSparseData;
+        }
+      }
       {
         // store a pointer to the tree
         p_last_tree_ = &tree;
@@ -398,24 +416,6 @@ class FastHistMaker: public TreeUpdater {
             << " is too small that no feature can be included";
         feat_index.resize(n);
       }
-      {
-        /* determine layout of data */
-        const auto nrow = info.num_row;
-        const auto ncol = info.num_col;
-        const auto nnz = info.num_nonzero;
-        // number of discrete bins for feature 0
-        const unsigned nbins_f0 = gmat.cut->row_ptr[1] - gmat.cut->row_ptr[0];
-        if (nrow * ncol == nnz) {
-          // dense data with zero-based indexing
-          data_layout_ = kDenseDataZeroBased;
-        } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
-          // dense data with one-based indexing
-          data_layout_ = kDenseDataOneBased;
-        } else {
-          // sparse data
-          data_layout_ = kSparseData;
-        }
-      }
       if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
         /* specialized code for dense data:
            choose the column that has a least positive number of discrete bins.
@@ -520,11 +520,11 @@ class FastHistMaker: public TreeUpdater {
       const bst_float split_pt = (*p_tree)[nid].split_cond();
       const bst_uint lower_bound = gmat.cut->row_ptr[fid];
       const bst_uint upper_bound = gmat.cut->row_ptr[fid + 1];
-      // set the split condition correctly
-      bst_uint split_cond = 0;
-      // set the condition
+      bst_int split_cond = -1;
+      // convert floating-point split_pt into corresponding bin_id
+      // split_cond = -1 indicates that split_pt is less than all known cut points
       for (unsigned i = gmat.cut->row_ptr[fid]; i < gmat.cut->row_ptr[fid + 1]; ++i) {
-        if (split_pt == gmat.cut->cut[i]) split_cond = i;
+        if (split_pt == gmat.cut->cut[i]) split_cond = static_cast<bst_int>(i);
       }
 
       const auto& rowset = row_set_collection_[nid];
@@ -547,7 +547,7 @@ class FastHistMaker: public TreeUpdater {
                                     const GHistIndexMatrix& gmat,
                                     std::vector<RowSetCollection::Split>* p_row_split_tloc,
                                     const Column<T>& column,
-                                    bst_uint split_cond,
+                                    bst_int split_cond,
                                     bool default_left) {
       std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
       const int K = 8;  // loop unrolling factor
@@ -575,7 +575,7 @@ class FastHistMaker: public TreeUpdater {
               right.push_back(rid[k]);
             }
           } else {
-            if (rbin[k] + column.index_base <= split_cond) {
+            if (static_cast<bst_int>(rbin[k] + column.index_base) <= split_cond) {
               left.push_back(rid[k]);
             } else {
               right.push_back(rid[k]);
@@ -595,7 +595,7 @@ class FastHistMaker: public TreeUpdater {
             right.push_back(rid);
           }
         } else {
-          if (rbin + column.index_base <= split_cond) {
+          if (static_cast<bst_int>(rbin + column.index_base) <= split_cond) {
             left.push_back(rid);
           } else {
             right.push_back(rid);
@@ -609,7 +609,7 @@ class FastHistMaker: public TreeUpdater {
                                         std::vector<RowSetCollection::Split>* p_row_split_tloc,
                                         bst_uint lower_bound,
                                         bst_uint upper_bound,
-                                        bst_uint split_cond,
+                                        bst_int split_cond,
                                         bool default_left) {
       std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
       const int K = 8;  // loop unrolling factor
@@ -634,7 +634,7 @@ class FastHistMaker: public TreeUpdater {
         }
         for (int k = 0; k < K; ++k) {
           if (p[k] != row[k].index + row[k].size && *p[k] < upper_bound) {
-            if (*p[k] <= split_cond) {
+            if (static_cast<bst_int>(*p[k]) <= split_cond) {
               left.push_back(rid[k]);
             } else {
               right.push_back(rid[k]);
@@ -655,7 +655,7 @@ class FastHistMaker: public TreeUpdater {
         auto& left = row_split_tloc[0].left;
         auto& right = row_split_tloc[0].right;
         if (p != row.index + row.size && *p < upper_bound) {
-          if (*p <= split_cond) {
+          if (static_cast<bst_int>(*p) <= split_cond) {
             left.push_back(rid);
           } else {
             right.push_back(rid);
@@ -677,7 +677,7 @@ class FastHistMaker: public TreeUpdater {
                                     const Column<T>& column,
                                     bst_uint lower_bound,
                                     bst_uint upper_bound,
-                                    bst_uint split_cond,
+                                    bst_int split_cond,
                                     bool default_left) {
       std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
       const bst_omp_uint nrows = rowset.end - rowset.begin;
@@ -687,52 +687,54 @@ class FastHistMaker: public TreeUpdater {
         const bst_uint tid = omp_get_thread_num();
         const bst_omp_uint ibegin = tid * nrows / nthread;
         const bst_omp_uint iend = (tid + 1) * nrows / nthread;
-        // search first nonzero row with index >= rowset[ibegin]
-        const uint32_t* p = std::lower_bound(column.row_ind,
-                                             column.row_ind + column.len,
-                                             rowset.begin[ibegin]);
+        if (ibegin < iend) {  // ensure that [ibegin, iend) is nonempty range
+          // search first nonzero row with index >= rowset[ibegin]
+          const uint32_t* p = std::lower_bound(column.row_ind,
+                                               column.row_ind + column.len,
+                                               rowset.begin[ibegin]);
 
-        auto& left = row_split_tloc[tid].left;
-        auto& right = row_split_tloc[tid].right;
-        if (p != column.row_ind + column.len && *p <= rowset.begin[iend - 1]) {
-          bst_omp_uint cursor = p - column.row_ind;
+          auto& left = row_split_tloc[tid].left;
+          auto& right = row_split_tloc[tid].right;
+          if (p != column.row_ind + column.len && *p <= rowset.begin[iend - 1]) {
+            bst_omp_uint cursor = p - column.row_ind;
 
-          for (bst_omp_uint i = ibegin; i < iend; ++i) {
-            const bst_uint rid = rowset.begin[i];
-            while (cursor < column.len
-                   && column.row_ind[cursor] < rid
-                   && column.row_ind[cursor] <= rowset.begin[iend - 1]) {
-              ++cursor;
-            }
-            if (cursor < column.len && column.row_ind[cursor] == rid) {
-              const T rbin = column.index[cursor];
-              if (rbin + column.index_base <= split_cond) {
-                left.push_back(rid);
-              } else {
-                right.push_back(rid);
+            for (bst_omp_uint i = ibegin; i < iend; ++i) {
+              const bst_uint rid = rowset.begin[i];
+              while (cursor < column.len
+                     && column.row_ind[cursor] < rid
+                     && column.row_ind[cursor] <= rowset.begin[iend - 1]) {
+                ++cursor;
+              }
+              if (cursor < column.len && column.row_ind[cursor] == rid) {
+                const T rbin = column.index[cursor];
+                if (static_cast<bst_int>(rbin + column.index_base) <= split_cond) {
+                  left.push_back(rid);
+                } else {
+                  right.push_back(rid);
+                }
+                ++cursor;
+              } else {
+                // missing value
+                if (default_left) {
+                  left.push_back(rid);
+                } else {
+                  right.push_back(rid);
+                }
+              }
+            }
+          } else {  // all rows in [ibegin, iend) have missing values
+            if (default_left) {
+              for (bst_omp_uint i = ibegin; i < iend; ++i) {
+                const bst_uint rid = rowset.begin[i];
+                left.push_back(rid);
               }
-              ++cursor;
             } else {
-              // missing value
-              if (default_left) {
-                left.push_back(rid);
-              } else {
+              for (bst_omp_uint i = ibegin; i < iend; ++i) {
+                const bst_uint rid = rowset.begin[i];
                 right.push_back(rid);
               }
             }
           }
-        } else {  // all rows in [ibegin, iend) have missing values
-          if (default_left) {
-            for (bst_omp_uint i = ibegin; i < iend; ++i) {
-              const bst_uint rid = rowset.begin[i];
-              left.push_back(rid);
-            }
-          } else {
-            for (bst_omp_uint i = ibegin; i < iend; ++i) {
-              const bst_uint rid = rowset.begin[i];
-              right.push_back(rid);
-            }
-          }
         }
       }
     }
diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py
new file mode 100644
index 000000000..eb73daf2c
--- /dev/null
+++ b/tests/python/test_openmp.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+from scipy.sparse import csr_matrix
+import xgboost as xgb
+import unittest
+
+
+class TestOMP(unittest.TestCase):
+    def test_omp(self):
+        # a contrived example where one node has an instance set of size 2.
+        data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        indices = [2, 1, 1, 2, 0, 0, 2, 0, 1, 3]
+        indptr = [0, 1, 2, 4, 5, 7, 9, 10]
+        A = csr_matrix((data, indices, indptr), shape=(7, 4))
+        y = [1, 1, 0, 0, 0, 1, 1]
+        dtrain = xgb.DMatrix(A, label=y)
+
+        # 1. use 3 threads to train a tree with an instance set of size 2
+        param = {'booster': 'gbtree',
+                 'objective': 'binary:logistic',
+                 'grow_policy': 'lossguide',
+                 'tree_method': 'hist',
+                 'eval_metric': 'auc',
+                 'max_depth': 0,
+                 'max_leaves': 1024,
+                 'min_child_weight': 0,
+                 'nthread': 3}
+
+        watchlist = [(dtrain, 'train')]
+        num_round = 1
+        res = {}
+        xgb.train(param, dtrain, num_round, watchlist, evals_result=res)
+        assert res['train']['auc'][-1] > 0.99
+
+        # 2. vary number of threads and test whether you get the same result
+        param['nthread'] = 1
+        res2 = {}
+        xgb.train(param, dtrain, num_round, watchlist, evals_result=res2)
+        assert res['train']['auc'][-1] == res2['train']['auc'][-1]
+
+        param['nthread'] = 2
+        res3 = {}
+        xgb.train(param, dtrain, num_round, watchlist, evals_result=res3)
+        assert res['train']['auc'][-1] == res3['train']['auc'][-1]