From 549c8d6ae998a5aa4578102a517e4b4def8761c9 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sun, 17 Feb 2019 16:01:07 -0800
Subject: [PATCH] Prevent empty quantiles in fast hist (#4155)

* Prevent empty quantiles

* Revise and improve unit tests for quantile hist

* Remove unnecessary comment

* Add #2943 as a test case

* Skip test if no sklearn

* Revise misleading comments
---
 src/common/hist_util.cc              |  21 ++--
 tests/cpp/tree/test_quantile_hist.cc | 182 ++++++++++++++++++++++-----
 tests/python/test_updaters.py        |  15 +++
 3 files changed, 179 insertions(+), 39 deletions(-)
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index ad2e1696e..8e1340313 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -148,14 +148,17 @@ void HistCutMatrix::Init
       }
     }
     // push a value that is greater than anything
-    if (a.size != 0) {
-      bst_float cpt = a.data[a.size - 1].value;
-      // this must be bigger than last value in a scale
-      bst_float last = cpt + (fabs(cpt) + 1e-5);
-      cut.push_back(last);
-    }
+    const bst_float cpt
+      = (a.size > 0) ? a.data[a.size - 1].value : this->min_val[fid];
+    // this must be bigger than last value in a scale
+    const bst_float last = cpt + (fabs(cpt) + 1e-5);
+    cut.push_back(last);
 
-    row_ptr.push_back(static_cast<bst_uint>(cut.size()));
+    // Ensure that every feature gets at least one quantile point
+    CHECK_LE(cut.size(), std::numeric_limits<uint32_t>::max());
+    auto cut_size = static_cast<uint32_t>(cut.size());
+    CHECK_GT(cut_size, row_ptr.back());
+    row_ptr.push_back(cut_size);
   }
 }
 
@@ -165,7 +168,9 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) {
   auto cend = cut.begin() + row_ptr[fid + 1];
   CHECK(cbegin != cend);
   auto it = std::upper_bound(cbegin, cend, e.fvalue);
-  if (it == cend) it = cend - 1;
+  if (it == cend) {
+    it = cend - 1;
+  }
   uint32_t idx = static_cast<uint32_t>(it - cut.begin());
   return idx;
 }
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 47ed36c92..819860e8a 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -4,11 +4,13 @@
 #include "../helpers.h"
 #include "../../../src/tree/param.h"
 #include "../../../src/tree/updater_quantile_hist.h"
+#include "../../../src/tree/split_evaluator.h"
 #include "../../../src/common/host_device_vector.h"
 
 #include <xgboost/tree_updater.h>
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <vector>
 #include <string>
 
@@ -22,43 +24,105 @@ class QuantileHistMock : public QuantileHistMaker {
     using RealImpl = QuantileHistMaker::Builder;
 
     BuilderMock(const TrainParam& param,
-                     std::unique_ptr<TreeUpdater> pruner,
-                     std::unique_ptr<SplitEvaluator> spliteval)
+                std::unique_ptr<TreeUpdater> pruner,
+                std::unique_ptr<SplitEvaluator> spliteval)
         : RealImpl(param, std::move(pruner), std::move(spliteval)) {}
 
    public:
     void TestInitData(const GHistIndexMatrix& gmat,
-                  const std::vector<GradientPair>& gpair,
-                  const DMatrix& fmat,
-                  const RegTree& tree) {
-      RealImpl::InitData(gmat, gpair, fmat, tree);
+                      const std::vector<GradientPair>& gpair,
+                      DMatrix* p_fmat,
+                      const RegTree& tree) {
+      RealImpl::InitData(gmat, gpair, *p_fmat, tree);
       ASSERT_EQ(data_layout_, kSparseData);
+
+      /* The creation of HistCutMatrix and GHistIndexMatrix are not technically
+       * part of QuantileHist updater logic, but we include it here because
+       * QuantileHist updater object currently stores GHistIndexMatrix
+       * internally. According to https://github.com/dmlc/xgboost/pull/3803,
+       * we should eventually move GHistIndexMatrix out of the QuantileHist
+       * updater. */
+
+      const size_t num_row = p_fmat->Info().num_row_;
+      const size_t num_col = p_fmat->Info().num_col_;
+      /* Validate HistCutMatrix */
+      ASSERT_EQ(gmat.cut.row_ptr.size(), num_col + 1);
+      for (size_t fid = 0; fid < num_col; ++fid) {
+        // Each feature must have at least one quantile point (cut)
+        const size_t ibegin = gmat.cut.row_ptr[fid];
+        const size_t iend = gmat.cut.row_ptr[fid + 1];
+        ASSERT_LT(ibegin, iend);
+        for (size_t i = ibegin; i < iend - 1; ++i) {
+          // Quantile points must be sorted in ascending order
+          // No duplicates allowed
+          ASSERT_LT(gmat.cut.cut[i], gmat.cut.cut[i + 1]);
+        }
+      }
+
+      /* Validate GHistIndexMatrix */
+      ASSERT_EQ(gmat.row_ptr.size(), num_row + 1);
+      ASSERT_LT(*std::max_element(gmat.index.begin(), gmat.index.end()),
+                gmat.cut.row_ptr.back());
+      for (const auto& batch : p_fmat->GetRowBatches()) {
+        for (size_t i = 0; i < batch.Size(); ++i) {
+          const size_t rid = batch.base_rowid + i;
+          ASSERT_LT(rid, num_row);
+          const size_t gmat_row_offset = gmat.row_ptr[rid];
+          ASSERT_LT(gmat_row_offset, gmat.index.size());
+          SparsePage::Inst inst = batch[i];
+          ASSERT_EQ(gmat.row_ptr[rid] + inst.size(), gmat.row_ptr[rid + 1]);
+          for (size_t j = 0; j < inst.size(); ++j) {
+            // Each entry of GHistIndexMatrix represents a bin ID
+            const size_t bin_id = gmat.index[gmat_row_offset + j];
+            const size_t fid = inst[j].index;
+            // The bin ID must correspond to correct feature
+            ASSERT_GE(bin_id, gmat.cut.row_ptr[fid]);
+            ASSERT_LT(bin_id, gmat.cut.row_ptr[fid + 1]);
+            // The bin ID must correspond to a region between two
+            // suitable quantile points
+            ASSERT_LT(inst[j].fvalue, gmat.cut.cut[bin_id]);
+            if (bin_id > gmat.cut.row_ptr[fid]) {
+              ASSERT_GE(inst[j].fvalue, gmat.cut.cut[bin_id - 1]);
+            } else {
+              ASSERT_GE(inst[j].fvalue, gmat.cut.min_val[fid]);
+            }
+          }
+        }
+      }
     }
 
     void TestBuildHist(int nid,
                        const GHistIndexMatrix& gmat,
                        const DMatrix& fmat,
                        const RegTree& tree) {
-      std::vector<GradientPair> gpair =
+      const std::vector<GradientPair> gpair =
           { {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
             {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
       RealImpl::InitData(gmat, gpair, fmat, tree);
-      GHistIndexBlockMatrix quantile_index_block;
+      GHistIndexBlockMatrix dummy;
       hist_.AddHistRow(nid);
       BuildHist(gpair, row_set_collection_[nid],
-                gmat, quantile_index_block, hist_[nid], false);
-      std::vector<GradientPairPrecise> solution {
-        {0.27f, 0.29f}, {0.27f, 0.29f}, {0.47f, 0.49f},
-        {0.27f, 0.29f}, {0.57f, 0.59f}, {0.26f, 0.27f},
-        {0.37f, 0.39f}, {0.23f, 0.24f}, {0.37f, 0.39f},
-        {0.27f, 0.28f}, {0.27f, 0.29f}, {0.37f, 0.39f},
-        {0.26f, 0.27f}, {0.23f, 0.24f}, {0.57f, 0.59f},
-        {0.47f, 0.49f}, {0.47f, 0.49f}, {0.37f, 0.39f},
-        {0.26f, 0.27f}, {0.23f, 0.24f}, {0.27f, 0.28f},
-        {0.57f, 0.59f}, {0.23f, 0.24f}, {0.47f, 0.49f}};
+                gmat, dummy, hist_[nid], false);
 
+      // Check if number of histogram bins is correct
+      ASSERT_EQ(hist_[nid].size(), gmat.cut.row_ptr.back());
+      std::vector<GradientPairPrecise> histogram_expected(hist_[nid].size());
+
+      // Compute the correct histogram (histogram_expected)
+      const size_t num_row = fmat.Info().num_row_;
+      CHECK_EQ(gpair.size(), num_row);
+      for (size_t rid = 0; rid < num_row; ++rid) {
+        const size_t ibegin = gmat.row_ptr[rid];
+        const size_t iend = gmat.row_ptr[rid + 1];
+        for (size_t i = ibegin; i < iend; ++i) {
+          const size_t bin_id = gmat.index[i];
+          histogram_expected[bin_id] += GradientPairPrecise(gpair[rid]);
+        }
+      }
+
+      // Now validate the computed histogram returned by BuildHist
       for (size_t i = 0; i < hist_[nid].size(); ++i) {
-        GradientPairPrecise sol = solution[i];
+        GradientPairPrecise sol = histogram_expected[i];
         ASSERT_NEAR(sol.GetGrad(), hist_[nid][i].GetGrad(), kEps);
         ASSERT_NEAR(sol.GetHess(), hist_[nid][i].GetHess(), kEps);
       }
@@ -67,10 +131,11 @@ class QuantileHistMock : public QuantileHistMaker {
     void TestEvaluateSplit(const GHistIndexBlockMatrix& quantile_index_block,
                            const RegTree& tree) {
       std::vector<GradientPair> row_gpairs =
-          { {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
-            {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
+          { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
+            {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
       size_t constexpr max_bins = 4;
-      auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3);  // dense
+      auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3);
+        // dense, no missing values
 
       common::GHistIndexMatrix gmat;
       gmat.Init((*dmat).get(), max_bins);
@@ -82,14 +147,67 @@ class QuantileHistMock : public QuantileHistMaker {
                 gmat, quantile_index_block, hist_[0], false);
 
       RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree);
-      // Manipulate the root_gain so that I don't have to invent an actual
-      // split.  Yes, I'm cheating.
-      snode_[0].root_gain = 0.8;
-      RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree);
 
-      ASSERT_NEAR(snode_.at(0).best.loss_chg, 0.7128048, kEps);
-      ASSERT_EQ(snode_.at(0).best.SplitIndex(), 10);
-      ASSERT_NEAR(snode_.at(0).best.split_value, 0.182258, kEps);
+      /* Compute correct split (best_split) using the computed histogram */
+      const size_t num_row = dmat->get()->Info().num_row_;
+      const size_t num_feature = dmat->get()->Info().num_col_;
+      CHECK_EQ(num_row, row_gpairs.size());
+      // Compute total gradient for all data points
+      GradientPairPrecise total_gpair;
+      for (const auto& e : row_gpairs) {
+        total_gpair += GradientPairPrecise(e);
+      }
+      // Initialize split evaluator
+      std::unique_ptr<SplitEvaluator> evaluator(SplitEvaluator::Create("elastic_net"));
+      evaluator->Init({});
+
+      // Now enumerate all feature*threshold combination to get best split
+      // To simplify logic, we make some assumptions:
+      // 1) no missing values in data
+      // 2) no regularization, i.e. set min_child_weight, reg_lambda, reg_alpha,
+      //    and max_delta_step to 0.
+      bst_float best_split_gain = 0.0f;
+      size_t best_split_threshold, best_split_feature;
+      // Enumerate all features
+      for (size_t fid = 0; fid < num_feature; ++fid) {
+        const size_t bin_id_min = gmat.cut.row_ptr[fid];
+        const size_t bin_id_max = gmat.cut.row_ptr[fid + 1];
+        // Enumerate all bin ID in [bin_id_min, bin_id_max), i.e. every possible
+        // choice of thresholds for feature fid
+        for (size_t split_thresh = bin_id_min;
+             split_thresh < bin_id_max; ++split_thresh) {
+          // left_sum, right_sum: Gradient sums for data points whose feature
+          //                      value is left/right side of the split threshold
+          GradientPairPrecise left_sum, right_sum;
+          for (size_t rid = 0; rid < num_row; ++rid) {
+            for (size_t offset = gmat.row_ptr[rid];
+                 offset < gmat.row_ptr[rid + 1]; ++offset) {
+              const size_t bin_id = gmat.index[offset];
+              if (bin_id >= bin_id_min && bin_id < bin_id_max) {
+                if (bin_id <= split_thresh) {
+                  left_sum += GradientPairPrecise(row_gpairs[rid]);
+                } else {
+                  right_sum += GradientPairPrecise(row_gpairs[rid]);
+                }
+              }
+            }
+          }
+          // Now compute gain (change in loss)
+          const auto split_gain
+            = evaluator->ComputeSplitScore(0, fid, GradStats(left_sum),
+                                           GradStats(right_sum));
+          if (split_gain > best_split_gain) {
+            best_split_gain = split_gain;
+            best_split_feature = fid;
+            best_split_threshold = split_thresh;
+          }
+        }
+      }
+
+      /* Now compare against result given by EvaluateSplit() */
+      RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree);
+      ASSERT_EQ(snode_[0].best.SplitIndex(), best_split_feature);
+      ASSERT_EQ(snode_[0].best.split_value, gmat.cut.cut[best_split_threshold]);
 
       delete dmat;
     }
@@ -128,7 +246,7 @@ class QuantileHistMock : public QuantileHistMaker {
         { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
           {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
 
-    builder_->TestInitData(gmat, gpair, *(*dmat), tree);
+    builder_->TestInitData(gmat, gpair, dmat->get(), tree);
   }
 
   void TestBuildHist() {
@@ -169,7 +287,9 @@ TEST(Updater, QuantileHist_BuildHist) {
 TEST(Updater, QuantileHist_EvalSplits) {
   std::vector<std::pair<std::string, std::string>> cfg
       {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
-       {"split_evaluator", "elastic_net"}};
+       {"split_evaluator", "elastic_net"},
+       {"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"},
+       {"min_child_weight", "0"}};
   QuantileHistMock maker(cfg);
   maker.TestEvaluateSplit();
 }
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index b19710e35..5cc41e551 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -2,6 +2,7 @@ import testing as tm
 import unittest
 import pytest
 import xgboost as xgb
+import numpy as np
 
 try:
     from regression_test_utilities import run_suite, parameter_combinations, \
@@ -59,3 +60,17 @@ class TestUpdaters(unittest.TestCase):
                   evals_result=exact_res)
         assert hist_res['train']['auc'] == exact_res['train']['auc']
         assert hist_res['test']['auc'] == exact_res['test']['auc']
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_fast_histmaker_degenerate_case(self):
+        # Test a degenerate case where the quantile sketcher won't return any
+        # quantile points for a particular feature (the second feature in
+        # this example). Source: https://github.com/dmlc/xgboost/issues/2943
+        nan = np.nan
+        param = {'missing': nan, 'tree_method': 'hist'}
+        model = xgb.XGBRegressor(**param)
+        X = [[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
+             [6.38888889e+05, nan], [6.28086420e+05, nan]]
+        y = [1000000., 0., 0., 500000.]
+        w = [0, 0, 1, 0]
+        model.fit(X, y, sample_weight=w)