Make `HistCutMatrix::Init' be aware of groups. (#4115)

* Add checks for group size. * Simple docs. * Search group index during hist cut matrix initialization. Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com> Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2019-02-16 04:39:41 +08:00
parent 37ddfd7d6e
commit 754fe8142b
6 changed files with 188 additions and 22 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -24,7 +24,25 @@
 namespace xgboost {
 namespace common {

+HistCutMatrix::HistCutMatrix() {
+  monitor_.Init("HistCutMatrix");
+}
+
+size_t HistCutMatrix::SearchGroupIndFromBaseRow(
+    std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const {
+  using KIt = std::vector<bst_uint>::const_iterator;
+  KIt res = std::lower_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
+  // Cannot use CHECK_NE because it will try to print the iterator.
+  bool const found = res != group_ptr.cend() - 1;
+  if (!found) {
+    LOG(FATAL) << "Row " << base_rowid << " does not lie in any group!\n";
+  }
+  size_t group_ind = std::distance(group_ptr.cbegin(), res);
+  return group_ind;
+}
+
 void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
+  monitor_.Start("Init");
  const MetaInfo& info = p_fmat->Info();

  // safe factor for better accuracy
@@ -33,30 +51,50 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {

  const int nthread = omp_get_max_threads();

-  auto nstep = static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
-  auto ncol = static_cast<unsigned>(info.num_col_);
+  unsigned const nstep =
+      static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
+  unsigned const ncol = static_cast<unsigned>(info.num_col_);
  sketchs.resize(info.num_col_);
  for (auto& s : sketchs) {
    s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor));
  }

  const auto& weights = info.weights_.HostVector();
+
+  // Data groups, used in ranking.
+  std::vector<bst_uint> const& group_ptr = info.group_ptr_;
+  size_t const num_groups = group_ptr.size() == 0 ? 0 : group_ptr.size() - 1;
+  // Use group index for weights?
+  bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_;
+
  for (const auto &batch : p_fmat->GetRowBatches()) {
-    #pragma omp parallel num_threads(nthread)
+    size_t group_ind = 0;
+    if (use_group_ind) {
+      group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
+    }
+#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
    {
      CHECK_EQ(nthread, omp_get_num_threads());
      auto tid = static_cast<unsigned>(omp_get_thread_num());
      unsigned begin = std::min(nstep * tid, ncol);
      unsigned end = std::min(nstep * (tid + 1), ncol);
+
      // do not iterate if no columns are assigned to the thread
      if (begin < end && end <= ncol) {
        for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
-          size_t ridx = batch.base_rowid + i;
-          SparsePage::Inst inst = batch[i];
-          for (auto& ins : inst) {
-            if (ins.index >= begin && ins.index < end) {
-              sketchs[ins.index].Push(ins.fvalue,
-                                      weights.size() > 0 ? weights[ridx] : 1.0f);
+          size_t const ridx = batch.base_rowid + i;
+          SparsePage::Inst const inst = batch[i];
+          if (use_group_ind &&
+              group_ptr[group_ind] == ridx &&
+              // maximum equals to weights.size() - 1
+              group_ind < num_groups - 1) {
+            // move to next group
+            group_ind++;
+          }
+          for (auto const& entry : inst) {
+            if (entry.index >= begin && entry.index < end) {
+              size_t w_idx = use_group_ind ? group_ind : ridx;
+              sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
            }
          }
        }
@@ -65,6 +103,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
  }

  Init(&sketchs, max_num_bins);
+  monitor_.Stop("Init");
 }

 void HistCutMatrix::Init
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -13,6 +13,7 @@
 #include "row_set.h"
 #include "../tree/param.h"
 #include "./quantile.h"
+#include "./timer.h"
 #include "../include/rabit/rabit.h"

 namespace xgboost {
@@ -35,6 +36,14 @@ struct HistCutMatrix {
  void Init(DMatrix* p_fmat, uint32_t max_num_bins);

  void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);
+
+  HistCutMatrix();
+
+ protected:
+  virtual size_t SearchGroupIndFromBaseRow(
+      std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const;
+
+  Monitor monitor_;
 };

 /*! \brief Builds the cut matrix on the GPU */
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -474,12 +474,16 @@ class LearnerImpl : public Learner {

  void UpdateOneIter(int iter, DMatrix* train) override {
    monitor_.Start("UpdateOneIter");
+
+    // TODO(trivialfis): Merge the duplicated code with BoostOneIter
    CHECK(ModelInitialized())
        << "Always call InitModel or LoadModel before update";
    if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
      common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
    }
+    this->ValidateDMatrix(train);
    this->PerformTreeMethodHeuristic(train);
+
    monitor_.Start("PredictRaw");
    this->PredictRaw(train, &preds_);
    monitor_.Stop("PredictRaw");
@@ -493,10 +497,15 @@ class LearnerImpl : public Learner {
  void BoostOneIter(int iter, DMatrix* train,
                    HostDeviceVector<GradientPair>* in_gpair) override {
    monitor_.Start("BoostOneIter");
+
+    CHECK(ModelInitialized())
+        << "Always call InitModel or LoadModel before boost.";
    if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
      common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
    }
+    this->ValidateDMatrix(train);
    this->PerformTreeMethodHeuristic(train);
+
    gbm_->DoBoost(train, in_gpair);
    monitor_.Stop("BoostOneIter");
  }
@@ -711,7 +720,7 @@ class LearnerImpl : public Learner {
      mparam_.num_feature = num_feature;
    }
    CHECK_NE(mparam_.num_feature, 0)
-        << "0 feature is supplied.  Are you using raw Booster?";
+        << "0 feature is supplied.  Are you using raw Booster interface?";
    // setup
    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
    CHECK(obj_ == nullptr && gbm_ == nullptr);
@@ -736,6 +745,19 @@ class LearnerImpl : public Learner {
    gbm_->PredictBatch(data, out_preds, ntree_limit);
  }

+  void ValidateDMatrix(DMatrix* p_fmat) {
+    MetaInfo const& info = p_fmat->Info();
+    auto const& weights = info.weights_.HostVector();
+    if (info.group_ptr_.size() != 0 && weights.size() != 0) {
+      CHECK(weights.size() == info.group_ptr_.size() - 1)
+          << "\n"
+          << "weights size: " << weights.size()            << ", "
+          << "groups size: "  << info.group_ptr_.size() -1 << ", "
+          << "num rows: "     << p_fmat->Info().num_row_   << "\n"
+          << "Number of weights should be equal to number of groups in ranking task.";
+    }
+  }
+
  // model parameter
  LearnerModelParam mparam_;
  // training parameter