Make `HistCutMatrix::Init' be aware of groups. (#4115)

* Add checks for group size.
* Simple docs.
* Search group index during hist cut matrix initialization.

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2019-02-16 04:39:41 +08:00
committed by GitHub
parent 37ddfd7d6e
commit 754fe8142b
6 changed files with 188 additions and 22 deletions

View File

@@ -24,7 +24,25 @@
namespace xgboost {
namespace common {
HistCutMatrix::HistCutMatrix() {
monitor_.Init("HistCutMatrix");
}
size_t HistCutMatrix::SearchGroupIndFromBaseRow(
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const {
using KIt = std::vector<bst_uint>::const_iterator;
KIt res = std::lower_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
// Cannot use CHECK_NE because it will try to print the iterator.
bool const found = res != group_ptr.cend() - 1;
if (!found) {
LOG(FATAL) << "Row " << base_rowid << " does not lie in any group!\n";
}
size_t group_ind = std::distance(group_ptr.cbegin(), res);
return group_ind;
}
void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
monitor_.Start("Init");
const MetaInfo& info = p_fmat->Info();
// safe factor for better accuracy
@@ -33,30 +51,50 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
const int nthread = omp_get_max_threads();
auto nstep = static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
auto ncol = static_cast<unsigned>(info.num_col_);
unsigned const nstep =
static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
unsigned const ncol = static_cast<unsigned>(info.num_col_);
sketchs.resize(info.num_col_);
for (auto& s : sketchs) {
s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor));
}
const auto& weights = info.weights_.HostVector();
// Data groups, used in ranking.
std::vector<bst_uint> const& group_ptr = info.group_ptr_;
size_t const num_groups = group_ptr.size() == 0 ? 0 : group_ptr.size() - 1;
// Use group index for weights?
bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_;
for (const auto &batch : p_fmat->GetRowBatches()) {
#pragma omp parallel num_threads(nthread)
size_t group_ind = 0;
if (use_group_ind) {
group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
}
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
{
CHECK_EQ(nthread, omp_get_num_threads());
auto tid = static_cast<unsigned>(omp_get_thread_num());
unsigned begin = std::min(nstep * tid, ncol);
unsigned end = std::min(nstep * (tid + 1), ncol);
// do not iterate if no columns are assigned to the thread
if (begin < end && end <= ncol) {
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
size_t ridx = batch.base_rowid + i;
SparsePage::Inst inst = batch[i];
for (auto& ins : inst) {
if (ins.index >= begin && ins.index < end) {
sketchs[ins.index].Push(ins.fvalue,
weights.size() > 0 ? weights[ridx] : 1.0f);
size_t const ridx = batch.base_rowid + i;
SparsePage::Inst const inst = batch[i];
if (use_group_ind &&
group_ptr[group_ind] == ridx &&
// maximum equals to weights.size() - 1
group_ind < num_groups - 1) {
// move to next group
group_ind++;
}
for (auto const& entry : inst) {
if (entry.index >= begin && entry.index < end) {
size_t w_idx = use_group_ind ? group_ind : ridx;
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
}
}
}
@@ -65,6 +103,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
}
Init(&sketchs, max_num_bins);
monitor_.Stop("Init");
}
void HistCutMatrix::Init

View File

@@ -13,6 +13,7 @@
#include "row_set.h"
#include "../tree/param.h"
#include "./quantile.h"
#include "./timer.h"
#include "../include/rabit/rabit.h"
namespace xgboost {
@@ -35,6 +36,14 @@ struct HistCutMatrix {
void Init(DMatrix* p_fmat, uint32_t max_num_bins);
void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);
HistCutMatrix();
protected:
virtual size_t SearchGroupIndFromBaseRow(
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const;
Monitor monitor_;
};
/*! \brief Builds the cut matrix on the GPU */