Make `HistCutMatrix::Init' be aware of groups. (#4115)
* Add checks for group size. * Simple docs. * Search group index during hist cut matrix initialization. Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com> Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -24,7 +24,25 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
HistCutMatrix::HistCutMatrix() {
|
||||
monitor_.Init("HistCutMatrix");
|
||||
}
|
||||
|
||||
size_t HistCutMatrix::SearchGroupIndFromBaseRow(
|
||||
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const {
|
||||
using KIt = std::vector<bst_uint>::const_iterator;
|
||||
KIt res = std::lower_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
|
||||
// Cannot use CHECK_NE because it will try to print the iterator.
|
||||
bool const found = res != group_ptr.cend() - 1;
|
||||
if (!found) {
|
||||
LOG(FATAL) << "Row " << base_rowid << " does not lie in any group!\n";
|
||||
}
|
||||
size_t group_ind = std::distance(group_ptr.cbegin(), res);
|
||||
return group_ind;
|
||||
}
|
||||
|
||||
void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
monitor_.Start("Init");
|
||||
const MetaInfo& info = p_fmat->Info();
|
||||
|
||||
// safe factor for better accuracy
|
||||
@@ -33,30 +51,50 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
|
||||
const int nthread = omp_get_max_threads();
|
||||
|
||||
auto nstep = static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
|
||||
auto ncol = static_cast<unsigned>(info.num_col_);
|
||||
unsigned const nstep =
|
||||
static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
|
||||
unsigned const ncol = static_cast<unsigned>(info.num_col_);
|
||||
sketchs.resize(info.num_col_);
|
||||
for (auto& s : sketchs) {
|
||||
s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor));
|
||||
}
|
||||
|
||||
const auto& weights = info.weights_.HostVector();
|
||||
|
||||
// Data groups, used in ranking.
|
||||
std::vector<bst_uint> const& group_ptr = info.group_ptr_;
|
||||
size_t const num_groups = group_ptr.size() == 0 ? 0 : group_ptr.size() - 1;
|
||||
// Use group index for weights?
|
||||
bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_;
|
||||
|
||||
for (const auto &batch : p_fmat->GetRowBatches()) {
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
size_t group_ind = 0;
|
||||
if (use_group_ind) {
|
||||
group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
|
||||
}
|
||||
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
|
||||
{
|
||||
CHECK_EQ(nthread, omp_get_num_threads());
|
||||
auto tid = static_cast<unsigned>(omp_get_thread_num());
|
||||
unsigned begin = std::min(nstep * tid, ncol);
|
||||
unsigned end = std::min(nstep * (tid + 1), ncol);
|
||||
|
||||
// do not iterate if no columns are assigned to the thread
|
||||
if (begin < end && end <= ncol) {
|
||||
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
|
||||
size_t ridx = batch.base_rowid + i;
|
||||
SparsePage::Inst inst = batch[i];
|
||||
for (auto& ins : inst) {
|
||||
if (ins.index >= begin && ins.index < end) {
|
||||
sketchs[ins.index].Push(ins.fvalue,
|
||||
weights.size() > 0 ? weights[ridx] : 1.0f);
|
||||
size_t const ridx = batch.base_rowid + i;
|
||||
SparsePage::Inst const inst = batch[i];
|
||||
if (use_group_ind &&
|
||||
group_ptr[group_ind] == ridx &&
|
||||
// maximum equals to weights.size() - 1
|
||||
group_ind < num_groups - 1) {
|
||||
// move to next group
|
||||
group_ind++;
|
||||
}
|
||||
for (auto const& entry : inst) {
|
||||
if (entry.index >= begin && entry.index < end) {
|
||||
size_t w_idx = use_group_ind ? group_ind : ridx;
|
||||
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -65,6 +103,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
}
|
||||
|
||||
Init(&sketchs, max_num_bins);
|
||||
monitor_.Stop("Init");
|
||||
}
|
||||
|
||||
void HistCutMatrix::Init
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "row_set.h"
|
||||
#include "../tree/param.h"
|
||||
#include "./quantile.h"
|
||||
#include "./timer.h"
|
||||
#include "../include/rabit/rabit.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -35,6 +36,14 @@ struct HistCutMatrix {
|
||||
void Init(DMatrix* p_fmat, uint32_t max_num_bins);
|
||||
|
||||
void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);
|
||||
|
||||
HistCutMatrix();
|
||||
|
||||
protected:
|
||||
virtual size_t SearchGroupIndFromBaseRow(
|
||||
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const;
|
||||
|
||||
Monitor monitor_;
|
||||
};
|
||||
|
||||
/*! \brief Builds the cut matrix on the GPU */
|
||||
|
||||
@@ -474,12 +474,16 @@ class LearnerImpl : public Learner {
|
||||
|
||||
void UpdateOneIter(int iter, DMatrix* train) override {
|
||||
monitor_.Start("UpdateOneIter");
|
||||
|
||||
// TODO(trivialfis): Merge the duplicated code with BoostOneIter
|
||||
CHECK(ModelInitialized())
|
||||
<< "Always call InitModel or LoadModel before update";
|
||||
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
|
||||
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
|
||||
}
|
||||
this->ValidateDMatrix(train);
|
||||
this->PerformTreeMethodHeuristic(train);
|
||||
|
||||
monitor_.Start("PredictRaw");
|
||||
this->PredictRaw(train, &preds_);
|
||||
monitor_.Stop("PredictRaw");
|
||||
@@ -493,10 +497,15 @@ class LearnerImpl : public Learner {
|
||||
void BoostOneIter(int iter, DMatrix* train,
|
||||
HostDeviceVector<GradientPair>* in_gpair) override {
|
||||
monitor_.Start("BoostOneIter");
|
||||
|
||||
CHECK(ModelInitialized())
|
||||
<< "Always call InitModel or LoadModel before boost.";
|
||||
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
|
||||
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
|
||||
}
|
||||
this->ValidateDMatrix(train);
|
||||
this->PerformTreeMethodHeuristic(train);
|
||||
|
||||
gbm_->DoBoost(train, in_gpair);
|
||||
monitor_.Stop("BoostOneIter");
|
||||
}
|
||||
@@ -711,7 +720,7 @@ class LearnerImpl : public Learner {
|
||||
mparam_.num_feature = num_feature;
|
||||
}
|
||||
CHECK_NE(mparam_.num_feature, 0)
|
||||
<< "0 feature is supplied. Are you using raw Booster?";
|
||||
<< "0 feature is supplied. Are you using raw Booster interface?";
|
||||
// setup
|
||||
cfg_["num_feature"] = common::ToString(mparam_.num_feature);
|
||||
CHECK(obj_ == nullptr && gbm_ == nullptr);
|
||||
@@ -736,6 +745,19 @@ class LearnerImpl : public Learner {
|
||||
gbm_->PredictBatch(data, out_preds, ntree_limit);
|
||||
}
|
||||
|
||||
void ValidateDMatrix(DMatrix* p_fmat) {
|
||||
MetaInfo const& info = p_fmat->Info();
|
||||
auto const& weights = info.weights_.HostVector();
|
||||
if (info.group_ptr_.size() != 0 && weights.size() != 0) {
|
||||
CHECK(weights.size() == info.group_ptr_.size() - 1)
|
||||
<< "\n"
|
||||
<< "weights size: " << weights.size() << ", "
|
||||
<< "groups size: " << info.group_ptr_.size() -1 << ", "
|
||||
<< "num rows: " << p_fmat->Info().num_row_ << "\n"
|
||||
<< "Number of weights should be equal to number of groups in ranking task.";
|
||||
}
|
||||
}
|
||||
|
||||
// model parameter
|
||||
LearnerModelParam mparam_;
|
||||
// training parameter
|
||||
|
||||
Reference in New Issue
Block a user