cqmaker ok
This commit is contained in:
parent
08e9813c9b
commit
fa1581b94c
@ -18,8 +18,7 @@ IUpdater* CreateUpdater(const char *name) {
|
||||
if (!strcmp(name, "sync")) return new TreeSyncher();
|
||||
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
|
||||
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
|
||||
if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
|
||||
if (!strcmp(name, "grow_cqmaker")) return new CQHistMaker<GradStats>();
|
||||
if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
|
||||
if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
|
||||
if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "../utils/random.h"
|
||||
#include "../utils/quantile.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@ -238,7 +239,81 @@ class BaseMaker: public IUpdater {
|
||||
s.Add(thread_temp[tid][nid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief common helper data structure to build sketch*/
|
||||
struct SketchEntry {
|
||||
/*! \brief total sum of amount to be met */
|
||||
bst_float sum_total;
|
||||
/*! \brief statistics used in the sketch */
|
||||
bst_float rmin, wmin;
|
||||
/*! \brief last seen feature value */
|
||||
bst_float last_fvalue;
|
||||
/*! \brief current size of sketch */
|
||||
bst_float next_goal;
|
||||
// pointer to the sketch to put things in
|
||||
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
|
||||
// initialize the space
|
||||
inline void Init(unsigned max_size) {
|
||||
next_goal = -1.0f;
|
||||
rmin = wmin = 0.0f;
|
||||
sketch->temp.Reserve(max_size + 1);
|
||||
sketch->temp.size = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief push a new element to sketch
|
||||
* \param fvalue feature value, comes in sorted ascending order
|
||||
* \param w weight
|
||||
* \param max_size
|
||||
*/
|
||||
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
|
||||
if (next_goal == -1.0f) {
|
||||
next_goal = 0.0f;
|
||||
last_fvalue = fvalue;
|
||||
wmin = w;
|
||||
return;
|
||||
}
|
||||
if (last_fvalue != fvalue) {
|
||||
bst_float rmax = rmin + wmin;
|
||||
if (rmax >= next_goal) {
|
||||
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
|
||||
// push to sketch
|
||||
sketch->temp.data[sketch->temp.size] =
|
||||
utils::WXQuantileSketch<bst_float, bst_float>::
|
||||
Entry(rmin, rmax, wmin, last_fvalue);
|
||||
utils::Assert(sketch->temp.size < max_size,
|
||||
"invalid maximum size max_size=%u, stemp.size=%lu\n",
|
||||
max_size, sketch->temp.size);
|
||||
++sketch->temp.size;
|
||||
}
|
||||
if (sketch->temp.size == max_size) {
|
||||
next_goal = sum_total * 2.0f + 1e-5f;
|
||||
} else{
|
||||
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
|
||||
}
|
||||
}
|
||||
rmin = rmax;
|
||||
wmin = w;
|
||||
last_fvalue = fvalue;
|
||||
} else {
|
||||
wmin += w;
|
||||
}
|
||||
}
|
||||
/*! \brief push final unfinished value to the sketch */
|
||||
inline void Finalize(unsigned max_size) {
|
||||
bst_float rmax = rmin + wmin;
|
||||
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
|
||||
utils::Assert(sketch->temp.size <= max_size,
|
||||
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
|
||||
sketch->temp.size, max_size );
|
||||
// push to sketch
|
||||
sketch->temp.data[sketch->temp.size] =
|
||||
utils::WXQuantileSketch<bst_float, bst_float>::
|
||||
Entry(rmin, rmax, wmin, last_fvalue);
|
||||
++sketch->temp.size;
|
||||
}
|
||||
sketch->PushTemp();
|
||||
}
|
||||
};
|
||||
/*! \brief training parameter of tree grower */
|
||||
TrainParam param;
|
||||
/*! \brief queue of nodes to be expanded */
|
||||
|
||||
@ -41,7 +41,9 @@ class HistMaker: public BaseMaker {
|
||||
/*! \brief content of statistics data */
|
||||
TStats *data;
|
||||
/*! \brief size of histogram */
|
||||
const unsigned size;
|
||||
unsigned size;
|
||||
// default constructor
|
||||
HistUnit(void) {}
|
||||
// constructor
|
||||
HistUnit(const bst_float *cut, TStats *data, unsigned size)
|
||||
: cut(cut), data(data), size(size) {}
|
||||
@ -79,7 +81,7 @@ class HistMaker: public BaseMaker {
|
||||
/*! \brief cut field */
|
||||
std::vector<bst_float> cut;
|
||||
// per thread histset
|
||||
std::vector<HistSet> hset;
|
||||
std::vector<HistSet> hset;
|
||||
// initialize the hist set
|
||||
inline void Init(const TrainParam ¶m, int nthread) {
|
||||
hset.resize(nthread);
|
||||
@ -111,28 +113,26 @@ class HistMaker: public BaseMaker {
|
||||
inline size_t Size(void) const {
|
||||
return rptr.size() - 1;
|
||||
}
|
||||
};
|
||||
};
|
||||
// workspace of thread
|
||||
ThreadWSpace wspace;
|
||||
// reducer for histogram
|
||||
sync::Reducer<TStats> histred;
|
||||
|
||||
// this function does two jobs
|
||||
// (1) reset the position in array position, to be the latest leaf id
|
||||
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
|
||||
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) = 0;
|
||||
|
||||
sync::Reducer<TStats> histred;
|
||||
// update function implementation
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
RegTree *p_tree) {
|
||||
this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
|
||||
for (int depth = 0; depth < param.max_depth; ++depth) {
|
||||
// reset and propose candidate split
|
||||
this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
|
||||
// create histogram
|
||||
this->CreateHist(gpair, p_fmat, info, *p_tree);
|
||||
// find split based on histogram statistics
|
||||
this->FindSplit(depth, gpair, p_fmat, info, p_tree);
|
||||
this->ResetPositionCol(this->qexpand, p_fmat, *p_tree);
|
||||
// reset position after split
|
||||
this->ResetPositionAfterSplit(p_fmat, *p_tree);
|
||||
this->UpdateQueueExpand(*p_tree);
|
||||
// if nothing left to be expand, break
|
||||
if (qexpand.size() == 0) break;
|
||||
@ -142,20 +142,24 @@ class HistMaker: public BaseMaker {
|
||||
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
inline void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) {
|
||||
// this function does two jobs
|
||||
// (1) reset the position in array position, to be the latest leaf id
|
||||
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
|
||||
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) = 0;
|
||||
// reset position after split, this is not a must, depending on implementation
|
||||
virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
|
||||
const RegTree &tree) {
|
||||
}
|
||||
virtual void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) {
|
||||
bst_uint num_feature = tree.param.num_feature;
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
// intialize work space
|
||||
wspace.Init(param, nthread);
|
||||
wspace.Init(param, this->get_nthread());
|
||||
// start accumulating statistics
|
||||
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
@ -190,6 +194,8 @@ class HistMaker: public BaseMaker {
|
||||
// sync the histogram
|
||||
histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size());
|
||||
}
|
||||
|
||||
private:
|
||||
inline void EnumerateSplit(const HistUnit &hist,
|
||||
const TStats &node_sum,
|
||||
bst_uint fid,
|
||||
@ -231,10 +237,6 @@ class HistMaker: public BaseMaker {
|
||||
const BoosterInfo &info,
|
||||
RegTree *p_tree) {
|
||||
const bst_uint num_feature = p_tree->param.num_feature;
|
||||
// reset and propose candidate split
|
||||
this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
|
||||
// create histogram
|
||||
this->CreateHist(gpair, p_fmat, info, *p_tree);
|
||||
// get the best split condition for each node
|
||||
std::vector<SplitEntry> sol(qexpand.size());
|
||||
std::vector<TStats> left_sum(qexpand.size());
|
||||
@ -247,7 +249,7 @@ class HistMaker: public BaseMaker {
|
||||
SplitEntry &best = sol[wid];
|
||||
TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
||||
for (bst_uint fid = 0; fid < num_feature; ++ fid) {
|
||||
EnumerateSplit(wspace.hset[0][fid + wid * (num_feature+1)],
|
||||
EnumerateSplit(this->wspace.hset[0][fid + wid * (num_feature+1)],
|
||||
node_sum, fid, &best, &left_sum[wid]);
|
||||
}
|
||||
}
|
||||
@ -279,26 +281,37 @@ class HistMaker: public BaseMaker {
|
||||
}
|
||||
|
||||
inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
|
||||
p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
|
||||
p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
|
||||
node_sum.SetLeafVec(param, p_tree->leafvec(nid));
|
||||
p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
|
||||
p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
|
||||
node_sum.SetLeafVec(param, p_tree->leafvec(nid));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename TStats>
|
||||
class CQHistMaker: public HistMaker<TStats> {
|
||||
protected:
|
||||
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) {
|
||||
sketchs.resize(this->qexpand.size() * tree.param.num_feature);
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
sketchs[i].Init(info.num_row, this->param.sketch_eps);
|
||||
struct HistEntry {
|
||||
typename HistMaker<TStats>::HistUnit hist;
|
||||
unsigned istart;
|
||||
/*!
|
||||
* \brief add a histogram to data,
|
||||
* do linear scan, start from istart
|
||||
*/
|
||||
inline void Add(bst_float fv,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const BoosterInfo &info,
|
||||
const bst_uint ridx) {
|
||||
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
|
||||
utils::Assert(istart != hist.size, "the bound variable must be max");
|
||||
hist.data[istart].Add(gpair, info, ridx);
|
||||
}
|
||||
thread_temp.resize(this->get_nthread());
|
||||
std::vector<bst_float> root_stats;
|
||||
this->GetRootStats(gpair, *p_fmat, tree, &root_stats);
|
||||
};
|
||||
virtual void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) {
|
||||
this->wspace.Init(this->param, 1);
|
||||
thread_hist.resize(this->get_nthread());
|
||||
// start accumulating statistics
|
||||
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
|
||||
iter->BeforeFirst();
|
||||
@ -308,15 +321,55 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
this->MakeSketch(gpair, batch[i], tree,
|
||||
root_stats,
|
||||
batch.col_index[i],
|
||||
p_fmat->GetColDensity(batch.col_index[i]),
|
||||
&thread_temp[omp_get_thread_num()]);
|
||||
this->UpdateHistCol(gpair, batch[i], info, tree,
|
||||
batch.col_index[i],
|
||||
&thread_hist[omp_get_thread_num()]);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
const int wid = this->node2workindex[nid];
|
||||
this->wspace.hset[0][tree.param.num_feature + wid * (tree.param.num_feature+1)]
|
||||
.data[0] = node_stats[nid];
|
||||
}
|
||||
// sync the histogram
|
||||
this->histred.AllReduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());
|
||||
}
|
||||
virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
|
||||
const RegTree &tree) {
|
||||
this->ResetPositionCol(this->qexpand, p_fmat, tree);
|
||||
}
|
||||
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) {
|
||||
this->GetNodeStats(gpair, *p_fmat, tree, info,
|
||||
&thread_stats, &node_stats);
|
||||
sketchs.resize(this->qexpand.size() * tree.param.num_feature);
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
sketchs[i].Init(info.num_row, this->param.sketch_eps);
|
||||
}
|
||||
thread_sketch.resize(this->get_nthread());
|
||||
// number of rows in
|
||||
const size_t nrows = p_fmat->buffered_rowset().size();
|
||||
// start accumulating statistics
|
||||
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
this->UpdateSketchCol(gpair, batch[i], tree,
|
||||
node_stats,
|
||||
batch.col_index[i],
|
||||
batch[i].length == nrows,
|
||||
&thread_sketch[omp_get_thread_num()]);
|
||||
}
|
||||
}
|
||||
// setup maximum size
|
||||
size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
|
||||
unsigned max_size = this->param.max_sketch_size();
|
||||
// synchronize sketch
|
||||
summary_array.Init(sketchs.size(), max_size);
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
@ -356,93 +409,42 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
(tree.param.num_feature + 1) * this->qexpand.size() + 1,
|
||||
"cut space inconsistent");
|
||||
}
|
||||
// temporal space to build a sketch
|
||||
struct SketchEntry {
|
||||
/*! \brief total sum of */
|
||||
bst_float sum_total;
|
||||
/*! \brief statistics used in the sketch */
|
||||
bst_float rmin, wmin;
|
||||
/*! \brief last seen feature value */
|
||||
bst_float last_fvalue;
|
||||
/*! \brief current size of sketch */
|
||||
bst_float next_goal;
|
||||
// pointer to the sketch to put things in
|
||||
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
|
||||
// initialize the space
|
||||
inline void Init(unsigned max_size) {
|
||||
next_goal = 0.0f;
|
||||
rmin = wmin = 0.0f;
|
||||
sketch->temp.Reserve(max_size + 1);
|
||||
sketch->temp.size = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief push a new element to sketch
|
||||
* \param fvalue feature value, comes in sorted ascending order
|
||||
* \param w weight
|
||||
* \param max_size
|
||||
*/
|
||||
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
|
||||
if (wmin == 0.0f) {
|
||||
last_fvalue = fvalue;
|
||||
wmin = w;
|
||||
return;
|
||||
}
|
||||
if (last_fvalue != fvalue) {
|
||||
bst_float rmax = rmin + wmin;
|
||||
if (rmax >= next_goal) {
|
||||
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
|
||||
// push to sketch
|
||||
sketch->temp.data[sketch->temp.size] =
|
||||
utils::WXQuantileSketch<bst_float, bst_float>::
|
||||
Entry(rmin, rmax, wmin, last_fvalue);
|
||||
utils::Assert(sketch->temp.size < max_size,
|
||||
"invalid maximum size max_size=%u, stemp.size=%lu\n",
|
||||
max_size, sketch->temp.size);
|
||||
++sketch->temp.size;
|
||||
}
|
||||
if (sketch->temp.size == max_size) {
|
||||
next_goal = sum_total * 2.0f + 1e-5f;
|
||||
} else{
|
||||
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
|
||||
}
|
||||
}
|
||||
rmin = rmax;
|
||||
wmin = w;
|
||||
last_fvalue = fvalue;
|
||||
} else {
|
||||
wmin += w;
|
||||
}
|
||||
}
|
||||
/*! \brief push final unfinished value to the sketch */
|
||||
inline void Finalize(unsigned max_size) {
|
||||
bst_float rmax = rmin + wmin;
|
||||
//utils::Assert(fabs(rmax - sum_total) < 1e-4 + sum_total * 1e-5,
|
||||
//"invalid sum value, rmax=%f, sum_total=%lf", rmax, sum_total);
|
||||
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
|
||||
utils::Assert(sketch->temp.size <= max_size,
|
||||
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
|
||||
sketch->temp.size, max_size );
|
||||
// push to sketch
|
||||
sketch->temp.data[sketch->temp.size] =
|
||||
utils::WXQuantileSketch<bst_float, bst_float>::
|
||||
Entry(rmin, rmax, wmin, last_fvalue);
|
||||
++sketch->temp.size;
|
||||
}
|
||||
sketch->PushTemp();
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
inline void MakeSketch(const std::vector<bst_gpair> &gpair,
|
||||
const ColBatch::Inst &c,
|
||||
const RegTree &tree,
|
||||
const std::vector<bst_float> &root_stats,
|
||||
bst_uint fid,
|
||||
float col_density,
|
||||
std::vector<SketchEntry> *p_temp) {
|
||||
inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
|
||||
const ColBatch::Inst &c,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree,
|
||||
bst_uint fid,
|
||||
std::vector<HistEntry> *p_temp) {
|
||||
if (c.length == 0) return;
|
||||
// initialize sbuilder for use
|
||||
std::vector<SketchEntry> &sbuilder = *p_temp;
|
||||
std::vector<HistEntry> &hbuilder = *p_temp;
|
||||
hbuilder.resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
const unsigned wid = this->node2workindex[nid];
|
||||
hbuilder[nid].istart = 0;
|
||||
hbuilder[nid].hist = this->wspace.hset[0][fid + wid * (tree.param.num_feature+1)];
|
||||
}
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
|
||||
}
|
||||
}
|
||||
}
|
||||
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
|
||||
const ColBatch::Inst &c,
|
||||
const RegTree &tree,
|
||||
const std::vector<TStats> &nstats,
|
||||
bst_uint fid,
|
||||
bool col_full,
|
||||
std::vector<BaseMaker::SketchEntry> *p_temp) {
|
||||
if (c.length == 0) return;
|
||||
// initialize sbuilder for use
|
||||
std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
|
||||
sbuilder.resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
@ -451,7 +453,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
|
||||
}
|
||||
|
||||
if (col_density != 1.0f) {
|
||||
if (!col_full) {
|
||||
// first pass, get sum of weight, TODO, optimization to skip first pass
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
@ -463,7 +465,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
} else {
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
sbuilder[nid].sum_total = root_stats[nid];
|
||||
sbuilder[nid].sum_total = nstats[nid].sum_hess;
|
||||
}
|
||||
}
|
||||
// if only one value, no need to do second pass
|
||||
@ -475,7 +477,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
return;
|
||||
}
|
||||
// two pass scan
|
||||
unsigned max_size = static_cast<unsigned>(this->param.sketch_ratio / this->param.sketch_eps);
|
||||
unsigned max_size = this->param.max_sketch_size();
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
sbuilder[nid].Init(max_size);
|
||||
@ -493,47 +495,16 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
sbuilder[nid].Finalize(max_size);
|
||||
}
|
||||
}
|
||||
inline void GetRootStats(const std::vector<bst_gpair> &gpair,
|
||||
const IFMatrix &fmat,
|
||||
const RegTree &tree,
|
||||
std::vector<float> *p_snode) {
|
||||
std::vector<float> &snode = *p_snode;
|
||||
thread_temp.resize(this->get_nthread());
|
||||
snode.resize(tree.param.num_nodes);
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int tid = omp_get_thread_num();
|
||||
thread_temp[tid].resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
thread_temp[tid][nid].sum_total = 0.0f;
|
||||
}
|
||||
}
|
||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
||||
// setup position
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
const bst_uint ridx = rowset[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
if (this->position[ridx] < 0) continue;
|
||||
thread_temp[tid][this->position[ridx]].sum_total += gpair[ridx].hess;
|
||||
}
|
||||
// sum the per thread statistics together
|
||||
for (size_t j = 0; j < this->qexpand.size(); ++j) {
|
||||
const int nid = this->qexpand[j];
|
||||
double wsum = 0.0f;
|
||||
for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
|
||||
wsum += thread_temp[tid][nid].sum_total;
|
||||
}
|
||||
// update node statistics
|
||||
snode[nid] = static_cast<bst_float>(wsum);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||
// thread temp data
|
||||
std::vector< std::vector<SketchEntry> > thread_temp;
|
||||
std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
|
||||
// used to hold statistics
|
||||
std::vector< std::vector<TStats> > thread_stats;
|
||||
// used to hold start pointer
|
||||
std::vector< std::vector<HistEntry> > thread_hist;
|
||||
// node statistics
|
||||
std::vector<TStats> node_stats;
|
||||
// summary array
|
||||
WXQSketch::SummaryArray summary_array;
|
||||
// reducer for summary
|
||||
|
||||
@ -130,80 +130,6 @@ class SketchMaker: public BaseMaker {
|
||||
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const {
|
||||
}
|
||||
};
|
||||
// temporal space to build a sketch
|
||||
struct SketchEntry {
|
||||
/*! \brief total sum of amount to be met */
|
||||
bst_float sum_total;
|
||||
/*! \brief statistics used in the sketch */
|
||||
bst_float rmin, wmin;
|
||||
/*! \brief last seen feature value */
|
||||
bst_float last_fvalue;
|
||||
/*! \brief current size of sketch */
|
||||
bst_float next_goal;
|
||||
// pointer to the sketch to put things in
|
||||
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
|
||||
// initialize the space
|
||||
inline void Init(unsigned max_size) {
|
||||
next_goal = -1.0f;
|
||||
rmin = wmin = 0.0f;
|
||||
sketch->temp.Reserve(max_size + 1);
|
||||
sketch->temp.size = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief push a new element to sketch
|
||||
* \param fvalue feature value, comes in sorted ascending order
|
||||
* \param w weight
|
||||
* \param max_size
|
||||
*/
|
||||
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
|
||||
if (next_goal == -1.0f) {
|
||||
next_goal = 0.0f;
|
||||
last_fvalue = fvalue;
|
||||
wmin = w;
|
||||
return;
|
||||
}
|
||||
if (last_fvalue != fvalue) {
|
||||
bst_float rmax = rmin + wmin;
|
||||
if (rmax >= next_goal) {
|
||||
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
|
||||
// push to sketch
|
||||
sketch->temp.data[sketch->temp.size] =
|
||||
utils::WXQuantileSketch<bst_float, bst_float>::
|
||||
Entry(rmin, rmax, wmin, last_fvalue);
|
||||
utils::Assert(sketch->temp.size < max_size,
|
||||
"invalid maximum size max_size=%u, stemp.size=%lu\n",
|
||||
max_size, sketch->temp.size);
|
||||
++sketch->temp.size;
|
||||
}
|
||||
if (sketch->temp.size == max_size) {
|
||||
next_goal = sum_total * 2.0f + 1e-5f;
|
||||
} else{
|
||||
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
|
||||
}
|
||||
}
|
||||
rmin = rmax;
|
||||
wmin = w;
|
||||
last_fvalue = fvalue;
|
||||
} else {
|
||||
wmin += w;
|
||||
}
|
||||
}
|
||||
/*! \brief push final unfinished value to the sketch */
|
||||
inline void Finalize(unsigned max_size) {
|
||||
bst_float rmax = rmin + wmin;
|
||||
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
|
||||
utils::Assert(sketch->temp.size <= max_size,
|
||||
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
|
||||
sketch->temp.size, max_size );
|
||||
// push to sketch
|
||||
sketch->temp.data[sketch->temp.size] =
|
||||
utils::WXQuantileSketch<bst_float, bst_float>::
|
||||
Entry(rmin, rmax, wmin, last_fvalue);
|
||||
++sketch->temp.size;
|
||||
}
|
||||
sketch->PushTemp();
|
||||
}
|
||||
};
|
||||
inline void BuildSketch(const std::vector<bst_gpair> &gpair,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user