only need to add in create hist col base

This commit is contained in:
tqchen 2014-11-18 22:21:41 -08:00
parent 08e9813c9b
commit 32beb56ba3
3 changed files with 135 additions and 231 deletions

View File

@ -8,6 +8,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include "../utils/random.h" #include "../utils/random.h"
#include "../utils/quantile.h"
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
@ -239,6 +240,80 @@ class BaseMaker: public IUpdater {
} }
} }
} }
/*! \brief common helper data structure to build sketch*/
struct SketchEntry {
/*! \brief total sum of amount to be met */
bst_float sum_total;
/*! \brief statistics used in the sketch */
bst_float rmin, wmin;
/*! \brief last seen feature value */
bst_float last_fvalue;
/*! \brief current size of sketch */
bst_float next_goal;
// pointer to the sketch to put things in
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
// initialize the space
inline void Init(unsigned max_size) {
next_goal = -1.0f;
rmin = wmin = 0.0f;
sketch->temp.Reserve(max_size + 1);
sketch->temp.size = 0;
}
/*!
* \brief push a new element to sketch
* \param fvalue feature value, comes in sorted ascending order
* \param w weight
* \param max_size
*/
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
if (next_goal == -1.0f) {
next_goal = 0.0f;
last_fvalue = fvalue;
wmin = w;
return;
}
if (last_fvalue != fvalue) {
bst_float rmax = rmin + wmin;
if (rmax >= next_goal) {
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
utils::Assert(sketch->temp.size < max_size,
"invalid maximum size max_size=%u, stemp.size=%lu\n",
max_size, sketch->temp.size);
++sketch->temp.size;
}
if (sketch->temp.size == max_size) {
next_goal = sum_total * 2.0f + 1e-5f;
} else{
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
}
}
rmin = rmax;
wmin = w;
last_fvalue = fvalue;
} else {
wmin += w;
}
}
/*! \brief push final unfinished value to the sketch */
inline void Finalize(unsigned max_size) {
bst_float rmax = rmin + wmin;
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
utils::Assert(sketch->temp.size <= max_size,
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
sketch->temp.size, max_size );
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
++sketch->temp.size;
}
sketch->PushTemp();
}
};
/*! \brief training parameter of tree grower */ /*! \brief training parameter of tree grower */
TrainParam param; TrainParam param;
/*! \brief queue of nodes to be expanded */ /*! \brief queue of nodes to be expanded */

View File

@ -116,23 +116,21 @@ class HistMaker: public BaseMaker {
ThreadWSpace wspace; ThreadWSpace wspace;
// reducer for histogram // reducer for histogram
sync::Reducer<TStats> histred; sync::Reducer<TStats> histred;
// update function implementation
// this function does two jobs
// (1) reset the position in array position, to be the latest leaf id
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const RegTree &tree) = 0;
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
this->InitData(gpair, *p_fmat, info.root_index, *p_tree); this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) { for (int depth = 0; depth < param.max_depth; ++depth) {
// reset and propose candidate split
this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
// create histogram
this->CreateHist(gpair, p_fmat, info, *p_tree);
// find split based on histogram statistics
this->FindSplit(depth, gpair, p_fmat, info, p_tree); this->FindSplit(depth, gpair, p_fmat, info, p_tree);
this->ResetPositionCol(this->qexpand, p_fmat, *p_tree); // reset position after split
this->ResetPositionAfterSplit(p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree); this->UpdateQueueExpand(*p_tree);
// if nothing left to be expand, break // if nothing left to be expand, break
if (qexpand.size() == 0) break; if (qexpand.size() == 0) break;
@ -142,12 +140,21 @@ class HistMaker: public BaseMaker {
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate); (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
} }
} }
// this function does two jobs
private: // (1) reset the position in array position, to be the latest leaf id
inline void CreateHist(const std::vector<bst_gpair> &gpair, // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
IFMatrix *p_fmat, virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
const BoosterInfo &info, IFMatrix *p_fmat,
const RegTree &tree) { const BoosterInfo &info,
const RegTree &tree) = 0;
// reset position after split, this is not a must, depending on implementation
virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
const RegTree &tree) {
}
virtual void CreateHist(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const RegTree &tree) {
bst_uint num_feature = tree.param.num_feature; bst_uint num_feature = tree.param.num_feature;
int nthread; int nthread;
#pragma omp parallel #pragma omp parallel
@ -190,6 +197,8 @@ class HistMaker: public BaseMaker {
// sync the histogram // sync the histogram
histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size()); histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size());
} }
private:
inline void EnumerateSplit(const HistUnit &hist, inline void EnumerateSplit(const HistUnit &hist,
const TStats &node_sum, const TStats &node_sum,
bst_uint fid, bst_uint fid,
@ -231,10 +240,6 @@ class HistMaker: public BaseMaker {
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
const bst_uint num_feature = p_tree->param.num_feature; const bst_uint num_feature = p_tree->param.num_feature;
// reset and propose candidate split
this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
// create histogram
this->CreateHist(gpair, p_fmat, info, *p_tree);
// get the best split condition for each node // get the best split condition for each node
std::vector<SplitEntry> sol(qexpand.size()); std::vector<SplitEntry> sol(qexpand.size());
std::vector<TStats> left_sum(qexpand.size()); std::vector<TStats> left_sum(qexpand.size());
@ -288,17 +293,23 @@ class HistMaker: public BaseMaker {
template<typename TStats> template<typename TStats>
class CQHistMaker: public HistMaker<TStats> { class CQHistMaker: public HistMaker<TStats> {
protected: protected:
virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
const RegTree &tree) {
this->ResetPositionCol(this->qexpand, p_fmat, tree);
}
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair, virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const RegTree &tree) { const RegTree &tree) {
this->GetNodeStats(gpair, *p_fmat, tree, info,
&thread_stats, &node_stats);
sketchs.resize(this->qexpand.size() * tree.param.num_feature); sketchs.resize(this->qexpand.size() * tree.param.num_feature);
for (size_t i = 0; i < sketchs.size(); ++i) { for (size_t i = 0; i < sketchs.size(); ++i) {
sketchs[i].Init(info.num_row, this->param.sketch_eps); sketchs[i].Init(info.num_row, this->param.sketch_eps);
} }
thread_temp.resize(this->get_nthread()); thread_sketch.resize(this->get_nthread());
std::vector<bst_float> root_stats; // number of rows in
this->GetRootStats(gpair, *p_fmat, tree, &root_stats); const size_t nrows = p_fmat->buffered_rowset().size();
// start accumulating statistics // start accumulating statistics
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(); utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
iter->BeforeFirst(); iter->BeforeFirst();
@ -308,15 +319,15 @@ class CQHistMaker: public HistMaker<TStats> {
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(dynamic, 1) #pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
this->MakeSketch(gpair, batch[i], tree, this->UpdateSketchCol(gpair, batch[i], tree,
root_stats, node_stats,
batch.col_index[i], batch.col_index[i],
p_fmat->GetColDensity(batch.col_index[i]), batch[i].length == nrows,
&thread_temp[omp_get_thread_num()]); &thread_sketch[omp_get_thread_num()]);
} }
} }
// setup maximum size // setup maximum size
size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps); unsigned max_size = this->param.max_sketch_size();
// synchronize sketch // synchronize sketch
summary_array.Init(sketchs.size(), max_size); summary_array.Init(sketchs.size(), max_size);
for (size_t i = 0; i < sketchs.size(); ++i) { for (size_t i = 0; i < sketchs.size(); ++i) {
@ -356,93 +367,18 @@ class CQHistMaker: public HistMaker<TStats> {
(tree.param.num_feature + 1) * this->qexpand.size() + 1, (tree.param.num_feature + 1) * this->qexpand.size() + 1,
"cut space inconsistent"); "cut space inconsistent");
} }
// temporal space to build a sketch
struct SketchEntry {
/*! \brief total sum of */
bst_float sum_total;
/*! \brief statistics used in the sketch */
bst_float rmin, wmin;
/*! \brief last seen feature value */
bst_float last_fvalue;
/*! \brief current size of sketch */
bst_float next_goal;
// pointer to the sketch to put things in
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
// initialize the space
inline void Init(unsigned max_size) {
next_goal = 0.0f;
rmin = wmin = 0.0f;
sketch->temp.Reserve(max_size + 1);
sketch->temp.size = 0;
}
/*!
* \brief push a new element to sketch
* \param fvalue feature value, comes in sorted ascending order
* \param w weight
* \param max_size
*/
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
if (wmin == 0.0f) {
last_fvalue = fvalue;
wmin = w;
return;
}
if (last_fvalue != fvalue) {
bst_float rmax = rmin + wmin;
if (rmax >= next_goal) {
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
utils::Assert(sketch->temp.size < max_size,
"invalid maximum size max_size=%u, stemp.size=%lu\n",
max_size, sketch->temp.size);
++sketch->temp.size;
}
if (sketch->temp.size == max_size) {
next_goal = sum_total * 2.0f + 1e-5f;
} else{
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
}
}
rmin = rmax;
wmin = w;
last_fvalue = fvalue;
} else {
wmin += w;
}
}
/*! \brief push final unfinished value to the sketch */
inline void Finalize(unsigned max_size) {
bst_float rmax = rmin + wmin;
//utils::Assert(fabs(rmax - sum_total) < 1e-4 + sum_total * 1e-5,
//"invalid sum value, rmax=%f, sum_total=%lf", rmax, sum_total);
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
utils::Assert(sketch->temp.size <= max_size,
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
sketch->temp.size, max_size );
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
++sketch->temp.size;
}
sketch->PushTemp();
}
};
private: private:
inline void MakeSketch(const std::vector<bst_gpair> &gpair, inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
const ColBatch::Inst &c, const ColBatch::Inst &c,
const RegTree &tree, const RegTree &tree,
const std::vector<bst_float> &root_stats, const std::vector<TStats> &nstats,
bst_uint fid, bst_uint fid,
float col_density, bool col_full,
std::vector<SketchEntry> *p_temp) { std::vector<BaseMaker::SketchEntry> *p_temp) {
if (c.length == 0) return; if (c.length == 0) return;
// initialize sbuilder for use // initialize sbuilder for use
std::vector<SketchEntry> &sbuilder = *p_temp; std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
sbuilder.resize(tree.param.num_nodes); sbuilder.resize(tree.param.num_nodes);
for (size_t i = 0; i < this->qexpand.size(); ++i) { for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i]; const unsigned nid = this->qexpand[i];
@ -451,7 +387,7 @@ class CQHistMaker: public HistMaker<TStats> {
sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid]; sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
} }
if (col_density != 1.0f) { if (!col_full) {
// first pass, get sum of weight, TODO, optimization to skip first pass // first pass, get sum of weight, TODO, optimization to skip first pass
for (bst_uint j = 0; j < c.length; ++j) { for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index; const bst_uint ridx = c[j].index;
@ -463,7 +399,7 @@ class CQHistMaker: public HistMaker<TStats> {
} else { } else {
for (size_t i = 0; i < this->qexpand.size(); ++i) { for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i]; const unsigned nid = this->qexpand[i];
sbuilder[nid].sum_total = root_stats[nid]; sbuilder[nid].sum_total = nstats[nid].sum_hess;
} }
} }
// if only one value, no need to do second pass // if only one value, no need to do second pass
@ -475,7 +411,7 @@ class CQHistMaker: public HistMaker<TStats> {
return; return;
} }
// two pass scan // two pass scan
unsigned max_size = static_cast<unsigned>(this->param.sketch_ratio / this->param.sketch_eps); unsigned max_size = this->param.max_sketch_size();
for (size_t i = 0; i < this->qexpand.size(); ++i) { for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i]; const int nid = this->qexpand[i];
sbuilder[nid].Init(max_size); sbuilder[nid].Init(max_size);
@ -493,47 +429,14 @@ class CQHistMaker: public HistMaker<TStats> {
sbuilder[nid].Finalize(max_size); sbuilder[nid].Finalize(max_size);
} }
} }
inline void GetRootStats(const std::vector<bst_gpair> &gpair,
const IFMatrix &fmat,
const RegTree &tree,
std::vector<float> *p_snode) {
std::vector<float> &snode = *p_snode;
thread_temp.resize(this->get_nthread());
snode.resize(tree.param.num_nodes);
#pragma omp parallel
{
const int tid = omp_get_thread_num();
thread_temp[tid].resize(tree.param.num_nodes);
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i];
thread_temp[tid][nid].sum_total = 0.0f;
}
}
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
// setup position
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int tid = omp_get_thread_num();
if (this->position[ridx] < 0) continue;
thread_temp[tid][this->position[ridx]].sum_total += gpair[ridx].hess;
}
// sum the per thread statistics together
for (size_t j = 0; j < this->qexpand.size(); ++j) {
const int nid = this->qexpand[j];
double wsum = 0.0f;
for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
wsum += thread_temp[tid][nid].sum_total;
}
// update node statistics
snode[nid] = static_cast<bst_float>(wsum);
}
}
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch; typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
// thread temp data // thread temp data
std::vector< std::vector<SketchEntry> > thread_temp; std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
// used to hold statistics
std::vector< std::vector<TStats> > thread_stats;
// node statistics
std::vector<TStats> node_stats;
// summary array // summary array
WXQSketch::SummaryArray summary_array; WXQSketch::SummaryArray summary_array;
// reducer for summary // reducer for summary

View File

@ -130,80 +130,6 @@ class SketchMaker: public BaseMaker {
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const { inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
} }
}; };
// temporal space to build a sketch
struct SketchEntry {
/*! \brief total sum of amount to be met */
bst_float sum_total;
/*! \brief statistics used in the sketch */
bst_float rmin, wmin;
/*! \brief last seen feature value */
bst_float last_fvalue;
/*! \brief current size of sketch */
bst_float next_goal;
// pointer to the sketch to put things in
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
// initialize the space
inline void Init(unsigned max_size) {
next_goal = -1.0f;
rmin = wmin = 0.0f;
sketch->temp.Reserve(max_size + 1);
sketch->temp.size = 0;
}
/*!
* \brief push a new element to sketch
* \param fvalue feature value, comes in sorted ascending order
* \param w weight
* \param max_size
*/
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
if (next_goal == -1.0f) {
next_goal = 0.0f;
last_fvalue = fvalue;
wmin = w;
return;
}
if (last_fvalue != fvalue) {
bst_float rmax = rmin + wmin;
if (rmax >= next_goal) {
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
utils::Assert(sketch->temp.size < max_size,
"invalid maximum size max_size=%u, stemp.size=%lu\n",
max_size, sketch->temp.size);
++sketch->temp.size;
}
if (sketch->temp.size == max_size) {
next_goal = sum_total * 2.0f + 1e-5f;
} else{
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
}
}
rmin = rmax;
wmin = w;
last_fvalue = fvalue;
} else {
wmin += w;
}
}
/*! \brief push final unfinished value to the sketch */
inline void Finalize(unsigned max_size) {
bst_float rmax = rmin + wmin;
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
utils::Assert(sketch->temp.size <= max_size,
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
sketch->temp.size, max_size );
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
++sketch->temp.size;
}
sketch->PushTemp();
}
};
inline void BuildSketch(const std::vector<bst_gpair> &gpair, inline void BuildSketch(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,