checkin some micro optimization
This commit is contained in:
parent
9c52fc8e22
commit
792cff5abc
2
Makefile
2
Makefile
@ -2,7 +2,7 @@ export CC = gcc
|
||||
export CXX = g++
|
||||
export MPICXX = mpicxx
|
||||
export LDFLAGS= -pthread -lm
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops
|
||||
|
||||
ifeq ($(OS), Windows_NT)
|
||||
export CXX = g++ -m64
|
||||
|
||||
@ -48,6 +48,8 @@ struct TrainParam{
|
||||
int size_leaf_vector;
|
||||
// option for parallelization
|
||||
int parallel_option;
|
||||
// option to open cacheline optimizaton
|
||||
int cache_opt;
|
||||
// number of threads to be used for tree construction,
|
||||
// if OpenMP is enabled, if equals 0, use system default
|
||||
int nthread;
|
||||
@ -70,6 +72,7 @@ struct TrainParam{
|
||||
parallel_option = 2;
|
||||
sketch_eps = 0.1f;
|
||||
sketch_ratio = 2.0f;
|
||||
cache_opt = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
@ -96,6 +99,7 @@ struct TrainParam{
|
||||
if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
|
||||
if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
|
||||
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
||||
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
||||
if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
|
||||
@ -192,6 +196,11 @@ struct GradStats {
|
||||
double sum_grad;
|
||||
/*! \brief sum hessian statistics */
|
||||
double sum_hess;
|
||||
/*!
|
||||
* \brief whether this is simply statistics and we only need to call
|
||||
* Add(gpair), instead of Add(gpair, info, ridx)
|
||||
*/
|
||||
static const int kSimpleStats = 1;
|
||||
/*! \brief constructor, the object must be cleared during construction */
|
||||
explicit GradStats(const TrainParam ¶m) {
|
||||
this->Clear();
|
||||
@ -204,7 +213,14 @@ struct GradStats {
|
||||
inline static void CheckInfo(const BoosterInfo &info) {
|
||||
}
|
||||
/*!
|
||||
* \brief accumulate statistics,
|
||||
* \brief accumulate statistics
|
||||
* \param p the gradient pair
|
||||
*/
|
||||
inline void Add(bst_gpair p) {
|
||||
this->Add(p.grad, p.hess);
|
||||
}
|
||||
/*!
|
||||
* \brief accumulate statistics, more complicated version
|
||||
* \param gpair the vector storing the gradient statistics
|
||||
* \param info the additional information
|
||||
* \param ridx instance index of this instance
|
||||
|
||||
@ -356,7 +356,100 @@ class ColMaker: public IUpdater {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// update enumeration solution
|
||||
inline void UpdateEnumeration(int nid, bst_gpair gstats,
|
||||
float fvalue, int d_step, bst_uint fid,
|
||||
TStats &c, std::vector<ThreadEntry> &temp) {
|
||||
// get the statistics of nid
|
||||
ThreadEntry &e = temp[nid];
|
||||
// test if first hit, this is fine, because we set 0 during init
|
||||
if (e.stats.Empty()) {
|
||||
e.stats.Add(gstats);
|
||||
e.last_fvalue = fvalue;
|
||||
} else {
|
||||
// try to find a split
|
||||
if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
||||
c.SetSubstract(snode[nid].stats, e.stats);
|
||||
if (c.sum_hess >= param.min_child_weight) {
|
||||
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
|
||||
}
|
||||
}
|
||||
// update the statistics
|
||||
e.stats.Add(gstats);
|
||||
e.last_fvalue = fvalue;
|
||||
}
|
||||
}
|
||||
// same as EnumerateSplit, with cacheline prefetch optimization
|
||||
inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
|
||||
const ColBatch::Entry *end,
|
||||
int d_step,
|
||||
bst_uint fid,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
std::vector<ThreadEntry> &temp) {
|
||||
const std::vector<int> &qexpand = qexpand_;
|
||||
// clear all the temp statistics
|
||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||
temp[qexpand[j]].stats.Clear();
|
||||
}
|
||||
// left statistics
|
||||
TStats c(param);
|
||||
// local cache buffer for position and gradient pair
|
||||
const int kBuffer = 32;
|
||||
int buf_position[kBuffer];
|
||||
bst_gpair buf_gpair[kBuffer];
|
||||
// aligned ending position
|
||||
const ColBatch::Entry *align_end;
|
||||
if (d_step > 0) {
|
||||
align_end = begin + (end - begin) / kBuffer * kBuffer;
|
||||
} else {
|
||||
align_end = begin - (begin - end) / kBuffer * kBuffer;
|
||||
}
|
||||
int i;
|
||||
const ColBatch::Entry *it;
|
||||
const int align_step = d_step * kBuffer;
|
||||
// internal cached loop
|
||||
for (it = begin; it != align_end; it += align_step) {
|
||||
const ColBatch::Entry *p;
|
||||
for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
|
||||
buf_position[i] = position[p->index];
|
||||
buf_gpair[i] = gpair[p->index];
|
||||
}
|
||||
for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
|
||||
const int nid = buf_position[i];
|
||||
if (nid < 0) continue;
|
||||
this->UpdateEnumeration(nid, buf_gpair[i],
|
||||
p->fvalue, d_step,
|
||||
fid, c, temp);
|
||||
}
|
||||
}
|
||||
// finish up the ending piece
|
||||
for (it = align_end, i = 0; it != end; ++i, it += d_step) {
|
||||
buf_position[i] = position[it->index];
|
||||
buf_gpair[i] = gpair[it->index];
|
||||
}
|
||||
for (it = align_end, i = 0; it != end; ++i, it += d_step) {
|
||||
const int nid = buf_position[i];
|
||||
if (nid < 0) continue;
|
||||
this->UpdateEnumeration(nid, buf_gpair[i],
|
||||
it->fvalue, d_step,
|
||||
fid, c, temp);
|
||||
}
|
||||
// finish updating all statistics, check if it is possible to include all sum statistics
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
ThreadEntry &e = temp[nid];
|
||||
c.SetSubstract(snode[nid].stats, e.stats);
|
||||
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
|
||||
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||
const float gap = std::abs(e.last_fvalue) + rt_eps;
|
||||
const float delta = d_step == +1 ? gap: -gap;
|
||||
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// enumerate the split values of specific feature
|
||||
inline void EnumerateSplit(const ColBatch::Entry *begin,
|
||||
const ColBatch::Entry *end,
|
||||
@ -365,6 +458,11 @@ class ColMaker: public IUpdater {
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const BoosterInfo &info,
|
||||
std::vector<ThreadEntry> &temp) {
|
||||
// use cacheline aware optimization
|
||||
if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
|
||||
EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
|
||||
return;
|
||||
}
|
||||
const std::vector<int> &qexpand = qexpand_;
|
||||
// clear all the temp statistics
|
||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||
@ -411,6 +509,7 @@ class ColMaker: public IUpdater {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update the solution candidate
|
||||
virtual void UpdateSolution(const ColBatch &batch,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
@ -550,8 +649,8 @@ class ColMaker: public IUpdater {
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
const bst_uint ridx = col[j].index;
|
||||
const float fvalue = col[j].fvalue;
|
||||
const int nid = this->DecodePosition(ridx);
|
||||
const float fvalue = col[j].fvalue;
|
||||
// go back to parent, correct those who are not default
|
||||
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
|
||||
if(fvalue < tree[nid].split_cond()) {
|
||||
|
||||
@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
utils::Assert(istart != hist.size, "the bound variable must be max");
|
||||
hist.data[istart].Add(gpair, info, ridx);
|
||||
}
|
||||
/*!
|
||||
* \brief add a histogram to data,
|
||||
* do linear scan, start from istart
|
||||
*/
|
||||
inline void Add(bst_float fv,
|
||||
bst_gpair gstats) {
|
||||
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
|
||||
utils::Assert(istart != hist.size, "the bound variable must be max");
|
||||
hist.data[istart].Add(gstats);
|
||||
}
|
||||
};
|
||||
// sketch type used for this
|
||||
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||
@ -479,11 +489,38 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
hbuilder[nid].istart = 0;
|
||||
hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
|
||||
}
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
|
||||
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
|
||||
const bst_uint kBuffer = 32;
|
||||
bst_uint align_length = c.length / kBuffer * kBuffer;
|
||||
int buf_position[kBuffer];
|
||||
bst_gpair buf_gpair[kBuffer];
|
||||
for (bst_uint j = 0; j < align_length; j += kBuffer) {
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
bst_uint ridx = c[j + i].index;
|
||||
buf_position[i] = this->position[ridx];
|
||||
buf_gpair[i] = gpair[ridx];
|
||||
}
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
const int nid = buf_position[i];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (bst_uint j = align_length; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -536,11 +573,38 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
sbuilder[nid].Init(max_size);
|
||||
}
|
||||
// second pass, build the sketch
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
|
||||
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
|
||||
const bst_uint kBuffer = 32;
|
||||
bst_uint align_length = c.length / kBuffer * kBuffer;
|
||||
int buf_position[kBuffer];
|
||||
bst_float buf_hess[kBuffer];
|
||||
for (bst_uint j = 0; j < align_length; j += kBuffer) {
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
bst_uint ridx = c[j + i].index;
|
||||
buf_position[i] = this->position[ridx];
|
||||
buf_hess[i] = gpair[ridx].hess;
|
||||
}
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
const int nid = buf_position[i];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (bst_uint j = align_length; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user