checkin some micro optimization
This commit is contained in:
parent
9c52fc8e22
commit
792cff5abc
2
Makefile
2
Makefile
@ -2,7 +2,7 @@ export CC = gcc
|
|||||||
export CXX = g++
|
export CXX = g++
|
||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
export LDFLAGS= -pthread -lm
|
export LDFLAGS= -pthread -lm
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops
|
||||||
|
|
||||||
ifeq ($(OS), Windows_NT)
|
ifeq ($(OS), Windows_NT)
|
||||||
export CXX = g++ -m64
|
export CXX = g++ -m64
|
||||||
|
|||||||
@ -48,6 +48,8 @@ struct TrainParam{
|
|||||||
int size_leaf_vector;
|
int size_leaf_vector;
|
||||||
// option for parallelization
|
// option for parallelization
|
||||||
int parallel_option;
|
int parallel_option;
|
||||||
|
// option to open cacheline optimizaton
|
||||||
|
int cache_opt;
|
||||||
// number of threads to be used for tree construction,
|
// number of threads to be used for tree construction,
|
||||||
// if OpenMP is enabled, if equals 0, use system default
|
// if OpenMP is enabled, if equals 0, use system default
|
||||||
int nthread;
|
int nthread;
|
||||||
@ -70,6 +72,7 @@ struct TrainParam{
|
|||||||
parallel_option = 2;
|
parallel_option = 2;
|
||||||
sketch_eps = 0.1f;
|
sketch_eps = 0.1f;
|
||||||
sketch_ratio = 2.0f;
|
sketch_ratio = 2.0f;
|
||||||
|
cache_opt = 0;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
@ -96,6 +99,7 @@ struct TrainParam{
|
|||||||
if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast<float>(atof(val));
|
if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
|
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
|
||||||
|
if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
|
||||||
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
||||||
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
||||||
if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
|
if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
|
||||||
@ -192,6 +196,11 @@ struct GradStats {
|
|||||||
double sum_grad;
|
double sum_grad;
|
||||||
/*! \brief sum hessian statistics */
|
/*! \brief sum hessian statistics */
|
||||||
double sum_hess;
|
double sum_hess;
|
||||||
|
/*!
|
||||||
|
* \brief whether this is simply statistics and we only need to call
|
||||||
|
* Add(gpair), instead of Add(gpair, info, ridx)
|
||||||
|
*/
|
||||||
|
static const int kSimpleStats = 1;
|
||||||
/*! \brief constructor, the object must be cleared during construction */
|
/*! \brief constructor, the object must be cleared during construction */
|
||||||
explicit GradStats(const TrainParam ¶m) {
|
explicit GradStats(const TrainParam ¶m) {
|
||||||
this->Clear();
|
this->Clear();
|
||||||
@ -204,7 +213,14 @@ struct GradStats {
|
|||||||
inline static void CheckInfo(const BoosterInfo &info) {
|
inline static void CheckInfo(const BoosterInfo &info) {
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief accumulate statistics,
|
* \brief accumulate statistics
|
||||||
|
* \param p the gradient pair
|
||||||
|
*/
|
||||||
|
inline void Add(bst_gpair p) {
|
||||||
|
this->Add(p.grad, p.hess);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief accumulate statistics, more complicated version
|
||||||
* \param gpair the vector storing the gradient statistics
|
* \param gpair the vector storing the gradient statistics
|
||||||
* \param info the additional information
|
* \param info the additional information
|
||||||
* \param ridx instance index of this instance
|
* \param ridx instance index of this instance
|
||||||
|
|||||||
@ -357,6 +357,99 @@ class ColMaker: public IUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// update enumeration solution
|
||||||
|
inline void UpdateEnumeration(int nid, bst_gpair gstats,
|
||||||
|
float fvalue, int d_step, bst_uint fid,
|
||||||
|
TStats &c, std::vector<ThreadEntry> &temp) {
|
||||||
|
// get the statistics of nid
|
||||||
|
ThreadEntry &e = temp[nid];
|
||||||
|
// test if first hit, this is fine, because we set 0 during init
|
||||||
|
if (e.stats.Empty()) {
|
||||||
|
e.stats.Add(gstats);
|
||||||
|
e.last_fvalue = fvalue;
|
||||||
|
} else {
|
||||||
|
// try to find a split
|
||||||
|
if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
||||||
|
c.SetSubstract(snode[nid].stats, e.stats);
|
||||||
|
if (c.sum_hess >= param.min_child_weight) {
|
||||||
|
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||||
|
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// update the statistics
|
||||||
|
e.stats.Add(gstats);
|
||||||
|
e.last_fvalue = fvalue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// same as EnumerateSplit, with cacheline prefetch optimization
|
||||||
|
inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
|
||||||
|
const ColBatch::Entry *end,
|
||||||
|
int d_step,
|
||||||
|
bst_uint fid,
|
||||||
|
const std::vector<bst_gpair> &gpair,
|
||||||
|
std::vector<ThreadEntry> &temp) {
|
||||||
|
const std::vector<int> &qexpand = qexpand_;
|
||||||
|
// clear all the temp statistics
|
||||||
|
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||||
|
temp[qexpand[j]].stats.Clear();
|
||||||
|
}
|
||||||
|
// left statistics
|
||||||
|
TStats c(param);
|
||||||
|
// local cache buffer for position and gradient pair
|
||||||
|
const int kBuffer = 32;
|
||||||
|
int buf_position[kBuffer];
|
||||||
|
bst_gpair buf_gpair[kBuffer];
|
||||||
|
// aligned ending position
|
||||||
|
const ColBatch::Entry *align_end;
|
||||||
|
if (d_step > 0) {
|
||||||
|
align_end = begin + (end - begin) / kBuffer * kBuffer;
|
||||||
|
} else {
|
||||||
|
align_end = begin - (begin - end) / kBuffer * kBuffer;
|
||||||
|
}
|
||||||
|
int i;
|
||||||
|
const ColBatch::Entry *it;
|
||||||
|
const int align_step = d_step * kBuffer;
|
||||||
|
// internal cached loop
|
||||||
|
for (it = begin; it != align_end; it += align_step) {
|
||||||
|
const ColBatch::Entry *p;
|
||||||
|
for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
|
||||||
|
buf_position[i] = position[p->index];
|
||||||
|
buf_gpair[i] = gpair[p->index];
|
||||||
|
}
|
||||||
|
for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
|
||||||
|
const int nid = buf_position[i];
|
||||||
|
if (nid < 0) continue;
|
||||||
|
this->UpdateEnumeration(nid, buf_gpair[i],
|
||||||
|
p->fvalue, d_step,
|
||||||
|
fid, c, temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// finish up the ending piece
|
||||||
|
for (it = align_end, i = 0; it != end; ++i, it += d_step) {
|
||||||
|
buf_position[i] = position[it->index];
|
||||||
|
buf_gpair[i] = gpair[it->index];
|
||||||
|
}
|
||||||
|
for (it = align_end, i = 0; it != end; ++i, it += d_step) {
|
||||||
|
const int nid = buf_position[i];
|
||||||
|
if (nid < 0) continue;
|
||||||
|
this->UpdateEnumeration(nid, buf_gpair[i],
|
||||||
|
it->fvalue, d_step,
|
||||||
|
fid, c, temp);
|
||||||
|
}
|
||||||
|
// finish updating all statistics, check if it is possible to include all sum statistics
|
||||||
|
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||||
|
const int nid = qexpand[i];
|
||||||
|
ThreadEntry &e = temp[nid];
|
||||||
|
c.SetSubstract(snode[nid].stats, e.stats);
|
||||||
|
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
|
||||||
|
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||||
|
const float gap = std::abs(e.last_fvalue) + rt_eps;
|
||||||
|
const float delta = d_step == +1 ? gap: -gap;
|
||||||
|
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// enumerate the split values of specific feature
|
// enumerate the split values of specific feature
|
||||||
inline void EnumerateSplit(const ColBatch::Entry *begin,
|
inline void EnumerateSplit(const ColBatch::Entry *begin,
|
||||||
const ColBatch::Entry *end,
|
const ColBatch::Entry *end,
|
||||||
@ -365,6 +458,11 @@ class ColMaker: public IUpdater {
|
|||||||
const std::vector<bst_gpair> &gpair,
|
const std::vector<bst_gpair> &gpair,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<ThreadEntry> &temp) {
|
std::vector<ThreadEntry> &temp) {
|
||||||
|
// use cacheline aware optimization
|
||||||
|
if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
|
||||||
|
EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
|
||||||
|
return;
|
||||||
|
}
|
||||||
const std::vector<int> &qexpand = qexpand_;
|
const std::vector<int> &qexpand = qexpand_;
|
||||||
// clear all the temp statistics
|
// clear all the temp statistics
|
||||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||||
@ -411,6 +509,7 @@ class ColMaker: public IUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// update the solution candidate
|
// update the solution candidate
|
||||||
virtual void UpdateSolution(const ColBatch &batch,
|
virtual void UpdateSolution(const ColBatch &batch,
|
||||||
const std::vector<bst_gpair> &gpair,
|
const std::vector<bst_gpair> &gpair,
|
||||||
@ -550,8 +649,8 @@ class ColMaker: public IUpdater {
|
|||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||||
const bst_uint ridx = col[j].index;
|
const bst_uint ridx = col[j].index;
|
||||||
const float fvalue = col[j].fvalue;
|
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
|
const float fvalue = col[j].fvalue;
|
||||||
// go back to parent, correct those who are not default
|
// go back to parent, correct those who are not default
|
||||||
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
|
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
|
||||||
if(fvalue < tree[nid].split_cond()) {
|
if(fvalue < tree[nid].split_cond()) {
|
||||||
|
|||||||
@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker<TStats> {
|
|||||||
utils::Assert(istart != hist.size, "the bound variable must be max");
|
utils::Assert(istart != hist.size, "the bound variable must be max");
|
||||||
hist.data[istart].Add(gpair, info, ridx);
|
hist.data[istart].Add(gpair, info, ridx);
|
||||||
}
|
}
|
||||||
|
/*!
|
||||||
|
* \brief add a histogram to data,
|
||||||
|
* do linear scan, start from istart
|
||||||
|
*/
|
||||||
|
inline void Add(bst_float fv,
|
||||||
|
bst_gpair gstats) {
|
||||||
|
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
|
||||||
|
utils::Assert(istart != hist.size, "the bound variable must be max");
|
||||||
|
hist.data[istart].Add(gstats);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
// sketch type used for this
|
// sketch type used for this
|
||||||
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||||
@ -479,11 +489,38 @@ class CQHistMaker: public HistMaker<TStats> {
|
|||||||
hbuilder[nid].istart = 0;
|
hbuilder[nid].istart = 0;
|
||||||
hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
|
hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
|
||||||
}
|
}
|
||||||
for (bst_uint j = 0; j < c.length; ++j) {
|
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
|
||||||
const bst_uint ridx = c[j].index;
|
const bst_uint kBuffer = 32;
|
||||||
const int nid = this->position[ridx];
|
bst_uint align_length = c.length / kBuffer * kBuffer;
|
||||||
if (nid >= 0) {
|
int buf_position[kBuffer];
|
||||||
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
|
bst_gpair buf_gpair[kBuffer];
|
||||||
|
for (bst_uint j = 0; j < align_length; j += kBuffer) {
|
||||||
|
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||||
|
bst_uint ridx = c[j + i].index;
|
||||||
|
buf_position[i] = this->position[ridx];
|
||||||
|
buf_gpair[i] = gpair[ridx];
|
||||||
|
}
|
||||||
|
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||||
|
const int nid = buf_position[i];
|
||||||
|
if (nid >= 0) {
|
||||||
|
hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (bst_uint j = align_length; j < c.length; ++j) {
|
||||||
|
const bst_uint ridx = c[j].index;
|
||||||
|
const int nid = this->position[ridx];
|
||||||
|
if (nid >= 0) {
|
||||||
|
hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (bst_uint j = 0; j < c.length; ++j) {
|
||||||
|
const bst_uint ridx = c[j].index;
|
||||||
|
const int nid = this->position[ridx];
|
||||||
|
if (nid >= 0) {
|
||||||
|
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -536,11 +573,38 @@ class CQHistMaker: public HistMaker<TStats> {
|
|||||||
sbuilder[nid].Init(max_size);
|
sbuilder[nid].Init(max_size);
|
||||||
}
|
}
|
||||||
// second pass, build the sketch
|
// second pass, build the sketch
|
||||||
for (bst_uint j = 0; j < c.length; ++j) {
|
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
|
||||||
const bst_uint ridx = c[j].index;
|
const bst_uint kBuffer = 32;
|
||||||
const int nid = this->position[ridx];
|
bst_uint align_length = c.length / kBuffer * kBuffer;
|
||||||
if (nid >= 0) {
|
int buf_position[kBuffer];
|
||||||
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
|
bst_float buf_hess[kBuffer];
|
||||||
|
for (bst_uint j = 0; j < align_length; j += kBuffer) {
|
||||||
|
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||||
|
bst_uint ridx = c[j + i].index;
|
||||||
|
buf_position[i] = this->position[ridx];
|
||||||
|
buf_hess[i] = gpair[ridx].hess;
|
||||||
|
}
|
||||||
|
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||||
|
const int nid = buf_position[i];
|
||||||
|
if (nid >= 0) {
|
||||||
|
sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (bst_uint j = align_length; j < c.length; ++j) {
|
||||||
|
const bst_uint ridx = c[j].index;
|
||||||
|
const int nid = this->position[ridx];
|
||||||
|
if (nid >= 0) {
|
||||||
|
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (bst_uint j = 0; j < c.length; ++j) {
|
||||||
|
const bst_uint ridx = c[j].index;
|
||||||
|
const int nid = this->position[ridx];
|
||||||
|
if (nid >= 0) {
|
||||||
|
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user