diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 82a373352..df43d3b7f 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -18,7 +18,7 @@ struct RowBatchPage { utils::Assert(data_ != NULL, "fail to allocate row batch page"); this->Clear(); } - ~RowBatchPage(void) { + ~BinaryPage(void) { if (data_ != NULL) delete [] data_; } /*! diff --git a/src/tree/param.h b/src/tree/param.h index 52c273749..92bc1c990 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -37,7 +37,9 @@ struct TrainParam{ // speed optimization for dense column float opt_dense_col; // leaf vector size - int size_leaf_vector; + int size_leaf_vector; + // option for parallelization + int parallel_option; // number of threads to be used for tree construction, // if OpenMP is enabled, if equals 0, use system default int nthread; @@ -55,6 +57,7 @@ struct TrainParam{ opt_dense_col = 1.0f; nthread = 0; size_leaf_vector = 0; + parallel_option = 0; } /*! * \brief set parameters from outside @@ -79,6 +82,7 @@ struct TrainParam{ if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val); if (!strcmp(name, "max_depth")) max_depth = atoi(val); if (!strcmp(name, "nthread")) nthread = atoi(val); + if (!strcmp(name, "parallel_option")) parallel_option = atoi(val); if (!strcmp(name, "default_direction")) { if (!strcmp(val, "learn")) default_direction = 0; if (!strcmp(val, "left")) default_direction = 1; diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index a8cf6ea7f..bf93cb7b5 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -45,15 +45,19 @@ class ColMaker: public IUpdater { // data structure /*! \brief per thread x per node entry to store tmp data */ struct ThreadEntry { - /*! \brief statistics of data*/ + /*! \brief statistics of data */ TStats stats; + /*! \brief extra statistics of data */ + TStats stats_extra; /*! \brief last feature value scanned */ float last_fvalue; + /*! \brief first feature value scanned */ + float first_fvalue; /*! \brief current best solution */ SplitEntry best; // constructor explicit ThreadEntry(const TrainParam ¶m) - : stats(param) { + : stats(param), stats_extra(param) { } }; struct NodeEntry { @@ -219,7 +223,137 @@ class ColMaker: public IUpdater { } // use new nodes for qexpand qexpand = newnodes; - } + } + // parallel find the best split of current fid + // this function does not support nested functions + inline void ParallelFindSplit(const ColBatch::Inst &col, + bst_uint fid, + const IFMatrix &fmat, + const std::vector &gpair, + const BoosterInfo &info) { + bool need_forward = param.need_forward_search(fmat.GetColDensity(fid)); + bool need_backward = param.need_backward_search(fmat.GetColDensity(fid)); + int nthread; + #pragma omp parallel + { + const int tid = omp_get_thread_num(); + std::vector &temp = stemp[tid]; + // cleanup temp statistics + for (size_t j = 0; j < qexpand.size(); ++j) { + temp[qexpand[j]].stats.Clear(); + } + nthread = omp_get_num_threads(); + bst_uint step = (col.length + nthread - 1) / nthread; + bst_uint end = std::min(col.length, step * (tid + 1)); + for (bst_uint i = tid * step; i < end; ++i) { + const bst_uint ridx = col[i].index; + const int nid = position[ridx]; + if (nid < 0) continue; + const float fvalue = col[i].fvalue; + if (temp[nid].stats.Empty()) { + temp[nid].first_fvalue = fvalue; + } + temp[nid].stats.Add(gpair, info, ridx); + temp[nid].last_fvalue = fvalue; + } + } + // start collecting the partial sum statistics + bst_omp_uint nnode = static_cast(qexpand.size()); + #pragma omp parallel for schedule(static) + for (bst_omp_uint j = 0; j < nnode; ++j) { + const int nid = qexpand[j]; + TStats sum(param), tmp(param), c(param); + for (int tid = 0; tid < nthread; ++tid) { + tmp = stemp[tid][nid].stats; + stemp[tid][nid].stats = sum; + sum.Add(tmp); + if (tid != 0) { + std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue); + } + } + for (int tid = 0; tid < nthread; ++tid) { + stemp[tid][nid].stats_extra = sum; + ThreadEntry &e = stemp[tid][nid]; + float fsplit; + if (tid != 0) { + if(fabsf(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) { + fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f; + } else { + continue; + } + } else { + fsplit = e.first_fvalue - rt_eps; + } + if (need_forward && tid != 0) { + c.SetSubstract(snode[nid].stats, e.stats); + if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, fsplit, false); + } + } + if (need_backward) { + tmp.SetSubstract(sum, e.stats); + c.SetSubstract(snode[nid].stats, tmp); + if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, fsplit, true); + } + } + } + if (need_backward) { + tmp = sum; + ThreadEntry &e = stemp[nthread-1][nid]; + c.SetSubstract(snode[nid].stats, tmp); + if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true); + } + } + } + // rescan, generate candidate split + #pragma omp parallel + { + TStats c(param), cright(param); + const int tid = omp_get_thread_num(); + std::vector &temp = stemp[tid]; + nthread = static_cast(omp_get_num_threads()); + bst_uint step = (col.length + nthread - 1) / nthread; + bst_uint end = std::min(col.length, step * (tid + 1)); + for (bst_uint i = tid * step; i < end; ++i) { + const bst_uint ridx = col[i].index; + const int nid = position[ridx]; + if (nid < 0) continue; + const float fvalue = col[i].fvalue; + // get the statistics of nid + ThreadEntry &e = temp[nid]; + if (e.stats.Empty()) { + e.stats.Add(gpair, info, ridx); + e.first_fvalue = fvalue; + } else { + // forward default right + if (fabsf(fvalue - e.first_fvalue) > rt_2eps){ + if (need_forward) { + c.SetSubstract(snode[nid].stats, e.stats); + if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false); + } + } + if (need_backward) { + cright.SetSubstract(e.stats_extra, e.stats); + c.SetSubstract(snode[nid].stats, cright); + if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true); + } + } + } + e.stats.Add(gpair, info, ridx); + e.first_fvalue = fvalue; + } + } + } + } // enumerate the split values of specific feature inline void EnumerateSplit(const ColBatch::Entry *begin, const ColBatch::Entry *end, @@ -272,6 +406,38 @@ class ColMaker: public IUpdater { } } } + // update the solution candidate + virtual void UpdateSolution(const ColBatch &batch, + const std::vector &gpair, + const IFMatrix &fmat, + const BoosterInfo &info) { + // start enumeration + const bst_omp_uint nsize = static_cast(batch.size); + #if defined(_OPENMP) + const int batch_size = std::max(static_cast(nsize / this->nthread / 32), 1); + #endif + if (param.parallel_option == 0) { + #pragma omp parallel for schedule(dynamic, batch_size) + for (bst_omp_uint i = 0; i < nsize; ++i) { + const bst_uint fid = batch.col_index[i]; + const int tid = omp_get_thread_num(); + const ColBatch::Inst c = batch[i]; + if (param.need_forward_search(fmat.GetColDensity(fid))) { + this->EnumerateSplit(c.data, c.data + c.length, +1, + fid, gpair, info, stemp[tid]); + } + if (param.need_backward_search(fmat.GetColDensity(fid))) { + this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, + fid, gpair, info, stemp[tid]); + } + } + } else { + for (bst_omp_uint i = 0; i < nsize; ++i) { + this->ParallelFindSplit(batch[i], batch.col_index[i], + fmat, gpair, info); + } + } + } // find splits at current level, do split per level inline void FindSplit(int depth, const std::vector &qexpand, @@ -288,26 +454,7 @@ class ColMaker: public IUpdater { } utils::IIterator *iter = p_fmat->ColIterator(feat_set); while (iter->Next()) { - const ColBatch &batch = iter->Value(); - // start enumeration - const bst_omp_uint nsize = static_cast(batch.size); - #if defined(_OPENMP) - const int batch_size = std::max(static_cast(nsize / this->nthread / 32), 1); - #endif - #pragma omp parallel for schedule(dynamic, batch_size) - for (bst_omp_uint i = 0; i < nsize; ++i) { - const bst_uint fid = batch.col_index[i]; - const int tid = omp_get_thread_num(); - const ColBatch::Inst c = batch[i]; - if (param.need_forward_search(p_fmat->GetColDensity(fid))) { - this->EnumerateSplit(c.data, c.data + c.length, +1, - fid, gpair, info, stemp[tid]); - } - if (param.need_backward_search(p_fmat->GetColDensity(fid))) { - this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, - fid, gpair, info, stemp[tid]); - } - } + this->UpdateSolution(iter->Value(), gpair, *p_fmat, info); } // after this each thread's stemp will get the best candidates, aggregate results for (size_t i = 0; i < qexpand.size(); ++i) { @@ -325,6 +472,7 @@ class ColMaker: public IUpdater { } } } + // reset position of each data points after split is created in the tree inline void ResetPosition(const std::vector &qexpand, IFMatrix *p_fmat, const RegTree &tree) { const std::vector &rowset = p_fmat->buffered_rowset(); diff --git a/src/utils/io.h b/src/utils/io.h index 23fa0d468..141d83f8c 100644 --- a/src/utils/io.h +++ b/src/utils/io.h @@ -100,7 +100,6 @@ class ISeekStream: public IStream { /*! \brief implementation of file i/o stream */ class FileStream : public ISeekStream { public: - explicit FileStream(void) {} explicit FileStream(FILE *fp) { this->fp = fp; }