diff --git a/Makefile b/Makefile index 12d2507e2..583f12038 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,12 @@ else CFLAGS += -fopenmp endif +# by default use c++11 +ifeq ($(no_cxx11),1) +else + CFLAGS += -std=c++11 +endif + # specify tensor path BIN = xgboost OBJ = updater.o gbm.o io.o main.o diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp index ab1c5ef1c..9f35d5731 100644 --- a/src/tree/updater_histmaker-inl.hpp +++ b/src/tree/updater_histmaker-inl.hpp @@ -152,7 +152,7 @@ class HistMaker: public BaseMaker { IFMatrix *p_fmat, const BoosterInfo &info, const std::vector &fset, - const RegTree &tree) = 0; + const RegTree &tree) = 0; // initialize the current working set of features in this round virtual void InitWorkSet(IFMatrix *p_fmat, const RegTree &tree, @@ -306,32 +306,45 @@ class CQHistMaker: public HistMaker { } // start to work this->wspace.Init(this->param, 1); - thread_hist.resize(this->get_nthread()); - // start accumulating statistics - utils::IIterator *iter = p_fmat->ColIterator(fset); - iter->BeforeFirst(); - while (iter->Next()) { - const ColBatch &batch = iter->Value(); - // start enumeration - const bst_omp_uint nsize = static_cast(batch.size); - #pragma omp parallel for schedule(dynamic, 1) - for (bst_omp_uint i = 0; i < nsize; ++i) { - int offset = feat2workindex[batch.col_index[i]]; - if (offset >= 0) { - this->UpdateHistCol(gpair, batch[i], info, tree, - fset, offset, - &thread_hist[omp_get_thread_num()]); + // if it is C++11, use lazy evaluation for Allreduce, + // to gain speedup in recovery +#if __cplusplus >= 201103L + auto lazy_get_hist = [&]() +#endif + { + thread_hist.resize(this->get_nthread()); + // start accumulating statistics + utils::IIterator *iter = p_fmat->ColIterator(fset); + iter->BeforeFirst(); + while (iter->Next()) { + const ColBatch &batch = iter->Value(); + // start enumeration + const bst_omp_uint nsize = static_cast(batch.size); + #pragma omp parallel for schedule(dynamic, 1) + for (bst_omp_uint i = 0; i < nsize; ++i) { + int offset = feat2workindex[batch.col_index[i]]; + if (offset >= 0) { + this->UpdateHistCol(gpair, batch[i], info, tree, + fset, offset, + &thread_hist[omp_get_thread_num()]); + } } } - } - for (size_t i = 0; i < this->qexpand.size(); ++i) { - const int nid = this->qexpand[i]; - const int wid = this->node2workindex[nid]; - this->wspace.hset[0][fset.size() + wid * (fset.size()+1)] - .data[0] = node_stats[nid]; - } + for (size_t i = 0; i < this->qexpand.size(); ++i) { + const int nid = this->qexpand[i]; + const int wid = this->node2workindex[nid]; + this->wspace.hset[0][fset.size() + wid * (fset.size()+1)] + .data[0] = node_stats[nid]; + } + }; // sync the histogram - this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size()); + // if it is C++11, use lazy evaluation for Allreduce +#if __cplusplus >= 201103L + this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), + this->wspace.hset[0].data.size(), lazy_get_hist); +#else + this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size()); +#endif } virtual void ResetPositionAfterSplit(IFMatrix *p_fmat, const RegTree &tree) { @@ -353,49 +366,61 @@ class CQHistMaker: public HistMaker { } else { feat2workindex[fset[i]] = -2; } - } - + } this->GetNodeStats(gpair, *p_fmat, tree, info, - &thread_stats, &node_stats); + &thread_stats, &node_stats); sketchs.resize(this->qexpand.size() * freal_set.size()); for (size_t i = 0; i < sketchs.size(); ++i) { sketchs[i].Init(info.num_row, this->param.sketch_eps); } - thread_sketch.resize(this->get_nthread()); - // number of rows in - const size_t nrows = p_fmat->buffered_rowset().size(); - // start accumulating statistics - utils::IIterator *iter = p_fmat->ColIterator(freal_set); - iter->BeforeFirst(); - while (iter->Next()) { - const ColBatch &batch = iter->Value(); - // start enumeration - const bst_omp_uint nsize = static_cast(batch.size); - #pragma omp parallel for schedule(dynamic, 1) - for (bst_omp_uint i = 0; i < nsize; ++i) { - int offset = feat2workindex[batch.col_index[i]]; - if (offset >= 0) { - this->UpdateSketchCol(gpair, batch[i], tree, - node_stats, - freal_set, offset, - batch[i].length == nrows, - &thread_sketch[omp_get_thread_num()]); - } - } - } + // intitialize the summary array + summary_array.resize(sketchs.size()); // setup maximum size unsigned max_size = this->param.max_sketch_size(); - // synchronize sketch - summary_array.resize(sketchs.size()); for (size_t i = 0; i < sketchs.size(); ++i) { - utils::WXQuantileSketch::SummaryContainer out; - sketchs[i].GetSummary(&out); summary_array[i].Reserve(max_size); - summary_array[i].SetPrune(out, max_size); } + // if it is C++11, use lazy evaluation for Allreduce +#if __cplusplus >= 201103L + auto lazy_get_summary = [&]() +#endif + {// get smmary + thread_sketch.resize(this->get_nthread()); + // number of rows in + const size_t nrows = p_fmat->buffered_rowset().size(); + // start accumulating statistics + utils::IIterator *iter = p_fmat->ColIterator(freal_set); + iter->BeforeFirst(); + while (iter->Next()) { + const ColBatch &batch = iter->Value(); + // start enumeration + const bst_omp_uint nsize = static_cast(batch.size); + #pragma omp parallel for schedule(dynamic, 1) + for (bst_omp_uint i = 0; i < nsize; ++i) { + int offset = feat2workindex[batch.col_index[i]]; + if (offset >= 0) { + this->UpdateSketchCol(gpair, batch[i], tree, + node_stats, + freal_set, offset, + batch[i].length == nrows, + &thread_sketch[omp_get_thread_num()]); + } + } + } + for (size_t i = 0; i < sketchs.size(); ++i) { + utils::WXQuantileSketch::SummaryContainer out; + sketchs[i].GetSummary(&out); + summary_array[i].SetPrune(out, max_size); + } + utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch"); + }; if (summary_array.size() != 0) { size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); +#if __cplusplus >= 201103L + sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary); +#else sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size()); +#endif } // now we get the final result of sketch, setup the cut this->wspace.cut.clear(); @@ -623,7 +648,8 @@ class QuantileHistMaker: public HistMaker { summary_array[i].Reserve(max_size); summary_array[i].SetPrune(out, max_size); } - size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); + + size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size()); // now we get the final result of sketch, setup the cut this->wspace.cut.clear(); diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp index 83a81615c..0285f0771 100644 --- a/src/tree/updater_refresh-inl.hpp +++ b/src/tree/updater_refresh-inl.hpp @@ -52,40 +52,50 @@ class TreeRefresher: public IUpdater { std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param)); fvec_temp[tid].Init(trees[0]->param.num_feature); } - // start accumulating statistics - utils::IIterator *iter = p_fmat->RowIterator(); - iter->BeforeFirst(); - while (iter->Next()) { - const RowBatch &batch = iter->Value(); - utils::Check(batch.size < std::numeric_limits::max(), - "too large batch size "); - const bst_omp_uint nbatch = static_cast(batch.size); - #pragma omp parallel for schedule(static) - for (bst_omp_uint i = 0; i < nbatch; ++i) { - RowBatch::Inst inst = batch[i]; - const int tid = omp_get_thread_num(); - const bst_uint ridx = static_cast(batch.base_rowid + i); - RegTree::FVec &feats = fvec_temp[tid]; - feats.Fill(inst); - int offset = 0; - for (size_t j = 0; j < trees.size(); ++j) { - AddStats(*trees[j], feats, gpair, info, ridx, - BeginPtr(stemp[tid]) + offset); - offset += trees[j]->param.num_nodes; + // if it is C++11, use lazy evaluation for Allreduce, + // to gain speedup in recovery +#if __cplusplus >= 201103L + auto lazy_get_stats = [&]() +#endif + { + // start accumulating statistics + utils::IIterator *iter = p_fmat->RowIterator(); + iter->BeforeFirst(); + while (iter->Next()) { + const RowBatch &batch = iter->Value(); + utils::Check(batch.size < std::numeric_limits::max(), + "too large batch size "); + const bst_omp_uint nbatch = static_cast(batch.size); + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < nbatch; ++i) { + RowBatch::Inst inst = batch[i]; + const int tid = omp_get_thread_num(); + const bst_uint ridx = static_cast(batch.base_rowid + i); + RegTree::FVec &feats = fvec_temp[tid]; + feats.Fill(inst); + int offset = 0; + for (size_t j = 0; j < trees.size(); ++j) { + AddStats(*trees[j], feats, gpair, info, ridx, + BeginPtr(stemp[tid]) + offset); + offset += trees[j]->param.num_nodes; + } + feats.Drop(inst); } - feats.Drop(inst); } - } - // aggregate the statistics - int num_nodes = static_cast(stemp[0].size()); - #pragma omp parallel for schedule(static) - for (int nid = 0; nid < num_nodes; ++nid) { - for (int tid = 1; tid < nthread; ++tid) { - stemp[0][nid].Add(stemp[tid][nid]); + // aggregate the statistics + int num_nodes = static_cast(stemp[0].size()); + #pragma omp parallel for schedule(static) + for (int nid = 0; nid < num_nodes; ++nid) { + for (int tid = 1; tid < nthread; ++tid) { + stemp[0][nid].Add(stemp[tid][nid]); + } } - } - // AllReduce, add statistics up + }; +#if __cplusplus >= 201103L + reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats); +#else reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size()); +#endif // rescale learning rate according to size of trees float lr = param.learning_rate; param.learning_rate = lr / trees.size();