diff --git a/Makefile b/Makefile
index 12d2507e2..583f12038 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,12 @@ else
 	CFLAGS += -fopenmp
 endif
 
+# by default use c++11
+ifeq ($(no_cxx11),1)
+else 
+	CFLAGS += -std=c++11
+endif
+
 # specify tensor path
 BIN = xgboost
 OBJ = updater.o gbm.o io.o main.o 
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index ab1c5ef1c..9f35d5731 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -152,7 +152,7 @@ class HistMaker: public BaseMaker {
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const std::vector <bst_uint> &fset,
-                                  const RegTree &tree)  = 0;
+                                  const RegTree &tree) = 0;
   // initialize the current working set of features in this round
   virtual void InitWorkSet(IFMatrix *p_fmat,
                            const RegTree &tree,
@@ -306,32 +306,45 @@ class CQHistMaker: public HistMaker<TStats> {
     } 
     // start to work
     this->wspace.Init(this->param, 1);
-    thread_hist.resize(this->get_nthread());
-    // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        int offset = feat2workindex[batch.col_index[i]];
-        if (offset >= 0) {
-          this->UpdateHistCol(gpair, batch[i], info, tree,
-                              fset, offset,
-                              &thread_hist[omp_get_thread_num()]);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_hist = [&]()
+#endif
+    {
+      thread_hist.resize(this->get_nthread());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateHistCol(gpair, batch[i], info, tree,
+                                fset, offset,
+                                &thread_hist[omp_get_thread_num()]);
+          }
         }
       }
-    }
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const int nid = this->qexpand[i];
-      const int wid = this->node2workindex[nid];
-      this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
-          .data[0] = node_stats[nid];
-    }
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        const int wid = this->node2workindex[nid];
+        this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
+            .data[0] = node_stats[nid];
+      }
+    };
     // sync the histogram
-    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());    
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), 
+                            this->wspace.hset[0].data.size(), lazy_get_hist);
+#else
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());   
+#endif    
   }
   virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
                                        const RegTree &tree) {
@@ -353,49 +366,61 @@ class CQHistMaker: public HistMaker<TStats> {
       } else {
         feat2workindex[fset[i]] = -2;  
       }
-    }
-        
+    }      
     this->GetNodeStats(gpair, *p_fmat, tree, info,
-                       &thread_stats, &node_stats);
+                       &thread_stats, &node_stats);       
     sketchs.resize(this->qexpand.size() * freal_set.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
-    thread_sketch.resize(this->get_nthread());
-    // number of rows in
-    const size_t nrows = p_fmat->buffered_rowset().size();
-    // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        int offset = feat2workindex[batch.col_index[i]];
-        if (offset >= 0) {
-          this->UpdateSketchCol(gpair, batch[i], tree,
-                                node_stats,
-                                freal_set, offset,
-                                batch[i].length == nrows,
-                                &thread_sketch[omp_get_thread_num()]);
-        }
-      }
-    }
+    // intitialize the summary array
+    summary_array.resize(sketchs.size());
     // setup maximum size
     unsigned max_size = this->param.max_sketch_size();
-    // synchronize sketch
-    summary_array.resize(sketchs.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-      sketchs[i].GetSummary(&out);
       summary_array[i].Reserve(max_size);
-      summary_array[i].SetPrune(out, max_size);
     }
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    auto lazy_get_summary = [&]()
+#endif
+    {// get smmary
+      thread_sketch.resize(this->get_nthread());
+      // number of rows in
+      const size_t nrows = p_fmat->buffered_rowset().size();
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateSketchCol(gpair, batch[i], tree,
+                                  node_stats,
+                                  freal_set, offset,
+                                  batch[i].length == nrows,
+                                  &thread_sketch[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < sketchs.size(); ++i) {
+        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+        sketchs[i].GetSummary(&out);
+        summary_array[i].SetPrune(out, max_size);
+      }
+      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
+    };
     if (summary_array.size() != 0) {
       size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+#if __cplusplus >= 201103L
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
+#else
       sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+#endif
     }
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
@@ -623,7 +648,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
       summary_array[i].Reserve(max_size);
       summary_array[i].SetPrune(out, max_size);
     }
-    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+    
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);    
     sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp
index 83a81615c..0285f0771 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -52,40 +52,50 @@ class TreeRefresher: public IUpdater {
       std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
       fvec_temp[tid].Init(trees[0]->param.num_feature);
     }
-    // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                   "too large batch size ");
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const int tid = omp_get_thread_num();
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        RegTree::FVec &feats = fvec_temp[tid];
-        feats.Fill(inst);
-        int offset = 0;
-        for (size_t j = 0; j < trees.size(); ++j) {
-          AddStats(*trees[j], feats, gpair, info, ridx,
-                   BeginPtr(stemp[tid]) + offset);
-          offset += trees[j]->param.num_nodes;
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_stats = [&]()
+#endif
+    {
+      // start accumulating statistics
+      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const RowBatch &batch = iter->Value();
+        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
+                     "too large batch size ");
+        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nbatch; ++i) {
+          RowBatch::Inst inst = batch[i];
+          const int tid = omp_get_thread_num();
+          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+          RegTree::FVec &feats = fvec_temp[tid];
+          feats.Fill(inst);
+          int offset = 0;
+          for (size_t j = 0; j < trees.size(); ++j) {
+            AddStats(*trees[j], feats, gpair, info, ridx,
+                     BeginPtr(stemp[tid]) + offset);
+            offset += trees[j]->param.num_nodes;
+          }
+          feats.Drop(inst);
         }
-        feats.Drop(inst);
       }
-    }
-    // aggregate the statistics
-    int num_nodes = static_cast<int>(stemp[0].size());
-    #pragma omp parallel for schedule(static)
-    for (int nid = 0; nid < num_nodes; ++nid) {
-      for (int tid = 1; tid < nthread; ++tid) {
-        stemp[0][nid].Add(stemp[tid][nid]);
+      // aggregate the statistics
+      int num_nodes = static_cast<int>(stemp[0].size());
+      #pragma omp parallel for schedule(static)
+      for (int nid = 0; nid < num_nodes; ++nid) {
+        for (int tid = 1; tid < nthread; ++tid) {
+          stemp[0][nid].Add(stemp[tid][nid]);
+        }
       }
-    }
-    // AllReduce, add statistics up
+    };
+#if __cplusplus >= 201103L
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+#else
     reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
+#endif
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();