diff --git a/.gitignore b/.gitignore
index ee5928043..789c6b7c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *.slo
 *.lo
 *.o
-
+*.page
 # Compiled Dynamic libraries
 *.so
 *.dylib
@@ -45,5 +45,13 @@ Debug
 *save
 *csv
 .Rproj.user
+*.cpage.col
+*.cpage
 xgboost
+xgboost.mpi
 xgboost.mock
+train*
+rabit
+.Rbuildignore
+R-package.Rproj
+
diff --git a/CHANGES.md b/CHANGES.md
index 027a077c6..d834ce79d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -20,3 +20,9 @@ xgboost-0.3
 * Linear booster is now parallelized, using parallel coordinated descent.
 * Add [Code Guide](src/README.md) for customizing objective function and evaluation
 * Add R module
+
+in progress version
+=====
+* Distributed version
+* Feature importance visualization in R module, thanks to Michael Benesty
+* Predict leaf inde
diff --git a/Makefile b/Makefile
index 3230661d4..a4bbe876f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 export CC  = gcc
 export CXX = g++
+export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
-
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -pedantic 
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC
 
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP 
@@ -10,56 +10,90 @@ else
 	CFLAGS += -fopenmp
 endif
 
+# by default use c++11
+ifeq ($(cxx11),1)
+	CFLAGS += -std=c++11
+else 
+endif
+
 # specify tensor path
 BIN = xgboost
-OBJ = updater.o gbm.o io.o
+MOCKBIN = xgboost.mock
+OBJ = updater.o gbm.o io.o main.o 
+MPIBIN = xgboost.mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack
 
-all: $(BIN) $(OBJ) $(SLIB) 
+all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
+mpi: $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
-updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
-gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
+updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
+gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
+xgboost.mpi:  updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mpi.a
+xgboost.mock: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mock.a
+xgboost:  updater.o gbm.o io.o main.o subtree/rabit/lib/librabit.a
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o subtree/rabit/lib/librabit.a
+
+# dependency on rabit
+subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
+	cd subtree/rabit;make lib/librabit.a; cd ../..
+subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
+	cd subtree/rabit;make lib/librabit_empty.a; cd ../..
+subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
+	cd subtree/rabit;make lib/librabit_mock.a; cd ../..
+subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
+	cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
 
 $(BIN) : 
-	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
+
+$(MOCKBIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 
 $(SLIB) :
-	$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) 
 
 $(OBJ) : 
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIOBJ) : 
+	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) 
+
+$(MPIBIN) : 
+	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 
 install:
 	cp -f -r $(BIN)  $(INSTALL_PATH)
 
 Rpack:
 	make clean
+	cd subtree/rabit;make clean;cd ..
 	rm -rf xgboost xgboost*.tar.gz
 	cp -r R-package xgboost
 	rm -rf xgboost/inst/examples/*.buffer
 	rm -rf xgboost/inst/examples/*.model
 	rm -rf xgboost/inst/examples/dump*
 	rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
+	rm -rf subtree/rabit/src/*.o
 	rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
 	rm -rf xgboost/demo/runall.R
 	cp -r src xgboost/src/src
+	cp -r subtree xgboost/src/subtree
 	mkdir xgboost/src/wrapper
 	cp  wrapper/xgboost_wrapper.h xgboost/src/wrapper
 	cp  wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
 	cp ./LICENSE xgboost
 	cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
-	cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
+	cp xgboost/src/Makevars xgboost/src/Makevars.win
 	R CMD build xgboost
 	rm -rf xgboost
 	R CMD check --as-cran xgboost*.tar.gz
 
 clean:
-	$(RM) $(OBJ) $(BIN) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
+	cd subtree/rabit; make clean; cd ..
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index cc1c22087..63ed9581c 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -22,7 +22,7 @@ Depends:
 Imports:
     Matrix (>= 1.1-0),
     methods,
-    data.table (>= 1.9),
+    data.table (>= 1.9.4),
     magrittr (>= 1.5),
     stringr,
-    DiagrammeR
\ No newline at end of file
+    DiagrammeR
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index d29ad7a18..12225c966 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -1,4 +1,4 @@
-# Generated by roxygen2 (4.1.0): do not edit by hand
+# Generated by roxygen2 (4.0.1): do not edit by hand
 
 export(getinfo)
 export(setinfo)
diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R
index ed61ba654..6e291fe62 100644
--- a/R-package/R/getinfo.xgb.DMatrix.R
+++ b/R-package/R/getinfo.xgb.DMatrix.R
@@ -32,10 +32,15 @@ setMethod("getinfo", signature = "xgb.DMatrix",
               if (class(object) != "xgb.DMatrix") {
                   stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
               }
-              if (name != "label" && name != "weight" && name != "base_margin") {
+              if (name != "label" && name != "weight" && 
+                      name != "base_margin" && name != "nrow") {
                   stop(paste("xgb.getinfo: unknown info name", name))
               }
-              ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
+              if (name != "nrow"){
+                  ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
+              } else {
+                  ret <- xgb.numrow(object)
+              }
               return(ret)
           })
 
diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R
index d57017b65..1e458e708 100644
--- a/R-package/R/predict.xgb.Booster.R
+++ b/R-package/R/predict.xgb.Booster.R
@@ -7,6 +7,8 @@ setClass("xgb.Booster")
 #' @param object Object of class "xgb.Boost"
 #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or 
 #'   \code{xgb.DMatrix}. 
+#' @param missing Missing is only used when input is dense matrix, pick a float 
+#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #' @param outputmargin whether the prediction should be shown in the original
 #'   value of sum of functions, when outputmargin=TRUE, the prediction is 
 #'   untransformed margin value. In logistic regression, outputmargin=T will
@@ -14,6 +16,7 @@ setClass("xgb.Booster")
 #' @param ntreelimit limit number of trees used in prediction, this parameter is
 #'  only valid for gbtree, but not for gblinear. set it to be value bigger 
 #'  than 0. It will use all trees by default.
+#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
@@ -25,7 +28,8 @@ setClass("xgb.Booster")
 #' @export
 #' 
 setMethod("predict", signature = "xgb.Booster", 
-          definition = function(object, newdata, missing = NULL, outputmargin = FALSE, ntreelimit = NULL) {
+          definition = function(object, newdata, missing = NULL, 
+                                outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
   if (class(newdata) != "xgb.DMatrix") {
     if (is.null(missing)) {
       newdata <- xgb.DMatrix(newdata)
@@ -40,7 +44,24 @@ setMethod("predict", signature = "xgb.Booster",
       stop("predict: ntreelimit must be equal to or greater than 1")
     }
   }
-  ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
+  option = 0
+  if (outputmargin) {
+    option <- option + 1
+  }
+  if (predleaf) {
+    option <- option + 2
+  }
+  ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(option), 
+               as.integer(ntreelimit), PACKAGE = "xgboost")
+  if (predleaf){
+      len <- getinfo(newdata, "nrow")
+      if (length(ret) == len){
+          ret <- matrix(ret,ncol = 1)
+      } else {
+          ret <- matrix(ret, ncol = len)
+          ret <- t(ret)
+      }
+  }
   return(ret)
 })
  
diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R
index 419170a66..b70a8ee92 100644
--- a/R-package/R/slice.xgb.DMatrix.R
+++ b/R-package/R/slice.xgb.DMatrix.R
@@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
               if (class(object) != "xgb.DMatrix") {
                   stop("slice: first argument dtrain must be xgb.DMatrix")
               }
-              ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
+              ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, 
+                           PACKAGE = "xgboost")
+              
+              attr_list <- attributes(object)
+              nr <- xgb.numrow(object)
+              len <- sapply(attr_list,length)
+              ind <- which(len==nr)
+              if (length(ind)>0) {
+                  nms <- names(attr_list)[ind]
+                  for (i in 1:length(ind)) {
+                    attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
+                  }
+              }
               return(structure(ret, class = "xgb.DMatrix"))
           })
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 34ce003db..412132891 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -131,7 +131,7 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
 }
 
 # iteratively evaluate one iteration
-xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
+xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
   if (class(booster) != "xgb.Booster") {
     stop("xgb.eval: first argument must be type xgb.Booster")
   }
@@ -169,18 +169,27 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
   } else {
     msg <- ""
   }
+  if (prediction){
+    preds <- predict(booster,watchlist[[2]])
+    return(list(msg,preds))
+  }
   return(msg)
-} 
+}
 #------------------------------------------
 # helper functions for cross validation
 #
 xgb.cv.mknfold <- function(dall, nfold, param) {
-  randidx <- sample(1 : xgb.numrow(dall))
-  kstep <- length(randidx) / nfold
-  idset <- list()
-  for (i in 1:nfold) {
-    idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
+  if (nfold <= 1) {
+    stop("nfold must be bigger than 1")
   }
+  randidx <- sample(1 : xgb.numrow(dall))
+  kstep <- length(randidx) %/% nfold
+  idset <- list()
+  for (i in 1:(nfold-1)) {
+    idset[[i]] = randidx[1:kstep]
+    randidx = setdiff(randidx,idset[[i]])
+  }
+  idset[[nfold]] = randidx
   ret <- list()
   for (k in 1:nfold) {
     dtest <- slice(dall, idset[[k]])
@@ -193,7 +202,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
     dtrain <- slice(dall, didx)
     bst <- xgb.Booster(param, list(dtrain, dtest))
     watchlist = list(train=dtrain, test=dtest)
-    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
+    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
   }
   return (ret)
 }
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index b7a5a9897..8c3ea80bc 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -6,7 +6,7 @@
 #'   indicating the data file.
 #' @param info a list of information of the xgb.DMatrix object
 #' @param missing Missing is only used when input is dense matrix, pick a float
-#     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #
 #' @param ... other information to pass to \code{info}.
 #' 
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index b071f08a7..ed088df52 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -31,6 +31,9 @@
 #' @param nrounds the max number of iterations
 #' @param nfold number of folds used
 #' @param label option field, when data is Matrix
+#' @param missing Missing is only used when input is dense matrix, pick a float
+#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#' @param prediction A logical value indicating whether to return the prediction vector.
 #' @param showsd \code{boolean}, whether show standard deviation of cross validation
 #' @param metrics, list of evaluation metrics to be used in corss validation,
 #'   when it is not specified, the evaluation metric is chosen according to objective function.
@@ -47,8 +50,6 @@
 #' @param feval custimized evaluation function. Returns 
 #'   \code{list(metric='metric-name', value='metric-value')} with given 
 #'   prediction and dtrain,
-#' @param missing Missing is only used when input is dense matrix, pick a float
-#     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #' @param verbose \code{boolean}, print the statistics during the process.
 #' @param ... other parameters to pass to \code{params}.
 #' 
@@ -71,7 +72,8 @@
 #' @export
 #'
 xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
-                   showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
+                   prediction = FALSE, showsd = TRUE, metrics=list(), 
+                   obj = NULL, feval = NULL, verbose = T,...) {
   if (typeof(params) != "list") {
     stop("xgb.cv: first argument params must be list")
   }
@@ -90,13 +92,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
   }
 
   folds <- xgb.cv.mknfold(dtrain, nfold, params)
+  predictValues <- rep(0,xgb.numrow(dtrain))
   history <- c()
   for (i in 1:nrounds) {
     msg <- list()
     for (k in 1:nfold) {
       fd <- folds[[k]]
-      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)      
-      msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
+      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
+      if (!prediction){
+        msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
+      } else {
+        res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
+        predictValues[fd$index] <- res[[2]]
+        msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
+      }
     }
     ret <- xgb.cv.aggcv(msg, showsd)
     history <- c(history, ret)
@@ -115,5 +124,14 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
   split <- str_split(string = history, pattern = "\t")
   
   for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
-  dt
-}
\ No newline at end of file
+  
+  if (prediction) {
+    return(list(dt = dt,pred = predictValues))
+  }
+  return(dt)
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(".")
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 3df8c9605..a0938ded1 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -29,7 +29,7 @@
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nround = 2,objective = "binary:logistic")
 #' # save the model in file 'xgb.model.dump'
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
 #' 
 #' # print the model without saving it to a file
 #' print(xgb.dump(bst))
@@ -37,11 +37,15 @@
 #' 
 xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
   if (class(model) != "xgb.Booster") {
-    stop("xgb.dump: first argument must be type xgb.Booster")
+    stop("model: argument must be type xgb.Booster")
   }
-  if (!class(fname) %in% c("character", "NULL")) {
-    stop("xgb.dump: second argument must be type character when provided")
+  if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
+    stop("fname: argument must be type character (when provided)")
   }
+  if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
+    stop("fmap: argument must be type character (when provided)")
+  }
+  
   result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
   
   if(is.null(fname)) {
@@ -50,4 +54,9 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
     result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname)
     return(TRUE)
   }
-}
\ No newline at end of file
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(".")
\ No newline at end of file
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index 189ee03b4..094171382 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -9,6 +9,7 @@
 #' @importFrom magrittr %>%
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
+#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #'
 #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 #'
@@ -31,41 +32,57 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' 
-#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). 
+#' #Both dataset are list with two items, a sparse matrix and labels 
+#' #(labels = outcome column which will be learned). 
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.importance <- function(feature_names = NULL, filename_dump = NULL){  
+xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){  
   if (!class(feature_names) %in% c("character", "NULL")) {	   
     stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
   }
-  if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
+  
+  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
     stop("filename_dump: Has to be a path to the model dump file.")
   }
-  text <- readLines(filename_dump)
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  if(is.null(model)){
+    text <- readLines(filename_dump)  
+  } else {
+    text <- xgb.dump(model = model, with.stats = T)
+  } 
+  
   if(text[2] == "bias:"){
-    result <- linearDump(feature_names, text)
+    result <- readLines(filename_dump) %>% linearDump(feature_names, .)
   }  else {
-    result <- treeDump(feature_names, text)
+    result <- treeDump(feature_names, text = text)
   }
   result
 }
 
 treeDump <- function(feature_names, text){  
-  result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",][,.(sum(Quality), sum(Cover), .N),by = Feature][,V1:=V1/sum(V1)][,V2:=V2/sum(V2)][,N:=N/sum(N)][order(-rank(V1))]
-  setnames(result, c("Feature", "Gain", "Cover", "Frequence"))
+  result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover),Frequence=Frequence/sum(Frequence))][order(-Gain)]
+  
   result  
 }
 
 linearDump <- function(feature_names, text){
   which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
-}
\ No newline at end of file
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(".")
\ No newline at end of file
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 3e0723c61..930538c5b 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -16,6 +16,8 @@
 #' @importFrom stringr str_trim
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
+#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
 #'
 #' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
@@ -40,38 +42,47 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
-#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). 
+#' #Both dataset are list with two items, a sparse matrix and labels 
+#' #(labels = outcome column which will be learned). 
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump')
 #' 
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){
+xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
   
   if (!class(feature_names) %in% c("character", "NULL")) {     
     stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
   }
-  if (!class(filename_dump) %in% c("character", "NULL")) {
-    stop("filename_dump: Has to be a character vector representing the path to the model dump file.")
-  } else if (class(filename_dump) == "character" && !file.exists(filename_dump)) {
+  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
+    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
+  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
     stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) & is.null(text)){
-    stop("filename_dump: no path and no string version of the model dump have been provided.")
+  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
+    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
   }
-  if (!class(text) %in% c("character", "NULL")) {     
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  if (!class(text) %in% c("character", "NULL")) { 
     stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
   }
+  
   if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
     stop("n_first_tree: Has to be a numeric vector of size 1.")
   }
   
-  if(is.null(text)){
+  if(!is.null(model)){
+    text = xgb.dump(model = model, with.stats = T)
+  } else if(!is.null(filename_dump)){
     text <- readLines(filename_dump) %>% str_trim(side = "both")  
   }
   
@@ -89,6 +100,9 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
     
     tree <- text[(position[i]+1):(position[i+1]-1)]
     
+    # avoid tree made of a leaf only (no split)
+    if(length(tree) <2) next
+    
     treeID <- i-1
     
     notLeaf <- str_match(tree, "leaf") %>% is.na
@@ -119,7 +133,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
   }
   
   yes <- allTrees[!is.na(Yes),Yes]
-  
+                                                                                      
   set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
       j = "Yes.Feature", 
       value = allTrees[ID == yes,Feature])
@@ -148,3 +162,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
       
   allTrees
 }
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
\ No newline at end of file
diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R
index 1a8a04e8a..662ccb21b 100644
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -17,9 +17,12 @@
 #' @importFrom stringr str_trim
 #' @importFrom DiagrammeR DiagrammeR
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
+#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
-#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
+#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
+#' @param  width  the width of the diagram in pixels.
+#' @param height	the height of the diagram in pixels.
 #'
 #' @return A \code{DiagrammeR} of the model.
 #'
@@ -39,39 +42,53 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
-#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). 
+#' #Both dataset are list with two items, a sparse matrix and labels 
+#' #(labels = outcome column which will be learned). 
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){  
+#' 
+xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){  
   
-  if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) {
+  if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
     stop("style: Has to be a character vector of size 1.")
   }
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
     
-  allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree)
+  if(is.null(model)){
+    allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)  
+  } else {
+    allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)  
+  }
   
   allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
   
   allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
   
   
-  if(is.null(styles)){
-    styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
+  if(is.null(CSSstyle)){
+    CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
   }  
   
   yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
   
   no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
   
-  path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";")
-  DiagrammeR(path)
+  path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
+  DiagrammeR(path, width, height)
 }
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))
\ No newline at end of file
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 02a554f68..c72c4d5b0 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -5,7 +5,7 @@
 #' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or 
 #'   \code{xgb.DMatrix}. 
 #' @param label the response variable. User should not set this field,
-#    if data is local data file or  \code{xgb.DMatrix}. 
+#'    if data is local data file or  \code{xgb.DMatrix}. 
 #' @param params the list of parameters. Commonly used ones are:
 #' \itemize{
 #'   \item \code{objective} objective function, common ones are
@@ -24,8 +24,8 @@
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
 #'   performance and construction progress information
-#' @param missing Missing is only used when input is dense matrix, pick a float
-#     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#' @param missing Missing is only used when input is dense matrix, pick a float 
+#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
diff --git a/R-package/data/agaricus.test.rda b/R-package/data/agaricus.test.rda
index bffe6de21..ad8d50af7 100644
Binary files a/R-package/data/agaricus.test.rda and b/R-package/data/agaricus.test.rda differ
diff --git a/R-package/data/agaricus.train.rda b/R-package/data/agaricus.train.rda
index c471d0173..3f5f24144 100644
Binary files a/R-package/data/agaricus.train.rda and b/R-package/data/agaricus.train.rda differ
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
index 345d7ca4f..969da0d91 100644
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -4,4 +4,5 @@ boost_from_prediction           Boosting from existing prediction
 predict_first_ntree             Predicting using first n trees
 generalized_linear_model        Generalized Linear Model
 cross_validation                Cross validation
-create_sparse_matrix
+create_sparse_matrix            Create Sparse Matrix
+predict_leaf_indices            Predicting the corresponding leaves
diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R
index 4060d1c48..ac96510a3 100644
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -1,7 +1,7 @@
 require(xgboost)
 require(Matrix)
 require(data.table)
-require(vcd) #Available in Cran. Used for its dataset with categorical values.
+if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
 
 # According to its documentation, Xgboost works only on numbers.
 # Sometimes the dataset we have to work on have categorical data. 
@@ -86,4 +86,4 @@ print(chisq.test(df$AgeCat, df$Y))
 
 # As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
 # However it's almost always worse when you add some arbitrary rules.
-# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
\ No newline at end of file
+# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R
index c7e7ba537..47a0adea0 100644
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -45,3 +45,7 @@ param <- list(max.depth=2,eta=1,silent=1)
 xgb.cv(param, dtrain, nround, nfold = 5,
        obj = logregobj, feval=evalerror)
 
+# do cross validation with prediction values for each fold
+res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
+res$dt
+length(res$pred)
diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
new file mode 100644
index 000000000..480578c1d
--- /dev/null
+++ b/R-package/demo/predict_leaf_indices.R
@@ -0,0 +1,21 @@
+require(xgboost)
+# load in the agaricus dataset
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
+watchlist <- list(eval = dtest, train = dtrain)
+nround = 5
+
+# training the model for two rounds
+bst = xgb.train(param, dtrain, nround, watchlist)
+cat('start testing prediction from first n trees\n')
+
+### predict using first 2 tree
+pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
+head(pred_with_leaf)
+# by default, we predict using all the trees
+pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
+head(pred_with_leaf)
diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd
index 556425379..c050d3ecd 100644
--- a/R-package/man/agaricus.test.Rd
+++ b/R-package/man/agaricus.test.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgboost.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \docType{data}
 \name{agaricus.test}
 \alias{agaricus.test}
diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd
index 879b3d5df..02571cf54 100644
--- a/R-package/man/agaricus.train.Rd
+++ b/R-package/man/agaricus.train.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgboost.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \docType{data}
 \name{agaricus.train}
 \alias{agaricus.train}
diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd
index 37e0ad0be..23e3adc84 100644
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/getinfo.xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \docType{methods}
 \name{getinfo}
 \alias{getinfo}
@@ -13,9 +12,9 @@ getinfo(object, ...)
 \arguments{
 \item{object}{Object of class "xgb.DMatrix"}
 
-\item{...}{other parameters}
-
 \item{name}{the name of the field to get}
+
+\item{...}{other parameters}
 }
 \description{
 Get information of an xgb.DMatrix object
diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd
index afa0c70a5..204a8167f 100644
--- a/R-package/man/predict-xgb.Booster-method.Rd
+++ b/R-package/man/predict-xgb.Booster-method.Rd
@@ -1,12 +1,11 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/predict.xgb.Booster.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \docType{methods}
 \name{predict,xgb.Booster-method}
 \alias{predict,xgb.Booster-method}
 \title{Predict method for eXtreme Gradient Boosting model}
 \usage{
 \S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
-  outputmargin = FALSE, ntreelimit = NULL)
+  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
 }
 \arguments{
 \item{object}{Object of class "xgb.Boost"}
@@ -14,6 +13,9 @@
 \item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
 \code{xgb.DMatrix}.}
 
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+
 \item{outputmargin}{whether the prediction should be shown in the original
 value of sum of functions, when outputmargin=TRUE, the prediction is
 untransformed margin value. In logistic regression, outputmargin=T will
@@ -22,6 +24,8 @@ output value before logistic transformation.}
 \item{ntreelimit}{limit number of trees used in prediction, this parameter is
 only valid for gbtree, but not for gblinear. set it to be value bigger
 than 0. It will use all trees by default.}
+
+\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
 }
 \description{
 Predicted values based on xgboost model object.
diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd
index 4ed262b46..7ea992110 100644
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/setinfo.xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \docType{methods}
 \name{setinfo}
 \alias{setinfo}
@@ -13,11 +12,11 @@ setinfo(object, ...)
 \arguments{
 \item{object}{Object of class "xgb.DMatrix"}
 
-\item{...}{other parameters}
-
 \item{name}{the name of the field to get}
 
 \item{info}{the specific field of information to set}
+
+\item{...}{other parameters}
 }
 \description{
 Set information of an xgb.DMatrix object
diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd
index a7812e886..a749aa8ff 100644
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/slice.xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \docType{methods}
 \name{slice}
 \alias{slice}
@@ -14,9 +13,9 @@ slice(object, ...)
 \arguments{
 \item{object}{Object of class "xgb.DMatrix"}
 
-\item{...}{other parameters}
-
 \item{idxset}{a integer vector of indices of rows needed}
+
+\item{...}{other parameters}
 }
 \description{
 Get a new DMatrix containing the specified rows of
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 86000220f..31efde687 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.DMatrix}
 \alias{xgb.DMatrix}
 \title{Contruct xgb.DMatrix object}
@@ -12,7 +11,8 @@ indicating the data file.}
 
 \item{info}{a list of information of the xgb.DMatrix object}
 
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
 
 \item{...}{other information to pass to \code{info}.}
 }
diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
index 6bbc277b3..803de912b 100644
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.DMatrix.save.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.DMatrix.save}
 \alias{xgb.DMatrix.save}
 \title{Save xgb.DMatrix object to binary file}
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 7ba5eb727..149ec392f 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -1,12 +1,11 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.cv.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.cv}
 \alias{xgb.cv}
 \title{Cross Validation}
 \usage{
 xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
-  missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
-  feval = NULL, verbose = T, ...)
+  missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
+  obj = NULL, feval = NULL, verbose = T, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@@ -32,7 +31,10 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
 
 \item{label}{option field, when data is Matrix}
 
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+
+\item{prediction}{A logical value indicating whether to return the prediction vector.}
 
 \item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
 
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 473227357..d1968217b 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.dump.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.dump}
 \alias{xgb.dump}
 \title{Save xgboost model to text file}
@@ -37,7 +36,7 @@ test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                eta = 1, nround = 2,objective = "binary:logistic")
 # save the model in file 'xgb.model.dump'
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
 
 # print the model without saving it to a file
 print(xgb.dump(bst))
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index 78be4b91b..1588639b4 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -1,15 +1,16 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.importance.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.importance}
 \alias{xgb.importance}
 \title{Show importance of features in a model}
 \usage{
-xgb.importance(feature_names = NULL, filename_dump = NULL)
+xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
+
+\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 }
 \value{
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
@@ -36,16 +37,16 @@ There are 3 columns :
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
-#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 test <- agaricus.test
 
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 
 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst)
 }
 
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 433b38c79..d2c5d94b6 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.load.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.load}
 \alias{xgb.load}
 \title{Load xgboost model from binary file}
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index 2bc48c4d0..51c965970 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -1,17 +1,20 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.model.dt.tree.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.model.dt.tree}
 \alias{xgb.model.dt.tree}
 \title{Convert tree model dump to data.table}
 \usage{
-xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL,
-  n_first_tree = NULL)
+xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
+  model = NULL, text = NULL, n_first_tree = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
 
+\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+
+\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+
 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
 }
 \value{
@@ -40,15 +43,16 @@ The content of the \code{data.table} is organised that way:
 \examples{
 data(agaricus.train, package='xgboost')
 
-#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
 
 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump')
 }
 
diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd
index ba65cdd7c..dc95dfec0 100644
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -1,20 +1,25 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.plot.tree.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.plot.tree}
 \alias{xgb.plot.tree}
 \title{Plot a boosted tree model}
 \usage{
-xgb.plot.tree(feature_names = NULL, filename_dump = NULL,
-  n_first_tree = NULL, styles = NULL)
+xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
+  n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
+
+\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 
 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
 
-\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
+\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
+
+\item{width}{the width of the diagram in pixels.}
+
+\item{height}{the height of the diagram in pixels.}
 }
 \value{
 A \code{DiagrammeR} of the model.
@@ -38,15 +43,15 @@ It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpos
 \examples{
 data(agaricus.train, package='xgboost')
 
-#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 
 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
 }
 
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index ded444446..0ccdf13da 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.save.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.save}
 \alias{xgb.save}
 \title{Save xgboost model to binary file}
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 58ef94135..a05e2eeb9 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.train.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgb.train}
 \alias{xgb.train}
 \title{eXtreme Gradient Boosting Training}
diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd
index 21b1ad220..035eec9e7 100644
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgboost.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
 \name{xgboost}
 \alias{xgboost}
 \title{eXtreme Gradient Boosting (Tree) library}
@@ -11,9 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
 \code{xgb.DMatrix}.}
 
-\item{label}{the response variable. User should not set this field,}
-
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{label}{the response variable. User should not set this field,
+if data is local data file or  \code{xgb.DMatrix}.}
 
 \item{params}{the list of parameters. Commonly used ones are:
 \itemize{
@@ -36,6 +34,9 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
 information of performance. If 2, xgboost will print information of both
 performance and construction progress information}
 
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
diff --git a/R-package/src/Makevars b/R-package/src/Makevars
index 44dce490e..cc933f099 100644
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -1,9 +1,7 @@
 # package root
 PKGROOT=../../
 # _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
+PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
-
-
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 289f1a15a..0f7bc06ec 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -1,7 +1,19 @@
 # package root
-PKGROOT=../../
+PKGROOT=./
 # _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
+
+# This file is only used for windows compilation from github
+# It will be replaced by Makevars in CRAN version
+.PHONY: all xgblib
+all: $(SHLIB)
+$(SHLIB): xgblib
+xgblib:
+	cp -r ../../src .
+	cp -r ../../wrapper .
+	cp -r ../../subtree .
+
+PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
+$(OBJECTS) : xgblib
diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
index 9320547df..b4757542d 100644
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -3,10 +3,12 @@
 #include <utility>
 #include <cstring>
 #include <cstdio>
-#include "xgboost_R.h"
+#include <sstream> 
 #include "wrapper/xgboost_wrapper.h"
 #include "src/utils/utils.h"
 #include "src/utils/omp.h"
+#include "xgboost_R.h"
+
 using namespace std;
 using namespace xgboost;
 
@@ -246,12 +248,12 @@ extern "C" {
                                          asInteger(iter),
                                          BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
   }
-  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
+  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
     _WrapperBegin();
     bst_ulong olen;
     const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
                                         R_ExternalPtrAddr(dmat),
-                                        asInteger(output_margin),
+                                        asInteger(option_mask),
                                         asInteger(ntree_limit),
                                         &olen);
     SEXP ret = PROTECT(allocVector(REALSXP, olen));
@@ -280,14 +282,13 @@ extern "C" {
     asInteger(with_stats),
     &olen);
     SEXP out = PROTECT(allocVector(STRSXP, olen));    
-    char buffer [2000];
     for (size_t i = 0; i < olen; ++i) {     
-      memset(buffer, 0, sizeof buffer);
-      sprintf (buffer, "booster[%u]:\n%s", static_cast<unsigned>(i), res[i]);
-      SET_STRING_ELT(out, i, mkChar(buffer));
+      stringstream stream;
+      stream <<  "booster["<<i<<"]\n" << res[i];
+      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
     }
     _WrapperEnd();
     UNPROTECT(1);
     return out;
   }
-}
\ No newline at end of file
+}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 766152699..1e7606dd7 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -111,10 +111,10 @@ extern "C" {
    * \brief make prediction based on dmat
    * \param handle handle
    * \param dmat data matrix
-   * \param output_margin whether only output raw margin value
+   * \param option_mask output_margin:1 predict_leaf:2
    * \param ntree_limit limit number of trees used in prediction
    */
-  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit);
+  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
   /*!
    * \brief load model from existing file
    * \param handle handle
diff --git a/README.md b/README.md
index 6bc92ed04..949a10ba6 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 xgboost: eXtreme Gradient Boosting 
 ======
-An optimized general purpose gradient boosting library. The library is parallelized using OpenMP. It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree. 
+An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
+It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data.
 
 Contributors: https://github.com/tqchen/xgboost/graphs/contributors
 
@@ -10,6 +11,8 @@ Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.
 
 Examples Code: [Learning to use xgboost by examples](demo)
 
+Distributed Version: [Distributed XGBoost](multi-node)
+
 Notes on the Code: [Code Guide](src)
 
 Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
@@ -19,10 +22,14 @@ Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washin
 
 What's New
 =====
+* [Distributed XGBoost](multi-node) is now available!!
+* New features in the lastest changes :)
+  - Distributed version that scale xgboost to even larger problems with cluster
+  - Feature importance visualization in R module, thanks to Michael Benesty
+  - Predict leaf index, see [demo/guide-python/predict_leaf_indices.py](demo/guide-python/predict_leaf_indices.py)  
 * XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
 * XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
 * Thanks to Bing Xu, [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl) allows you to use xgboost from Julia
-* See the updated [demo folder](demo) for feature walkthrough
 * Thanks to Tong He, the new [R package](R-package) is available
 
 Features
@@ -34,10 +41,15 @@ Features
 * Speed: XGBoost is very fast
   - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
 * Layout of gradient boosting algorithm to support user defined objective
+* Distributed and portable
+  - The distributed version of xgboost is highly portable and can be used in different platforms
+  - It inheritates all the optimizations made in single machine mode, maximumly utilize the resources using both multi-threading and distributed computing.
 
 Build
 =====
 * Run ```bash build.sh``` (you can also type make)
+* If you have C++11 compiler, it is recommended to type ```make cxx11=1```
+  - C++11 is not used by default
 * If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
 * You may get a error: -lgomp is not found
   - You can type ```make no_omp=1```, this will get you single thread xgboost
diff --git a/build.sh b/build.sh
index 35a566ccc..055e75c33 100755
--- a/build.sh
+++ b/build.sh
@@ -3,6 +3,9 @@
 # basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
 # This will automatically make xgboost for MAC users who do not have openmp support
 # In most cases, type make will give what you want
+
+# download rabit
+
 if make; then
     echo "Successfully build multi-thread xgboost"
 else
diff --git a/demo/README.md b/demo/README.md
index c1f9a210c..e35f3033d 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -32,6 +32,9 @@ This is a list of short codes introducing different functionalities of xgboost a
   [python](guide-python/cross_validation.py)
   [R](../R-package/demo/cross_validation.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)  
+* Predicting leaf indices
+  [python](guide-python/predict_leaf_indices.py)
+  [R](../R-package/demo/predict_leaf_indices.R)
 
 Basic Examples by Tasks
 ====
diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md
index 3625c40f5..bc1c219d0 100644
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@@ -6,3 +6,4 @@ XGBoost Python Feature Walkthrough
 * [Predicting using first n trees](predict_first_ntree.py)
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
+* [Predicting leaf indices](predict_leaf_indices.py)
diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py
new file mode 100755
index 000000000..291ad1ee7
--- /dev/null
+++ b/demo/guide-python/predict_leaf_indices.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+
+### load data in do training
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+dtest = xgb.DMatrix('../data/agaricus.txt.test')
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 3
+bst = xgb.train(param, dtrain, num_round, watchlist)
+
+print ('start testing predict the leaf indices')
+### predict using first 2 tree
+leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
+print leafindex.shape
+print leafindex
+### predict all trees
+leafindex = bst.predict(dtest, pred_leaf = True)
+print leafindex.shape
diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh
index 2dd2c20b0..8f4f9832a 100755
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -4,4 +4,5 @@ python custom_objective.py
 python boost_from_prediction.py
 python generalized_linear_model.py
 python cross_validation.py
-rm -rf *~ *.model *.buffer 
\ No newline at end of file
+python predict_leaf_indices.py
+rm -rf *~ *.model *.buffer 
diff --git a/multi-node/README.md b/multi-node/README.md
new file mode 100644
index 000000000..0beb55de7
--- /dev/null
+++ b/multi-node/README.md
@@ -0,0 +1,37 @@
+Distributed XGBoost
+======
+This folder contains information of Distributed XGBoost (Distributed GBDT).
+
+* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
+  - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning  
+  - This makes xgboost portable and fault-tolerant against node failures
+* You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
+  - Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
+
+Build
+=====
+* In the root folder, type ```make```
+  - If you have C++11 compiler, it is recommended to use ```make cxx11=1```
+
+Notes
+====
+* Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs
+  - The Hadoop version does not rely on Mapreduce to do iterations
+  - You can expect xgboost not suffering the drawbacks of iterative MapReduce program
+* The design choice was made because Allreduce is very natural and efficient for distributed tree building
+  - In current version of xgboost, the distributed version is only adds several lines of Allreduce synchronization code
+* The multi-threading nature of xgboost is inheritated in distributed mode
+  - This means xgboost efficiently use all the threads in one machine, and communicates only between machines
+  - Remember to run on xgboost process per machine and this will give you maximum speedup
+* For more information about rabit and how it works, see the [Rabit's Tutorial](https://github.com/tqchen/rabit/tree/master/guide)
+
+Solvers
+=====
+There are two solvers in distributed xgboost. You can check for local demo of the two solvers, see [row-split](row-split) and [col-split](col-split)
+  * Column-based solver split data by column, each node work on subset of columns, 
+    it uses exactly the same algorithm as single node version.
+  * Row-based solver split data by row, each node work on subset of rows,
+    it uses an approximate histogram count algorithm, and will only examine subset of 
+    potential split points as opposed to all split points.
+    - This is the mode used by current hadoop version, since usually data was stored by rows in many industry system
+    
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
new file mode 100644
index 000000000..3ea0799fe
--- /dev/null
+++ b/multi-node/col-split/README.md
@@ -0,0 +1,19 @@
+Distributed XGBoost: Column Split Version
+====
+* run ```bash mushroom-col-rabit.sh <n-process>```
+  - mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
+* run ```bash mushroom-col-rabit-mock.sh <n-process>```
+  - mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
+
+How to Use
+====
+* First split the data by column, 
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable column split mode by ```dsplit=col```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one process per node
+* The code will work correctly as long as union of each column subset is all the columns we are interested in.
+  - The column subset can overlap with each other.
+* It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
new file mode 100755
index 000000000..b4208f04c
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+
+
+#cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh
new file mode 100755
index 000000000..77e0c904c
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-rabit.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt 
+
+# run for one round, and continue training
+../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
+
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.conf b/multi-node/col-split/mushroom-col.conf
new file mode 100644
index 000000000..2c779a44d
--- /dev/null
+++ b/multi-node/col-split/mushroom-col.conf
@@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.col%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"      
diff --git a/multi-node/col-split/splitsvm.py b/multi-node/col-split/splitsvm.py
new file mode 100644
index 000000000..365aef610
--- /dev/null
+++ b/multi-node/col-split/splitsvm.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different subcolumns
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+fmap = {}
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    arr = l.split()
+    for f in fos:
+        f.write(arr[0])
+    for it in arr[1:]:
+        fid = int(it.split(':')[0])
+        if fid not in fmap:
+            fmap[fid] = random.randint(0, k-1)
+        fos[fmap[fid]].write(' '+it)
+    for f in fos:
+        f.write('\n')
+for f in fos:    
+    f.close()
diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
new file mode 100644
index 000000000..d1dde8ba3
--- /dev/null
+++ b/multi-node/hadoop/README.md
@@ -0,0 +1,43 @@
+Distributed XGBoost: Hadoop Version
+====
+*  The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
+*  It relies on [Rabit Library](https://github.com/tqchen/rabit) (Reliable Allreduce and Broadcast Interface) and Hadoop Streaming. Rabit provides an interface to aggregate gradient values and split statistics, that allow xgboost to run reliably on hadoop. You do not need to care how to update model in each iteration, just use the script ```rabit_hadoop.py```. For those who want to know how it exactly works, plz refer to the main page of [Rabit](https://github.com/tqchen/rabit).
+*  Quick start: run ```bash run_mushroom.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
+  - This is the hadoop version of binary classification example in the demo folder.
+  - More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki)
+
+Before you run the script
+====
+* Make sure you have set up the hadoop environment.
+* If you want to only use single machine multi-threading, try single machine examples in the [demo folder](../../demo).
+* Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost.
+* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py.
+
+How to Use
+====
+* Input data format: LIBSVM format. The example here uses generated data in demo/data folder.
+* Put the training data in HDFS (hadoop distributed file system).
+* Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file.
+* Get the final model file from HDFS, and locally do prediction as well as visualization of model.
+
+Single machine vs Hadoop version
+====
+If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
+* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first.
+* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data=stdin``` and ```model_out=stdout```.
+* File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on.
+  - Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
+  - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names).
+  - The local path of cached files in command is "./".
+  - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance.
+* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train".
+* More details of submission can be referred to the usage of ```rabit_hadoop.py```.
+* The model saved by hadoop version is compatible with single machine version.
+
+Notes
+====
+* The code has been tested on MapReduce 1 (MRv1) and YARN.
+  - We recommend to run it on MapReduce 2 (MRv2, YARN) so that multi-threading can be enabled.
+* The code is optimized with multi-threading, so you will want to run one xgboost per node/worker for best performance.
+  - You will want to set <n_thread_per_worker> to be number of cores you have on each machine.
+  - You will need YARN to set specify number of cores of each worker
diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf
new file mode 100644
index 000000000..a4e885d54
--- /dev/null
+++ b/multi-node/hadoop/mushroom.hadoop.conf
@@ -0,0 +1,39 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+# evaluate on training data as well each round
+# eval_train = 1
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+# eval[test] = "agaricus.txt.test"
+
+# Plz donot modify the following parameters
+# The path of training data
+data = stdin
+# The path of model file
+model_out = stdout 
+# split pattern of xgboost
+dsplit = row
+
+<<<<<<< HEAD
+# evaluate on training data as well each round
+eval_train = 1
+=======
+>>>>>>> df3f87c182cc12ccc9ac1f9cafbe01ea7ebf0ac4
diff --git a/multi-node/hadoop/run_mushroom.sh b/multi-node/hadoop/run_mushroom.sh
new file mode 100755
index 000000000..9cb73ec25
--- /dev/null
+++ b/multi-node/hadoop/run_mushroom.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+	echo "Usage: <nworkers> <nthreads> <path_in_HDFS>"
+	exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $3/data
+hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
+
+../../subtree/rabit/tracker/rabit_hadoop.py  -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/mushroom.final.model ../../xgboost mushroom.hadoop.conf  nthread=$2
+
+# get the final model file
+hadoop fs -get $3/mushroom.final.model/part-00000 ./final.model
+
+# output prediction task=pred 
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt
diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
new file mode 100644
index 000000000..30e2528d3
--- /dev/null
+++ b/multi-node/row-split/README.md
@@ -0,0 +1,18 @@
+Distributed XGBoost: Row Split Version
+====
+* You might be interested to checkout the [Hadoop example](../hadoop)
+* Machine Rabit: run ```bash machine-row-rabit.sh <n-mpi-process>```
+  - machine-col-rabit.sh starts xgboost job using rabit
+
+How to Use
+====
+* First split the data by rows
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable ow split mode by ```dsplit=row```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
+  and will only examine subset of potential split points as opposed to all split points.
+
diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
new file mode 100755
index 000000000..ed1178dc9
--- /dev/null
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0  mock=0,0,3,0 mock=2,2,3,0
diff --git a/multi-node/row-split/machine-row-rabit.sh b/multi-node/row-split/machine-row-rabit.sh
new file mode 100755
index 000000000..fb3e3ba60
--- /dev/null
+++ b/multi-node/row-split/machine-row-rabit.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
+
+# run xgboost-mpi save model 0001, continue to run from existing model
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
diff --git a/multi-node/row-split/machine-row.conf b/multi-node/row-split/machine-row.conf
new file mode 100644
index 000000000..c0cba3da8
--- /dev/null
+++ b/multi-node/row-split/machine-row.conf
@@ -0,0 +1,30 @@
+# General Parameters, see comment for each definition
+# choose the tree booster, can also change to gblinear
+booster = gbtree
+# this is the only difference with classification, use reg:linear to do linear classification
+# when labels are in [0,1] we can also use reg:logistic
+objective = reg:linear
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+# Task parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data
+data = "train-machine.row%d" 
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/regression/machine.txt.test" 
+# The path of test data 
+test:data = "../../demo/regression/machine.txt.test" 
+
diff --git a/multi-node/row-split/splitrows.py b/multi-node/row-split/splitrows.py
new file mode 100644
index 000000000..2e9d1184d
--- /dev/null
+++ b/multi-node/row-split/splitrows.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different rows
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    i = random.randint(0, k-1)
+    fos[i].write(l)
+
+for f in fos:    
+    f.close()
diff --git a/src/data.h b/src/data.h
index 2ea5f222a..162a31bfe 100644
--- a/src/data.h
+++ b/src/data.h
@@ -138,9 +138,10 @@ class IFMatrix {
   virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
   /*!
    * \brief check if column access is supported, if not, initialize column access
+   * \param enabled whether certain feature should be included in column access
    * \param subsample subsample ratio when generating column access
    */
-  virtual void InitColAccess(float subsample) = 0;
+  virtual void InitColAccess(const std::vector<bool> &enabled, float subsample) = 0;
   // the following are column meta data, should be able to answer them fast
   /*! \return whether column access is enabled */
   virtual bool HaveColAccess(void) const = 0;
diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 473914b6e..de9ee6173 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -33,16 +33,17 @@ class GBLinear : public IGradBooster {
       model.param.SetParam(name, val);
     }
   }
-  virtual void LoadModel(utils::IStream &fi) {
+  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
     model.LoadModel(fi);
   }
-  virtual void SaveModel(utils::IStream &fo) const {
+  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
     model.SaveModel(fo);
   }
   virtual void InitModel(void) {
     model.InitModel();
   }
   virtual void DoBoost(IFMatrix *p_fmat,
+                       int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
     std::vector<bst_gpair> &gpair = *in_gpair;
@@ -135,8 +136,22 @@ class GBLinear : public IGradBooster {
       }
     }
   }
-
- virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
+  virtual void Predict(const SparseBatch::Inst &inst,
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit,
+                       unsigned root_index) {
+    const int ngroup = model.param.num_output_group;
+    for (int gid = 0; gid < ngroup; ++gid) {
+      this->Pred(inst, BeginPtr(*out_preds));
+    }
+  }
+  virtual void PredictLeaf(IFMatrix *p_fmat,
+                           const BoosterInfo &info,
+                           std::vector<float> *out_preds,
+                           unsigned ntree_limit = 0) {
+    utils::Error("gblinear does not support predict leaf index");
+  }
+  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
     std::stringstream fo("");
     fo << "bias:\n";
     for (int i = 0; i < model.param.num_output_group; ++i) {
@@ -151,8 +166,8 @@ class GBLinear : public IGradBooster {
     std::vector<std::string> v;
     v.push_back(fo.str());
     return v;
- }
-  
+  }
+
  protected:
   inline void Pred(const RowBatch::Inst &inst, float *preds) {
     for (int gid = 0; gid < model.param.num_output_group; ++gid) {
diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp
index e280fdd4a..fe8d778e4 100644
--- a/src/gbm/gbm.cpp
+++ b/src/gbm/gbm.cpp
@@ -1,5 +1,6 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <cstring>
 #include "./gbm.h"
 #include "./gbtree-inl.hpp"
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 07dade4ac..f07d277ac 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -27,25 +27,44 @@ class IGradBooster {
   /*!
    * \brief load model from stream
    * \param fi input stream
+   * \param with_pbuffer whether the incoming data contains pbuffer
    */
-  virtual void LoadModel(utils::IStream &fi) = 0;
+  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0;
   /*!
    * \brief save model to stream
    * \param fo output stream
+   * \param with_pbuffer whether save out pbuffer
    */
-  virtual void SaveModel(utils::IStream &fo) const = 0;
+  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0;
   /*!
    * \brief initialize the model
    */
   virtual void InitModel(void) = 0;
+  /*! 
+   * \brief reset the predict buffer
+   * this will invalidate all the previous cached results
+   * and recalculate from scratch
+   */
+  virtual void ResetPredBuffer(size_t num_pbuffer) {}
+  /*! 
+   * \brief whether the model allow lazy checkpoint
+   * return true if model is only updated in DoBoost 
+   * after all Allreduce calls
+   */
+  virtual bool AllowLazyCheckPoint(void) const {
+    return false;
+  }
   /*!
    * \brief peform update to the model(boosting)
    * \param p_fmat feature matrix that provide access to features
+   * \param buffer_offset buffer index offset of these instances, if equals -1
+   *        this means we do not have buffer index allocated to the gbm
    * \param info meta information about training
    * \param in_gpair address of the gradient pair statistics of the data
    * the booster may change content of gpair
    */
   virtual void DoBoost(IFMatrix *p_fmat,
+                       int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) = 0;
   /*!
@@ -64,7 +83,36 @@ class IGradBooster {
                        int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) = 0;
+                       unsigned ntree_limit = 0) = 0;  
+  /*!
+   * \brief online prediction funciton, predict score for one instance at a time
+   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
+   *        more efficient than online prediction
+   *        This function is NOT threadsafe, make sure you only call from one thread
+   *    
+   * \param inst the instance you want to predict
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction
+   * \param root_index the root index
+   * \sa Predict
+   */
+  virtual void Predict(const SparseBatch::Inst &inst,
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit = 0,
+                       unsigned root_index = 0)  = 0;
+  /*!
+   * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
+   *        this is only valid in gbtree predictor
+   * \param p_fmat feature matrix
+   * \param info extra side information that may be needed for prediction
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means 
+   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
+   */
+  virtual void PredictLeaf(IFMatrix *p_fmat,
+                           const BoosterInfo &info,
+                           std::vector<float> *out_preds,
+                           unsigned ntree_limit = 0) = 0;
   /*!
    * \brief dump the model in text format
    * \param fmap feature map that may help give interpretations of feature
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index ed52afa7d..66b03dd87 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -19,6 +19,8 @@ namespace gbm {
  */
 class GBTree : public IGradBooster {
  public:
+  GBTree(void) {
+  }
   virtual ~GBTree(void) {
     this->Clear();
   }
@@ -37,7 +39,7 @@ class GBTree : public IGradBooster {
     tparam.SetParam(name, val);
     if (trees.size() == 0) mparam.SetParam(name, val);
   }
-  virtual void LoadModel(utils::IStream &fi) {
+  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
     this->Clear();
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "GBTree: invalid model file");
@@ -51,7 +53,7 @@ class GBTree : public IGradBooster {
       utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
                    "GBTree: invalid model file");
     }
-    if (mparam.num_pbuffer != 0) {
+    if (mparam.num_pbuffer != 0 && with_pbuffer) {
       pred_buffer.resize(mparam.PredBufferSize());
       pred_counter.resize(mparam.PredBufferSize());
       utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
@@ -60,7 +62,7 @@ class GBTree : public IGradBooster {
                    "GBTree: invalid model file");
     }
   }
-  virtual void SaveModel(utils::IStream &fo) const {
+  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
     utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
     fo.Write(&mparam, sizeof(ModelParam));
     for (size_t i = 0; i < trees.size(); ++i) {
@@ -69,7 +71,7 @@ class GBTree : public IGradBooster {
     if (tree_info.size() != 0) {
       fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
     }
-    if (mparam.num_pbuffer != 0) {
+    if (mparam.num_pbuffer != 0 && with_pbuffer) {
       fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
       fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
     }
@@ -82,12 +84,23 @@ class GBTree : public IGradBooster {
     utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
     utils::Assert(trees.size() == 0, "GBTree: model already initialized");
   }
+  virtual void ResetPredBuffer(size_t num_pbuffer) {
+    mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer);
+    pred_buffer.clear(); pred_counter.clear();
+    pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
+    pred_counter.resize(mparam.PredBufferSize(), 0);
+  }
+  virtual bool AllowLazyCheckPoint(void) const {
+    return !(tparam.distcol_mode != 0  && mparam.num_output_group != 1);
+  }
   virtual void DoBoost(IFMatrix *p_fmat,
+                       int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
     const std::vector<bst_gpair> &gpair = *in_gpair;
-    if (mparam.num_output_group == 1) {
-      this->BoostNewTrees(gpair, p_fmat, info, 0);
+    std::vector<std::vector<tree::RegTree*> > new_trees;
+    if (mparam.num_output_group == 1) {      
+      new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
     } else {
       const int ngroup = mparam.num_output_group;
       utils::Check(gpair.size() % ngroup == 0,
@@ -99,15 +112,18 @@ class GBTree : public IGradBooster {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           tmp[i] = gpair[i * ngroup + gid];
         }
-        this->BoostNewTrees(tmp, p_fmat, info, gid);
+        new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
       }
     }
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->CommitModel(new_trees[gid], gid);
+    }
   }
   virtual void Predict(IFMatrix *p_fmat,
                        int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) {
+                       unsigned ntree_limit = 0) {    
     int nthread;
     #pragma omp parallel
     {
@@ -117,7 +133,6 @@ class GBTree : public IGradBooster {
     for (int i = 0; i < nthread; ++i) {
       thread_temp[i].Init(mparam.num_feature);
     }
-
     std::vector<float> &preds = *out_preds;
     const size_t stride = info.num_row * mparam.num_output_group;
     preds.resize(stride * (mparam.size_leaf_vector+1));
@@ -144,6 +159,38 @@ class GBTree : public IGradBooster {
         }
       }
     }
+  }  
+  virtual void Predict(const SparseBatch::Inst &inst,
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit,
+                       unsigned root_index) {
+    if (thread_temp.size() == 0) {
+      thread_temp.resize(1, tree::RegTree::FVec());
+      thread_temp[0].Init(mparam.num_feature);
+    }
+    out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
+    // loop over output groups
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->Pred(inst, -1, gid, root_index, &thread_temp[0],
+                 &(*out_preds)[gid], mparam.num_output_group, 
+                 ntree_limit);
+    }
+  }  
+  virtual void PredictLeaf(IFMatrix *p_fmat,
+                           const BoosterInfo &info,
+                           std::vector<float> *out_preds,
+                           unsigned ntree_limit) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    thread_temp.resize(nthread, tree::RegTree::FVec());
+    for (int i = 0; i < nthread; ++i) {
+      thread_temp[i].Init(mparam.num_feature);
+    }
+    this->PredPath(p_fmat, info, out_preds, ntree_limit);
+    
   }
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
     std::vector<std::string> dump;
@@ -184,13 +231,15 @@ class GBTree : public IGradBooster {
     tparam.updater_initialized = 1;
   }
   // do group specific group
-  inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
-                            IFMatrix *p_fmat,
-                            const BoosterInfo &info,
-                            int bst_group) {
+  inline std::vector<tree::RegTree*>
+  BoostNewTrees(const std::vector<bst_gpair> &gpair,
+                IFMatrix *p_fmat,
+                int64_t buffer_offset,
+                const BoosterInfo &info,
+                int bst_group) {
+    std::vector<tree::RegTree *> new_trees;
     this->InitUpdater();
     // create the trees
-    std::vector<tree::RegTree *> new_trees;
     for (int i = 0; i < tparam.num_parallel_tree; ++i) {
       new_trees.push_back(new tree::RegTree());
       for (size_t j = 0; j < cfg.size(); ++j) {
@@ -201,13 +250,52 @@ class GBTree : public IGradBooster {
     // update the trees
     for (size_t i = 0; i < updaters.size(); ++i) {
       updaters[i]->Update(gpair, p_fmat, info, new_trees);
+    }    
+    // optimization, update buffer, if possible
+    // this is only under distributed column mode
+    // for safety check of lazy checkpoint
+    if (
+        buffer_offset >= 0 &&
+        new_trees.size() == 1 && updaters.size() > 0 &&
+        updaters.back()->GetLeafPosition() != NULL) {
+      utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
+                   "distributed mode is not compatible with prob_buffer_row");
+      this->UpdateBufferByPosition(p_fmat,
+                                   buffer_offset, bst_group,
+                                   *new_trees[0],
+                                   updaters.back()->GetLeafPosition());
     }
-    // push back to model
+    return new_trees;
+  }
+  // commit new trees all at once
+  inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
     for (size_t i = 0; i < new_trees.size(); ++i) {
       trees.push_back(new_trees[i]);
       tree_info.push_back(bst_group);
     }
-    mparam.num_trees += tparam.num_parallel_tree;
+    mparam.num_trees += static_cast<int>(new_trees.size());
+  }
+  // update buffer by pre-cached position
+  inline void UpdateBufferByPosition(IFMatrix *p_fmat,
+                                     int64_t buffer_offset, 
+                                     int bst_group,
+                                     const tree::RegTree &new_tree,
+                                     const int* leaf_position) {
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
+      const int tid = leaf_position[ridx];
+      utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date");
+      utils::Assert(tid >= 0, "invalid leaf position");
+      pred_buffer[bid] += new_tree[tid].leaf_value();
+      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+        pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
+      }
+      pred_counter[bid] += tparam.num_parallel_tree;
+    }
   }
   // make a prediction for a single instance
   inline void Pred(const RowBatch::Inst &inst,
@@ -215,7 +303,8 @@ class GBTree : public IGradBooster {
                    int bst_group,
                    unsigned root_index,
                    tree::RegTree::FVec *p_feats,
-                   float *out_pred, size_t stride, unsigned ntree_limit) {
+                   float *out_pred, size_t stride, 
+                   unsigned ntree_limit) {
     size_t itop = 0;
     float  psum = 0.0f;
     // sum of leaf vector 
@@ -258,6 +347,39 @@ class GBTree : public IGradBooster {
       out_pred[stride * (i + 1)] = vec_psum[i];
     }
   }
+  // predict independent leaf index
+  inline void PredPath(IFMatrix *p_fmat,
+                       const BoosterInfo &info,
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit) {
+    // number of valid trees
+    if (ntree_limit == 0 || ntree_limit > trees.size()) {
+      ntree_limit = static_cast<unsigned>(trees.size());
+    } 
+    std::vector<float> &preds = *out_preds;
+    preds.resize(info.num_row * ntree_limit);
+    // start collecting the prediction
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel over local batch
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const int tid = omp_get_thread_num();
+        int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
+        tree::RegTree::FVec &feats = thread_temp[tid];
+        feats.Fill(batch[i]);
+        for (unsigned j = 0; j < ntree_limit; ++j) {
+          int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
+          preds[ridx * ntree_limit + j] = static_cast<float>(tid);
+        }
+        feats.Drop(batch[i]);
+      }
+    }
+  }
+                       
   // --- data structure ---
   /*! \brief training parameters */
   struct TrainParam {
@@ -270,6 +392,8 @@ class GBTree : public IGradBooster {
     int num_parallel_tree;
     /*! \brief whether updater is already initialized */
     int updater_initialized;
+    /*! \brief distributed column mode */
+    int distcol_mode;
     /*! \brief tree updater sequence */
     std::string updater_seq;
     // construction
@@ -278,6 +402,7 @@ class GBTree : public IGradBooster {
       updater_seq = "grow_colmaker,prune";
       num_parallel_tree = 1;
       updater_initialized = 0;
+      distcol_mode = 0;
     }
     inline void SetParam(const char *name, const char *val){
       using namespace std;
@@ -286,6 +411,9 @@ class GBTree : public IGradBooster {
         updater_seq = val;
         updater_initialized = 0;
       }
+      if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
+        distcol_mode = 1;
+      }
       if (!strcmp(name, "nthread")) {
         omp_set_num_threads(nthread = atoi(val));
       }
diff --git a/src/io/io.cpp b/src/io/io.cpp
index d251d7a96..0072618c6 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -1,15 +1,32 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <string>
 #include "./io.h"
 #include "../utils/io.h"
 #include "../utils/utils.h"
 #include "simple_dmatrix-inl.hpp"
+#include "page_dmatrix-inl.hpp"
+#include "page_fmatrix-inl.hpp"
+
 // implements data loads using dmatrix simple for now
 
 namespace xgboost {
 namespace io {
 DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
+  if (!strcmp(fname, "stdin")) {
+    DMatrixSimple *dmat = new DMatrixSimple();
+    dmat->LoadText(fname, silent);
+    return dmat;
+  }
+  std::string tmp_fname;
+  const char *fname_ext = NULL;
+  if (strchr(fname, ';') != NULL) {
+    tmp_fname = fname;
+    char *ptr = strchr(&tmp_fname[0], ';');
+    ptr[0] = '\0'; fname = &tmp_fname[0];
+    fname_ext = ptr + 1;
+  }
   int magic;
   utils::FileStream fs(utils::FopenCheck(fname, "rb"));
   utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
@@ -20,7 +37,27 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
     dmat->LoadBinary(fs, silent, fname);
     fs.Close();
     return dmat;
-  } 
+  }
+  if (magic == DMatrixPage::kMagic) {
+    if (fname_ext == NULL) {
+      DMatrixPage *dmat = new DMatrixPage();
+      dmat->Load(fs, silent, fname);
+      return dmat;
+    } else {
+      DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
+      dmat->Load(fs, silent, fname, true);
+      return dmat;
+    }
+  }
+  if (magic == DMatrixColPage::kMagic) {
+    std::string sfname = fname;
+    if (fname_ext == NULL) {
+      sfname += ".col"; fname_ext = sfname.c_str();
+    }
+    DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
+    dmat->Load(fs, silent, fname);
+    return dmat;
+  }
   fs.Close();
  
   DMatrixSimple *dmat = new DMatrixSimple();
@@ -29,11 +66,21 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
 }
 
 void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
+  if (!strcmp(fname + strlen(fname) - 5, ".page")) {    
+    DMatrixPage::Save(fname, dmat, silent);
+    return;
+  }
+  if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
+    DMatrixColPage::Save(fname, dmat, silent);
+    return;
+  }
   if (dmat.magic == DMatrixSimple::kMagic) {
     const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
     p_dmat->SaveBinary(fname, silent);
   } else {
-    utils::Error("not implemented");
+    DMatrixSimple smat;
+    smat.CopyFrom(dmat);
+    smat.SaveBinary(fname, silent);
   }
 }
 
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
new file mode 100644
index 000000000..4f70ff2e9
--- /dev/null
+++ b/src/io/page_dmatrix-inl.hpp
@@ -0,0 +1,278 @@
+#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+/*!
+ * \file page_row_iter-inl.hpp
+ * row iterator based on sparse page
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include "../data.h"
+#include "../utils/iterator.h"
+#include "../utils/thread_buffer.h"
+#include "./simple_fmatrix-inl.hpp"
+
+namespace xgboost {
+namespace io {
+/*! \brief page structure that can be used to store a rowbatch */
+struct RowBatchPage {
+ public:
+  explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
+    data_ = new int[kPageSize];
+    utils::Assert(data_ != NULL, "fail to allocate row batch page");
+    this->Clear();
+  }
+  ~RowBatchPage(void) {
+    if (data_ != NULL) delete [] data_;
+  }
+  /*! 
+   * \brief Push one row into page
+   *  \param row an instance row
+   *  \return false or true to push into
+   */  
+  inline bool PushRow(const RowBatch::Inst &row) {
+    const size_t dsize = row.length * sizeof(RowBatch::Entry);
+    if (FreeBytes() < dsize+ sizeof(int)) return false;
+    row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
+    memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
+    ++data_[0];
+    return true;
+  }
+  /*!
+   * \brief get a row batch representation from the page
+   * \param p_rptr a temporal space that can be used to provide
+   *  ind_ptr storage for RowBatch
+   * \return a new RowBatch object
+   */
+  inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
+    RowBatch batch;
+    batch.base_rowid = base_rowid;
+    batch.data_ptr = this->data_ptr(0);
+    batch.size = static_cast<size_t>(this->Size());
+    std::vector<size_t> &rptr = *p_rptr;
+    rptr.resize(this->Size() + 1);
+    for (size_t i = 0; i < rptr.size(); ++i) {
+      rptr[i] = static_cast<size_t>(this->row_ptr(static_cast<int>(i)));
+    }
+    batch.ind_ptr = &rptr[0];
+    return batch;
+  }
+  /*! \brief get i-th row from the batch */
+  inline RowBatch::Inst operator[](int i) {
+    return RowBatch::Inst(data_ptr(0) + row_ptr(i),
+                          static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
+  }
+  /*!
+   * \brief clear the page, cleanup the content
+   */
+  inline void Clear(void) {
+    memset(&data_[0], 0, sizeof(int) * kPageSize);
+  }
+  /*!
+   * \brief load one page form instream
+   * \return true if loading is successful
+   */
+  inline bool Load(utils::IStream &fi) {
+    return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
+  }
+  /*! \brief save one page into outstream */
+  inline void Save(utils::IStream &fo) {
+    fo.Write(&data_[0], sizeof(int) * kPageSize);
+  }
+  /*! \return number of elements */
+  inline int Size(void) const {
+    return data_[0];
+  }
+
+ protected:
+  /*! \return number of elements */
+  inline size_t FreeBytes(void) {
+    return (kPageSize - (Size() + 2)) * sizeof(int) -
+        row_ptr(Size()) * sizeof(RowBatch::Entry);
+  }
+  /*! \brief equivalent row pointer at i */
+  inline int& row_ptr(int i) {
+    return data_[kPageSize - i - 1];
+  }
+  inline RowBatch::Entry* data_ptr(int i) {
+    return (RowBatch::Entry*)(&data_[1]) + i;
+  }
+  // content of data
+  int *data_;
+  // page size
+  const size_t kPageSize;
+};
+/*! \brief thread buffer iterator */
+class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
+ public:
+  ThreadRowPageIterator(void) {
+    itr.SetParam("buffer_size", "2");
+    page_ = NULL;
+    base_rowid_ = 0;
+  }
+  virtual ~ThreadRowPageIterator(void) {}
+  virtual void Init(void) {
+  }
+  virtual void BeforeFirst(void) {
+    itr.BeforeFirst();
+    base_rowid_ = 0;
+  }
+  virtual bool Next(void) {
+    if (!itr.Next(page_)) return false;
+    out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
+    base_rowid_ += out_.size;
+    return true;
+  }
+  virtual const RowBatch &Value(void) const {
+    return out_;
+  }
+  /*! \brief load and initialize the iterator with fi */
+  inline void Load(const utils::FileStream &fi) {
+    itr.get_factory().SetFile(fi);
+    itr.Init();
+    this->BeforeFirst();
+  }
+  /*!
+   * \brief save a row iterator to output stream, in row iterator format
+   */
+  inline static void Save(utils::IIterator<RowBatch> *iter,
+                          utils::IStream &fo) {
+    RowBatchPage page(kPageSize);
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        if (!page.PushRow(batch[i])) {
+          page.Save(fo);
+          page.Clear();
+          utils::Check(page.PushRow(batch[i]), "row is too big");
+        }
+      }
+    }
+    if (page.Size() != 0) page.Save(fo);
+  }
+  /*! \brief page size 64 MB */
+  static const size_t kPageSize = 64 << 18;
+
+ private:
+  // base row id
+  size_t base_rowid_;
+  // temporal ptr
+  std::vector<size_t> tmp_ptr_;
+  // output data
+  RowBatch out_;
+  // page pointer type
+  typedef RowBatchPage* PagePtr;
+  // loader factory for page
+  struct Factory {
+   public:
+    size_t file_begin_;
+    utils::FileStream fi;
+    Factory(void) {}
+    inline void SetFile(const utils::FileStream &fi) {
+      this->fi = fi;
+      file_begin_ = this->fi.Tell();
+    }
+    inline bool Init(void) {
+      return true;
+    }
+    inline void SetParam(const char *name, const char *val) {}
+    inline bool LoadNext(PagePtr &val) {
+      return val->Load(fi);
+    }
+    inline PagePtr Create(void) {
+      PagePtr a = new RowBatchPage(kPageSize);
+      return a;
+    }
+    inline void FreeSpace(PagePtr &a) {
+      delete a;
+    }
+    inline void Destroy(void) {
+      fi.Close();
+    }
+    inline void BeforeFirst(void) {
+      fi.Seek(file_begin_);
+    }
+  };
+
+ protected:
+  PagePtr page_;
+  utils::ThreadBuffer<PagePtr, Factory> itr;
+};
+
+/*! \brief data matrix using page */
+template<int TKMagic>
+class DMatrixPageBase : public DataMatrix {
+ public:
+  DMatrixPageBase(void) : DataMatrix(kMagic) {
+    iter_ = new ThreadRowPageIterator();
+  }
+  // virtual destructor
+  virtual ~DMatrixPageBase(void) {
+    // do not delete row iterator, since it is owned by fmat
+    // to be cleaned up in a more clear way
+  }
+  /*! \brief load and initialize the iterator with fi */
+  inline void Load(utils::FileStream &fi,
+                   bool silent = false,
+                   const char *fname = NULL,
+                   bool skip_magic_check = false) {
+    int tmagic;
+    utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
+    if (!skip_magic_check) {
+      utils::Check(tmagic == magic, "invalid format,magic number mismatch");
+    }
+    this->info.LoadBinary(fi);
+    iter_->Load(fi);
+    if (!silent) {
+      utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
+                    static_cast<unsigned long>(info.num_row()),
+                    static_cast<unsigned long>(info.num_col()));
+      if (fname != NULL) {
+        utils::Printf(" from %s\n", fname);
+      } else {
+        utils::Printf("\n");
+      }
+      if (info.group_ptr.size() != 0) {
+        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
+      }
+    }
+  }
+  /*! \brief save a DataMatrix as DMatrixPage*/
+  inline static void Save(const char* fname, const DataMatrix &mat, bool silent) {
+    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
+    int magic = kMagic;
+    fs.Write(&magic, sizeof(magic));
+    mat.info.SaveBinary(fs);
+    ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
+    fs.Close();
+    if (!silent) {
+      utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
+                    static_cast<unsigned long>(mat.info.num_row()),
+                    static_cast<unsigned long>(mat.info.num_col()), fname);
+    }
+  }
+  /*! \brief magic number used to identify DMatrix */
+  static const int kMagic = TKMagic;
+
+ protected:
+  /*! \brief row iterator */
+  ThreadRowPageIterator *iter_;
+};
+
+class DMatrixPage : public DMatrixPageBase<0xffffab02> {
+ public:
+  DMatrixPage(void) {
+    fmat_ = new FMatrixS(iter_);
+  }
+  virtual ~DMatrixPage(void) {
+    delete fmat_;
+  }
+  virtual IFMatrix *fmat(void) const {
+    return fmat_;
+  }
+  /*! \brief the real fmatrix */
+  IFMatrix *fmat_;
+};
+}  // namespace io
+}  // namespace xgboost
+#endif  // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
new file mode 100644
index 000000000..44cb9abdc
--- /dev/null
+++ b/src/io/page_fmatrix-inl.hpp
@@ -0,0 +1,382 @@
+#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
+#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
+/*!
+ * \file page_fmatrix-inl.hpp
+ * sparse page manager for fmatrix
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "../data.h"
+#include "../utils/iterator.h"
+#include "../utils/io.h"
+#include "../utils/matrix_csr.h"
+#include "../utils/thread_buffer.h"
+namespace xgboost {
+namespace io {
+class CSCMatrixManager {
+ public:
+  /*! \brief in memory page */
+  struct Page {
+   public:
+    /*! \brief initialize the page */
+    explicit Page(size_t size) {
+      buffer.resize(size);
+      col_index.reserve(10);
+      col_data.reserve(10);
+    }
+    /*! \brief clear the page */
+    inline void Clear(void) {
+      num_entry = 0;
+      col_index.clear();
+      col_data.clear();
+    }
+    /*! \brief number of used entries */
+    size_t num_entry;
+    /*! \brief column index */
+    std::vector<bst_uint> col_index;
+    /*! \brief column data */
+    std::vector<ColBatch::Inst> col_data;
+    /*! \brief number of free entries */
+    inline size_t NumFreeEntry(void) const {
+      return buffer.size() - num_entry;
+    }
+    inline ColBatch::Entry* AllocEntry(size_t len) {
+      ColBatch::Entry *p_data = &buffer[0] + num_entry;
+      num_entry += len;
+      return p_data;
+    }
+    /*! \brief get underlying batch */
+    inline ColBatch GetBatch(void) const {
+      ColBatch batch;
+      batch.size = col_index.size();
+      batch.col_index = BeginPtr(col_index);
+      batch.col_data  = BeginPtr(col_data);
+      return batch;
+    }
+
+   private:
+    /*! \brief buffer space, not to be changed since ready */
+    std::vector<ColBatch::Entry> buffer;
+  };
+  /*! \brief define type of page pointer */
+  typedef Page *PagePtr;
+  // constructor
+  CSCMatrixManager(void) {
+    fi_ = NULL;
+  }
+  /*! \brief get column pointer */
+  inline const std::vector<size_t> &col_ptr(void) const {
+    return col_ptr_;
+  }
+  inline void SetParam(const char *name, const char *val) {
+  }
+  inline PagePtr Create(void) {
+    return new Page(page_size_);
+  }
+  inline void FreeSpace(PagePtr &a) {
+    delete a;
+  }
+  inline void Destroy(void) {
+  }
+  inline void BeforeFirst(void) {
+    col_index_ = col_todo_;
+    read_top_ = 0;
+  }
+  inline bool LoadNext(PagePtr &val) {
+    val->Clear();
+    if (read_top_ >= col_index_.size()) return false;
+    while (read_top_ < col_index_.size()) {
+      if (!this->TryFill(col_index_[read_top_], val)) {
+        return true;
+      }
+      ++read_top_;
+    }
+    return true;
+  }
+  inline bool Init(void) {
+    this->BeforeFirst();
+    return true;
+  }
+  inline void Setup(utils::ISeekStream *fi, double page_ratio) {
+    fi_ = fi;
+    fi_->Read(&begin_meta_ , sizeof(begin_meta_));
+    begin_data_ = static_cast<size_t>(fi->Tell());
+    fi_->Seek(begin_meta_);
+    fi_->Read(&col_ptr_);
+    size_t psmax = 0;
+    for (size_t i = 0; i < col_ptr_.size() - 1; ++i) {
+      psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
+    }
+    utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
+    page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
+  }
+  inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
+    if (!setall) {
+      col_todo_.resize(0);
+      for (size_t i = 0; i < cset.size(); ++i) {
+        if (col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1)) {
+          col_todo_.push_back(cset[i]);
+        }
+      }
+      std::sort(col_todo_.begin(), col_todo_.end());
+    } else {
+      col_todo_.resize(col_ptr_.size()-1);
+      for (size_t i = 0; i < col_todo_.size(); ++i) {
+        col_todo_[i] = static_cast<bst_uint>(i);
+      }
+    }
+  }
+
+ private:
+  /*! \brief fill a page with */
+  inline bool TryFill(size_t cidx, Page *p_page) {
+    size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
+    if (p_page->NumFreeEntry() < len) return false;
+    ColBatch::Entry *p_data = p_page->AllocEntry(len);
+    fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_);
+    utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
+                 "invalid column buffer format");
+    p_page->col_data.push_back(ColBatch::Inst(p_data, static_cast<bst_uint>(len)));
+    p_page->col_index.push_back(static_cast<bst_uint>(cidx));
+    return true;
+  }
+  // the following are in memory auxiliary data structure
+  /*! \brief top of reader position */
+  size_t read_top_;
+  /*! \brief size of page */
+  size_t page_size_;
+  /*! \brief column index to be loaded */
+  std::vector<bst_uint> col_index_;
+  /*! \brief column index to be after calling before first */
+  std::vector<bst_uint> col_todo_;
+  // the following are input content
+  /*! \brief beginning position of data content */
+  size_t begin_data_;
+  /*! \brief size of data content */
+  size_t begin_meta_;
+  /*! \brief input stream */
+  utils::ISeekStream *fi_;
+  /*! \brief column pointer of CSC format */
+  std::vector<size_t> col_ptr_;
+};
+
+class ThreadColPageIterator : public utils::IIterator<ColBatch> {
+ public:
+  explicit ThreadColPageIterator(utils::ISeekStream *fi,
+                                 float page_ratio, bool silent) {
+    itr_.SetParam("buffer_size", "2");
+    itr_.get_factory().Setup(fi, page_ratio);
+    itr_.Init();
+    if (!silent) {
+      utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n",
+                    static_cast<unsigned>(col_ptr().size() - 1));
+    }
+  }
+  virtual ~ThreadColPageIterator(void) {
+  }
+  virtual void BeforeFirst(void) {
+    itr_.BeforeFirst();
+  }
+  virtual bool Next(void) {
+    // page to be loaded
+    CSCMatrixManager::PagePtr page;
+    if (!itr_.Next(page)) return false;
+    out_ = page->GetBatch();
+    return true;
+  }
+  virtual const ColBatch &Value(void) const {
+    return out_;
+  }
+  inline const std::vector<size_t> &col_ptr(void) const {
+    return itr_.get_factory().col_ptr();
+  }
+  inline void SetColSet(const std::vector<bst_uint> &cset,
+                        bool setall = false) {
+    itr_.get_factory().SetColSet(cset, setall);
+  }
+
+ private:
+  // output data
+  ColBatch out_;
+  // internal iterator
+  utils::ThreadBuffer<CSCMatrixManager::PagePtr, CSCMatrixManager> itr_;
+};
+/*!
+ * \brief sparse matrix that support column access
+ */
+class FMatrixPage : public IFMatrix {
+ public:
+  /*! \brief constructor */
+  FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer)
+      : fname_cbuffer_(fname_buffer) {
+    this->row_iter_ = iter;
+    this->col_iter_ = NULL;
+    this->fi_ = NULL;
+  }
+  // destructor
+  virtual ~FMatrixPage(void) {
+    if (row_iter_ != NULL) delete row_iter_;
+    if (col_iter_ != NULL) delete col_iter_;
+    if (fi_ != NULL) {
+      fi_->Close(); delete fi_;
+    }
+  }
+  /*! \return whether column access is enabled */
+  virtual bool HaveColAccess(void) const {
+    return col_iter_ != NULL;
+  }
+  /*! \brief get number of colmuns */
+  virtual size_t NumCol(void) const {
+    utils::Check(this->HaveColAccess(), "NumCol:need column access");
+    return col_iter_->col_ptr().size() - 1;
+  }
+  /*! \brief get number of buffered rows */
+  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
+    return buffered_rowset_;
+  }
+  /*! \brief get column size */
+  virtual size_t GetColSize(size_t cidx) const {
+    const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
+    return col_ptr[cidx+1] - col_ptr[cidx];
+  }
+  /*! \brief get column density */
+  virtual float GetColDensity(size_t cidx) const {
+    const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
+    size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
+    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
+  }
+  virtual void InitColAccess(const std::vector<bool> &enabled, float pkeep = 1.0f) {
+    if (this->HaveColAccess()) return;
+    utils::Printf("start to initialize page col access\n");
+    if (this->LoadColData()) {
+      utils::Printf("loading previously saved col data\n");
+      return;
+    }
+    this->InitColData(pkeep, fname_cbuffer_.c_str(),
+                      1 << 30, 5);
+    utils::Check(this->LoadColData(), "fail to read in column data");
+    utils::Printf("finish initialize page col access\n");
+  }
+  /*!
+   * \brief get the row iterator associated with FMatrix
+   */
+  virtual utils::IIterator<RowBatch>* RowIterator(void) {
+    row_iter_->BeforeFirst();
+    return row_iter_;
+  }
+  /*!
+   * \brief get the column based  iterator
+   */
+  virtual utils::IIterator<ColBatch>* ColIterator(void) {
+    std::vector<bst_uint> cset;
+    col_iter_->SetColSet(cset, true);
+    col_iter_->BeforeFirst();
+    return col_iter_;
+  }
+  /*!
+   * \brief colmun based iterator
+   */
+  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
+    col_iter_->SetColSet(fset, false);
+    col_iter_->BeforeFirst();
+    return col_iter_;
+  }
+
+ protected:
+  /*!
+   * \brief try load column data from file
+   */
+  inline bool LoadColData(void) {
+    FILE *fp = fopen64(fname_cbuffer_.c_str(), "rb");
+    if (fp == NULL) return false;
+    fi_ = new utils::FileStream(fp);
+    static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
+    col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
+    return true;
+  }
+  /*!
+   * \brief intialize column data
+   * \param pkeep probability to keep a row
+   */
+  inline void InitColData(float pkeep, const char *fname,
+                          size_t buffer_size, size_t col_step) {
+    buffered_rowset_.clear();
+    utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
+    // use 64M buffer
+    utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
+    // start working
+    row_iter_->BeforeFirst();
+    while (row_iter_->Next()) {
+      const RowBatch &batch = row_iter_->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
+          buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
+          RowBatch::Inst inst = batch[i];
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.AddBudget(inst[j].index);
+          }
+        }
+      }
+    }
+    // write buffered rowset
+    static_cast<utils::IStream*>(&fo)->Write(buffered_rowset_);
+    builder.InitStorage();
+    row_iter_->BeforeFirst();
+    size_t ktop = 0;
+    while (row_iter_->Next()) {
+      const RowBatch &batch = row_iter_->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        if (ktop < buffered_rowset_.size() &&
+            buffered_rowset_[ktop] == batch.base_rowid + i) {
+          ++ktop;
+          RowBatch::Inst inst = batch[i];
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.PushElem(inst[j].index,
+                             ColBatch::Entry((bst_uint)(batch.base_rowid+i),
+                                             inst[j].fvalue));
+          }
+          if (ktop % 100000 == 0) {
+            utils::Printf("\r                         \r");
+            utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));    
+          }
+        }
+      }
+    }
+    builder.Finalize();
+    builder.SortRows(ColBatch::Entry::CmpValue, col_step);
+    fo.Close();
+  }
+
+ private:
+  // row iterator
+  utils::IIterator<RowBatch> *row_iter_;
+  // column iterator
+  ThreadColPageIterator *col_iter_;
+  // file pointer to data
+  utils::FileStream *fi_;
+  // file name of column buffer
+  std::string fname_cbuffer_;
+  /*! \brief list of row index that are buffered */
+  std::vector<bst_uint> buffered_rowset_;
+};
+
+class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
+ public:
+  explicit DMatrixColPage(const char *fname) {
+    fmat_ = new FMatrixPage(iter_, fname);
+  }
+  virtual ~DMatrixColPage(void) {
+    delete fmat_;
+  }
+  virtual IFMatrix *fmat(void) const {
+    return fmat_;
+  }
+  /*! \brief the real fmatrix */
+  IFMatrix *fmat_;
+};
+
+}  // namespace io
+}  // namespace xgboost
+#endif  // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index 374d621e9..a793c779f 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -44,8 +44,8 @@ class DMatrixSimple : public DataMatrix {
   }
   /*! \brief copy content data from source matrix */
   inline void CopyFrom(const DataMatrix &src) {
-    this->info = src.info;
     this->Clear();
+    this->info = src.info;
     // clone data content in thos matrix
     utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
     iter->BeforeFirst();
@@ -84,7 +84,12 @@ class DMatrixSimple : public DataMatrix {
   inline void LoadText(const char* fname, bool silent = false) {
     using namespace std;
     this->Clear();
-    FILE* file = utils::FopenCheck(fname, "r");
+    FILE* file;
+    if (!strcmp(fname, "stdin")) {
+      file = stdin;
+    } else {
+      file = utils::FopenCheck(fname, "r");      
+    }
     float label; bool init = true;
     char tmp[1024];
     std::vector<RowBatch::Entry> feats;
@@ -112,7 +117,9 @@ class DMatrixSimple : public DataMatrix {
                     static_cast<unsigned long>(info.num_col()),
                     static_cast<unsigned long>(row_data_.size()), fname);
     }
-    fclose(file);
+    if (file != stdin) {
+      fclose(file);
+    }
     // try to load in additional file
     std::string name = fname;
     std::string gname = name + ".group";
@@ -152,7 +159,7 @@ class DMatrixSimple : public DataMatrix {
   inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
     int tmagic;
     utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
+    utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
 
     info.LoadBinary(fs);
     FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index 7c8631a29..08e25e28b 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -48,9 +48,10 @@ class FMatrixS : public IFMatrix{
     size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
     return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
   }
-  virtual void InitColAccess(float pkeep = 1.0f) {
+  virtual void InitColAccess(const std::vector<bool> &enabled, 
+                             float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
-    this->InitColData(pkeep);
+    this->InitColData(pkeep, enabled);
   }
   /*!
    * \brief get the row iterator associated with FMatrix
@@ -75,7 +76,11 @@ class FMatrixS : public IFMatrix{
    * \brief colmun based iterator
    */
   virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    col_iter_.col_index_ = fset;
+    size_t ncol = this->NumCol();
+    col_iter_.col_index_.resize(0);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); 
+    }
     col_iter_.SetBatch(col_ptr_, col_data_);
     return &col_iter_;
   }
@@ -141,7 +146,7 @@ class FMatrixS : public IFMatrix{
    * \brief intialize column data
    * \param pkeep probability to keep a row
    */
-  inline void InitColData(float pkeep) {
+  inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
     buffered_rowset_.clear();
     // note: this part of code is serial, todo, parallelize this transformer
     utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
@@ -150,12 +155,14 @@ class FMatrixS : public IFMatrix{
     iter_->BeforeFirst();
     while (iter_->Next()) {
       const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
+      for (size_t i = 0; i < batch.size; ++i) {        
         if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
           buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
           RowBatch::Inst inst = batch[i];
           for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.AddBudget(inst[j].index);
+            if (enabled[inst[j].index]){ 
+              builder.AddBudget(inst[j].index);
+            }
           }
         }
       }
@@ -172,9 +179,11 @@ class FMatrixS : public IFMatrix{
           ++ktop;
           RowBatch::Inst inst = batch[i];
           for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.PushElem(inst[j].index,
-                             Entry((bst_uint)(batch.base_rowid+i),
-                                   inst[j].fvalue));
+            if (enabled[inst[j].index]) { 
+              builder.PushElem(inst[j].index,
+                               Entry((bst_uint)(batch.base_rowid+i),
+                                     inst[j].fvalue));
+            }
           }
         }
       }
diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp
index fb0b8953d..60a8da8f1 100644
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <climits>
 #include <algorithm>
+#include "../sync/sync.h"
 #include "./evaluation.h"
 #include "./helper_utils.h"
 
@@ -23,7 +24,8 @@ namespace learner {
 template<typename Derived>
 struct EvalEWiseBase : public IEvaluator {
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
     utils::Check(info.labels.size() != 0, "label set cannot be empty");
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label and prediction size not match");
@@ -37,7 +39,11 @@ struct EvalEWiseBase : public IEvaluator {
       sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
       wsum += wt;
     }
-    return Derived::GetFinal(sum, wsum);
+    float dat[2]; dat[0] = sum, dat[1] = wsum;
+    if (distributed) {
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+    }
+    return Derived::GetFinal(dat[0], dat[1]);
   }
   /*! 
    * \brief to be implemented by subclass, 
@@ -113,7 +119,9 @@ struct EvalCTest: public IEvaluator {
     return name_.c_str();
   }
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
+    utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str());
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label and prediction size not match");
     size_t ngroup = preds.size() / info.labels.size() - 1;
@@ -150,7 +158,9 @@ struct EvalAMS : public IEvaluator {
     utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
   }
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
+    utils::Check(!distributed, "metric AMS do not support distributed evaluation");
     using namespace std;
     const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
 
@@ -212,7 +222,9 @@ struct EvalPrecisionRatio : public IEvaluator{
     }
   }
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
+    utils::Check(!distributed, "metric %s do not support distributed evaluation", Name());
     utils::Check(info.labels.size() != 0, "label set cannot be empty");    
     utils::Assert(preds.size() % info.labels.size() == 0,
                   "label size predict size not match");
@@ -252,7 +264,8 @@ struct EvalPrecisionRatio : public IEvaluator{
 /*! \brief Area under curve, for both classification and rank */
 struct EvalAuc : public IEvaluator {
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
     utils::Check(info.labels.size() != 0, "label set cannot be empty");
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label size predict size not match");
@@ -299,8 +312,16 @@ struct EvalAuc : public IEvaluator {
         sum_auc += sum_pospair / (sum_npos*sum_nneg);
       }
     }
-    // return average AUC over list
-    return static_cast<float>(sum_auc) / ngroup;
+    if (distributed) {
+      float dat[2];
+	  dat[0] = static_cast<float>(sum_auc);
+	  dat[1] = static_cast<float>(ngroup);      
+      // approximately estimate auc using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<float>(sum_auc) / ngroup;
+    }
   }
   virtual const char *Name(void) const {
     return "auc";
@@ -311,7 +332,8 @@ struct EvalAuc : public IEvaluator {
 struct EvalRankList : public IEvaluator {
  public:
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
     utils::Check(preds.size() == info.labels.size(),
                   "label size predict size not match");
     // quick consistency when group is not available
@@ -336,7 +358,16 @@ struct EvalRankList : public IEvaluator {
         sum_metric += this->EvalMetric(rec);
       }
     }
-    return static_cast<float>(sum_metric) / ngroup;
+    if (distributed) {
+      float dat[2];
+	  dat[0] = static_cast<float>(sum_metric);
+	  dat[1] = static_cast<float>(ngroup);      
+      // approximately estimate auc using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<float>(sum_metric) / ngroup;
+    }
   }
   virtual const char *Name(void) const {
     return name_.c_str();
diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h
index 33370e706..4d59e270a 100644
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@@ -19,9 +19,13 @@ struct IEvaluator{
    * \brief evaluate a specific metric
    * \param preds prediction
    * \param info information, including label etc.
+   * \param distributed whether a call to Allreduce is needed to gather 
+   *        the average statistics across all the node,
+   *        this is only supported by some metrics
    */
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const = 0;
+                     const MetaInfo &info,
+                     bool distributed = false) const = 0;
   /*! \return name of metric */
   virtual const char *Name(void) const = 0;
   /*! \brief virtual destructor */
@@ -70,10 +74,11 @@ class EvalSet{
   }
   inline std::string Eval(const char *evname,
                           const std::vector<float> &preds,
-                          const MetaInfo &info) const {
+                          const MetaInfo &info,
+                          bool distributed = false) {
     std::string result = "";
     for (size_t i = 0; i < evals_.size(); ++i) {
-      float res = evals_[i]->Eval(preds, info);
+      float res = evals_[i]->Eval(preds, info, distributed);
       char tmp[1024];
       utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
       result += tmp;
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 88026975d..616cf03e9 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -10,6 +10,9 @@
 #include <utility>
 #include <string>
 #include <limits>
+#include "../sync/sync.h"
+#include "../utils/io.h"
+#include "../utils/base64.h"
 #include "./objective.h"
 #include "./evaluation.h"
 #include "../gbm/gbm.h"
@@ -21,7 +24,7 @@ namespace learner {
  * \brief learner that takes do gradient boosting on specific objective functions
  *  and do training and prediction
  */
-class BoostLearner {
+class BoostLearner : public rabit::ISerializable {
  public:
   BoostLearner(void) {
     obj_ = NULL;
@@ -30,8 +33,13 @@ class BoostLearner {
     name_gbm_ = "gbtree";
     silent= 0;
     prob_buffer_row = 1.0f;
+    distributed_mode = 0;
+    pred_buffer_size = 0;
+    seed_per_iteration = 0;
+    seed = 0;
+    save_base64 = 0;
   }
-  ~BoostLearner(void) {
+  virtual ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
     if (gbm_ != NULL) delete gbm_;
   }
@@ -44,11 +52,9 @@ class BoostLearner {
    * \param mats array of pointers to matrix whose prediction result need to be cached
    */          
   inline void SetCacheData(const std::vector<DMatrix*>& mats) {
-    // estimate feature bound
-    unsigned num_feature = 0;
+    utils::Assert(cache_.size() == 0, "can only call cache data once");
     // assign buffer index
     size_t buffer_size = 0;
-    utils::Assert(cache_.size() == 0, "can only call cache data once");
     for (size_t i = 0; i < mats.size(); ++i) {
       bool dupilicate = false;
       for (size_t j = 0; j < i; ++j) {
@@ -59,19 +65,12 @@ class BoostLearner {
       mats[i]->cache_learner_ptr_ = this;
       cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
       buffer_size += mats[i]->info.num_row();
-      num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
     }
     char str_temp[25];
-    if (num_feature > mparam.num_feature) {
-      utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
-      this->SetParam("bst:num_feature", str_temp);
-    }
-    utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
-			 static_cast<unsigned long>(buffer_size));
+    utils::SPrintf(str_temp, sizeof(str_temp), "%lu", 
+                   static_cast<unsigned long>(buffer_size));
     this->SetParam("num_pbuffer", str_temp);
-    if (!silent) {
-      utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
-    }
+    this->pred_buffer_size = buffer_size;
   }
   /*!
    * \brief set parameters from outside
@@ -86,9 +85,29 @@ class BoostLearner {
       this->SetParam(n.c_str(), val);
     }
     if (!strcmp(name, "silent")) silent = atoi(val);
-    if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast<float>(atof(val));
+    if (!strcmp(name, "dsplit")) {
+      if (!strcmp(val, "col")) {
+        this->SetParam("updater", "distcol");
+        distributed_mode = 1;
+      } else if (!strcmp(val, "row")) {
+        this->SetParam("updater", "grow_histmaker,prune");
+        distributed_mode = 2;
+      } else {
+        utils::Error("%s is invalid value for dsplit, should be row or col", val);
+      }
+    }
+    if (!strcmp(name, "prob_buffer_row")) {
+      prob_buffer_row = static_cast<float>(atof(val));
+      utils::Check(distributed_mode == 0,
+                   "prob_buffer_row can only be used in single node mode so far");
+      this->SetParam("updater", "grow_colmaker,refresh,prune");
+    }
     if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
-    if (!strcmp("seed", name)) random::Seed(atoi(val));
+    if (!strcmp("seed", name)) {
+      this->seed = seed; random::Seed(atoi(val));
+    }
+    if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
+    if (!strcmp("save_base64", name)) save_base64 = atoi(val);
     if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
     if (!strcmp(name, "nthread")) {
       omp_set_num_threads(atoi(val));
@@ -104,10 +123,29 @@ class BoostLearner {
       cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
     }
   }
+  // this is an internal function
+  // initialize the trainer, called at InitModel and LoadModel
+  inline void InitTrainer(bool calc_num_feature = true) {
+    if (calc_num_feature) {
+      // estimate feature bound
+      unsigned num_feature = 0;
+      for (size_t i = 0; i < cache_.size(); ++i) {
+        num_feature = std::max(num_feature, 
+                               static_cast<unsigned>(cache_[i].mat_->info.num_col()));
+      }
+      // run allreduce on num_feature to find the maximum value
+      rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+      if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
+    } 
+    char str_temp[25];
+    utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
+    this->SetParam("bst:num_feature", str_temp);   
+  }
   /*!
    * \brief initialize the model
    */
   inline void InitModel(void) {
+    this->InitTrainer();
     // initialize model
     this->InitObjGBM();
     // reset the base score
@@ -118,8 +156,10 @@ class BoostLearner {
   /*!
    * \brief load model from stream
    * \param fi input stream
+   * \param with_pbuffer whether to load with predict buffer
+   * \param calc_num_feature whether call InitTrainer with calc_num_feature
    */
-  inline void LoadModel(utils::IStream &fi) {
+  inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true, bool calc_num_feature = true) {
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "BoostLearner: wrong model format");
     utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
@@ -127,32 +167,90 @@ class BoostLearner {
     // delete existing gbm if any
     if (obj_ != NULL) delete obj_;
     if (gbm_ != NULL) delete gbm_;
+    this->InitTrainer(calc_num_feature);
     this->InitObjGBM();
-    gbm_->LoadModel(fi);
+    gbm_->LoadModel(fi, with_pbuffer);
+    if (!with_pbuffer || distributed_mode == 2) {
+      gbm_->ResetPredBuffer(pred_buffer_size);
+    }
+  }
+  // rabit load model from rabit checkpoint
+  virtual void Load(rabit::IStream &fi) {
+    RabitStreamAdapter fs(fi);
+    // for row split, we should not keep pbuffer
+    this->LoadModel(fs, distributed_mode != 2, false);
+  }
+  // rabit save model to rabit checkpoint
+  virtual void Save(rabit::IStream &fo) const {
+    RabitStreamAdapter fs(fo);
+    // for row split, we should not keep pbuffer
+    this->SaveModel(fs, distributed_mode != 2);
   }
   /*!
    * \brief load model from file
    * \param fname file name
    */
   inline void LoadModel(const char *fname) {
-    utils::FileStream fi(utils::FopenCheck(fname, "rb"));
+    FILE *fp = utils::FopenCheck(fname, "rb");
+    std::string header; header.resize(4);
+    utils::FileStream fi(fp);
+    // check header for different binary encode
+    // can be base64 or binary
+    if (fi.Read(&header[0], 4) != 0) {
+      // base64 format
+      if (header == "bs64") {
+        utils::Base64InStream bsin(fp);
+        bsin.InitPosition();
+        this->LoadModel(bsin);
+        fclose(fp);
+        return;
+      }
+      if (header == "binf") {
+        this->LoadModel(fi);
+        fclose(fp);
+        return;
+      }
+    }
+    fi.Seek(0);
     this->LoadModel(fi);
-    fi.Close();
+    fclose(fp);
   }
-  inline void SaveModel(utils::IStream &fo) const {
+  inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
     fo.Write(&mparam, sizeof(ModelParam));
     fo.Write(name_obj_);
     fo.Write(name_gbm_);
-    gbm_->SaveModel(fo);
+    gbm_->SaveModel(fo, with_pbuffer);
   }
   /*!
    * \brief save model into file
    * \param fname file name
    */
   inline void SaveModel(const char *fname) const {
-    utils::FileStream fo(utils::FopenCheck(fname, "wb"));
-    this->SaveModel(fo);
-    fo.Close();
+    FILE *fp;
+    bool use_stdout = false;;
+#ifndef XGBOOST_STRICT_CXX98_
+    if (!strcmp(fname, "stdout")) {
+      fp = stdout;
+      use_stdout = true;
+    } else
+#endif
+    {
+      fp = utils::FopenCheck(fname, "wb");      
+    }
+    utils::FileStream fo(fp);
+    std::string header;
+    if (save_base64 != 0|| use_stdout) {
+      fo.Write("bs64\t", 5);
+      utils::Base64OutStream bout(fp);
+      this->SaveModel(bout);
+      bout.Finish('\n');
+    } else {
+      fo.Write("binf", 4);
+      this->SaveModel(fo);      
+    }
+    if (!use_stdout) {
+      fclose(fp);
+    }
   }
   /*!
    * \brief check if data matrix is ready to be used by training,
@@ -160,7 +258,10 @@ class BoostLearner {
    * \param p_train pointer to the matrix used by training
    */
   inline void CheckInit(DMatrix *p_train) {
-    p_train->fmat()->InitColAccess(prob_buffer_row);
+    int ncol = static_cast<int>(p_train->info.info.num_col);    
+    std::vector<bool> enabled(ncol, true);    
+    // initialize column access
+    p_train->fmat()->InitColAccess(enabled, prob_buffer_row);    
   }
   /*!
    * \brief update the model for one iteration
@@ -168,9 +269,18 @@ class BoostLearner {
    * \param p_train pointer to the data matrix
    */
   inline void UpdateOneIter(int iter, const DMatrix &train) {
+    if (seed_per_iteration || rabit::IsDistributed()) {
+      random::Seed(this->seed * kRandSeedMagic);
+    }
     this->PredictRaw(train, &preds_);
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
-    gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
+    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
+  }
+  /*!
+   * \brief whether model allow lazy checkpoint
+   */
+  inline bool AllowLazyCheckPoint(void) const {
+    return gbm_->AllowLazyCheckPoint();
   }
   /*!
    * \brief evaluate the model for specific iteration
@@ -189,7 +299,7 @@ class BoostLearner {
     for (size_t i = 0; i < evals.size(); ++i) {
       this->PredictRaw(*evals[i], &preds_);
       obj_->EvalTransform(&preds_);
-      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
+      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2);
     }
     return res;
   }
@@ -217,10 +327,41 @@ class BoostLearner {
    *   predictor, when it equals 0, this means we are using all the trees
    */
   inline void Predict(const DMatrix &data,
+                      bool output_margin,
+                      std::vector<float> *out_preds,
+                      unsigned ntree_limit = 0,
+                      bool pred_leaf = false
+                      ) const {
+    if (pred_leaf) {
+      gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit);      
+    } else {
+      this->PredictRaw(data, out_preds, ntree_limit);
+      if (!output_margin) {
+        obj_->PredTransform(out_preds);
+      }
+    }
+  }
+  /*!
+   * \brief online prediction funciton, predict score for one instance at a time
+   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
+   *        more efficient than online prediction
+   *        This function is NOT threadsafe, make sure you only call from one thread
+   *    
+   * \param inst the instance you want to predict
+   * \param output_margin whether to only predict margin value instead of transformed prediction
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction
+   * \param root_index the root index
+   * \sa Predict
+   */
+  inline void Predict(const SparseBatch::Inst &inst,
                       bool output_margin,
                       std::vector<float> *out_preds,
                       unsigned ntree_limit = 0) const {
-    this->PredictRaw(data, out_preds, ntree_limit);
+    gbm_->Predict(inst, out_preds, ntree_limit);
+    if (out_preds->size() == 1) {
+      (*out_preds)[0] += mparam.base_score;
+    }
     if (!output_margin) {
       obj_->PredTransform(out_preds);
     }
@@ -240,6 +381,7 @@ class BoostLearner {
     utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
     obj_ = CreateObjFunction(name_obj_.c_str());
     gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
+    
     for (size_t i = 0; i < cfg_.size(); ++i) {
       obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
       gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@@ -287,7 +429,7 @@ class BoostLearner {
     /* \brief number of class, if it is multi-class classification  */
     int num_class;
     /*! \brief reserved field */
-    int reserved[32];
+    int reserved[31];
     /*! \brief constructor */
     ModelParam(void) {
       base_score = 0.5f;
@@ -308,14 +450,26 @@ class BoostLearner {
     }
   };
   // data fields
+  // stored random seed
+  int seed;
+  // whether seed the PRNG each iteration
+  // this is important for restart from existing iterations
+  // default set to no, but will auto switch on in distributed mode
+  int seed_per_iteration;
+  // save model in base64 encoding
+  int save_base64;
   // silent during training
   int silent;
+  // distributed learning mode, if any, 0:none, 1:col, 2:row
+  int distributed_mode;
+  // cached size of predict buffer
+  size_t pred_buffer_size;
   // maximum buffred row value
   float prob_buffer_row;
   // evaluation set
   EvalSet evaluator_;
   // model parameter
-  ModelParam   mparam;
+  ModelParam  mparam;
   // gbm model that back everything
   gbm::IGradBooster *gbm_;
   // name of gbm model used for training
@@ -331,7 +485,9 @@ class BoostLearner {
   // gradient pairs
   std::vector<bst_gpair> gpair_;
 
- private:
+ protected:
+  // magic number to transform random seed
+  const static int kRandSeedMagic = 127;
   // cache entry object that helps handle feature caching
   struct CacheEntry {
     const DMatrix *mat_;
@@ -354,6 +510,23 @@ class BoostLearner {
   // data structure field
   /*! \brief the entries indicates that we have internal prediction cache */
   std::vector<CacheEntry> cache_;
+
+ private:
+  // adapt rabit stream to utils stream
+  struct RabitStreamAdapter : public utils::IStream {
+    // rabit stream
+    rabit::IStream &fs;
+    // constructr
+    RabitStreamAdapter(rabit::IStream &fs) : fs(fs) {}
+    // destructor
+    virtual ~RabitStreamAdapter(void){}
+    virtual size_t Read(void *ptr, size_t size) {
+      return fs.Read(ptr, size);
+    }
+    virtual void Write(const void *ptr, size_t size) {
+      fs.Write(ptr, size);
+    }
+  };
 };
 }  // namespace learner
 }  // namespace xgboost
diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp
index 96aacf12d..9887e7a05 100644
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -41,6 +41,25 @@ struct LossType {
       default: utils::Error("unknown loss_type"); return 0.0f;
     }
   }
+  /*!
+   * \brief check if label range is valid
+   */
+  inline bool CheckLabel(float x) const {
+    if (loss_type != kLinearSquare) {
+      return x >= 0.0f && x <= 1.0f;
+    }
+    return true;
+  }
+  /*!
+   * \brief error message displayed when check label fail
+   */
+  inline const char * CheckLabelErrorMsg(void) const {
+    if (loss_type != kLinearSquare) {
+      return "label must be in [0,1] for logistic regression";
+    } else {
+      return "";
+    }
+  }
   /*!
    * \brief calculate first order gradient of loss, given transformed prediction
    * \param predt transformed prediction
@@ -115,6 +134,8 @@ class RegLossObj : public IObjFunction{
                  "labels are not correctly provided");
     std::vector<bst_gpair> &gpair = *out_gpair;
     gpair.resize(preds.size());
+    // check if label in range
+    bool label_correct = true;
     // start calculating gradient
     const unsigned nstep = static_cast<unsigned>(info.labels.size());
     const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
@@ -124,9 +145,11 @@ class RegLossObj : public IObjFunction{
       float p = loss.PredTransform(preds[i]);
       float w = info.GetWeight(j);
       if (info.labels[j] == 1.0f) w *= scale_pos_weight;
+      if (!loss.CheckLabel(info.labels[j])) label_correct = false;
       gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
                            loss.SecondOrderGradient(p, info.labels[j]) * w);
     }
+    utils::Check(label_correct, loss.CheckLabelErrorMsg());
   }
   virtual const char* DefaultEvalMetric(void) const {
     return loss.DefaultEvalMetric();
@@ -183,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
         Softmax(&rec);
         const unsigned j = i % nstep;
         int label = static_cast<int>(info.labels[j]);
-        utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
+        utils::Check(label >= 0 && label < nclass,
+                     "SoftmaxMultiClassObj: label must be in [0, num_class)");
         const float wt = info.GetWeight(j);
         for (int k = 0; k < nclass; ++k) {
           float p = rec[k];
@@ -325,9 +349,9 @@ class LambdaRankObj : public IObjFunction {
           float h = loss.SecondOrderGradient(p, 1.0f);
           // accumulate gradient and hessian in both pid, and nid
           gpair[pos.rindex].grad += g * w;
-          gpair[pos.rindex].hess += 2.0f * h;
+          gpair[pos.rindex].hess += 2.0f * w * h;
           gpair[neg.rindex].grad -= g * w;
-          gpair[neg.rindex].hess += 2.0f * h;
+          gpair[neg.rindex].hess += 2.0f * w * h;
         }
       }
     }
diff --git a/src/sync/sync.h b/src/sync/sync.h
new file mode 100644
index 000000000..aec5e2abd
--- /dev/null
+++ b/src/sync/sync.h
@@ -0,0 +1,12 @@
+#ifndef XGBOOST_SYNC_H_
+#define XGBOOST_SYNC_H_
+/*!
+ * \file sync.h
+ * \brief the synchronization module of rabit
+ *        redirects to subtree rabit header
+ * \author Tianqi Chen
+ */
+#include "../../subtree/rabit/include/rabit.h"
+#endif  // XGBOOST_SYNC_H_
+
+
diff --git a/src/tree/model.h b/src/tree/model.h
index aa9ad2794..f3575488a 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -68,8 +68,9 @@ class TreeModel {
     }
   };
   /*! \brief tree node */
-  class Node{
+  class Node {
    public:
+	Node(void) : sindex_(0) {}
     /*! \brief index of left child */
     inline int cleft(void) const {
       return this->cleft_;
@@ -110,6 +111,10 @@ class TreeModel {
     inline bool is_left_child(void) const {
       return (parent_ & (1U << 31)) != 0;
     }
+    /*! \brief whether this node is deleted */
+    inline bool is_deleted(void) const {
+      return sindex_ == std::numeric_limits<unsigned>::max();
+    }
     /*! \brief whether current node is root */
     inline bool is_root(void) const {
       return parent_ == -1;
@@ -144,7 +149,11 @@ class TreeModel {
       this->cleft_ = -1;
       this->cright_ = right;
     }
-
+    /*! \brief mark that this node is deleted */
+    inline void mark_delete(void) {
+      this->sindex_ = std::numeric_limits<unsigned>::max();
+    }
+    
    private:
     friend class TreeModel<TSplitCond, TNodeStat>;
     /*! 
@@ -197,11 +206,11 @@ class TreeModel {
     leaf_vector.resize(param.num_nodes * param.size_leaf_vector); 
     return nd;
   }
-  // delete a tree node
+  // delete a tree node, keep the parent field to allow trace back
   inline void DeleteNode(int nid) {
     utils::Assert(nid >= param.num_roots, "can not delete root");
     deleted_nodes.push_back(nid);
-    nodes[nid].set_parent(-1);
+    nodes[nid].mark_delete();
     ++param.num_deleted;
   }
 
@@ -296,11 +305,12 @@ class TreeModel {
     }
     // chg deleted nodes
     deleted_nodes.resize(0);
-    for (int i = param.num_roots; i < param.num_nodes; i ++) {
-      if (nodes[i].is_root()) deleted_nodes.push_back(i);
+    for (int i = param.num_roots; i < param.num_nodes; ++i) {
+      if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
     }
     utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
-                  "number of deleted nodes do not match");
+                  "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
+                  param.num_deleted, deleted_nodes.size(), param.num_nodes);
   }
   /*! 
    * \brief save model to stream
diff --git a/src/tree/param.h b/src/tree/param.h
index 04ea5277f..2c2362095 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -36,8 +36,14 @@ struct TrainParam{
   float colsample_bytree;
   // speed optimization for dense column
   float opt_dense_col;
+  // accuracy of sketch
+  float sketch_eps;
+  // accuracy of sketch
+  float sketch_ratio;
   // leaf vector size
-  int size_leaf_vector;
+  int size_leaf_vector;  
+  // option for parallelization
+  int parallel_option;
   // number of threads to be used for tree construction,
   // if OpenMP is enabled, if equals 0, use system default
   int nthread;
@@ -55,6 +61,9 @@ struct TrainParam{
     opt_dense_col = 1.0f;
     nthread = 0;
     size_leaf_vector = 0;
+    parallel_option = 2;
+    sketch_eps = 0.1f;
+    sketch_ratio = 2.0f;
   }
   /*! 
    * \brief set parameters from outside 
@@ -76,10 +85,13 @@ struct TrainParam{
     if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
     if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
     if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
     if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
     if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
     if (!strcmp(name, "max_depth")) max_depth = atoi(val);
     if (!strcmp(name, "nthread")) nthread = atoi(val);
+    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
     if (!strcmp(name, "default_direction")) {
       if (!strcmp(val, "learn")) default_direction = 0;
       if (!strcmp(val, "left")) default_direction = 1;
@@ -132,6 +144,12 @@ struct TrainParam{
   inline bool cannot_split(double sum_hess, int depth) const {
     return sum_hess < this->min_child_weight * 2.0;
   }
+  /*! \brief maximum sketch size */
+  inline unsigned max_sketch_size(void) const {
+    unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
+    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
+    return ret;
+  }
 
  protected:
   // functions for L1 cost
@@ -186,6 +204,10 @@ struct GradStats {
   inline void Add(const GradStats &b) {
     this->Add(b.sum_grad, b.sum_hess);
   }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline static void Reduce(GradStats &a, const GradStats &b) {
+    a.Add(b);
+  }
   /*! \brief set current value to a - b */
   inline void SetSubstract(const GradStats &a, const GradStats &b) {
     sum_grad = a.sum_grad - b.sum_grad;
@@ -262,6 +284,10 @@ struct CVGradStats : public GradStats {
       valid[i].Add(b.valid[i]);
     }
   }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline static void Reduce(CVGradStats &a, const CVGradStats &b) {
+    a.Add(b);
+  }
   /*! \brief set current value to a - b */
   inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
     GradStats::SetSubstract(a, b);
@@ -341,6 +367,10 @@ struct SplitEntry{
       return false;
     }
   }
+  /*! \brief same as update, used by AllReduce*/
+  inline static void Reduce(SplitEntry &dst, const SplitEntry &src) {
+    dst.Update(src);
+  }
   /*!\return feature index to split on */
   inline unsigned split_index(void) const {
     return sindex & ((1U << 31) - 1U);
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 2cb6552fe..53b3d6aa1 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -1,10 +1,16 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <cstring>
 #include "./updater.h"
 #include "./updater_prune-inl.hpp"
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
+#ifndef XGBOOST_STRICT_CXX98_
+#include "./updater_sync-inl.hpp"
+#include "./updater_distcol-inl.hpp"
+#include "./updater_histmaker-inl.hpp"
+#endif
 
 namespace xgboost {
 namespace tree {
@@ -13,6 +19,11 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
+#ifndef XGBOOST_STRICT_CXX98_
+  if (!strcmp(name, "sync")) return new TreeSyncher();
+  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
+  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
+#endif
   utils::Error("unknown updater:%s", name);
   return NULL;
 }
diff --git a/src/tree/updater.h b/src/tree/updater.h
index e3a05c84f..49adc8dca 100644
--- a/src/tree/updater.h
+++ b/src/tree/updater.h
@@ -37,6 +37,16 @@ class IUpdater {
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       const std::vector<RegTree*> &trees) = 0;
+
+  /*! 
+   * \brief this is simply a function for optimizing performance
+   * this function asks the updater to return the leaf position of each instance in the p_fmat,
+   * if it is cached in the updater, if it is not available, return NULL
+   * \return array of leaf position of each instance in the last updated tree
+   */
+  virtual const int* GetLeafPosition(void) const {
+    return NULL;
+  }
   // destructor
   virtual ~IUpdater(void) {}
 };
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
new file mode 100644
index 000000000..f8816dd6e
--- /dev/null
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -0,0 +1,409 @@
+#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+/*!
+ * \file updater_basemaker-inl.hpp
+ * \brief implement a common tree constructor
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include "../sync/sync.h"
+#include "../utils/random.h"
+#include "../utils/quantile.h"
+
+namespace xgboost {
+namespace tree {
+/*! 
+ * \brief base tree maker class that defines common operation
+ *  needed in tree making
+ */
+class BaseMaker: public IUpdater {
+ public:
+  // destructor
+  virtual ~BaseMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+   
+ protected:
+  // helper to collect and query feature meta information
+  struct FMetaHelper {
+   public:
+    /*! \brief find type of each feature, use column format */
+    inline void InitByCol(IFMatrix *p_fmat,
+                          const RegTree &tree) {
+      fminmax.resize(tree.param.num_feature * 2);
+      std::fill(fminmax.begin(), fminmax.end(),
+                -std::numeric_limits<bst_float>::max());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (bst_uint i = 0; i < batch.size; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const ColBatch::Inst &c = batch[i];
+          if (c.length != 0) {
+            fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
+            fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
+          }
+        }
+      }      
+      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
+    }
+    // get feature type, 0:empty 1:binary 2:real
+    inline int Type(bst_uint fid) const {
+      utils::Assert(fid * 2 + 1 < fminmax.size(),
+                    "FeatHelper fid exceed query bound ");
+      bst_float a = fminmax[fid * 2];
+      bst_float b = fminmax[fid * 2 + 1];
+      if (a == -std::numeric_limits<bst_float>::max()) return 0;
+      if (-a == b) return 1;
+      else return 2;
+    }
+    inline bst_float MaxValue(bst_uint fid) const {
+      return fminmax[fid *2 + 1];
+    }
+    inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
+      std::vector<bst_uint> &findex = *p_findex;
+      findex.clear();
+      for (size_t i = 0; i < fminmax.size(); i += 2) {
+		const bst_uint fid = static_cast<bst_uint>(i / 2);
+        if (this->Type(fid) != 0) findex.push_back(fid);
+      }
+      unsigned n = static_cast<unsigned>(p * findex.size());
+      random::Shuffle(findex);
+      findex.resize(n);
+      // sync the findex if it is subsample
+      std::string s_cache;
+      utils::MemoryBufferStream fc(&s_cache);
+      utils::IStream &fs = fc;
+      if (rabit::GetRank() == 0) {
+        fs.Write(findex);
+      }
+      rabit::Broadcast(&s_cache, 0);
+      fs.Read(&findex);
+    }
+    
+   private:
+    std::vector<bst_float> fminmax;
+  };
+  // ------static helper functions ------
+  // helper function to get to next level of the tree
+  /*! \brief this is  helper function for row based data*/
+  inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
+    const RegTree::Node &n = tree[nid];
+    bst_uint findex = n.split_index();
+    for (unsigned i = 0; i < inst.length; ++i) {
+      if (findex == inst[i].index) {
+        if (inst[i].fvalue < n.split_cond()) {
+          return n.cleft();
+        } else {
+          return n.cright();
+        }
+      }
+    }
+    return n.cdefault();
+  }
+  /*! \brief get number of omp thread in current context */
+  inline static int get_nthread(void) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    return nthread;
+  }
+  // ------class member helpers---------
+  /*! \brief initialize temp data structure */
+  inline void InitData(const std::vector<bst_gpair> &gpair,
+                       const IFMatrix &fmat,
+                       const std::vector<unsigned> &root_index,
+                       const RegTree &tree) {
+    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+                  "TreeMaker: can only grow new tree");
+    {// setup position
+      position.resize(gpair.size());
+      if (root_index.size() == 0) {
+        std::fill(position.begin(), position.end(), 0);
+      } else {
+        for (size_t i = 0; i < position.size(); ++i) {
+          position[i] = root_index[i];
+          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
+                        "root index exceed setting");
+        }
+      }
+      // mark delete for the deleted datas
+      for (size_t i = 0; i < position.size(); ++i) {
+        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
+      }
+      // mark subsample
+      if (param.subsample < 1.0f) {
+        for (size_t i = 0; i < position.size(); ++i) {
+          if (gpair[i].hess < 0.0f) continue;
+          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+        }
+      }
+    }
+    {// expand query
+      qexpand.reserve(256); qexpand.clear();
+      for (int i = 0; i < tree.param.num_roots; ++i) {
+        qexpand.push_back(i);
+      }
+      this->UpdateNode2WorkIndex(tree);
+    }
+  }
+  /*! \brief update queue expand add in new leaves */
+  inline void UpdateQueueExpand(const RegTree &tree) {
+    std::vector<int> newnodes;
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      if (!tree[nid].is_leaf()) {
+        newnodes.push_back(tree[nid].cleft());
+        newnodes.push_back(tree[nid].cright());
+      }
+    }
+    // use new nodes for qexpand
+    qexpand = newnodes;
+    this->UpdateNode2WorkIndex(tree);
+  }
+  // return decoded position
+  inline int DecodePosition(bst_uint ridx) const{
+    const int pid = position[ridx];
+    return pid < 0 ? ~pid : pid;
+  }
+  // encode the encoded position value for ridx
+  inline void SetEncodePosition(bst_uint ridx, int nid) {
+    if (position[ridx] < 0) {
+      position[ridx] = ~nid;
+    } else {
+      position[ridx] = nid;
+    }
+  }
+  /*! 
+   * \brief this is helper function uses column based data structure,
+   *        reset the positions to the lastest one
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  inline void ResetPositionCol(const std::vector<int> &nodes, IFMatrix *p_fmat, const RegTree &tree) {
+    // set the positions in the nondefault
+    this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
+    // set rest of instances to default position
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+    // set default direct nodes to default
+    // for leaf nodes that are not fresh, mark then to ~nid, 
+    // so that they are ignored in future statistics collection
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = this->DecodePosition(ridx);
+      if (tree[nid].is_leaf()) {
+        // mark finish when it is not a fresh leaf
+        if (tree[nid].cright() == -1) {
+          position[ridx] = ~nid;
+        }
+        } else {
+        // push to default branch
+        if (tree[nid].default_left()) {
+          this->SetEncodePosition(ridx, tree[nid].cleft());
+        } else {
+          this->SetEncodePosition(ridx, tree[nid].cright());
+        }
+      }
+    }
+  }
+  /*!
+   * \brief this is helper function uses column based data structure,
+   *        update all positions into nondefault branch, if any, ignore the default branch
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
+                                        IFMatrix *p_fmat, const RegTree &tree) {
+    // step 1, classify the non-default data into right places
+    std::vector<unsigned> fsplits;
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      const int nid = nodes[i];
+      if (!tree[nid].is_leaf()) {
+        fsplits.push_back(tree[nid].split_index());
+      }
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        ColBatch::Inst col = batch[i];
+        const bst_uint fid = batch.col_index[i];
+        const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+          const bst_uint ridx = col[j].index;
+          const float fvalue = col[j].fvalue;
+          const int nid = this->DecodePosition(ridx);
+          // go back to parent, correct those who are not default
+          if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+            if(fvalue < tree[nid].split_cond()) {
+              this->SetEncodePosition(ridx, tree[nid].cleft());
+            } else {
+              this->SetEncodePosition(ridx, tree[nid].cright());
+            }
+          }
+        }
+      }
+    }
+  }
+  /*! \brief helper function to get statistics from a tree */
+  template<typename TStats>
+  inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
+                           const IFMatrix &fmat,
+                           const RegTree &tree,
+                           const BoosterInfo &info,
+                           std::vector< std::vector<TStats> > *p_thread_temp,
+                           std::vector<TStats> *p_node_stats) {
+    std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
+    thread_temp.resize(this->get_nthread());
+    p_node_stats->resize(tree.param.num_nodes);
+    #pragma omp parallel
+    {
+      const int tid = omp_get_thread_num();
+      thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const unsigned nid = qexpand[i];
+        thread_temp[tid][nid].Clear();
+      }
+    }
+    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    // setup position
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = position[ridx];
+      const int tid = omp_get_thread_num();
+      if (nid >= 0) {
+        thread_temp[tid][nid].Add(gpair, info, ridx);
+      }
+    }
+    // sum the per thread statistics together
+    for (size_t j = 0; j < qexpand.size(); ++j) {
+      const int nid = qexpand[j];
+      TStats &s = (*p_node_stats)[nid];
+      s.Clear();
+      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
+        s.Add(thread_temp[tid][nid]);
+      }
+    }
+  }
+  /*! \brief common helper data structure to build sketch*/
+  struct SketchEntry {
+    /*! \brief total sum of amount to be met */
+    bst_float sum_total;
+    /*! \brief statistics used in the sketch */
+    bst_float rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    bst_float next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = -1.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch 
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (next_goal == -1.0f) {
+        next_goal = 0.0f;
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        bst_float rmax = rmin + wmin;
+        if (rmax >= next_goal) {
+          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(rmin, rmax, wmin, last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else{
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      bst_float rmax = rmin + wmin;
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size );
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(rmin, rmax, wmin, last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
+  /*! \brief training parameter of tree grower */
+  TrainParam param;
+  /*! \brief queue of nodes to be expanded */
+  std::vector<int> qexpand;
+  /*!
+   * \brief map active node to is working index offset in qexpand,
+   *   can be -1, which means the node is node actively expanding
+   */
+  std::vector<int> node2workindex;
+  /*!
+   * \brief position of each instance in the tree
+   *   can be negative, which means this position is no longer expanding
+   *   see also Decode/EncodePosition
+   */
+  std::vector<int> position;
+
+ private:
+  inline void UpdateNode2WorkIndex(const RegTree &tree) {
+    // update the node2workindex
+    std::fill(node2workindex.begin(), node2workindex.end(), -1);
+    node2workindex.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node2workindex[qexpand[i]] = static_cast<int>(i);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 2d7c5311e..bbf6242c5 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -14,7 +14,7 @@
 
 namespace xgboost {
 namespace tree {
-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief colunwise update to construct a tree */
 template<typename TStats>
 class ColMaker: public IUpdater {
  public:
@@ -36,24 +36,29 @@ class ColMaker: public IUpdater {
       Builder builder(param);
       builder.Update(gpair, p_fmat, info, trees[i]);
     }
+
     param.learning_rate = lr;
   }
 
- private:
+ protected:
   // training parameter
   TrainParam param;
   // data structure
   /*! \brief per thread x per node entry to store tmp data */
   struct ThreadEntry {
-    /*! \brief statistics of data*/
+    /*! \brief statistics of data */
     TStats stats;
+    /*! \brief extra statistics of data */
+    TStats stats_extra;
     /*! \brief last feature value scanned */
     float  last_fvalue;
+    /*! \brief first feature value scanned */
+    float  first_fvalue;
     /*! \brief current best solution */
     SplitEntry best;
     // constructor
     explicit ThreadEntry(const TrainParam &param)
-        : stats(param) {
+        : stats(param), stats_extra(param) {
     }
   };
   struct NodeEntry {
@@ -104,7 +109,7 @@ class ColMaker: public IUpdater {
       }
     }
 
-   private:
+   protected:
     // initialize temp data structure
     inline void InitData(const std::vector<bst_gpair> &gpair,
                          const IFMatrix &fmat,
@@ -127,17 +132,17 @@ class ColMaker: public IUpdater {
         // mark delete for the deleted datas
         for (size_t i = 0; i < rowset.size(); ++i) {
           const bst_uint ridx = rowset[i];
-          if (gpair[ridx].hess < 0.0f) position[ridx] = -1;
+          if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
         }
         // mark subsample
         if (param.subsample < 1.0f) {
           for (size_t i = 0; i < rowset.size(); ++i) {
             const bst_uint ridx = rowset[i];
             if (gpair[ridx].hess < 0.0f) continue;
-            if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
+            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
           }
         }
-      }    
+      }
       {
         // initialize feature index
         unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@@ -148,7 +153,8 @@ class ColMaker: public IUpdater {
         }
         unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
         random::Shuffle(feat_index);
-        utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
+        //utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
+        utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included", param.colsample_bytree);
         feat_index.resize(n);
       }
       {// setup temp space for each thread
@@ -219,7 +225,138 @@ class ColMaker: public IUpdater {
       }
       // use new nodes for qexpand
       qexpand = newnodes;
-    }
+    }    
+    // parallel find the best split of current fid
+    // this function does not support nested functions
+    inline void ParallelFindSplit(const ColBatch::Inst &col,
+                                  bst_uint fid,
+                                  const IFMatrix &fmat,
+                                  const std::vector<bst_gpair> &gpair,
+                                  const BoosterInfo &info) {
+      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
+      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
+      const std::vector<int> &qexpand = qexpand_;
+      int nthread;
+      #pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        // cleanup temp statistics
+        for (size_t j = 0; j < qexpand.size(); ++j) {
+          temp[qexpand[j]].stats.Clear();
+        }
+        nthread = omp_get_num_threads();
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          if (temp[nid].stats.Empty()) {
+            temp[nid].first_fvalue = fvalue;
+          }
+          temp[nid].stats.Add(gpair, info, ridx);
+          temp[nid].last_fvalue = fvalue;
+        }
+      }
+      // start collecting the partial sum statistics
+      bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint j = 0; j < nnode; ++j) {
+        const int nid = qexpand[j];
+        TStats sum(param), tmp(param), c(param);
+        for (int tid = 0; tid < nthread; ++tid) {
+          tmp = stemp[tid][nid].stats;
+          stemp[tid][nid].stats = sum;
+          sum.Add(tmp);
+          if (tid != 0) {
+            std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
+          }
+        }
+        for (int tid = 0; tid < nthread; ++tid) {
+          stemp[tid][nid].stats_extra = sum;
+          ThreadEntry &e = stemp[tid][nid];
+          float fsplit;
+          if (tid != 0) {
+            if(fabsf(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
+              fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
+            } else {
+              continue;
+            }
+          } else {
+            fsplit = e.first_fvalue - rt_eps;
+          }                        
+          if (need_forward && tid != 0) {
+            c.SetSubstract(snode[nid].stats, e.stats);
+            if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, false);
+            }
+          }
+          if (need_backward) {
+            tmp.SetSubstract(sum, e.stats);
+            c.SetSubstract(snode[nid].stats, tmp);
+            if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, true);
+            }
+          }
+        }
+        if (need_backward) {
+          tmp = sum;
+          ThreadEntry &e = stemp[nthread-1][nid];
+          c.SetSubstract(snode[nid].stats, tmp);
+          if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
+          }
+        }
+      }
+      // rescan, generate candidate split
+      #pragma omp parallel
+      {
+        TStats c(param), cright(param);
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        nthread = static_cast<bst_uint>(omp_get_num_threads());
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          // get the statistics of nid
+          ThreadEntry &e = temp[nid];
+          if (e.stats.Empty()) {
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;
+          } else {
+            // forward default right
+            if (fabsf(fvalue - e.first_fvalue) > rt_2eps){
+              if (need_forward) { 
+                c.SetSubstract(snode[nid].stats, e.stats);
+                if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
+                }
+              }
+              if (need_backward) {
+                cright.SetSubstract(e.stats_extra, e.stats);
+                c.SetSubstract(snode[nid].stats, cright);
+                if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
+                }
+              }
+            }          
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;            
+          }
+        }
+      }
+    }    
     // enumerate the split values of specific feature
     inline void EnumerateSplit(const ColBatch::Entry *begin,
                                const ColBatch::Entry *end,
@@ -273,6 +410,42 @@ class ColMaker: public IUpdater {
         }
       }
     }
+    // update the solution candidate 
+    virtual void UpdateSolution(const ColBatch &batch,
+                                const std::vector<bst_gpair> &gpair,
+                                const IFMatrix &fmat,
+                                const BoosterInfo &info) {
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #if defined(_OPENMP)                                                                
+      const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
+      #endif
+      int poption = param.parallel_option;
+      if (poption == 2) {
+        poption = nsize * 2 < nthread ? 1 : 0;
+      }
+      if (poption == 0) {
+        #pragma omp parallel for schedule(dynamic, batch_size)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const int tid = omp_get_thread_num();
+          const ColBatch::Inst c = batch[i];
+          if (param.need_forward_search(fmat.GetColDensity(fid))) {
+            this->EnumerateSplit(c.data, c.data + c.length, +1, 
+                                 fid, gpair, info, stemp[tid]);
+          }
+          if (param.need_backward_search(fmat.GetColDensity(fid))) {
+            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
+                                 fid, gpair, info, stemp[tid]);
+          }
+        }
+      } else {
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          this->ParallelFindSplit(batch[i], batch.col_index[i],
+                                  fmat, gpair, info);
+        }
+      }      
+    }
     // find splits at current level, do split per level
     inline void FindSplit(int depth,
                           const std::vector<int> &qexpand,
@@ -289,66 +462,76 @@ class ColMaker: public IUpdater {
       }
       utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
       while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        // start enumeration
-        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-        #if defined(_OPENMP)                                                                
-        const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
-        #endif
-        #pragma omp parallel for schedule(dynamic, batch_size)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
-          if (param.need_forward_search(p_fmat->GetColDensity(fid))) {            
-            this->EnumerateSplit(c.data, c.data + c.length, +1, 
-                                 fid, gpair, info, stemp[tid]);
+        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
+      }
+      // after this each thread's stemp will get the best candidates, aggregate results
+      this->SyncBestSolution(qexpand);
+      // get the best result, we can synchronize the solution
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        NodeEntry &e = snode[nid];        
+        // now we know the solution in snode[nid], set split
+        if (e.best.loss_chg > rt_eps) {
+          p_tree->AddChilds(nid);
+          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
+          // mark right child as 0, to indicate fresh leaf
+          (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+          (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        } else {
+          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
+        }
+      } 
+    }
+    // reset position of each data points after split is created in the tree
+    inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
+      // set the positions in the nondefault
+      this->SetNonDefaultPosition(qexpand, p_fmat, tree);      
+      // set rest of instances to default position
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // set default direct nodes to default
+      // for leaf nodes that are not fresh, mark then to ~nid, 
+      // so that they are ignored in future statistics collection
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int nid = this->DecodePosition(ridx);
+        if (tree[nid].is_leaf()) {
+          // mark finish when it is not a fresh leaf
+          if (tree[nid].cright() == -1) {
+            position[ridx] = ~nid;
           }
-          if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
-            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
-                                 fid, gpair, info, stemp[tid]);
+        } else {
+          // push to default branch
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cright());
           }
         }
       }
-      // after this each thread's stemp will get the best candidates, aggregate results
+    }
+    // customization part
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
         NodeEntry &e = snode[nid];
         for (int tid = 0; tid < this->nthread; ++tid) {
           e.best.Update(stemp[tid][nid].best);
         }
-        // now we know the solution in snode[nid], set split
-        if (e.best.loss_chg > rt_eps) {
-          p_tree->AddChilds(nid);
-          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
-        } else {
-          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
-        }
       }
     }
-    // reset position of each data points after split is created in the tree
-    inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // step 1, set default direct nodes to default, and leaf nodes to -1
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int nid = position[ridx];
-        if (nid >= 0) {
-          if (tree[nid].is_leaf()) {
-            position[ridx] = -1;
-          } else {
-            // push to default branch, correct latter
-            position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
-          }
-        }
-      }
-      // step 2, classify the non-default data into right places
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 1, classify the non-default data into right places
       std::vector<unsigned> fsplits;
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) fsplits.push_back(tree[nid].split_index());
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
       }
       std::sort(fsplits.begin(), fsplits.end());
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
@@ -364,21 +547,33 @@ class ColMaker: public IUpdater {
           for (bst_omp_uint j = 0; j < ndata; ++j) {
             const bst_uint ridx = col[j].index;
             const float fvalue = col[j].fvalue;
-            int nid = position[ridx];
-            if (nid == -1) continue;
+            const int nid = this->DecodePosition(ridx);
             // go back to parent, correct those who are not default
-            nid = tree[nid].parent();
-            if (tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                position[ridx] = tree[nid].cleft();
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if(fvalue < tree[nid].split_cond()) {
+                this->SetEncodePosition(ridx, tree[nid].cleft());
               } else {
-                position[ridx] = tree[nid].cright();
+                this->SetEncodePosition(ridx, tree[nid].cright());
               }
             }
           }
         }
       }
     }
+    // utils to get/set position, with encoded format
+    // return decoded position
+    inline int DecodePosition(bst_uint ridx) const{
+      const int pid = position[ridx];
+      return pid < 0 ? ~pid : pid;
+    }
+    // encode the encoded position value for ridx
+    inline void SetEncodePosition(bst_uint ridx, int nid) {
+      if (position[ridx] < 0) {
+        position[ridx] = ~nid;
+      } else {
+        position[ridx] = nid;
+      }
+    }
     //--data fields--
     const TrainParam &param;
     // number of omp thread used during training
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
new file mode 100644
index 000000000..c989f4e47
--- /dev/null
+++ b/src/tree/updater_distcol-inl.hpp
@@ -0,0 +1,169 @@
+#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+/*!
+ * \file updater_distcol-inl.hpp
+ * \brief beta distributed version that takes a sub-column 
+ *        and construct a tree
+ * \author Tianqi Chen
+ */
+#include "../sync/sync.h"
+#include "../utils/bitmap.h"
+#include "../utils/io.h"
+#include "./updater_colmaker-inl.hpp"
+#include "./updater_prune-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class DistColMaker : public ColMaker<TStats> {
+ public:
+  DistColMaker(void) : builder(param) {}
+  virtual ~DistColMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+    pruner.SetParam(name, val);
+  }
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {    
+    TStats::CheckInfo(info);
+    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
+    // build the tree
+    builder.Update(gpair, p_fmat, info, trees[0]);
+    //// prune the tree, note that pruner will sync the tree
+    pruner.Update(gpair, p_fmat, info, trees);
+    // update position after the tree is pruned
+    builder.UpdatePosition(p_fmat, *trees[0]);
+  }
+  virtual const int* GetLeafPosition(void) const {
+    return builder.GetLeafPosition();
+  }  
+ private:
+  struct Builder : public ColMaker<TStats>::Builder {
+   public:
+    Builder(const TrainParam &param) 
+        : ColMaker<TStats>::Builder(param) {
+    }
+    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        int nid = this->DecodePosition(ridx);
+        while (tree[nid].is_deleted()) {
+          nid = tree[nid].parent();
+          utils::Assert(nid >=0, "distributed learning error");
+        }
+        this->position[ridx] = nid;
+      }
+    }
+    virtual const int* GetLeafPosition(void) const {
+      return BeginPtr(this->position);
+    }
+   protected:    
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 2, classify the non-default data into right places
+      std::vector<unsigned> fsplits;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
+      }
+      // get the candidate split index
+      std::sort(fsplits.begin(), fsplits.end());
+      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
+        fsplits.pop_back();
+      }
+      // bitmap is only word concurrent, set to bool first
+      {
+        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
+        boolmap.resize(ndata);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+            boolmap[j] = 0;
+        }        
+      }
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (size_t i = 0; i < batch.size; ++i) {
+          ColBatch::Inst col = batch[i];
+          const bst_uint fid = batch.col_index[i];
+          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+          #pragma omp parallel for schedule(static)
+          for (bst_omp_uint j = 0; j < ndata; ++j) {
+            const bst_uint ridx = col[j].index;
+            const float fvalue = col[j].fvalue;
+            const int nid = this->DecodePosition(ridx);
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if (fvalue < tree[nid].split_cond()) {
+                if (!tree[nid].default_left()) boolmap[ridx] = 1;
+              } else {
+                if (tree[nid].default_left()) boolmap[ridx] = 1;
+              }
+            }
+          }
+        }
+      }
+      
+      bitmap.InitFromBool(boolmap);
+      // communicate bitmap
+      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // get the new position
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int nid = this->DecodePosition(ridx);
+        if (bitmap.Get(ridx)) {
+          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cright());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          }
+        }
+      }
+    }
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      std::vector<SplitEntry> vec;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          this->snode[nid].best.Update(this->stemp[tid][nid].best);
+        }
+        vec.push_back(this->snode[nid].best);
+      }
+      // TODO, lazy version
+      // communicate best solution
+      reducer.Allreduce(BeginPtr(vec), vec.size());
+      // assign solution back
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        this->snode[nid].best = vec[i];
+      }
+    }
+    
+   private:
+    utils::BitMap bitmap;
+    std::vector<int> boolmap;
+    rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
+  };
+  // we directly introduce pruner here
+  TreePruner pruner;
+  // training parameter
+  TrainParam param;
+  // pointer to the builder
+  Builder builder; 
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
new file mode 100644
index 000000000..8c617450b
--- /dev/null
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -0,0 +1,701 @@
+#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+/*!
+ * \file updater_histmaker-inl.hpp
+ * \brief use histogram counting to construct a tree
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include "../sync/sync.h"
+#include "../utils/quantile.h"
+#include "../utils/group_data.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class HistMaker: public BaseMaker {
+ public:
+  virtual ~HistMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    TStats::CheckInfo(info);
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+
+ protected:
+  /*! \brief a single histogram */
+  struct HistUnit {
+    /*! \brief cutting point of histogram, contains maximum point */
+    const bst_float *cut;
+    /*! \brief content of statistics data */    
+    TStats *data;
+    /*! \brief size of histogram */
+    unsigned size;
+    // default constructor
+    HistUnit(void) {}
+    // constructor
+    HistUnit(const bst_float *cut, TStats *data, unsigned size)
+        : cut(cut), data(data), size(size) {}
+    /*! \brief add a histogram to data */
+    inline void Add(bst_float fv, 
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
+      utils::Assert(size != 0, "try insert into size=0");
+      utils::Assert(i < size, 
+                    "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
+      data[i].Add(gpair, info, ridx);
+    }
+  };
+  /*! \brief a set of histograms from different index */
+  struct HistSet {
+    /*! \brief the index pointer of each histunit */
+    const unsigned *rptr;
+    /*! \brief cutting points in each histunit */
+    const bst_float *cut;
+    /*! \brief data in different hist unit */
+    std::vector<TStats> data;
+    /*! \brief */
+    inline HistUnit operator[](size_t fid) {
+      return HistUnit(cut + rptr[fid],
+                      &data[0] + rptr[fid],
+                      rptr[fid+1] - rptr[fid]);
+    }
+  };
+  // thread workspace 
+  struct ThreadWSpace {
+    /*! \brief actual unit pointer */
+    std::vector<unsigned> rptr;
+    /*! \brief cut field */
+    std::vector<bst_float> cut;
+    // per thread histset
+    std::vector<HistSet> hset;
+    // initialize the hist set
+    inline void Init(const TrainParam &param, int nthread) {
+      hset.resize(nthread);
+      // cleanup statistics
+      for (int tid = 0; tid < nthread; ++tid) {
+        for (size_t i = 0; i < hset[tid].data.size(); ++i) {
+          hset[tid].data[i].Clear();
+        }
+        hset[tid].rptr = BeginPtr(rptr);
+        hset[tid].cut = BeginPtr(cut);
+        hset[tid].data.resize(cut.size(), TStats(param));        
+      }
+    }
+    // aggregate all statistics to hset[0]
+    inline void Aggregate(void) {
+      bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        for (size_t tid = 1; tid < hset.size(); ++tid) {
+          hset[0].data[i].Add(hset[tid].data[i]);
+        }
+      }
+    }
+    /*! \brief clear the workspace */
+    inline void Clear(void) {
+      cut.clear(); rptr.resize(1); rptr[0] = 0;
+    }
+    /*! \brief total size */
+    inline size_t Size(void) const {
+      return rptr.size() - 1;
+    }
+  };
+  // workspace of thread
+  ThreadWSpace wspace;
+  // reducer for histogram
+  rabit::Reducer<TStats, TStats::Reduce> histred;
+  // set of working features
+  std::vector<bst_uint> fwork_set;
+  // update function implementation
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      // reset and propose candidate split
+      this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
+      // create histogram
+      this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
+      // find split based on histogram statistics
+      this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
+      // reset position after split
+      this->ResetPositionAfterSplit(p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree) = 0;
+  // initialize the current working set of features in this round
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    p_fset->resize(tree.param.num_feature);
+    for (size_t i = 0; i < p_fset->size(); ++i) {
+      (*p_fset)[i] = static_cast<unsigned>(i);
+    }
+  }
+  // reset position after split, this is not a must, depending on implementation
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+  }
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const std::vector <bst_uint> &fset,
+                          const RegTree &tree)  = 0;
+ private:
+  inline void EnumerateSplit(const HistUnit &hist, 
+                             const TStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best,
+                             TStats *left_sum) {
+    if (hist.size == 0) return;
+
+    double root_gain = node_sum.CalcGain(param);
+    TStats s(param), c(param);
+    for (bst_uint i = 0; i < hist.size; ++i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          if (best->Update((float)loss_chg, fid, hist.cut[i], false)) {
+            *left_sum = s;
+          }
+        }
+      }
+    }
+    s.Clear();
+    for (bst_uint i = hist.size - 1; i != 0; --i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          if (best->Update((float)loss_chg, fid, hist.cut[i-1], true)) {
+            *left_sum = c;
+          }
+        }
+      }
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        const std::vector <bst_uint> &fset,
+                        RegTree *p_tree) {
+    const size_t num_feature = fset.size();
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    std::vector<TStats> left_sum(qexpand.size());    
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      for (size_t i = 0; i < fset.size(); ++ i) {
+        EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
+                       node_sum, fset[i], &best, &left_sum[wid]);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      this->SetStats(p_tree, nid, node_sum);
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);        
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        // right side sum
+        TStats right_sum;
+        right_sum.SetSubstract(node_sum, left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+  
+  inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
+    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));    
+  }
+};
+
+template<typename TStats>
+class CQHistMaker: public HistMaker<TStats> {
+ protected:
+  struct HistEntry {
+    typename HistMaker<TStats>::HistUnit hist;
+    unsigned istart;
+    /*! 
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gpair, info, ridx);
+    }
+  };
+  // sketch type used for this
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // initialize the work set of tree
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    feat_helper.InitByCol(p_fmat, tree);
+    feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
+  }
+  // code to create histogram  
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const std::vector<bst_uint> &fset,
+                          const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      feat2workindex[fset[i]] = static_cast<int>(i);
+    } 
+    // start to work
+    this->wspace.Init(this->param, 1);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_hist = [&]()
+#endif
+    {
+      thread_hist.resize(this->get_nthread());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateHistCol(gpair, batch[i], info, tree,
+                                fset, offset,
+                                &thread_hist[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        const int wid = this->node2workindex[nid];
+        this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
+            .data[0] = node_stats[nid];
+      }
+    };
+    // sync the histogram
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), 
+                            this->wspace.hset[0].data.size(), lazy_get_hist);
+#else
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());   
+#endif    
+  }
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+    this->ResetPositionCol(this->qexpand, p_fmat, tree);
+  }
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector<bst_uint> &fset,
+                                  const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    freal_set.clear();
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (feat_helper.Type(fset[i]) == 2) {
+        feat2workindex[fset[i]] = static_cast<int>(freal_set.size());
+        freal_set.push_back(fset[i]);
+      } else {
+        feat2workindex[fset[i]] = -2;  
+      }
+    }      
+    this->GetNodeStats(gpair, *p_fmat, tree, info,
+                       &thread_stats, &node_stats);       
+    sketchs.resize(this->qexpand.size() * freal_set.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    // intitialize the summary array
+    summary_array.resize(sketchs.size());
+    // setup maximum size
+    unsigned max_size = this->param.max_sketch_size();
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      summary_array[i].Reserve(max_size);
+    }
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    auto lazy_get_summary = [&]()
+#endif
+    {// get smmary
+      thread_sketch.resize(this->get_nthread());
+      // number of rows in
+      const size_t nrows = p_fmat->buffered_rowset().size();
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateSketchCol(gpair, batch[i], tree,
+                                  node_stats,
+                                  freal_set, offset,
+                                  batch[i].length == nrows,
+                                  &thread_sketch[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < sketchs.size(); ++i) {
+        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+        sketchs[i].GetSummary(&out);
+        summary_array[i].SetPrune(out, max_size);
+      }
+      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
+    };
+    if (summary_array.size() != 0) {
+      size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+#if __cplusplus >= 201103L
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
+#else
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+#endif
+    }
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (size_t i = 0; i < fset.size(); ++i) {
+        int offset = feat2workindex[fset[i]];
+        if (offset >= 0) {
+          const WXQSketch::Summary &a = summary_array[wid * freal_set.size() + offset];
+          for (size_t i = 1; i < a.size; ++i) {
+            bst_float cpt = a.data[i].value - rt_eps;
+            if (i == 1 || cpt > this->wspace.cut.back()) {
+              this->wspace.cut.push_back(cpt);
+            }
+          }
+          // push a value that is greater than anything
+          if (a.size != 0) {
+            bst_float cpt = a.data[a.size - 1].value;
+            // this must be bigger than last value in a scale
+            bst_float last = cpt + fabs(cpt) + rt_eps;
+            this->wspace.cut.push_back(last);
+          }
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+        } else {
+          utils::Assert(offset == -2, "BUG in mark");
+          bst_float cpt = feat_helper.MaxValue(fset[i]);        
+          this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));        
+        }
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (fset.size() + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
+  }
+  
+ private:
+  inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
+                            const ColBatch::Inst &c,
+                            const BoosterInfo &info,
+                            const RegTree &tree,
+                            const std::vector<bst_uint> &fset,
+                            bst_uint fid_offset,
+                            std::vector<HistEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<HistEntry> &hbuilder = *p_temp;
+    hbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      hbuilder[nid].istart = 0;
+      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
+    }
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+      }
+    }
+  }
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<TStats> &nstats,
+                              const std::vector<bst_uint> &frealset,
+                              bst_uint offset,
+                              bool col_full,
+                              std::vector<BaseMaker::SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      sbuilder[nid].sum_total = 0.0f;
+      sbuilder[nid].sketch = &sketchs[wid * frealset.size() + offset];
+    }
+
+    if (!col_full) {
+      // first pass, get sum of weight, TODO, optimization to skip first pass
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].sum_total += gpair[ridx].hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];        
+        sbuilder[nid].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
+      } 
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        sbuilder[nid].sketch->Push(c[0].fvalue, sbuilder[nid].sum_total);
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = this->param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Init(max_size);
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Finalize(max_size);
+    }
+  }
+  // feature helper
+  BaseMaker::FMetaHelper feat_helper;
+  // temp space to map feature id to working index
+  std::vector<int> feat2workindex;
+  // set of index from fset that are real
+  std::vector<bst_uint> freal_set; 
+  // thread temp data
+  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<TStats> > thread_stats;
+  // used to hold start pointer
+  std::vector< std::vector<HistEntry> > thread_hist;
+  // node statistics
+  std::vector<TStats> node_stats;
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;  
+};
+
+template<typename TStats>
+class QuantileHistMaker: public HistMaker<TStats> {  
+ protected:
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree) {
+    // initialize the data structure
+    int nthread = BaseMaker::get_nthread();
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    // start accumulating statistics
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel convert to column major format
+      utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
+      builder.InitBudget(tree.param.num_feature, nthread);
+
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);      
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        int nid = this->position[ridx];
+        if (nid >= 0) {
+          if (!tree[nid].is_leaf()) {
+            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
+          } 
+          if (this->node2workindex[nid] < 0) {
+            this->position[ridx] = ~nid;
+          } else{
+            for (bst_uint j = 0; j < inst.length; ++j) { 
+              builder.AddBudget(inst[j].index, omp_get_thread_num());
+            }
+          }
+        }
+      }
+      builder.InitStorage();
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.Push(inst[j].index,
+                         SparseBatch::Entry(nid, inst[j].fvalue),
+                         omp_get_thread_num());
+          }
+        }
+      }
+      // start putting things into sketch
+      const bst_omp_uint nfeat = col_ptr.size() - 1;
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint k = 0; k < nfeat; ++k) {
+        for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
+          const SparseBatch::Entry &e = col_data[i];
+          const int wid = this->node2workindex[e.index];
+          sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].hess);
+        }
+      }
+    }
+    // setup maximum size
+    unsigned max_size = this->param.max_sketch_size();
+    // synchronize sketch
+    summary_array.resize(sketchs.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array[i].Reserve(max_size);
+      summary_array[i].SetPrune(out, max_size);
+    }
+    
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);    
+    sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
+        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value - rt_eps;
+          if (i == 1 || cpt > this->wspace.cut.back()) {
+            this->wspace.cut.push_back(cpt);
+          }
+        }
+        // push a value that is greater than anything
+        if (a.size != 0) {
+          bst_float cpt = a.data[a.size - 1].value;
+          // this must be bigger than last value in a scale
+          bst_float last = cpt + fabs(cpt) + rt_eps;
+          this->wspace.cut.push_back(last);
+        }
+        this->wspace.rptr.push_back(this->wspace.cut.size());
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(this->wspace.cut.size());
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
+  }
+
+ private:
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  // local temp column data structure
+  std::vector<size_t> col_ptr;
+  // local storage of column data
+  std::vector<SparseBatch::Entry> col_data;
+  std::vector< std::vector<size_t> > thread_col_ptr;
+  // per node, per feature sketch
+  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp
index 726999f55..e7e5f9f0b 100644
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include "./param.h"
 #include "./updater.h"
+#include "./updater_sync-inl.hpp"
 
 namespace xgboost {
 namespace tree {
@@ -19,6 +20,7 @@ class TreePruner: public IUpdater {
   virtual void SetParam(const char *name, const char *val) {
     using namespace std;
     param.SetParam(name, val);
+    syncher.SetParam(name, val);
     if (!strcmp(name, "silent")) silent = atoi(val);
   }
   // update the tree, do pruning
@@ -33,8 +35,8 @@ class TreePruner: public IUpdater {
       this->DoPrune(*trees[i]);
     }
     param.learning_rate = lr;
+    syncher.Update(gpair, p_fmat, info, trees);
   }
-
  private:
   // try to prune off current leaf
   inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) {
@@ -70,6 +72,8 @@ class TreePruner: public IUpdater {
   }
 
  private:
+  // synchronizer
+  TreeSyncher syncher;
   // shutup
   int silent;
   // training parameter
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp
index a37630333..8613c8ea6 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -7,6 +7,7 @@
  */
 #include <vector>
 #include <limits>
+#include "../sync/sync.h"
 #include "./param.h"
 #include "./updater.h"
 #include "../utils/omp.h"
@@ -26,7 +27,7 @@ class TreeRefresher: public IUpdater {
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {    
+                      const std::vector<RegTree*> &trees) {        
     if (trees.size() == 0) return;
     // number of threads
     // thread temporal space
@@ -39,54 +40,71 @@ class TreeRefresher: public IUpdater {
       nthread = omp_get_num_threads();
     }
     fvec_temp.resize(nthread, RegTree::FVec());
-    stemp.resize(trees.size() * nthread, std::vector<TStats>());
+    stemp.resize(nthread, std::vector<TStats>());
     #pragma omp parallel
     {
       int tid = omp_get_thread_num();
+      int num_nodes = 0;
       for (size_t i = 0; i < trees.size(); ++i) {
-        std::vector<TStats> &vec = stemp[tid * trees.size() + i];
-        vec.resize(trees[i]->param.num_nodes, TStats(param));
-        std::fill(vec.begin(), vec.end(), TStats(param));
+        num_nodes += trees[i]->param.num_nodes;
       }
+      stemp[tid].resize(num_nodes, TStats(param));
+      std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
       fvec_temp[tid].Init(trees[0]->param.num_feature);
     }
-    // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                   "too large batch size ");
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const int tid = omp_get_thread_num();
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        RegTree::FVec &feats = fvec_temp[tid];
-        feats.Fill(inst);
-        for (size_t j = 0; j < trees.size(); ++j) {
-          AddStats(*trees[j], feats, gpair, info, ridx,
-                   &stemp[tid * trees.size() + j]);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_stats = [&]()
+#endif
+    {
+      // start accumulating statistics
+      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const RowBatch &batch = iter->Value();
+        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
+                     "too large batch size ");
+        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nbatch; ++i) {
+          RowBatch::Inst inst = batch[i];
+          const int tid = omp_get_thread_num();
+          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+          RegTree::FVec &feats = fvec_temp[tid];
+          feats.Fill(inst);
+          int offset = 0;
+          for (size_t j = 0; j < trees.size(); ++j) {
+            AddStats(*trees[j], feats, gpair, info, ridx,
+                     BeginPtr(stemp[tid]) + offset);
+            offset += trees[j]->param.num_nodes;
+          }
+          feats.Drop(inst);
         }
-        feats.Drop(inst);
       }
-    }
-    // start update the trees using the statistics
+      // aggregate the statistics
+      int num_nodes = static_cast<int>(stemp[0].size());
+      #pragma omp parallel for schedule(static)
+      for (int nid = 0; nid < num_nodes; ++nid) {
+        for (int tid = 1; tid < nthread; ++tid) {
+          stemp[0][nid].Add(stemp[tid][nid]);
+        }
+      }
+    };
+#if __cplusplus >= 201103L
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+#else
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
+#endif
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
-    for (size_t i = 0; i < trees.size(); ++i) {
-      // aggregate
-      #pragma omp parallel for schedule(static)
-      for (int nid = 0; nid < trees[i]->param.num_nodes; ++nid) {
-        for (int tid = 1; tid < nthread; ++tid) {
-          stemp[i][nid].Add(stemp[tid * trees.size() + i][nid]);
-        }
-      }
+    int offset = 0;
+    for (size_t i = 0; i < trees.size(); ++i) {      
       for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
-        this->Refresh(stemp[i], rid, trees[i]);
+        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
       }
+      offset += trees[i]->param.num_nodes;
     }
     // set learning rate back
     param.learning_rate = lr;
@@ -98,8 +116,7 @@ class TreeRefresher: public IUpdater {
                               const std::vector<bst_gpair> &gpair,
                               const BoosterInfo &info,
                               const bst_uint ridx,
-                              std::vector<TStats> *p_gstats) {
-    std::vector<TStats> &gstats = *p_gstats;
+                              TStats *gstats) {
     // start from groups that belongs to current data
     int pid = static_cast<int>(info.GetRoot(ridx));
     gstats[pid].Add(gpair, info, ridx);
@@ -110,7 +127,7 @@ class TreeRefresher: public IUpdater {
       gstats[pid].Add(gpair, info, ridx);
     }
   }
-  inline void Refresh(const std::vector<TStats> &gstats,
+  inline void Refresh(const TStats *gstats,
                       int nid, RegTree *p_tree) {
     RegTree &tree = *p_tree;
     tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
@@ -129,6 +146,8 @@ class TreeRefresher: public IUpdater {
   }
   // training parameter
   TrainParam param;
+  // reducer
+  rabit::Reducer<TStats, TStats::Reduce> reducer;  
 };
 
 }  // namespace tree
diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp
new file mode 100644
index 000000000..45202273a
--- /dev/null
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -0,0 +1,393 @@
+#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+/*!
+ * \file updater_skmaker-inl.hpp
+ * \brief use approximation sketch to construct a tree,
+          a refresh is needed to make the statistics exactly correct
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include <rabit.h>
+#include "../utils/quantile.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+class SketchMaker: public BaseMaker {
+ public:
+  virtual ~SketchMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+ 
+ protected:
+  inline void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->BuildSketch(gpair, p_fmat, info, *p_tree);
+      this->SyncNodeStats();
+      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->ResetPositionCol(qexpand, p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    if (qexpand.size() != 0) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->SyncNodeStats();
+    }
+    // set all statistics correctly
+    for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      this->SetStats(nid, node_stats[nid], p_tree);
+      if (!(*p_tree)[nid].is_leaf()) {
+        p_tree->stat(nid).loss_chg =
+            node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
+            node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
+            node_stats[nid].CalcGain(param);
+      }
+    }
+    // set left leaves
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // define the sketch we want to use
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+
+ private:
+  // statistics needed in the gradient calculation
+  struct SKStats {
+    /*! \brief sum of all positive gradient */
+    double pos_grad;
+    /*! \brief sum of all negative gradient */
+    double neg_grad;
+    /*! \brief sum of hessian statistics */    
+    double sum_hess;
+    explicit SKStats(void) {}
+    // constructor
+    explicit SKStats(const TrainParam &param) {
+      this->Clear();
+    }
+    /*! \brief clear the statistics */
+    inline void Clear(void) {
+      neg_grad = pos_grad = sum_hess = 0.0f;
+    }
+    // accumulate statistics
+    inline void Add(const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    bst_uint ridx) {
+      const bst_gpair &b = gpair[ridx];
+      if (b.grad >= 0.0f) {
+        pos_grad += b.grad;
+      } else {
+        neg_grad -= b.grad;
+      }
+      sum_hess += b.hess;
+    }
+    /*! \brief calculate gain of the solution */
+    inline double CalcGain(const TrainParam &param) const {
+      return param.CalcGain(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief set current value to a - b */
+    inline void SetSubstract(const SKStats &a, const SKStats &b) {
+      pos_grad = a.pos_grad - b.pos_grad;
+      neg_grad = a.neg_grad - b.neg_grad;
+      sum_hess = a.sum_hess - b.sum_hess;
+    }
+    // calculate leaf weight
+    inline double CalcWeight(const TrainParam &param) const {
+      return param.CalcWeight(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief add statistics to the data */
+    inline void Add(const SKStats &b) {
+      pos_grad += b.pos_grad;
+      neg_grad += b.neg_grad;
+      sum_hess += b.sum_hess;
+    }
+    /*! \brief same as add, reduce is used in All Reduce */
+    inline void Reduce(const SKStats &b) {
+      this->Add(b);
+    }
+    /*! \brief set leaf vector value based on statistics */
+    inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
+    }
+  };
+  inline void BuildSketch(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    thread_sketch.resize(this->get_nthread());
+    // number of rows in 
+    const size_t nrows = p_fmat->buffered_rowset().size();
+    // start accumulating statistics
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        this->UpdateSketchCol(gpair, batch[i], tree,
+                              node_stats,
+                              batch.col_index[i],
+                              batch[i].length == nrows,
+                              &thread_sketch[omp_get_thread_num()]);
+      }
+    }
+    // setup maximum size
+    unsigned max_size = param.max_sketch_size();
+    // synchronize sketch
+    summary_array.Init(sketchs.size(), max_size);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array.Set(i, out);
+    }
+    size_t nbytes = summary_array.MemSize();;
+    sketch_reducer.Allreduce(&summary_array, nbytes);    
+  }
+  // update sketch information in column fid
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<SKStats> &nstats,
+                              bst_uint fid,
+                              bool col_full,
+                              std::vector<SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes * 3);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].sum_total = 0.0f;
+        sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];       
+      }
+    }
+    if (!col_full) {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          const bst_gpair &e = gpair[ridx];
+          if (e.grad >= 0.0f) {
+            sbuilder[3 * nid + 0].sum_total += e.grad;
+          } else {
+            sbuilder[3 * nid + 1].sum_total -= e.grad;
+          }
+          sbuilder[3 * nid + 2].sum_total += e.hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];
+        sbuilder[3 * nid + 0].sum_total = nstats[nid].pos_grad;
+        sbuilder[3 * nid + 1].sum_total = nstats[nid].neg_grad;
+        sbuilder[3 * nid + 2].sum_total = nstats[nid].sum_hess;        
+      }
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        for (int k = 0; k < 3; ++k) {
+          sbuilder[3 * nid + k].sketch->Push(c[0].fvalue, sbuilder[3 * nid + k].sum_total);
+        }
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Init(max_size);
+      }
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const bst_gpair &e = gpair[ridx];
+        if (e.grad >= 0.0f) {
+          sbuilder[3 * nid + 0].Push(c[j].fvalue, e.grad, max_size);
+        } else {
+          sbuilder[3 * nid + 1].Push(c[j].fvalue, -e.grad, max_size);
+        }
+        sbuilder[3 * nid + 2].Push(c[j].fvalue, e.hess, max_size);
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Finalize(max_size);
+      }
+    }
+  }  
+  inline void SyncNodeStats(void) {
+    utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
+    std::vector<SKStats> tmp(qexpand.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      tmp[i] = node_stats[qexpand[i]];
+    }
+    stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node_stats[qexpand[i]] = tmp[i];
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        RegTree *p_tree) {
+    const bst_uint num_feature = p_tree->param.num_feature;
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      for (bst_uint fid = 0; fid < num_feature; ++ fid) {
+        unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
+        EnumerateSplit(summary_array[base + 0],
+                       summary_array[base + 1],
+                       summary_array[base + 2],
+                       node_stats[nid], fid, &best);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      this->SetStats(nid, node_stats[nid], p_tree);
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+  // set statistics on ptree
+  inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
+    p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
+  }
+  inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
+                             const WXQSketch::Summary &neg_grad,
+                             const WXQSketch::Summary &sum_hess,
+                             const SKStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best) {
+    if (sum_hess.size == 0) return;
+    double root_gain = node_sum.CalcGain(param);
+    std::vector<bst_float> fsplits;
+    for (size_t i = 0; i < pos_grad.size; ++i) {
+      fsplits.push_back(pos_grad.data[i].value);
+    }
+    for (size_t i = 0; i < neg_grad.size; ++i) {
+      fsplits.push_back(neg_grad.data[i].value);
+    }
+    for (size_t i = 0; i < sum_hess.size; ++i) {
+      fsplits.push_back(sum_hess.data[i].value);
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    // sum feature
+    SKStats feat_sum;
+    feat_sum.pos_grad = pos_grad.data[pos_grad.size - 1].rmax;
+    feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax;
+    feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax;
+    size_t ipos = 0, ineg = 0, ihess = 0;
+    for (size_t i = 1; i < fsplits.size(); ++i) {      
+      WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos);
+      WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg);
+      WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess);
+      SKStats s, c;
+      s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin);
+      s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin);
+      s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
+      c.SetSubstract(node_sum, s);      
+      // forward
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, fsplits[i], false);
+      }
+      // backward
+      c.SetSubstract(feat_sum, s);
+      s.SetSubstract(node_sum, c);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, fsplits[i], true);
+      }      
+    }
+    {// all including
+      SKStats s = feat_sum, c;
+      c.SetSubstract(node_sum, s);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        bst_float cpt = fsplits.back();
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, cpt + fabsf(cpt) + 1.0f, true);
+      }
+    }
+  }
+   
+  // thread temp data
+  // used to hold temporal sketch
+  std::vector< std::vector<SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<SKStats> > thread_stats;
+  // node statistics
+  std::vector<SKStats> node_stats;
+  // summary array
+  WXQSketch::SummaryArray summary_array;
+  // reducer for summary
+  rabit::Reducer<SKStats> stats_reducer;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryArray> sketch_reducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+};
+}  // tree
+}  // xgboost
+#endif
diff --git a/src/tree/updater_sync-inl.hpp b/src/tree/updater_sync-inl.hpp
new file mode 100644
index 000000000..2aa534aa8
--- /dev/null
+++ b/src/tree/updater_sync-inl.hpp
@@ -0,0 +1,53 @@
+#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+/*!
+ * \file updater_sync-inl.hpp
+ * \brief synchronize the tree in all distributed nodes
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <limits>
+#include "../sync/sync.h"
+#include "./updater.h"
+
+namespace xgboost {
+namespace tree {
+/*! 
+ * \brief syncher that synchronize the tree in all distributed nodes
+ * can implement various strategies, so far it is always set to node 0's tree
+ */
+class TreeSyncher: public IUpdater {
+ public:
+  virtual ~TreeSyncher(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+  }
+  // update the tree, do pruning
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    this->SyncTrees(trees);
+  }
+  
+ private:
+  // synchronize the trees in different nodes, take tree from rank 0
+  inline void SyncTrees(const std::vector<RegTree *> &trees) {
+    if (rabit::GetWorldSize() == 1) return;
+    std::string s_model;
+    utils::MemoryBufferStream fs(&s_model);
+    int rank = rabit::GetRank();
+    if (rank == 0) {
+      for (size_t i = 0; i < trees.size(); ++i) {
+        trees[i]->SaveModel(fs);
+      }
+    }
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    for (size_t i = 0; i < trees.size(); ++i) {      
+      trees[i]->LoadModel(fs);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
diff --git a/src/utils/base64.h b/src/utils/base64.h
new file mode 100644
index 000000000..36699199f
--- /dev/null
+++ b/src/utils/base64.h
@@ -0,0 +1,205 @@
+#ifndef XGBOOST_UTILS_BASE64_H_
+#define XGBOOST_UTILS_BASE64_H_
+/*!
+ * \file base64.h
+ * \brief data stream support to input and output from/to base64 stream
+ * base64 is easier to store and pass as text format in mapreduce
+ * \author Tianqi Chen
+ */
+#include <cctype>
+#include <cstdio>
+#include "./utils.h"
+#include "./io.h"
+
+namespace xgboost {
+namespace utils {
+/*! \brief namespace of base64 decoding and encoding table */
+namespace base64 {
+const char DecodeTable[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  62,  // '+'
+  0, 0, 0,
+  63,  // '/'
+  52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  // '0'-'9'
+  0, 0, 0, 0, 0, 0, 0,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'Z'
+  0, 0, 0, 0, 0, 0,
+  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+  39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  // 'a'-'z'
+};
+static const char EncodeTable[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+} // namespace base64
+/*! \brief the stream that reads from base64, note we take from file pointers */
+class Base64InStream: public IStream {
+ public:
+  explicit Base64InStream(FILE *fp) : fp(fp) {
+    num_prev = 0; tmp_ch = 0;
+  }
+  /*! 
+   * \brief initialize the stream position to beginning of next base64 stream 
+   * call this function before actually start read
+   */
+  inline void InitPosition(void) {
+    // get a charater
+    do {
+      tmp_ch = fgetc(fp);
+    } while (isspace(tmp_ch));
+  }
+  /*! \brief whether current position is end of a base64 stream */
+  inline bool IsEOF(void) const {
+    return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    using base64::DecodeTable;
+    if (size == 0) return 0;
+    // use tlen to record left size
+    size_t tlen = size;
+    unsigned char *cptr = static_cast<unsigned char*>(ptr);
+    // if anything left, load from previous buffered result
+    if (num_prev != 0) {
+      if (num_prev == 2) {
+        if (tlen >= 2) {
+          *cptr++ = buf_prev[0];
+          *cptr++ = buf_prev[1];
+          tlen -= 2;
+          num_prev = 0;
+        } else {
+          // assert tlen == 1
+          *cptr++ = buf_prev[0]; --tlen;
+          buf_prev[0] = buf_prev[1];
+          num_prev = 1;
+        }
+      } else {
+        // assert num_prev == 1
+        *cptr++ = buf_prev[0]; --tlen; num_prev = 0;
+      }
+    }
+    if (tlen == 0) return size;
+    int nvalue;
+    // note: everything goes with 4 bytes in Base64
+    // so we process 4 bytes a unit
+    while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
+      // first byte
+      nvalue = DecodeTable[tmp_ch] << 18;
+      {
+        // second byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        nvalue |= DecodeTable[tmp_ch] << 12;
+        *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
+      }
+      {
+        // third byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        // handle termination
+        if (tmp_ch == '=') {
+          Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
+          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+                "invalid base64 format");
+          break;
+        }
+        nvalue |= DecodeTable[tmp_ch] << 6;
+        if (tlen) {
+          *cptr++ = (nvalue >> 8) & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
+        }
+      }
+      {
+        // fourth byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        if (tmp_ch == '=') {
+          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+                "invalid base64 format");
+          break;
+        }
+        nvalue |= DecodeTable[tmp_ch];
+        if (tlen) {
+          *cptr++ = nvalue & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev ++] = nvalue & 0xFF;
+        }
+      }
+      // get next char
+      tmp_ch = fgetc(fp);
+    }
+    if (kStrictCheck) {
+      Check(tlen == 0, "Base64InStream: read incomplete");
+    }
+    return size - tlen;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    utils::Error("Base64InStream do not support write");
+  }
+
+ private:
+  FILE *fp;
+  unsigned char tmp_ch;
+  int num_prev;
+  unsigned char buf_prev[2];
+  // whether we need to do strict check
+  static const bool kStrictCheck = false;
+};
+/*! \brief the stream that write to base64, note we take from file pointers */
+class Base64OutStream: public IStream {
+ public:
+  explicit Base64OutStream(FILE *fp) : fp(fp) {
+    buf_top = 0;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    using base64::EncodeTable;
+    size_t tlen = size;
+    const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
+    while (tlen) {
+      while (buf_top < 3  && tlen != 0) {
+        buf[++buf_top] = *cptr++; --tlen;
+      }
+      if (buf_top == 3) {
+        // flush 4 bytes out
+        fputc(EncodeTable[buf[1] >> 2], fp);
+        fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
+        fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
+        fputc(EncodeTable[buf[3] & 0x3F], fp);
+        buf_top = 0;
+      }
+    }
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    Error("Base64OutStream do not support read");
+    return 0;
+  }
+  /*!
+   * \brief finish writing of all current base64 stream, do some post processing
+   * \param endch charater to put to end of stream, if it is EOF, then nothing will be done
+   */
+  inline void Finish(char endch = EOF) {
+    using base64::EncodeTable;
+    if (buf_top == 1) {
+      fputc(EncodeTable[buf[1] >> 2], fp);
+      fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
+      fputc('=', fp);
+      fputc('=', fp);
+    }
+    if (buf_top == 2) {
+      fputc(EncodeTable[buf[1] >> 2], fp);
+      fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
+      fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
+      fputc('=', fp);
+    }
+    buf_top = 0;
+    if (endch != EOF) fputc(endch, fp);
+  }
+
+ private:
+  FILE *fp;
+  int buf_top;
+  unsigned char buf[4];
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif  // XGBOOST_UTILS_BASE64_H_
diff --git a/src/utils/bitmap.h b/src/utils/bitmap.h
new file mode 100644
index 000000000..ba12caf41
--- /dev/null
+++ b/src/utils/bitmap.h
@@ -0,0 +1,66 @@
+#ifndef XGBOOST_UTILS_BITMAP_H_
+#define XGBOOST_UTILS_BITMAP_H_
+/*!
+ * \file bitmap.h
+ * \brief a simple implement of bitmap
+ *  NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include "./utils.h"
+#include "./omp.h"
+
+namespace xgboost {
+namespace utils {
+/*! \brief bit map that contains set of bit indicators */
+struct BitMap {
+  /*! \brief internal data structure */
+  std::vector<uint32_t> data;
+  /*! 
+   * \brief resize the bitmap to be certain size 
+   * \param size the size of bitmap
+   */
+  inline void Resize(size_t size) {
+    data.resize((size + 31U) >> 5, 0);
+  }
+  /*! 
+   * \brief query the i-th position of bitmap 
+   * \param i the position in 
+   */
+  inline bool Get(size_t i) const {
+    return (data[i >> 5] >> (i & 31U)) & 1U;
+  }
+  /*! 
+   * \brief set i-th position to true 
+   * \param i position index
+   */
+  inline void SetTrue(size_t i) {
+    data[i >> 5] |= (1 << (i & 31U));
+  }
+  /*! \brief initialize the value of bit map from vector of bool*/
+  inline void InitFromBool(const std::vector<int> &vec) {
+    this->Resize(vec.size());
+    // parallel over the full cases
+    bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < nsize; ++i) {
+      uint32_t res = 0;
+      for (int k = 0; k < 32; ++k) {
+        int bit = vec[(i << 5) | k];
+        res |= (bit << k);
+      }
+      data[i] = res;
+    }
+    if (nsize != vec.size()) data.back() = 0;
+    for (size_t i = nsize; i < vec.size(); ++i) {
+      if (vec[i]) this->SetTrue(i);
+    }
+  }
+  /*! \brief clear the bitmap, set all places to false */
+  inline void Clear(void) {
+    std::fill(data.begin(), data.end(), 0U);
+  }
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif
diff --git a/src/utils/group_data.h b/src/utils/group_data.h
new file mode 100644
index 000000000..a25eb1edd
--- /dev/null
+++ b/src/utils/group_data.h
@@ -0,0 +1,111 @@
+#ifndef XGBOOST_UTILS_GROUP_DATA_H_
+#define XGBOOST_UTILS_GROUP_DATA_H_
+/*!
+ * \file group_data.h
+ * \brief this file defines utils to group data by integer keys
+ *     Input: given input sequence (key,value), (k1,v1), (k2,v2)
+ *     Ouptupt: an array of values data = [v1,v2,v3 .. vn]
+ *              and a group pointer ptr,
+ *              data[ptr[k]:ptr[k+1]] contains values that corresponds to key k
+ *
+ * This can be used to construct CSR/CSC matrix from un-ordered input
+ * The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
+ * \author Tianqi Chen
+ */
+namespace xgboost {
+namespace utils {
+/*!
+ * \brief multi-thread version of group builder
+ * \tparam ValueType type of entries in the sparse matrix
+ * \tparam SizeType type of the index range holder
+ */
+template<typename ValueType, typename SizeType = size_t>
+struct ParallelGroupBuilder {
+ public:
+  // parallel group builder of data
+  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
+                       std::vector<ValueType> *p_data)
+      : rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
+  }
+  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
+                       std::vector<ValueType> *p_data,
+                       std::vector< std::vector<SizeType> > *p_thread_rptr)
+      : rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
+  }
+  
+ public:
+  /*!
+   * \brief step 1: initialize the helper, with hint of number keys 
+   *                and thread used in the construction
+   * \param nkeys number of keys in the matrix, can be smaller than expected
+   * \param nthread number of thread that will be used in construction
+   */
+  inline void InitBudget(size_t nkeys = 0, int nthread = 1) {
+    thread_rptr.resize(nthread);
+    for (size_t i = 0;  i < thread_rptr.size(); ++i) {
+      thread_rptr[i].resize(nkeys);
+      std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
+    }
+  }
+  /*!
+   * \brief step 2: add budget to each key
+   * \param key the key
+   * \param threadid the id of thread that calls this function
+   * \param nelem number of element budget add to this row
+   */
+  inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) {
+    std::vector<SizeType> &trptr = thread_rptr[threadid];
+    if (trptr.size() < key + 1) {
+      trptr.resize(key + 1, 0);      
+    }
+    trptr[key] += nelem;
+  }
+  /*! \brief step 3: initialize the necessary storage */
+  inline void InitStorage(void) {
+    // set rptr to correct size
+    for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
+      if (rptr.size() <= thread_rptr[tid].size()) {
+        rptr.resize(thread_rptr[tid].size()+1);
+      }
+    }
+    // initialize rptr to be beginning of each segment
+    size_t start = 0;
+    for (size_t i = 0; i + 1 < rptr.size(); ++i) {
+      for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
+        std::vector<SizeType> &trptr = thread_rptr[tid];
+        if (i < trptr.size()) {
+          size_t ncnt = trptr[i];
+          trptr[i] = start;
+          start += ncnt;
+        }
+      }
+      rptr[i + 1] = start;
+    }
+    data.resize(start);
+  }
+  /*!
+   * \brief step 4: add data to the allocated space, 
+   *   the calls to this function should be exactly match previous call to AddBudget
+   *
+   * \param key the key of 
+   * \param threadid the id of thread that calls this function
+   */
+  inline void Push(size_t key, ValueType value, int threadid = 0) {    
+    SizeType &rp = thread_rptr[threadid][key];
+    data[rp++] = value;
+  }
+
+ private:
+  /*! \brief pointer to the beginning and end of each continuous key */
+  std::vector<SizeType> &rptr;
+  /*! \brief index of nonzero entries in each row */
+  std::vector<ValueType> &data;
+  /*! \brief thread local data structure */
+  std::vector< std::vector<SizeType> > &thread_rptr;
+  /*! \brief local temp thread ptr, use this if not specified by the constructor */
+  std::vector< std::vector<SizeType> > tmp_thread_rptr;
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif
+
diff --git a/src/utils/io.h b/src/utils/io.h
index 026e3fec7..dff691ee0 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -88,12 +88,98 @@ class IStream {
   }
 };
 
-/*! \brief implementation of file i/o stream */
-class FileStream : public IStream {
- private:
-  std::FILE *fp;
+/*! \brief interface of i/o stream that support seek */
+class ISeekStream: public IStream {
  public:
-  explicit FileStream(std::FILE *fp) : fp(fp) {
+  /*! \brief seek to certain position of the file */
+  virtual void Seek(size_t pos) = 0;
+  /*! \brief tell the position of the stream */
+  virtual size_t Tell(void) = 0;
+};
+
+/*! \brief fixed size memory buffer */
+struct MemoryFixSizeBuffer : public ISeekStream {
+ public:
+  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size) 
+      : p_buffer_(reinterpret_cast<char*>(p_buffer)), buffer_size_(buffer_size) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryFixSizeBuffer(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ + size <= buffer_size_,
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, p_buffer_ + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    utils::Assert(curr_ptr_ + size <=  buffer_size_, 
+                  "write position exceed fixed buffer size");
+    memcpy(p_buffer_ + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  char *p_buffer_;
+  /*! \brief current pointer */
+  size_t buffer_size_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+}; // class MemoryFixSizeBuffer
+
+/*! \brief a in memory buffer that can be read and write as stream interface */
+struct MemoryBufferStream : public ISeekStream {
+ public:
+  MemoryBufferStream(std::string *p_buffer) 
+      : p_buffer_(p_buffer) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryBufferStream(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ <= p_buffer_->length(),
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    if (curr_ptr_ + size > p_buffer_->length()) {
+      p_buffer_->resize(curr_ptr_+size);
+    }
+    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size); 
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  std::string *p_buffer_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+}; // class MemoryBufferStream
+
+/*! \brief implementation of file i/o stream */
+class FileStream : public ISeekStream {
+ public:
+  explicit FileStream(FILE *fp) : fp(fp) {}
+  explicit FileStream(void) {
+    this->fp = NULL;
   }
   virtual size_t Read(void *ptr, size_t size) {
     return std::fread(ptr, size, 1, fp);
@@ -101,14 +187,21 @@ class FileStream : public IStream {
   virtual void Write(const void *ptr, size_t size) {
     std::fwrite(ptr, size, 1, fp);
   }
-  inline void Seek(size_t pos) {
-    std::fseek(fp, 0, SEEK_SET);
+  virtual void Seek(size_t pos) {
+    std::fseek(fp, static_cast<long>(pos), SEEK_SET);
+  }
+  virtual size_t Tell(void) {
+    return std::ftell(fp);
   }
   inline void Close(void) {
-    std::fclose(fp);
+    if (fp != NULL){
+      std::fclose(fp); fp = NULL;
+    }
   }
-};
 
+ private:
+  FILE *fp;
+};
 }  // namespace utils
 }  // namespace xgboost
 #endif
diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h
index 0f3b20a14..14e0667ee 100644
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@@ -6,8 +6,11 @@
  * \author Tianqi Chen
  */
 #include <vector>
+#include <utility>
 #include <algorithm>
+#include "./io.h"
 #include "./utils.h"
+#include "./omp.h"
 
 namespace xgboost {
 namespace utils {
@@ -118,6 +121,140 @@ struct SparseCSRMBuilder {
   }
 };
 
+/*!
+ * \brief a class used to help construct CSR format matrix file
+ * \tparam IndexType type of index used to store the index position
+ * \tparam SizeType type of size used in row pointer
+ */
+template<typename IndexType, typename SizeType = size_t>
+struct SparseCSRFileBuilder {
+ public:
+  explicit SparseCSRFileBuilder(utils::ISeekStream *fo, size_t buffer_size) 
+      : fo(fo), buffer_size(buffer_size) {
+  }
+  /*!
+   * \brief step 1: initialize the number of rows in the data, not necessary exact
+   * \nrows number of rows in the matrix, can be smaller than expected
+   */
+  inline void InitBudget(size_t nrows = 0) {
+    rptr.clear();
+    rptr.resize(nrows + 1, 0);
+  }
+  /*!
+   * \brief step 2: add budget to each rows
+   * \param row_id the id of the row
+   * \param nelem  number of element budget add to this row
+   */
+  inline void AddBudget(size_t row_id, SizeType nelem = 1) {
+    if (rptr.size() < row_id + 2) {
+      rptr.resize(row_id + 2, 0);
+    }
+    rptr[row_id + 1] += nelem;
+  }
+  /*! \brief step 3: initialize the necessary storage */
+  inline void InitStorage(void) {
+    SizeType nelem = 0;
+    for (size_t i = 1; i < rptr.size(); i++) {
+      nelem += rptr[i];
+      rptr[i] = nelem;
+    }
+    begin_data = static_cast<SizeType>(fo->Tell()) + sizeof(SizeType);
+    SizeType begin_meta = begin_data + nelem * sizeof(IndexType);
+    fo->Write(&begin_meta, sizeof(begin_meta));
+    fo->Seek(begin_meta);
+    fo->Write(rptr);
+    // setup buffer space
+    buffer_rptr.resize(rptr.size());
+    buffer_temp.reserve(buffer_size);
+    buffer_data.resize(buffer_size);
+    saved_offset = rptr;
+    saved_offset.resize(rptr.size() - 1);
+    this->ClearBuffer();
+  }
+  /*! \brief step 4: push element into buffer */
+  inline void PushElem(SizeType row_id, IndexType col_id) {
+    if (buffer_temp.size() == buffer_size) {
+      this->WriteBuffer();
+      this->ClearBuffer();
+    }
+    buffer_rptr[row_id + 1] += 1;
+    buffer_temp.push_back(std::make_pair(row_id, col_id));
+  }
+  /*! \brief finalize the construction */
+  inline void Finalize(void) {
+    this->WriteBuffer();
+    for (size_t i = 0; i < saved_offset.size(); ++i) {
+      utils::Assert(saved_offset[i] == rptr[i+1], "some block not write out");
+    }
+  }
+  /*! \brief content must be in wb+ */
+  template<typename Comparator>
+  inline void SortRows(Comparator comp, size_t step) {
+    for (size_t i = 0; i < rptr.size() - 1; i += step) {
+      bst_omp_uint begin = static_cast<bst_omp_uint>(i);
+      bst_omp_uint end = static_cast<bst_omp_uint>(std::min(rptr.size() - 1, i + step));
+      if (rptr[end] != rptr[begin]) {
+        fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
+        buffer_data.resize(rptr[end] - rptr[begin]);
+        fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
+        // do parallel sorting
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = begin; j < end; ++j) {
+          std::sort(&buffer_data[0] + rptr[j] - rptr[begin],
+                    &buffer_data[0] + rptr[j+1] - rptr[begin],
+                    comp);
+        }
+        fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
+        fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
+      }
+    }
+  }
+ protected:
+  inline void WriteBuffer(void) {
+    SizeType start = 0;
+    for (size_t i = 1; i < buffer_rptr.size(); ++i) {
+      size_t rlen = buffer_rptr[i];
+      buffer_rptr[i] = start;
+      start += rlen;
+    }
+    for (size_t i = 0; i < buffer_temp.size(); ++i) {
+      SizeType &rp = buffer_rptr[buffer_temp[i].first + 1];
+      buffer_data[rp++] = buffer_temp[i].second;
+    }
+    // write out
+    for (size_t i = 0; i < buffer_rptr.size() - 1; ++i) {
+      size_t nelem = buffer_rptr[i+1] - buffer_rptr[i];
+      if (nelem != 0) {
+        utils::Assert(saved_offset[i] + nelem <= rptr[i+1], "data exceed bound");
+        fo->Seek(saved_offset[i] * sizeof(IndexType) + begin_data);
+        fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType));
+        saved_offset[i] += nelem;
+      }
+    }
+  }
+  inline void ClearBuffer(void) {
+    buffer_temp.clear();
+    std::fill(buffer_rptr.begin(), buffer_rptr.end(), 0);
+  }
+ private:
+  /*! \brief output file pointer the data */
+  utils::ISeekStream *fo;
+  /*! \brief pointer to each of the row */
+  std::vector<SizeType> rptr;
+  /*! \brief saved top space of each item */
+  std::vector<SizeType> saved_offset;
+  /*! \brief beginning position of data */
+  size_t begin_data;
+  // ----- the following are buffer space
+  /*! \brief maximum size of content buffer*/
+  size_t buffer_size;
+  /*! \brief store the data content */
+  std::vector< std::pair<SizeType, IndexType> > buffer_temp;
+  /*! \brief saved top space of each item */
+  std::vector<SizeType> buffer_rptr;
+  /*! \brief saved top space of each item */
+  std::vector<IndexType> buffer_data;
+};
 }  // namespace utils
 }  // namespace xgboost
 #endif
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
new file mode 100644
index 000000000..bc76f4017
--- /dev/null
+++ b/src/utils/quantile.h
@@ -0,0 +1,747 @@
+#ifndef XGBOOST_UTILS_QUANTILE_H_
+#define XGBOOST_UTILS_QUANTILE_H_
+/*!
+ * \file quantile.h
+ * \brief util to compute quantiles 
+ * \author Tianqi Chen
+ */
+#include <cmath>
+#include <vector>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "./io.h"
+#include "./utils.h"
+
+namespace xgboost {
+namespace utils {
+/*!
+ * \brief experimental wsummary
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType>
+struct WQSummary {
+  /*! \brief an entry in the sketch summary */
+  struct Entry {
+    /*! \brief minimum rank */
+    RType rmin;
+    /*! \brief maximum rank */
+    RType rmax;
+    /*! \brief maximum weight */
+    RType wmin;
+    /*! \brief the value of data */
+    DType value;
+    // constructor
+    Entry(void) {}
+    // constructor
+    Entry(RType rmin, RType rmax, RType wmin, DType value)
+        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
+    /*! 
+     * \brief debug function,  check Valid 
+     * \param eps the tolerate level for violating the relation
+     */
+    inline void CheckValid(RType eps = 0) const {
+      utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
+      utils::Assert(rmax- rmin - wmin > -eps, "relation constraint: min/max");
+    }
+    /*! \return rmin estimation for v strictly bigger than value */
+    inline RType rmin_next(void) const {
+      return rmin + wmin;
+    }
+    /*! \return rmax estimation for v strictly smaller than value */
+    inline RType rmax_prev(void) const {
+      return rmax - wmin;
+    }
+  };
+  /*! \brief input data queue before entering the summary */
+  struct Queue {
+    // entry in the queue
+    struct QEntry {
+      // value of the instance
+      DType value;
+      // weight of instance
+      RType weight;
+      // default constructor
+      QEntry(void) {}
+      // constructor
+      QEntry(DType value, RType weight) 
+          : value(value), weight(weight) {}
+      // comparator on value
+      inline bool operator<(const QEntry &b) const {
+        return value < b.value;
+      }
+    };
+    // the input queue
+    std::vector<QEntry> queue;
+    // end of the queue
+    size_t qtail;
+    // push data to the queue
+    inline void Push(DType x, RType w) {
+      if (qtail == 0 || queue[qtail - 1].value != x) {
+        queue[qtail++] = QEntry(x, w);
+      } else {
+        queue[qtail - 1].weight += w;
+      }
+    }   
+    inline void MakeSummary(WQSummary *out) {
+      std::sort(queue.begin(), queue.begin() + qtail);
+      out->size = 0;
+      // start update sketch      
+      RType wsum = 0;
+      // construct data with unique weights
+      for (size_t i = 0; i < qtail;) {
+        size_t j = i + 1;
+        RType w = queue[i].weight;
+        while (j < qtail && queue[j].value == queue[i].value) {
+          w += queue[j].weight; ++j;
+        }
+        out->data[out->size++] = Entry(wsum, wsum + w, w, queue[i].value);
+        wsum += w; i = j;
+      }
+    }
+  };
+  /*! \brief data field */
+  Entry *data;
+  /*! \brief number of elements in the summary */
+  size_t size;
+  // constructor
+  WQSummary(Entry *data, size_t size) 
+      : data(data), size(size) {}
+  /*!
+   * \return the maximum error of the Summary
+   */
+  inline RType MaxError(void) const {
+    RType res = data[0].rmax - data[0].rmin - data[0].wmin;
+    for (size_t i = 1; i < size; ++i) {
+      res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
+      res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
+    }
+    return res;
+  }
+  /*! 
+   * \brief query qvalue, start from istart
+   * \param qvalue the value we query for
+   * \param istart starting position
+   */
+  inline Entry Query(DType qvalue, size_t &istart) const {
+    while (istart < size && qvalue > data[istart].value) {
+      ++istart;
+    }
+    if (istart == size) {
+      RType rmax = data[size - 1].rmax;
+      return Entry(rmax, rmax, 0.0f, qvalue);
+    }
+    if (qvalue == data[istart].value) {
+      return data[istart];
+    } else {
+      if (istart == 0) {
+        return Entry(0.0f, 0.0f, 0.0f, qvalue);    
+      } else {
+        return Entry(data[istart - 1].rmin_next(),
+                     data[istart].rmax_prev(),
+                     0.0f, qvalue);
+      }
+    }
+  }
+  /*! \return maximum rank in the summary */
+  inline RType MaxRank(void) const {
+    return data[size - 1].rmax;
+  }
+  /*!
+   * \brief copy content from src
+   * \param src source sketch
+   */
+  inline void CopyFrom(const WQSummary &src) {
+    size = src.size;
+    std::memcpy(data, src.data, sizeof(Entry) * size);    
+  }  
+  /*! 
+   * \brief debug function, validate whether the summary 
+   *  run consistency check to check if it is a valid summary
+   * \param eps the tolerate error level, used when RType is floating point and 
+   *        some inconsistency could occur due to rounding error
+   */
+  inline void CheckValid(RType eps) const {
+    for (size_t i = 0; i < size; ++i) {
+      data[i].CheckValid(eps);
+      if (i != 0) {
+        utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
+        utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
+      }
+    }
+  }
+  /*! \brief used for debug purpose, print the summary */
+  inline void Print(void) const {
+    for (size_t i = 0; i < size; ++i) {
+      std::cout << "x=" << data[i].value << "\t"
+                << "[" << data[i].rmin << "," << data[i].rmax << "]"
+                << " wmin=" << data[i].wmin << std::endl;
+    }
+  }
+  /*!
+   * \brief set current summary to be pruned summary of src
+   *        assume data field is already allocated to be at least maxsize
+   * \param src source summary
+   * \param maxsize size we can afford in the pruned sketch
+   */
+
+  inline void SetPrune(const WQSummary &src, size_t maxsize) {
+    if (src.size <= maxsize) {
+      this->CopyFrom(src); return;
+    }
+    const RType begin = src.data[0].rmax;
+    const RType range = src.data[src.size - 1].rmin - src.data[0].rmax;
+    const size_t n = maxsize - 1;
+    data[0] = src.data[0];
+    this->size = 1;
+    // lastidx is used to avoid duplicated records
+    size_t i = 1, lastidx = 0;
+    for (size_t k = 1; k < n; ++k) {
+      RType dx2 =  2 * ((k * range) / n + begin);
+      // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
+      while (i < src.size - 1 
+             && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+      utils::Assert(i != src.size - 1, "this cannot happen");
+      if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
+        if (i != lastidx) {
+          data[size++] = src.data[i]; lastidx = i;
+        }
+      } else {
+        if (i + 1 != lastidx) {
+          data[size++] = src.data[i + 1]; lastidx = i + 1;
+        }
+      }
+    }
+    if (lastidx != src.size - 1) {
+      data[size++] = src.data[src.size - 1];
+    }
+  }
+  /*! 
+   * \brief set current summary to be merged summary of sa and sb
+   * \param sa first input summary to be merged
+   * \param sb second input summar to be merged
+   */
+  inline void SetCombine(const WQSummary &sa,
+                         const WQSummary &sb) {
+    if (sa.size == 0) {
+      this->CopyFrom(sb); return;
+    }
+    if (sb.size == 0) {
+      this->CopyFrom(sa); return;
+    }
+    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
+    const Entry *a = sa.data, *a_end = sa.data + sa.size;
+    const Entry *b = sb.data, *b_end = sb.data + sb.size;
+    // extended rmin value
+    RType aprev_rmin = 0, bprev_rmin = 0;
+    Entry *dst = this->data;
+    while (a != a_end && b != b_end) {
+      // duplicated value entry
+      if (a->value == b->value) {
+        *dst = Entry(a->rmin + b->rmin,
+                     a->rmax + b->rmax,
+                     a->wmin + b->wmin, a->value);
+        aprev_rmin = a->rmin_next();
+        bprev_rmin = b->rmin_next();
+        ++dst; ++a; ++b;
+      } else if (a->value < b->value) {
+        *dst = Entry(a->rmin + bprev_rmin,
+                     a->rmax + b->rmax_prev(),
+                     a->wmin, a->value);
+        aprev_rmin = a->rmin_next();
+        ++dst; ++a;
+      } else {
+        *dst = Entry(b->rmin + aprev_rmin,
+                     b->rmax + a->rmax_prev(),
+                     b->wmin, b->value);
+        bprev_rmin = b->rmin_next();
+        ++dst; ++b;
+      }
+    }
+    if (a != a_end) {
+      RType brmax = (b_end - 1)->rmax;
+      do {
+        *dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
+        ++dst; ++a;
+      } while (a != a_end);
+    }
+    if (b != b_end) {
+      RType armax = (a_end - 1)->rmax;
+      do {
+        *dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
+        ++dst; ++b;
+      } while (b != b_end);
+    }
+    this->size = dst - data;
+    utils::Assert(size <= sa.size + sb.size, "bug in combine");
+  }
+};
+/*! \brief try to do efficient prunning */
+template<typename DType, typename RType>
+struct WXQSummary : public WQSummary<DType, RType> {
+  // redefine entry type
+  typedef typename WQSummary<DType, RType>::Entry Entry;
+  // constructor
+  WXQSummary(Entry *data, size_t size)
+      : WQSummary<DType, RType>(data, size) {}
+  // check if the block is large chunk
+  inline static bool CheckLarge(const Entry &e, RType chunk) {
+    return  e.rmin_next() > e.rmax_prev() + chunk;
+  }
+  // set prune
+  inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
+    if (src.size <= maxsize) {
+      this->CopyFrom(src); return;
+    }
+    RType begin = src.data[0].rmax;
+    size_t n = maxsize - 1, nbig = 0;
+    const RType range = src.data[src.size - 1].rmin - begin;
+    const RType chunk = 2 * range / n;
+    // minimized range
+    RType mrange = 0;
+    {
+      // first scan, grab all the big chunk
+      // moviing block index
+      size_t bid = 0;
+      for (size_t i = 1; i < src.size; ++i) {
+        if (CheckLarge(src.data[i], chunk)) {
+          if (bid != i - 1) {
+            mrange += src.data[i].rmax_prev() - src.data[bid].rmin_next();
+          }
+          bid = i; ++nbig;
+        }
+      }
+      if (bid != src.size - 2) {
+        mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
+      }
+    }
+    utils::Assert(nbig < n - 1, "too many large chunk");
+    this->data[0] = src.data[0];
+    this->size = 1;
+    // use smaller size
+    n = n - nbig;
+    // find the rest of point
+    size_t bid = 0, k = 1, lastidx = 0;
+    for (size_t end = 1; end < src.size; ++end) {
+      if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
+        if (bid != end - 1) {
+          size_t i = bid;
+          RType maxdx2 = src.data[end].rmax_prev() * 2;
+          for (; k < n; ++k) {
+            RType dx2 =  2 * ((k * mrange) / n + begin);
+            if (dx2 >= maxdx2) break; 
+            while (i < end &&
+                   dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+            if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
+              if (i != lastidx) {
+                this->data[this->size++] = src.data[i]; lastidx = i;
+              }
+            } else {
+              if (i + 1 != lastidx) {
+                this->data[this->size++] = src.data[i + 1]; lastidx = i + 1;
+              }
+            }
+          }
+        }
+        if (lastidx != end) {
+          this->data[this->size++] = src.data[end];
+          lastidx = end;
+        }
+        bid = end;
+        // shift base by the gap 
+        begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev();
+      }
+    }
+  }
+};
+/*! 
+ * \brief traditional GK summary
+ */
+template<typename DType, typename RType>
+struct GKSummary {
+  /*! \brief an entry in the sketch summary */
+  struct Entry {
+    /*! \brief minimum rank */
+    RType rmin;
+    /*! \brief maximum rank */
+    RType rmax;
+    /*! \brief the value of data */
+    DType value;
+    // constructor
+    Entry(void) {}
+    // constructor
+    Entry(RType rmin, RType rmax, DType value)
+        : rmin(rmin), rmax(rmax), value(value) {}
+  };
+  /*! \brief input data queue before entering the summary */
+  struct Queue {
+    // the input queue
+    std::vector<DType> queue;
+    // end of the queue
+    size_t qtail;
+    // push data to the queue
+    inline void Push(DType x, RType w) {
+      queue[qtail++] = x;
+    }   
+    inline void MakeSummary(GKSummary *out) {
+      std::sort(queue.begin(), queue.begin() + qtail);
+      out->size = qtail;
+      for (size_t i = 0; i < qtail; ++i) {
+        out->data[i] = Entry(i + 1, i + 1, queue[i]);
+      }
+    }
+  };
+  /*! \brief data field */
+  Entry *data;
+  /*! \brief number of elements in the summary */
+  size_t size;
+  GKSummary(Entry *data, size_t size)
+      : data(data), size(size) {} 
+  /*! \brief the maximum error of the summary */
+  inline RType MaxError(void) const {
+    RType res = 0;
+    for (size_t i = 1; i < size; ++i) {
+      res = std::max(data[i].rmax - data[i-1].rmin, res);
+    }
+    return res;
+  }
+  /*! \return maximum rank in the summary */
+  inline RType MaxRank(void) const {
+    return data[size - 1].rmax;
+  }
+  /*! 
+   * \brief copy content from src
+   * \param src source sketch
+   */
+  inline void CopyFrom(const GKSummary &src) {
+    size = src.size;
+    std::memcpy(data, src.data, sizeof(Entry) * size);
+  }
+  inline void CheckValid(RType eps) const {
+    // assume always valid
+  }
+  /*! \brief used for debug purpose, print the summary */
+  inline void Print(void) const {
+    for (size_t i = 0; i < size; ++i) {
+      std::cout << "x=" << data[i].value << "\t"
+                << "[" << data[i].rmin << "," << data[i].rmax << "]"
+                << std::endl;
+    }
+  }  
+  /*! 
+   * \brief set current summary to be pruned summary of src
+   *        assume data field is already allocated to be at least maxsize
+   * \param src source summary
+   * \param maxsize size we can afford in the pruned sketch
+   */
+  inline void SetPrune(const GKSummary &src, size_t maxsize) {
+    if (src.size <= maxsize) {
+      this->CopyFrom(src); return;
+    }
+    const RType max_rank = src.MaxRank();
+    this->size = maxsize;
+    data[0] = src.data[0];
+    size_t n = maxsize - 1;
+    RType top = 1;
+    for (size_t i = 1; i < n; ++i) {
+      RType k = (i * max_rank) / n;
+      while (k > src.data[top + 1].rmax) ++top;
+      // assert src.data[top].rmin <= k
+      // because k > src.data[top].rmax >= src.data[top].rmin
+      if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
+        data[i] = src.data[top];
+      } else {
+        data[i] = src.data[top + 1];
+      }
+    }
+    data[n] = src.data[src.size - 1];
+  }
+  inline void SetCombine(const GKSummary &sa,
+                         const GKSummary &sb) {
+    if (sa.size == 0) {
+      this->CopyFrom(sb); return;
+    }
+    if (sb.size == 0) {
+      this->CopyFrom(sa); return;
+    }    
+    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
+    const Entry *a = sa.data, *a_end = sa.data + sa.size;
+    const Entry *b = sb.data, *b_end = sb.data + sb.size;
+    this->size = sa.size + sb.size;
+    RType aprev_rmin = 0, bprev_rmin = 0;
+    Entry *dst = this->data;
+    while (a != a_end && b != b_end) {
+      if (a->value < b->value) {
+        *dst = Entry(bprev_rmin + a->rmin,
+                     a->rmax + b->rmax - 1, a->value);
+        aprev_rmin = a->rmin;
+        ++dst; ++a;
+      } else {
+        *dst = Entry(aprev_rmin + b->rmin, 
+                     b->rmax + a->rmax - 1, b->value);
+        bprev_rmin = b->rmin;
+        ++dst; ++b;
+      }
+    }
+    if (a != a_end) {
+      RType bprev_rmax = (b_end - 1)->rmax;
+      do {
+        *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
+        ++dst; ++a;
+      } while (a != a_end);
+    }
+    if (b != b_end) {
+      RType aprev_rmax = (a_end - 1)->rmax;
+      do {
+        *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
+        ++dst; ++b;
+      } while (b != b_end);
+    }
+    utils::Assert(dst == data + size, "bug in combine");
+  }
+};
+
+/*!
+ * \brief template for all quantle sketch algorithm
+ *        that uses merge/prune scheme
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ * \tparam TSummary actual summary data structure it uses
+ */
+template<typename DType, typename RType, class TSummary>
+class QuantileSketchTemplate {
+ public:
+  /*! \brief type of summary type */
+  typedef TSummary Summary;
+  /*! \brief the entry type */
+  typedef typename Summary::Entry Entry;   
+  /*! \brief same as summary, but use STL to backup the space */
+  struct SummaryContainer : public Summary {
+    std::vector<Entry> space;
+    SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { 
+      this->space = src.space;
+      this->data = BeginPtr(this->space);
+    }
+    SummaryContainer(void) : Summary(NULL, 0) { 
+    }
+    /*! \brief reserve space for summary */
+    inline void Reserve(size_t size) {
+      if (size > space.size()) {
+        space.resize(size);
+        this->data = BeginPtr(space);
+      }
+    }
+    /*! 
+     * \brief set the space to be merge of all Summary arrays
+     * \param begin begining position in th summary array
+     * \param end ending position in the Summary array
+     */
+    inline void SetMerge(const Summary *begin,
+                         const Summary *end) {
+      utils::Assert(begin < end, "can not set combine to empty instance");
+      size_t len = end - begin;
+      if (len == 1) {
+        this->Reserve(begin[0].size);
+        this->CopyFrom(begin[0]);
+      } else if (len == 2) {
+        this->Reserve(begin[0].size + begin[1].size);
+        this->SetMerge(begin[0], begin[1]);
+      } else {
+        // recursive merge
+        SummaryContainer lhs, rhs;
+        lhs.SetCombine(begin, begin + len / 2);
+        rhs.SetCombine(begin + len / 2, end);
+        this->Reserve(lhs.size + rhs.size);
+        this->SetCombine(lhs, rhs);
+      }
+    }
+    /*!
+     * \brief do elementwise combination of summary array
+     *        this[i] = combine(this[i], src[i]) for each i
+     * \param src the source summary
+     * \param max_nbyte, maximum number of byte allowed in here
+     */
+    inline void Reduce(const Summary &src, size_t max_nbyte) {
+      this->Reserve((max_nbyte - sizeof(this->size)) / sizeof(Entry));
+      SummaryContainer temp;
+      temp.Reserve(this->size + src.size);
+      temp.SetCombine(*this, src);
+      this->SetPrune(temp, space.size());
+    }
+    /*! \brief return the number of bytes this data structure cost in serialization */
+    inline static size_t CalcMemCost(size_t nentry) {
+      return sizeof(size_t) + sizeof(Entry) * nentry;
+    }
+    /*! \brief save the data structure into stream */
+    template<typename TStream>
+    inline void Save(TStream &fo) const {
+      fo.Write(&(this->size), sizeof(this->size));
+      if (this->size != 0) {
+        fo.Write(this->data, this->size * sizeof(Entry));
+      }
+    }
+    /*! \brief load data structure from input stream */
+    template<typename TStream>
+    inline void Load(TStream &fi) {
+      utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
+      this->Reserve(this->size);
+      if (this->size != 0) {
+        utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
+      }
+    }
+  };
+  /*! 
+   * \brief intialize the quantile sketch, given the performance specification
+   * \param maxn maximum number of data points can be feed into sketch
+   * \param eps accuracy level of summary
+   */
+  inline void Init(size_t maxn, double eps) {
+    nlevel = 1;
+    while (true) {
+      limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
+      size_t n = (1UL << nlevel);
+      if (n * limit_size >= maxn) break;
+      ++nlevel;
+    }
+    // check invariant
+    size_t n = (1UL << nlevel);
+    utils::Assert(n * limit_size >= maxn, "invalid init parameter");
+    utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
+    // lazy reserve the space, if there is only one value, no need to allocate space
+    inqueue.queue.resize(1);
+    inqueue.qtail = 0;
+    data.clear();
+    level.clear();
+  }
+  /*!
+   * \brief add an element to a sketch
+   * \param x the elemented added to the sketch
+   */
+  inline void Push(DType x, RType w = 1) {
+    if (inqueue.qtail == inqueue.queue.size()) {
+      // jump from lazy one value to limit_size * 2
+      if (inqueue.queue.size() == 1) {
+        inqueue.queue.resize(limit_size * 2);
+      } else {
+        temp.Reserve(limit_size * 2);
+        inqueue.MakeSummary(&temp);
+        // cleanup queue
+        inqueue.qtail = 0;
+        this->PushTemp();
+      }
+    }
+    inqueue.Push(x, w);
+  }
+  /*! \brief push up temp */
+  inline void PushTemp(void) {
+    temp.Reserve(limit_size * 2);
+    for (size_t l = 1; true; ++l) {
+      this->InitLevel(l + 1);
+      // check if level l is empty
+      if (level[l].size == 0) {
+        level[l].SetPrune(temp, limit_size);
+        break;
+      } else {
+        // level 0 is actually temp space
+        level[0].SetPrune(temp, limit_size);
+        temp.SetCombine(level[0], level[l]);
+        if (temp.size > limit_size) {
+          // try next level
+          level[l].size = 0;
+        } else {
+          // if merged record is still smaller, no need to send to next level
+          level[l].CopyFrom(temp); break;
+        }
+      }
+    }
+  }
+  /*! \brief get the summary after finalize */
+  inline void GetSummary(SummaryContainer *out) {
+    if (level.size() != 0) {
+      out->Reserve(limit_size * 2);
+    } else {
+      out->Reserve(inqueue.queue.size());
+    }
+    inqueue.MakeSummary(out);
+    if (level.size() != 0) {
+      level[0].SetPrune(*out, limit_size);
+      for (size_t l = 1; l < level.size(); ++l) {
+        if (level[l].size == 0) continue;
+        if (level[0].size == 0) {
+          level[0].CopyFrom(level[l]);
+        } else {
+          out->SetCombine(level[0], level[l]);
+          level[0].SetPrune(*out, limit_size);
+        }
+      }
+      out->CopyFrom(level[0]);
+    } else {
+      if (out->size > limit_size) {
+        temp.Reserve(limit_size);
+        temp.SetPrune(*out, limit_size);
+        out->CopyFrom(temp);
+      }
+    }
+  }
+  // used for debug, check if the sketch is valid
+  inline void CheckValid(RType eps) const {
+    for (size_t l = 1; l < level.size(); ++l) {
+      level[l].CheckValid(eps);
+    }
+  }
+  // initialize level space to at least nlevel
+  inline void InitLevel(size_t nlevel) {
+    if (level.size() >= nlevel) return;
+    data.resize(limit_size * nlevel);
+    level.resize(nlevel, Summary(NULL, 0));
+    for (size_t l = 0; l < level.size(); ++l) {
+      level[l].data = BeginPtr(data) + l * limit_size;
+    }
+  }
+  // input data queue
+  typename Summary::Queue inqueue;
+  // number of levels
+  size_t nlevel;
+  // size of summary in each level
+  size_t limit_size;
+  // the level of each summaries
+  std::vector<Summary> level;
+  // content of the summary
+  std::vector<Entry> data;
+  // temporal summary, used for temp-merge
+  SummaryContainer temp;
+};
+
+/*!
+ * \brief Quantile sketch use WQSummary
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class WQuantileSketch : 
+      public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
+};
+
+/*!
+ * \brief Quantile sketch use WXQSummary
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class WXQuantileSketch : 
+      public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> >{
+};
+/*!
+ * \brief Quantile sketch use WQSummary
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class GKQuantileSketch : 
+      public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> >{
+};
+
+}  // utils
+}  // xgboost
+#endif
diff --git a/src/utils/thread.h b/src/utils/thread.h
new file mode 100644
index 000000000..830b21cbe
--- /dev/null
+++ b/src/utils/thread.h
@@ -0,0 +1,146 @@
+#ifndef XGBOOST_UTILS_THREAD_H
+#define XGBOOST_UTILS_THREAD_H
+/*!
+ * \file thread.h
+ * \brief this header include the minimum necessary resource for multi-threading
+ * \author Tianqi Chen
+ * Acknowledgement: this file is adapted from SVDFeature project, by same author. 
+ *  The MAC support part of this code is provided by Artemy Kolchinsky
+ */
+#ifdef _MSC_VER
+#include "utils.h"
+#include <windows.h>
+#include <process.h>
+namespace xgboost {
+namespace utils {
+/*! \brief simple semaphore used for synchronization */
+class Semaphore {
+ public :
+  inline void Init(int init_val) {
+    sem = CreateSemaphore(NULL, init_val, 10, NULL);
+    utils::Assert(sem != NULL, "create Semaphore error");
+  }
+  inline void Destroy(void) {
+    CloseHandle(sem);
+  }
+  inline void Wait(void) {
+    utils::Assert(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
+  }
+  inline void Post(void) {
+    utils::Assert(ReleaseSemaphore(sem, 1, NULL)  != 0, "ReleaseSemaphore error");
+  }
+ private:
+  HANDLE sem;
+};
+/*! \brief simple thread that wraps windows thread */
+class Thread {
+ private:
+  HANDLE    thread_handle;
+  unsigned  thread_id;            
+ public:
+  inline void Start(unsigned int __stdcall entry(void*), void *param) {
+    thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
+  }            
+  inline int Join(void) {
+    WaitForSingleObject(thread_handle, INFINITE);
+    return 0;
+  }
+};
+/*! \brief exit function called from thread */
+inline void ThreadExit(void *status) {
+  _endthreadex(0);
+}
+#define XGBOOST_THREAD_PREFIX unsigned int __stdcall
+}  // namespace utils
+}  // namespace xgboost
+#else
+// thread interface using g++     
+#include <semaphore.h>
+#include <pthread.h>
+namespace xgboost {
+namespace utils {
+/*!\brief semaphore class */
+class Semaphore {
+  #ifdef __APPLE__
+ private:
+  sem_t* semPtr;
+  char sema_name[20];            
+ private:
+  inline void GenRandomString(char *s, const int len) {
+    static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;
+    for (int i = 0; i < len; ++i) {
+      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
+    }
+    s[len] = 0;
+  }
+ public:
+  inline void Init(int init_val) {
+    sema_name[0]='/'; 
+    sema_name[1]='s'; 
+    sema_name[2]='e'; 
+    sema_name[3]='/'; 
+    GenRandomString(&sema_name[4], 16);
+    if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
+      perror("sem_open");
+      exit(1);
+    }
+    utils::Assert(semPtr != NULL, "create Semaphore error");
+  }
+  inline void Destroy(void) {
+    if (sem_close(semPtr) == -1) {
+      perror("sem_close");
+      exit(EXIT_FAILURE);
+    }
+    if (sem_unlink(sema_name) == -1) {
+      perror("sem_unlink");
+      exit(EXIT_FAILURE);
+    }
+  }
+  inline void Wait(void) {
+    sem_wait(semPtr);
+  }
+  inline void Post(void) {
+    sem_post(semPtr);
+  }               
+  #else
+ private:
+  sem_t sem;
+ public:
+  inline void Init(int init_val) {
+    sem_init(&sem, 0, init_val);
+  }
+  inline void Destroy(void) {
+    sem_destroy(&sem);
+  }
+  inline void Wait(void) {
+    sem_wait(&sem);
+  }
+  inline void Post(void) {
+    sem_post(&sem);
+  }
+  #endif  
+};
+/*!\brief simple thread class */
+class Thread {
+ private:
+  pthread_t thread;                
+ public :
+  inline void Start(void * entry(void*), void *param) {
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    pthread_create(&thread, &attr, entry, param);
+  }  
+  inline int Join(void) {
+    void *status;
+    return pthread_join(thread, &status);
+  }
+};
+inline void ThreadExit(void *status) {
+  pthread_exit(status);
+}
+}  // namespace utils
+}  // namespace xgboost
+#define XGBOOST_THREAD_PREFIX void *
+#endif
+#endif
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
new file mode 100644
index 000000000..ed36e1b43
--- /dev/null
+++ b/src/utils/thread_buffer.h
@@ -0,0 +1,203 @@
+#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_
+#define XGBOOST_UTILS_THREAD_BUFFER_H_
+/*!
+ * \file thread_buffer.h
+ * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <cstring>
+#include <cstdlib>
+#include "./utils.h"
+#include "./thread.h"
+namespace xgboost {
+namespace utils {
+/*!
+ * \brief buffered loading iterator that uses multithread
+ * this template method will assume the following paramters
+ * \tparam Elem elememt type to be buffered
+ * \tparam ElemFactory factory type to implement in order to use thread buffer
+ */
+template<typename Elem, typename ElemFactory>
+class ThreadBuffer {
+ public:
+  /*!\brief constructor */
+  ThreadBuffer(void) {
+    this->init_end = false;
+    this->buf_size = 30;
+  }
+  ~ThreadBuffer(void) {
+    if(init_end) this->Destroy();
+  }
+  /*!\brief set parameter, will also pass the parameter to factory */
+  inline void SetParam(const char *name, const char *val) {
+    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
+    factory.SetParam(name, val);
+  }
+  /*!
+   * \brief initalize the buffered iterator
+   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
+   * \return false if the initlization can't be done, e.g. buffer file hasn't been created 
+   */
+  inline bool Init(void) {
+    if (!factory.Init()) return false;
+    for (int i = 0; i < buf_size; ++i) {
+      bufA.push_back(factory.Create());
+      bufB.push_back(factory.Create());
+    }
+    this->init_end = true;
+    this->StartLoader();
+    return true;
+  }  
+  /*!\brief place the iterator before first value */
+  inline void BeforeFirst(void) {
+    // wait till last loader end
+    loading_end.Wait();
+    // critcal zone
+    current_buf = 1;
+    factory.BeforeFirst();
+    // reset terminate limit
+    endA = endB = buf_size;
+    // wake up loader for first part
+    loading_need.Post();
+    // wait til first part is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();
+    // set buffer value
+    buf_index = 0;
+  }  
+  /*! \brief destroy the buffer iterator, will deallocate the buffer */
+  inline void Destroy(void) {
+    // wait until the signal is consumed
+    this->destroy_signal = true;
+    loading_need.Post();
+    loader_thread.Join();
+    loading_need.Destroy();
+    loading_end.Destroy();    
+    for (size_t i = 0; i < bufA.size(); ++i) {
+      factory.FreeSpace(bufA[i]);
+    }
+    for (size_t i = 0; i < bufB.size(); ++i) {
+      factory.FreeSpace(bufB[i]);
+    }
+    bufA.clear(); bufB.clear();
+    factory.Destroy();
+    this->init_end = false;
+  }  
+  /*!
+   * \brief get the next element needed in buffer
+   * \param elem element to store into
+   * \return whether reaches end of data
+   */
+  inline bool Next(Elem &elem) {
+    // end of buffer try to switch
+    if (buf_index == buf_size) {
+      this->SwitchBuffer();
+      buf_index = 0;
+    }
+    if (buf_index >= (current_buf ? endA : endB)) { 
+      return false;
+    }
+    std::vector<Elem> &buf = current_buf ? bufA : bufB;
+    elem = buf[buf_index];
+    ++buf_index;
+    return true;
+  }      
+  /*!
+   * \brief get the factory object
+   */
+  inline ElemFactory &get_factory(void) {
+    return factory;
+  }
+  inline const ElemFactory &get_factory(void) const{
+    return factory;
+  }
+  // size of buffer
+  int  buf_size;
+ private:
+  // factory object used to load configures
+  ElemFactory factory;
+  // index in current buffer
+  int buf_index;
+  // indicate which one is current buffer
+  int current_buf;
+  // max limit of visit, also marks termination
+  int endA, endB;
+  // double buffer, one is accessed by loader
+  // the other is accessed by consumer
+  // buffer of the data
+  std::vector<Elem> bufA, bufB;
+  // initialization end
+  bool init_end;
+  // singal whether the data is loaded
+  bool data_loaded;
+  // signal to kill the thread
+  bool destroy_signal;
+  // thread object
+  Thread loader_thread;
+  // signal of the buffer
+  Semaphore loading_end, loading_need;
+  /*!
+   * \brief slave thread
+   * this implementation is like producer-consumer style
+   */
+  inline void RunLoader(void) {
+    while(!destroy_signal) {
+      // sleep until loading is needed
+      loading_need.Wait();      
+      std::vector<Elem> &buf = current_buf ? bufB : bufA;
+      int i;
+      for (i = 0; i < buf_size ; ++i) {
+        if (!factory.LoadNext(buf[i])) {
+          int &end = current_buf ? endB : endA;
+          end = i; // marks the termination
+          break;
+        }
+      }
+      // signal that loading is done
+      data_loaded = true;
+      loading_end.Post();
+    }
+  }
+  /*!\brief entry point of loader thread */
+  inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) {
+    static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
+    ThreadExit(NULL);
+    return NULL;
+  }
+  /*!\brief start loader thread */
+  inline void StartLoader(void) {
+    destroy_signal = false;
+    // set param
+    current_buf = 1;    
+    loading_need.Init(1);
+    loading_end .Init(0);
+    // reset terminate limit
+    endA = endB = buf_size;
+    loader_thread.Start(LoaderEntry, this);
+    // wait until first part of data is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();    
+    buf_index = 0; 
+  }
+  /*!\brief switch double buffer */
+  inline void SwitchBuffer(void) {
+    loading_end.Wait();
+    // loader shall be sleep now, critcal zone!
+    current_buf = !current_buf;
+    // wake up loader
+    data_loaded = false;
+    loading_need.Post();
+  }
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 75544dd0e..eae5347cd 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -1,9 +1,10 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
-
+#define NOMINMAX
 #include <ctime>
 #include <string>
 #include <cstring>
+#include "./sync/sync.h"
 #include "io/io.h"
 #include "utils/utils.h"
 #include "utils/config.h"
@@ -13,13 +14,13 @@ namespace xgboost {
 /*!
  * \brief wrapping the training process 
  */
-class BoostLearnTask{
+class BoostLearnTask {
  public:
   inline int Run(int argc, char *argv[]) {
     if (argc < 2) {
       printf("Usage: <config>\n");
       return 0;
-    }
+    }    
     utils::ConfigIterator itr(argv[1]);
     while (itr.Next()) {
       this->SetParam(itr.name(), itr.val());
@@ -30,8 +31,36 @@ class BoostLearnTask{
         this->SetParam(name, val);
       }
     }
+    // do not save anything when save to stdout
+    if (model_out == "stdout" || name_pred == "stdout") {
+      this->SetParam("silent", "1");
+      save_period = 0;
+    }
+    // whether need data rank
+    bool need_data_rank = strchr(train_path.c_str(), '%') != NULL;
+    // if need data rank in loading, initialize rabit engine before load data
+    // otherwise, initialize rabit engine after loading data
+    // lazy initialization of rabit engine can be helpful in speculative execution
+    if (need_data_rank) rabit::Init(argc, argv);
     this->InitData();
-    this->InitLearner();
+    if (!need_data_rank) rabit::Init(argc, argv);
+    if (rabit::IsDistributed()) {
+      std::string pname = rabit::GetProcessorName();
+      fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
+    }
+    if (rabit::IsDistributed() && data_split == "NONE") {
+      this->SetParam("dsplit", "row");
+    }
+    if (rabit::GetRank() != 0) {
+      this->SetParam("silent", "2");
+    }    
+    if (task == "train") {
+      // if task is training, will try recover from checkpoint
+      this->TaskTrain();
+      return 0;
+    } else {
+      this->InitLearner();
+    }
     if (task == "dump") {
       this->TaskDump(); return 0;
     }
@@ -40,8 +69,6 @@ class BoostLearnTask{
     }
     if (task == "pred") {
       this->TaskPred();
-    } else {
-      this->TaskTrain();
     }
     return 0;
   }
@@ -62,6 +89,7 @@ class BoostLearnTask{
     if (!strcmp("fmap", name)) name_fmap = val;
     if (!strcmp("name_dump", name)) name_dump = val;
     if (!strcmp("name_pred", name)) name_pred = val;
+    if (!strcmp("dsplit", name)) data_split = val;
     if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
     if (!strncmp("eval[", name, 5)) {
       char evname[256];
@@ -89,6 +117,8 @@ class BoostLearnTask{
     name_pred = "pred.txt";
     name_dump = "dump.txt";
     model_dir_path = "./";
+    data_split = "NONE";
+    load_part = 0;
     data = NULL;
   }
   ~BoostLearnTask(void){
@@ -99,13 +129,20 @@ class BoostLearnTask{
   }
  private:
   inline void InitData(void) {
+    if (strchr(train_path.c_str(), '%') != NULL) {
+      char s_tmp[256];
+      utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), rabit::GetRank());
+      train_path = s_tmp;
+      load_part = 1;
+    }
+
     if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
     if (task == "dump") return;
     if (task == "pred") {
       data = io::LoadDataMatrix(test_path.c_str(), silent != 0, use_buffer != 0);
     } else {
       // training
-      data = io::LoadDataMatrix(train_path.c_str(), silent != 0, use_buffer != 0);
+      data = io::LoadDataMatrix(train_path.c_str(), silent != 0 && load_part == 0, use_buffer != 0);
       utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
       for (size_t i = 0; i < eval_data_names.size(); ++i) {
         deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0));
@@ -120,35 +157,61 @@ class BoostLearnTask{
       learner.SetCacheData(dcache);
       
       // add training set to evaluation set if needed
-      if( eval_train != 0 ) {
+      if (eval_train != 0) {
         devalall.push_back(data);
         eval_data_names.push_back(std::string("train"));
       }
     }
   }
   inline void InitLearner(void) {
-    if (model_in != "NULL"){
-      utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
-      learner.LoadModel(fi);
-      fi.Close();
+    if (model_in != "NULL") {
+      learner.LoadModel(model_in.c_str());
     } else {
       utils::Assert(task == "train", "model_in not specified");
       learner.InitModel();
     }
   }
   inline void TaskTrain(void) {
+    int version = rabit::LoadCheckPoint(&learner);
+    if (version == 0) this->InitLearner();
     const time_t start = time(NULL);
     unsigned long elapsed = 0;
     learner.CheckInit(data);
-    for (int i = 0; i < num_round; ++i) {
+
+    bool allow_lazy = learner.AllowLazyCheckPoint();
+    for (int i = version / 2; i < num_round; ++i) {
       elapsed = (unsigned long)(time(NULL) - start);
-      if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
-      learner.UpdateOneIter(i, *data); 
+      if (version % 2 == 0) { 
+        if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
+        learner.UpdateOneIter(i, *data);
+        if (allow_lazy) {
+          rabit::LazyCheckPoint(&learner);
+        } else {
+          rabit::CheckPoint(&learner);
+        }
+        version += 1;
+      }
+      utils::Assert(version == rabit::VersionNumber(), "consistent check");
       std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
-      fprintf(stderr, "%s\n", res.c_str());
+      if (rabit::IsDistributed()){
+        if (rabit::GetRank() == 0) {
+          rabit::TrackerPrintf("%s\n", res.c_str());
+        }
+      } else {
+        if (silent < 2) {
+          fprintf(stderr, "%s\n", res.c_str());
+        }
+      }
       if (save_period != 0 && (i + 1) % save_period == 0) {
         this->SaveModel(i);
       }
+      if (allow_lazy) {
+        rabit::LazyCheckPoint(&learner);
+      } else {
+        rabit::CheckPoint(&learner);
+      }
+      version += 1;
+      utils::Assert(version == rabit::VersionNumber(), "consistent check");
       elapsed = (unsigned long)(time(NULL) - start);
     }
     // always save final round
@@ -176,9 +239,8 @@ class BoostLearnTask{
     fclose(fo);
   }
   inline void SaveModel(const char *fname) const {
-    utils::FileStream fo(utils::FopenCheck(fname, "wb"));
-    learner.SaveModel(fo);
-    fo.Close();
+    if (rabit::GetRank() != 0) return;
+    learner.SaveModel(fname);
   }
   inline void SaveModel(int i) const {
     char fname[256];
@@ -189,16 +251,23 @@ class BoostLearnTask{
     std::vector<float> preds;
     if (!silent) printf("start prediction...\n");
     learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
-    if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
-    FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
-    for (size_t i = 0; i < preds.size(); i++) {
-      fprintf(fo, "%f\n", preds[i]);
+    if (!silent) printf("writing prediction to %s\n", name_pred.c_str());    
+    FILE *fo;
+    if (name_pred != "stdout") {
+      fo = utils::FopenCheck(name_pred.c_str(), "w");
+    } else {
+      fo = stdout;
     }
-    fclose(fo);
+    for (size_t i = 0; i < preds.size(); ++i) {
+      fprintf(fo, "%g\n", preds[i]);
+    }
+    if (fo != stdout) fclose(fo);
   }
  private:
   /*! \brief whether silent */
   int silent;
+  /*! \brief special load */
+  int load_part;
   /*! \brief whether use auto binary buffer */
   int use_buffer;
   /*! \brief whether evaluate training statistics */            
@@ -219,6 +288,8 @@ class BoostLearnTask{
   std::string task;
   /*! \brief name of predict file */
   std::string name_pred;
+  /*! \brief data split mode */
+  std::string data_split;
   /*!\brief limit number of trees in prediction */
   int ntree_limit;
   /*!\brief whether to directly output margin value */
@@ -243,7 +314,9 @@ class BoostLearnTask{
 }
 
 int main(int argc, char *argv[]){
-  xgboost::random::Seed(0);
   xgboost::BoostLearnTask tsk;
-  return tsk.Run(argc, argv);
+  tsk.SetParam("seed", "0");
+  int ret = tsk.Run(argc, argv);
+  rabit::Finalize();
+  return ret;
 }
diff --git a/subtree/README.md b/subtree/README.md
new file mode 100644
index 000000000..9c3df6609
--- /dev/null
+++ b/subtree/README.md
@@ -0,0 +1,5 @@
+This folder contains git subtree projects of xgboost.
+Do not make changes to the subtree projects in xgboost,
+push changes to the original project instead and changes will be pulled back to this folder
+
+* rabit: https://github.com/tqchen/rabit
diff --git a/subtree/rabit/.gitignore b/subtree/rabit/.gitignore
new file mode 100644
index 000000000..504802743
--- /dev/null
+++ b/subtree/rabit/.gitignore
@@ -0,0 +1,36 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+*.lnk
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+*~
+*.pyc
+*.mpi
+*.exe
+*.txt
+*tmp*
+*.rabit
+*.mock
diff --git a/subtree/rabit/LICENSE b/subtree/rabit/LICENSE
new file mode 100644
index 000000000..ebf9611d7
--- /dev/null
+++ b/subtree/rabit/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2014 by Contributors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/subtree/rabit/Makefile b/subtree/rabit/Makefile
new file mode 100644
index 000000000..20045bbd6
--- /dev/null
+++ b/subtree/rabit/Makefile
@@ -0,0 +1,56 @@
+export CC  = gcc
+export CXX = g++
+export MPICXX = mpicxx
+export LDFLAGS= -Llib
+export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic 
+export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS) 
+
+# build path
+BPATH=.
+# objectives that makes up rabit library
+MPIOBJ= $(BPATH)/engine_mpi.o
+OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
+	$(BPATH)/rabit_wrapper.o $(BPATH)/engine_base.o
+SLIB= wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so wrapper/librabit_wrapper_mpi.so
+ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
+HEADERS=src/*.h include/*.h include/rabit/*.h
+.PHONY: clean all install mpi python
+
+all: lib/librabit.a lib/librabit_mock.a  wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so lib/librabit_base.a
+mpi: lib/librabit_mpi.a wrapper/librabit_wrapper_mpi.so
+python: wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so
+
+$(BPATH)/allreduce_base.o: src/allreduce_base.cc $(HEADERS)
+$(BPATH)/engine.o: src/engine.cc $(HEADERS)
+$(BPATH)/allreduce_robust.o: src/allreduce_robust.cc $(HEADERS)
+$(BPATH)/engine_mpi.o: src/engine_mpi.cc $(HEADERS)
+$(BPATH)/engine_empty.o: src/engine_empty.cc $(HEADERS)
+$(BPATH)/engine_mock.o: src/engine_mock.cc $(HEADERS)
+$(BPATH)/engine_base.o: src/engine_base.cc $(HEADERS)
+
+lib/librabit.a: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o
+lib/librabit_base.a: $(BPATH)/allreduce_base.o $(BPATH)/engine_base.o
+lib/librabit_mock.a: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine_mock.o
+lib/librabit_empty.a: $(BPATH)/engine_empty.o
+lib/librabit_mpi.a: $(MPIOBJ)
+# wrapper code
+$(BPATH)/rabit_wrapper.o: wrapper/rabit_wrapper.cc
+wrapper/librabit_wrapper.so: $(BPATH)/rabit_wrapper.o lib/librabit.a
+wrapper/librabit_wrapper_mock.so: $(BPATH)/rabit_wrapper.o lib/librabit_mock.a
+wrapper/librabit_wrapper_mpi.so: $(BPATH)/rabit_wrapper.o lib/librabit_mpi.a
+
+$(OBJ) : 
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIOBJ) : 
+	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(ALIB):
+	ar cr $@ $+
+
+$(SLIB) :
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^)
+
+clean:
+	$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
+
diff --git a/subtree/rabit/README.md b/subtree/rabit/README.md
new file mode 100644
index 000000000..929ea9723
--- /dev/null
+++ b/subtree/rabit/README.md
@@ -0,0 +1,28 @@
+## rabit: Reliable Allreduce and Broadcast Interface
+
+rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs. 
+
+* [Tutorial](guide)
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
+* You can also directly read the [interface header](include/rabit.h)
+
+Features
+====
+All these features comes from the facts about small rabbit:)
+* Portable: rabit is light weight and runs everywhere
+  - Rabit is a library instead of a framework, a program only needs to link the library to run
+  - Rabit only replies on a mechanism to start program, which was provided by most framework
+  - You can run rabit programs on many platforms, including Hadoop, MPI using the same code
+* Scalable and Flexible: rabit runs fast
+  * Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
+  - Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
+  - Programs persist over all the iterations, unless they fail and recover.
+* Reliable: rabit dig burrows to avoid disasters
+  - Rabit programs can recover the model and results using synchronous function calls.
+
+Use Rabit
+====
+* Type make in the root folder will compile the rabit library in lib folder
+* Add lib to the library path and include to the include path of compiler
+* Languages: You can use rabit in C++ and python
+  - It is also possible to port the library to other languages
diff --git a/subtree/rabit/doc/.gitignore b/subtree/rabit/doc/.gitignore
new file mode 100644
index 000000000..9036e38b3
--- /dev/null
+++ b/subtree/rabit/doc/.gitignore
@@ -0,0 +1,3 @@
+html
+latex
+*.sh
diff --git a/subtree/rabit/doc/Doxyfile b/subtree/rabit/doc/Doxyfile
new file mode 100644
index 000000000..2e1af0286
--- /dev/null
+++ b/subtree/rabit/doc/Doxyfile
@@ -0,0 +1,287 @@
+# Doxyfile 1.7.6.1
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = "rabit"
+PROJECT_NUMBER         =
+PROJECT_BRIEF          =
+PROJECT_LOGO           =
+OUTPUT_DIRECTORY       = ../doc
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       =
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = YES
+STRIP_FROM_PATH        =
+STRIP_FROM_INC_PATH    =
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+QT_AUTOBRIEF           = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 8
+ALIASES                =
+TCL_SUBST              =
+OPTIMIZE_OUTPUT_FOR_C  = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
+OPTIMIZE_FOR_FORTRAN   = NO
+OPTIMIZE_OUTPUT_VHDL   = NO
+EXTENSION_MAPPING      =
+BUILTIN_STL_SUPPORT    = NO
+CPP_CLI_SUPPORT        = NO
+SIP_SUPPORT            = NO
+IDL_PROPERTY_SUPPORT   = YES
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+INLINE_GROUPED_CLASSES = NO
+INLINE_SIMPLE_STRUCTS  = NO
+TYPEDEF_HIDES_STRUCT   = NO
+SYMBOL_CACHE_SIZE      = 0
+LOOKUP_CACHE_SIZE      = 0
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_ANON_NSPACES   = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = YES
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+FORCE_LOCAL_INCLUDES   = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_MEMBERS_CTORS_1ST = NO
+SORT_GROUP_NAMES       = NO
+SORT_BY_SCOPE_NAME     = NO
+STRICT_PROTO_MATCHING  = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       =
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_DIRECTORIES       = NO
+SHOW_FILES             = YES
+SHOW_NAMESPACES        = YES
+FILE_VERSION_FILTER    =
+LAYOUT_FILE            =
+CITE_BIB_FILES         =
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = YES
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  =
+INPUT_ENCODING         = UTF-8
+FILE_PATTERNS          =
+RECURSIVE              = NO
+EXCLUDE                =
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = *-inl.hpp 
+EXCLUDE_SYMBOLS        = 
+EXAMPLE_PATH           =
+EXAMPLE_PATTERNS       =
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             =
+INPUT_FILTER           =
+FILTER_PATTERNS        =
+FILTER_SOURCE_FILES    = NO
+FILTER_SOURCE_PATTERNS =
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+REFERENCES_LINK_SOURCE = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            =
+HTML_FOOTER            =
+HTML_STYLESHEET        =
+HTML_EXTRA_FILES       =
+HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_GAMMA  = 80
+HTML_TIMESTAMP         = YES
+HTML_ALIGN_MEMBERS     = YES
+HTML_DYNAMIC_SECTIONS  = NO
+GENERATE_DOCSET        = NO
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+DOCSET_PUBLISHER_NAME  = Publisher
+GENERATE_HTMLHELP      = NO
+CHM_FILE               =
+HHC_LOCATION           =
+GENERATE_CHI           = NO
+CHM_INDEX_ENCODING     =
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+GENERATE_QHP           = NO
+QCH_FILE               =
+QHP_NAMESPACE          = org.doxygen.Project
+QHP_VIRTUAL_FOLDER     = doc
+QHP_CUST_FILTER_NAME   =
+QHP_CUST_FILTER_ATTRS  =
+QHP_SECT_FILTER_ATTRS  =
+QHG_LOCATION           =
+GENERATE_ECLIPSEHELP   = NO
+ECLIPSE_DOC_ID         = org.doxygen.Project
+DISABLE_INDEX          = NO
+GENERATE_TREEVIEW      = NO
+ENUM_VALUES_PER_LINE   = 4
+USE_INLINE_TREES       = NO
+TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+FORMULA_FONTSIZE       = 10
+FORMULA_TRANSPARENT    = YES
+USE_MATHJAX            = NO
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+MATHJAX_EXTENSIONS     =
+SEARCHENGINE           = YES
+SERVER_BASED_SEARCH    = NO
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = YES
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4
+EXTRA_PACKAGES         =
+LATEX_HEADER           =
+LATEX_FOOTER           =
+PDF_HYPERLINKS         = YES
+USE_PDFLATEX           = YES
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+LATEX_SOURCE_CODE      = NO
+LATEX_BIB_STYLE        = plain
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    =
+RTF_EXTENSIONS_FILE    =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_SCHEMA             =
+XML_DTD                =
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = NO
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           =
+INCLUDE_FILE_PATTERNS  =
+PREDEFINED             =
+EXPAND_AS_DEFINED      =
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+TAGFILES               =
+GENERATE_TAGFILE       =
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+MSCGEN_PATH            =
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+DOT_NUM_THREADS        = 0
+DOT_FONTNAME           = Helvetica
+DOT_FONTSIZE           = 10
+DOT_FONTPATH           =
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+CALLER_GRAPH           = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+INTERACTIVE_SVG        = NO
+DOT_PATH               =
+DOTFILE_DIRS           =
+MSCFILE_DIRS           =
+DOT_GRAPH_MAX_NODES    = 50
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = YES
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
diff --git a/subtree/rabit/doc/README.md b/subtree/rabit/doc/README.md
new file mode 100644
index 000000000..fadc9a1b1
--- /dev/null
+++ b/subtree/rabit/doc/README.md
@@ -0,0 +1,28 @@
+Rabit Documentation
+====
+* [Tutorial](../guide)
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
+  - You can also run ```./mkdoc.sh``` to make the document locally
+* [Parameters](#parameters)
+
+Parameters
+====
+This section list all the parameters that can be passed to rabit::Init function as argv.
+All the parameters are passed in as string in format of ```parameter-name=parameter-value```.
+In most setting these parameters have default value or will be automatically detected,
+and do not need to be manually configured.
+
+* rabit_tracker_uri [passed in automatically by tracker] 
+  - The uri/ip of rabit tracker
+* rabit_tracker_port [passed in automatically by tracker]
+  - The port of rabit tracker
+* rabit_task_id [automatically detected]
+  - The unique identifier of computing process
+  - When running on hadoop, this is automatically extracted from enviroment variable
+* rabit_reduce_buffer [default = 256MB]
+  - The memory buffer used to store intermediate result of reduction
+  - Format "digits + unit", can be 128M, 1G
+* rabit_global_replica [default = 5]
+  - Number of replication copies of result kept for each Allreduce/Broadcast call
+* rabit_local_replica [default = 2]
+  - Number of replication of local model in check point
diff --git a/subtree/rabit/doc/mkdoc.sh b/subtree/rabit/doc/mkdoc.sh
new file mode 100755
index 000000000..4bc0284c3
--- /dev/null
+++ b/subtree/rabit/doc/mkdoc.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+cd ../include
+doxygen ../doc/Doxyfile
+cd ../doc
diff --git a/subtree/rabit/guide/Makefile b/subtree/rabit/guide/Makefile
new file mode 100644
index 000000000..7213e1bf7
--- /dev/null
+++ b/subtree/rabit/guide/Makefile
@@ -0,0 +1,26 @@
+export CC  = gcc
+export CXX = g++
+export MPICXX = mpicxx
+export LDFLAGS= -pthread -lm -L../lib
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include 
+
+.PHONY: clean all lib libmpi
+BIN = basic.rabit broadcast.rabit
+MOCKBIN= lazy_allreduce.mock
+
+all: $(BIN)
+basic.rabit: basic.cc lib
+broadcast.rabit: broadcast.cc lib
+lazy_allreduce.mock: lazy_allreduce.cc lib
+
+$(BIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit
+
+$(MOCKBIN) : 
+	$(CXX) $(CFLAGS) -std=c++11 -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit_mock
+
+$(OBJ) : 
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(MOCKBIN) *~ ../src/*~
\ No newline at end of file
diff --git a/subtree/rabit/guide/README.md b/subtree/rabit/guide/README.md
new file mode 100644
index 000000000..41c4b9982
--- /dev/null
+++ b/subtree/rabit/guide/README.md
@@ -0,0 +1,415 @@
+Tutorial
+=====
+This is rabit's tutorial, a ***Reliable Allreduce and Broadcast Interface***.
+To run the examples locally, you will need to build them with ```make```.
+
+Please also refer to the [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for further details.
+
+**List of Topics**
+* [What is Allreduce](#what-is-allreduce)
+* [Common Use Case](#common-use-case)
+* [Use Rabit API](#use-rabit-api)
+  - [Structure of a Rabit Program](#structure-of-a-rabit-program)
+  - [Allreduce and Lazy Preparation](#allreduce-and-lazy-preparation)
+  - [Checkpoint and LazyCheckpoint](#checkpoint-and-lazycheckpoint)
+* [Compile Programs with Rabit](#compile-programs-with-rabit)
+* [Running Rabit Jobs](#running-rabit-jobs)
+  - [Running Rabit on Hadoop](#running-rabit-on-hadoop)
+  - [Running Rabit using MPI](#running-rabit-using-mpi)
+  - [Customize Tracker Script](#customize-tracker-script)
+* [Fault Tolerance](#fault-tolerance)
+
+What is Allreduce
+=====
+The main methods provided by rabit are Allreduce and Broadcast. Allreduce performs reduction across different computation nodes,
+and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](basic.cc) (there is a python example right after this if you are more familiar with python).
+```c++
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N];
+  rabit::Init(argc, argv);
+  for (int i = 0; i < N; ++i) {
+    a[i] = rabit::GetRank() + i;
+  } 
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // second allreduce that sums everything up
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);  
+  rabit::Finalize();
+  return 0;
+}
+```
+You can run the example using the rabit_demo.py script. The following command
+starts the rabit program with two worker processes.
+```bash
+../tracker/rabit_demo.py -n 2 basic.rabit
+```
+This will start two processes, one process with rank 0 and the other with rank 1, both processes run the same code.
+The ```rabit::GetRank()``` function returns the rank of current process.
+
+Before the call to Allreduce, process 0 contains the array ```a = {0, 1, 2}```, while process 1 has the array 
+```a = {1, 2, 3}```. After the call to Allreduce, the array contents in all processes are replaced by the
+reduction result (in this case, the maximum value in each position across all the processes). So, after the
+Allreduce call, the result will become ```a = {1, 2, 3}```.
+Rabit provides different reduction operators, for example,  if you change ```op::Max``` to ```op::Sum```,
+the reduction operation will be a summation, and the result will become ```a = {1, 3, 5}```.
+You can also run the example with different processes by setting -n to different values.
+
+If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](basic.py):
+
+```python
+import numpy as np
+import rabit
+
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+a = np.zeros(n)
+for i in xrange(n):
+    a[i] = rank + i
+    
+print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.MAX)
+print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.SUM)
+print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
+rabit.finalize()
+```
+You can run the program using the following command
+```bash
+../tracker/rabit_demo.py -n 2 basic.py
+```
+
+Broadcast is another method provided by rabit besides Allreduce. This function allows one node to broadcast its
+local data to all other nodes. The following code in [broadcast.cc](broadcast.cc) broadcasts a string from
+node 0 to all other nodes.
+```c++
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  rabit::Init(argc, argv);
+  std::string s;
+  if (rabit::GetRank() == 0) s = "hello world";
+  printf("@node[%d] before-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  // broadcast s from node 0 to all other nodes
+  rabit::Broadcast(&s, 0);
+  printf("@node[%d] after-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  rabit::Finalize();
+  return 0;
+}
+```
+The following command starts the program with three worker processes.
+```bash
+../tracker/rabit_demo.py -n 3 broadcast.rabit
+```
+Besides strings, rabit also allows to broadcast constant size array and vectors.
+
+The counterpart in python can be found in [broadcast.py](broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
+
+```python
+import rabit
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+s = None
+if rank == 0:
+    s = {'hello world':100, 2:3}
+print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
+s = rabit.broadcast(s, 0)
+print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
+rabit.finalize()
+```
+
+Common Use Case
+=====
+Many distributed machine learning algorithms involve splitting the data into different nodes,
+computing statistics locally, and finally aggregating them. Such workflow is usually done repetitively through many iterations before the algorithm converges. Allreduce naturally meets the structure of such programs,
+common use cases include:
+
+* Aggregation of gradient values, which can be used in optimization methods such as L-BFGS.
+* Aggregation of other statistics, which can be used in KMeans and Gaussian Mixture Models.
+* Find the best split candidate and aggregation of split statistics, used for tree based models.
+
+Rabit is a reliable and portable library for distributed machine learning programs, that allow programs to run reliably on different platforms.
+
+Use Rabit API
+====
+This section introduces topics about how to use rabit API.
+You can always refer to [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for definition of each functions.
+This section trys to gives examples of different aspectes of rabit API.
+
+#### Structure of a Rabit Program
+The following code illustrates the common structure of a rabit program. This is an abstract example,
+you can also refer to [kmeans.cc](../rabit-learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
+
+```c++
+#include <rabit.h>
+int main(int argc, char *argv[]) {
+  ...
+  rabit::Init(argc, argv);
+  // load the latest checked model
+  int version = rabit::LoadCheckPoint(&model);
+  // initialize the model if it is the first version
+  if (version == 0) model.InitModel();
+  // the version number marks the iteration to resume
+  for (int iter = version; iter < max_iter; ++iter) {
+    // at this point, the model object should allow us to recover the program state
+    ...
+    // each iteration can contain multiple calls of allreduce/broadcast
+    rabit::Allreduce<rabit::op::Max>(&data[0], n);
+    ...
+    // checkpoint model after one iteration finishes
+    rabit::CheckPoint(&model);
+  }
+  rabit::Finalize();
+  return 0;
+}
+```
+
+Besides the common Allreduce and Broadcast functions, there are two additional functions: ```LoadCheckPoint```
+and ```CheckPoint```. These two functions are used for fault-tolerance purposes. 
+As mentioned before, traditional machine learning programs involve several iterations. In each iteration, we start with a model, make some calls
+to Allreduce or Broadcast and update the model. The calling sequence in each iteration does not need to be the same.
+
+* When the nodes start from the beginning (i.e. iteration 0), ```LoadCheckPoint``` returns 0, so we can initialize the model.
+* ```CheckPoint``` saves the model after each iteration.
+  - Efficiency Note: the model is only kept in local memory and no save to disk is performed when calling Checkpoint
+* When a node goes down and restarts, ```LoadCheckPoint``` will recover the latest saved model, and 
+* When a node goes down, the rest of the nodes will block in the call of Allreduce/Broadcast and wait for 
+  the recovery of the failed node until it catches up. 
+
+Please see the [Fault Tolerance](#fault-tolerance) section to understand the recovery procedure executed by rabit.
+
+#### Allreduce and Lazy Preparation
+Allreduce is one of the most important function provided by rabit. You can call allreduce by specifying the
+reduction operator, pointer to the data and size of the buffer, as follows
+```c++
+Allreduce<operator>(pointer_of_data, size_of_data);
+```
+This is the basic use case of Allreduce function. It is common that user writes the code to prepare the data needed
+into the data buffer, pass the data to Allreduce function, and get the reduced result. However, when a node restarts
+from failure, we can directly recover the result from other nodes(see also [Fault Tolerance](#fault-tolerance)) and
+the data preparation procedure no longer necessary. Rabit Allreduce add an optional parameter preparation function
+to support such scenario. User can pass in a function that corresponds to the data preparation procedure to Allreduce
+calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](lazy_allreduce.cc)
+as an example to demonstrate this feature. It is modified from [basic.cc](basic.cc), and you can compare the two codes.
+```c++
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N] = {0};
+  rabit::Init(argc, argv);
+  // lazy preparation function
+  auto prepare = [&]() {
+    printf("@node[%d] run prepare function\n", rabit::GetRank());
+    for (int i = 0; i < N; ++i) {
+      a[i] = rabit::GetRank() + i;
+    } 
+  };
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N, prepare);  
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);  
+  // rum second allreduce
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);  
+  rabit::Finalize();
+  return 0;
+}
+```
+Here we use features of C++11 because the lambda function makes things much shorter.
+There is also C++ compatible callback interface provided in the [API](http://homes.cs.washington.edu/~tqchen/rabit/doc).
+You can compile the program by typing ```make lazy_allreduce.mock```. We link against the mock library so that we can see
+the effect when a process goes down. You can run the program using the following command
+```bash
+../tracker/rabit_demo.py -n 2 lazy_allreduce.mock mock=0,0,1,0
+```
+The additional arguments ```mock=0,0,1,0``` will cause node 0 to kill itself before second call of Allreduce (see also [mock test](#link-against-mock-test-rabit-library)).
+You will find that the prepare function's print is only executed once and node 0 will no longer execute the preparation function when it restarts from failure.
+
+You can also find python version of the example in [lazy_allreduce.py](lazy_allreduce.py), and run it using the followin command
+```bash
+../tracker/rabit_demo.py -n 2 lazy_allreduce.py mock=0,0,1,0
+
+```
+
+Since lazy preparation function may not be called during execution. User should be careful when using this feature. For example, a possible mistake
+could be putting some memory allocation code in the lazy preparation function, and the computing memory was not allocated when lazy preparation function is not called.
+The example in [lazy_allreduce.cc](lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](basic.cc)) to lazy version: wrap the preparation
+code with a lambda function, and pass it to allreduce. 
+
+#### Checkpoint and LazyCheckpoint
+Common machine learning algorithms usually involves iterative computation. As mentioned in the section ([Structure of a Rabit Program](#structure-of-a-rabit-program)),
+user can and should use Checkpoint to ```save``` the progress so far, so that when a node fails, the latest checkpointed model can be loaded.
+
+There are two model arguments you can pass to Checkpoint and LoadCheckpoint: ```global_model``` and ```local_model```:
+* ```global_model``` refers to the model that is commonly shared across all the nodes
+  - For example, the centriods of clusters in kmeans is shared across all nodes
+* ```local_model``` refers to the model that is specifically tied to the current node
+  - For example, in topic modeling, the topic assignments of subset of documents in current node is local model
+
+Because the different nature of the two types of models, different strategy will be used for them. 
+```global_model``` is simply saved in local memory of each node, while ```local_model``` will replicated to some other
+nodes (selected using a ring replication strategy). The checkpoint is only saved in the memory without touching the disk which makes rabit programs more efficient. 
+User is encouraged to use ```global_model``` only when is sufficient for better efficiency.
+
+To enable a model class to be checked pointed, user can implement a [serialization interface](../include/rabit_serialization.h). The serialization interface already
+provide serialization functions of STL vector and string. For python API, user can checkpoint any python object that can be pickled.
+
+There is a special Checkpoint function called [LazyCheckpoint](http://homes.cs.washington.edu/~tqchen/rabit/doc/namespacerabit.html#a99f74c357afa5fba2c80cc0363e4e459),
+which can be used for ```global_model``` only cases under certain condition.
+When LazyCheckpoint is called, no action is taken and the rabit engine only remembers the pointer to the model.
+The serialization will only happen when another node fails and the recovery starts. So user basically pays no extra cost calling LazyCheckpoint.
+To use this function, the user need to ensure the model remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
+So that when recovery procedure happens in these function calls, the serialized model will be the same.
+
+For example, consider the following calling sequence
+```
+LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+```
+The user must only change the model in code3. Such condition can usually be satiesfied in many scenarios, and user can use LazyCheckpoint to further
+improve the efficiency of the program.
+
+
+Compile Programs with Rabit
+====
+Rabit is a portable library, to use it, you only need to include the rabit header file.
+* You will need to add the path to [../include](../include) to the header search path of the compiler
+  - Solution 1: add ```-I/path/to/rabit/include``` to the compiler flag in gcc or clang
+  - Solution 2: add the path to the environment variable CPLUS_INCLUDE_PATH
+* You will need to add the path to [../lib](../lib) to the library search path of the compiler
+  - Solution 1: add ```-L/path/to/rabit/lib``` to the linker flag
+  - Solution 2: add the path to environment variable LIBRARY_PATH AND LD_LIBRARY_PATH
+* Link against lib/rabit.a
+  - Add ```-lrabit``` to the linker flag
+
+The procedure above allows you to compile a program with rabit. The following two sections contain additional
+options you can use to link against different backends other than the normal one.
+
+#### Link against MPI Allreduce
+You can link against ```rabit_mpi.a``` instead of using MPI Allreduce, however, the resulting program is backed by MPI and
+is not fault tolerant anymore.
+* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mpi```
+* The final linking needs to be done by mpi wrapper compiler ```mpicxx```
+
+#### Link against Mock Test Rabit Library
+If you want to use a mock to test the program in order to see the behavior of the code when some nodes go down, you can link against ```rabit_mock.a``` .
+* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mock```
+
+The resulting rabit mock program can take in additional arguments in the following format
+```
+mock=rank,version,seq,ndeath
+```
+
+The four integers specify an event that will cause the program to ```commit suicide```(exit with -2)
+* rank specifies the rank of the node to kill
+* version specifies the version (iteration) of the model where you want the process to die
+* seq specifies the sequence number of the Allreduce/Broadcast call since last checkpoint, where the process will be killed
+* ndeath specifies how many times this node died already
+
+For example, consider the following script in the test case
+```bash
+../tracker/rabit_demo.py -n 10 test_model_recover 10000\
+                         mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1
+```
+* The first mock will cause node 0 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 0
+* The second mock will cause node 1 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 1
+* The third mock will cause node 1 to exit again when calling second Allreduce/Broadcast (seq = 1) in iteration 1
+  - Note that ndeath = 1 means this will happen only if node 1 died once, which is our case
+
+Running Rabit Jobs
+====
+Rabit is a portable library that can run on multiple platforms. 
+
+#### Running Rabit Locally
+* You can use [../tracker/rabit_demo.py](../tracker/rabit_demo.py) to start n processes locally
+* This script will restart the program when it exits with -2, so it can be used for [mock test](#link-against-mock-test-library)
+
+#### Running Rabit on Hadoop
+* You can use [../tracker/rabit_hadoop.py](../tracker/rabit_hadoop.py) to run rabit programs on hadoop
+* This will start n rabit programs as mappers of MapReduce
+* Each program can read its portion of data from stdin
+* Yarn(Hadoop 2.0 or higher) is highly recommended, since Yarn allows specifying number of cpus and memory of each mapper:
+  - This allows multi-threading programs in each node, which can be more efficient
+  - An easy multi-threading solution could be to use OpenMP with rabit code
+
+#### Running Rabit using MPI
+* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py).
+* If you linked your code against librabit_mpi.a, then you can directly use mpirun to submit the job
+
+#### Customize Tracker Script
+You can also modify the tracker script to allow rabit to run on other platforms. To do so, refer to existing
+tracker scripts, such as [../tracker/rabit_hadoop.py](../tracker/rabit_hadoop.py) and [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py) to get a sense of how it is done.
+
+You will need to implement a platform dependent submission function with the following definition
+```python
+def fun_submit(nworkers, worker_args):
+    """
+      customized submit script, that submits nslave jobs,
+      each must contain args as parameter
+      note this can be a lambda closure
+      Parameters
+         nworkers number of worker processes to start
+         worker_args tracker information which must be passed to the arguments 
+              this usually includes the parameters of master_uri and port, etc.
+    """
+```
+The submission function should start nworkers processes in the platform, and append worker_args to the end of the other arguments.
+Then you can simply call ```tracker.submit``` with fun_submit to submit jobs to the target platform
+
+Note that the current rabit tracker does not restart a worker when it dies, the restart of a node is done by the platform, otherwise we should write the fail-restart logic in the custom script.
+* Fail-restart is usually provided by most platforms.
+* For example, mapreduce will restart a mapper when it fails
+
+Fault Tolerance
+=====
+This section introduces how fault tolerance works in rabit.
+The following figure shows how rabit deals with failures.
+
+![](http://homes.cs.washington.edu/~tqchen/rabit/fig/fault-tol.png)
+
+The scenario is as follows:
+* Node 1 fails between the first and second call of Allreduce after the second checkpoint
+* The other nodes wait in the call of the second Allreduce in order to help node 1 to recover.
+* When node 1 restarts, it will call ```LoadCheckPoint```, and get the latest checkpoint from one of the existing nodes.
+* Then node 1 can start from the latest checkpoint and continue running.
+* When node 1 calls the first Allreduce again, as the other nodes already know the result, node 1 can get it from one of them.
+* When node 1 reaches the second Allreduce, the other nodes find out that node 1 has catched up and they can continue the program normally.
+
+This fault tolerance model is based on a key property of Allreduce and
+Broadcast: All the nodes get the same result after calling Allreduce/Broadcast.
+Because of this property, any node can record the results of history
+Allreduce/Broadcast calls.  When a node is recovered, it can fetch the lost
+results from some alive nodes and rebuild its model.
+
+The checkpoint is introduced so that we can discard the history results of
+Allreduce/Broadcast calls before the latest checkpoint. This saves memory
+consumption used for backup.  The checkpoint of each node is a model defined by
+users and can be split into 2 parts: a global model and a local model. The
+global model is shared by all nodes and can be backed up by any nodes. The
+local model of a node is replicated to some other nodes (selected using a ring
+replication strategy).  The checkpoint is only saved in the memory without
+touching the disk which makes rabit programs more efficient.  The strategy of
+rabit is different from the fail-restart strategy where all the nodes restart
+from the same checkpoint when any of them fail.  In rabit, all the alive nodes
+will block in the Allreduce call and help the recovery.  To catch up, the
+recovered node fetches its latest checkpoint and the results of
+Allreduce/Broadcast calls after the checkpoint from some alive nodes.
+
+This is just a conceptual introduction to rabit's fault tolerance model. The actual implementation is more sophisticated,
+and can deal with more complicated cases such as multiple nodes failure and node failure during recovery phase.
diff --git a/subtree/rabit/guide/basic.cc b/subtree/rabit/guide/basic.cc
new file mode 100644
index 000000000..62c0fc165
--- /dev/null
+++ b/subtree/rabit/guide/basic.cc
@@ -0,0 +1,29 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file basic.cc
+ * \brief This is an example demonstrating what is Allreduce
+ *
+ * \author Tianqi Chen
+ */
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N];
+  rabit::Init(argc, argv);
+  for (int i = 0; i < N; ++i) {
+    a[i] = rabit::GetRank() + i;
+  } 
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // second allreduce that sums everything up
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);  
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/guide/basic.py b/subtree/rabit/guide/basic.py
new file mode 100755
index 000000000..becdae07d
--- /dev/null
+++ b/subtree/rabit/guide/basic.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+"""
+demo python script of rabit
+"""
+import os
+import sys
+import numpy as np
+# import rabit, the tracker script will setup the lib path correctly
+# for normal run without tracker script, add following line
+# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
+import rabit
+
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+a = np.zeros(n)
+for i in xrange(n):
+    a[i] = rank + i
+    
+print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.MAX)
+print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.SUM)
+print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
+rabit.finalize()
diff --git a/subtree/rabit/guide/broadcast.cc b/subtree/rabit/guide/broadcast.cc
new file mode 100644
index 000000000..83dbe67fe
--- /dev/null
+++ b/subtree/rabit/guide/broadcast.cc
@@ -0,0 +1,16 @@
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  rabit::Init(argc, argv);
+  std::string s;
+  if (rabit::GetRank() == 0) s = "hello world";
+  printf("@node[%d] before-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  // broadcast s from node 0 to all other nodes
+  rabit::Broadcast(&s, 0);
+  printf("@node[%d] after-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/guide/broadcast.py b/subtree/rabit/guide/broadcast.py
new file mode 100755
index 000000000..defe69eaa
--- /dev/null
+++ b/subtree/rabit/guide/broadcast.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+"""
+demo python script of rabit
+"""
+import os
+import sys
+# add path to wrapper
+# for normal run without tracker script, add following line
+# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
+import rabit
+
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+s = None
+if rank == 0:
+    s = {'hello world':100, 2:3}
+print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
+s = rabit.broadcast(s, 0)
+
+print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
+rabit.finalize()
diff --git a/subtree/rabit/guide/lazy_allreduce.cc b/subtree/rabit/guide/lazy_allreduce.cc
new file mode 100644
index 000000000..b54776ecc
--- /dev/null
+++ b/subtree/rabit/guide/lazy_allreduce.cc
@@ -0,0 +1,33 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file basic.cc
+ * \brief This is an example demonstrating what is Allreduce
+ *
+ * \author Tianqi Chen
+ */
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N] = {0};
+  rabit::Init(argc, argv);
+  // lazy preparation function
+  auto prepare = [&]() {
+    printf("@node[%d] run prepare function\n", rabit::GetRank());
+    for (int i = 0; i < N; ++i) {
+      a[i] = rabit::GetRank() + i;
+    } 
+  };
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N, prepare);  
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);  
+  // rum second allreduce
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);  
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/guide/lazy_allreduce.py b/subtree/rabit/guide/lazy_allreduce.py
new file mode 100755
index 000000000..a195f58d2
--- /dev/null
+++ b/subtree/rabit/guide/lazy_allreduce.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+"""
+demo python script of rabit: Lazy preparation function
+"""
+import os
+import sys
+import numpy as np
+# import rabit, the tracker script will setup the lib path correctly
+# for normal run without tracker script, add following line
+# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
+import rabit
+
+
+# use mock library so that we can run failure test
+rabit.init(lib = 'mock')
+n = 3
+rank = rabit.get_rank()
+a = np.zeros(n)
+
+def prepare(a):
+    print '@node[%d] run prepare function' % rank
+    # must take in reference and modify the reference
+    for i in xrange(n):
+        a[i] = rank + i    
+    
+print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.MAX, prepare_fun = prepare)
+print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.SUM)
+print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
+rabit.finalize()
diff --git a/subtree/rabit/include/README.md b/subtree/rabit/include/README.md
new file mode 100644
index 000000000..2512edc78
--- /dev/null
+++ b/subtree/rabit/include/README.md
@@ -0,0 +1,7 @@
+Library Header Files
+====
+* This folder contains all the header needed to use the library
+* To use it, add the "include" folder to the search path of the compiler
+* User only needs to know [rabit.h](rabit.h) and [rabit_serializable.h](rabit_serializable.h) in order to use the library
+* Folder [rabit](rabit) contains headers for internal engine and template's implementation
+* Not all .h files in the project are in the "include" folder, .h files that are internally used by the library remain at [src](../src)
diff --git a/subtree/rabit/include/rabit.h b/subtree/rabit/include/rabit.h
new file mode 100644
index 000000000..eb2834b30
--- /dev/null
+++ b/subtree/rabit/include/rabit.h
@@ -0,0 +1,330 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file rabit.h
+ * \brief This file defines rabit's Allreduce/Broadcast interface
+ *   The rabit engine contains the actual implementation
+ *   Code that only uses this header can also be compiled with MPI Allreduce (non fault-tolerant),
+ *
+ *   rabit.h and serializable.h is all what the user needs to use the rabit interface
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#ifndef RABIT_RABIT_H_
+#define RABIT_RABIT_H_
+#include <string>
+#include <vector>
+// optionally support of lambda functions in C++11, if available
+#if __cplusplus >= 201103L
+#include <functional>
+#endif  // C++11
+// contains definition of ISerializable
+#include "./rabit_serializable.h"
+// engine definition of rabit, defines internal implementation
+// to use rabit interface, there is no need to read engine.h
+// rabit.h and serializable.h are enough to use the interface
+#include "./rabit/engine.h"
+
+/*! \brief rabit namespace */
+namespace rabit {
+/*!
+ * \brief reduction operators namespace
+ */
+namespace op {
+/*!
+ * \class rabit::op::Max
+ * \brief maximum reduction operator
+ */
+struct Max;
+/*!
+ * \class rabit::op::Min
+ * \brief minimum reduction operator
+ */
+struct Min;
+/*!
+ * \class rabit::op::Sum
+ * \brief sum reduction operator
+ */
+struct Sum;
+/*!
+ * \class rabit::op::BitOR
+ * \brief bitwise OR reduction operator
+ */
+struct BitOR;
+}  // namespace op
+/*!
+ * \brief initializes rabit, call this once at the beginning of your program
+ * \param argc number of arguments in argv
+ * \param argv the array of input arguments
+ */
+inline void Init(int argc, char *argv[]);
+/*! 
+ * \brief finalizes the rabit engine, call this function after you finished with all the jobs 
+ */
+inline void Finalize(void);
+/*! \brief gets rank of the current process */
+inline int GetRank(void);
+/*! \brief gets total number of processes */
+inline int GetWorldSize(void);
+/*! \brief whether rabit env is in distributed mode */
+inline bool IsDistributed(void) {
+  return GetWorldSize() != 1;
+}
+/*! \brief gets processor's name */
+inline std::string GetProcessorName(void);
+/*!
+ * \brief prints the msg to the tracker,
+ *    this function can be used to communicate progress information to 
+ *    the user who monitors the tracker
+ * \param msg the message to be printed
+ */
+inline void TrackerPrint(const std::string &msg);
+#ifndef RABIT_STRICT_CXX98_
+/*!
+ * \brief prints the msg to the tracker, this function may not be available
+ *    in very strict c++98 compilers, though it usually is.
+ *    this function can be used to communicate progress information to
+ *    the user who monitors the tracker
+ * \param fmt the format string
+ */
+inline void TrackerPrintf(const char *fmt, ...);
+#endif
+/*!
+ * \brief broadcasts a memory region to every node from the root
+ *
+ *     Example: int a = 1; Broadcast(&a, sizeof(a), root); 
+ * \param sendrecv_data the pointer to the send/receive buffer,
+ * \param size the data size
+ * \param root the process root
+ */
+inline void Broadcast(void *sendrecv_data, size_t size, int root);
+/*!
+ * \brief broadcasts an std::vector<DType> to every node from root
+ * \param sendrecv_data the pointer to send/receive vector,
+ *        for the receiver, the vector does not need to be pre-allocated
+ * \param root the process root
+ * \tparam DType the data type stored in the vector, has to be a simple data type
+ *               that can be directly transmitted by sending the sizeof(DType)
+ */
+template<typename DType>
+inline void Broadcast(std::vector<DType> *sendrecv_data, int root);
+/*!
+ * \brief broadcasts a std::string to every node from the root
+ * \param sendrecv_data the pointer to the send/receive buffer,
+ *        for the receiver, the vector does not need to be pre-allocated
+ * \param root the process root
+ */
+inline void Broadcast(std::string *sendrecv_data, int root);
+/*!
+ * \brief performs in-place Allreduce on sendrecvbuf 
+ *        this function is NOT thread-safe
+ *
+ * Example Usage: the following code does an Allreduce and outputs the sum as the result
+ *     vector<int> data(10);
+ *     ...
+ *     Allreduce<op::Sum>(&data[0], data.size());
+ *     ...
+ * \param sendrecvbuf buffer for both sending and receiving data
+ * \param count number of elements to be reduced
+ * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+ *                    will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param prepare_arg argument used to pass into the lazy preprocessing function 
+ * \tparam OP see namespace op, reduce operator 
+ * \tparam DType data type
+ */
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      void (*prepare_fun)(void *arg) = NULL,
+                      void *prepare_arg = NULL);
+// C++11 support for lambda prepare function
+#if __cplusplus >= 201103L
+/*!
+ * \brief performs in-place Allreduce, on sendrecvbuf
+ *        with a prepare function specified by a lambda function
+ *
+ * Example Usage: the following code does an Allreduce and outputs the sum as the result
+ *     vector<int> data(10);
+ *     ...
+ *     Allreduce<op::Sum>(&data[0], data.size(), [&]() {
+ *                          for (int i = 0; i < 10; ++i) {
+ *                            data[i] = i;
+ *                          }
+ *                        });
+ *     ...
+ * \param sendrecvbuf buffer for both sending and receiving data
+ * \param count number of elements to be reduced
+ * \param prepare_fun  Lazy lambda preprocessing function, prepare_fun() will be invoked
+ *                     by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \tparam OP see namespace op, reduce operator 
+ * \tparam DType data type
+ */
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      std::function<void()> prepare_fun);
+#endif  // C++11
+/*!
+ * \brief loads the latest check point
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller needs to guarantee that the global_model
+ *   is the same in every node
+ * \param local_model pointer to the local model that is specific to the current node/rank
+ *   this can be NULL when no local model is needed
+ * 
+ * \return the version number of the check point loaded
+ *     if returned version == 0, this means no model has been CheckPointed
+ *     the p_model is not touched, users should do the necessary initialization by themselves
+ *   
+ *   Common usage example:
+ *      int iter = rabit::LoadCheckPoint(&model);
+ *      if (iter == 0) model.InitParameters();
+ *      for (i = iter; i < max_iter; ++i) {
+ *        do many things, include allreduce
+ *        rabit::CheckPoint(model);
+ *      } 
+ *
+ * \sa CheckPoint, VersionNumber
+ */
+inline int LoadCheckPoint(ISerializable *global_model,
+                          ISerializable *local_model = NULL);
+/*!
+ * \brief checkpoints the model, meaning a stage of execution has finished.
+ *  every time we call check point, a version number will be increased by one
+ * 
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller needs to guarantee that the global_model
+ *   is the same in every node
+ * \param local_model pointer to the local model that is specific to the current node/rank
+ *   this can be NULL when no local state is needed
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in the CheckPoint function. global_model does not need explicit replication.
+   *       So, only CheckPoint with the global_model if possible
+   * \sa LoadCheckPoint, VersionNumber
+   */
+inline void CheckPoint(const ISerializable *global_model,
+                       const ISerializable *local_model = NULL);
+/*!
+ * \brief This function can be used to replace CheckPoint for global_model only,
+ *   when certain condition is met (see detailed explanation).
+ * 
+ *   This is a "lazy" checkpoint such that only the pointer to the global_model is
+ *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+ *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
+ *   In other words, the global_model model can be changed only between the last call of 
+ *   Allreduce/Broadcast and LazyCheckPoint, both in the same version
+ *   
+ *   For example, suppose the calling sequence is:
+ *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint/(or can be CheckPoint)
+ *   
+ *   Then the user MUST only change the global_model in code3.
+ *
+ *   The use of LazyCheckPoint instead of CheckPoint will improve the efficiency of the program.
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller needs to guarantee that the global_model
+ *   is the same in every node
+ * \sa LoadCheckPoint, CheckPoint, VersionNumber
+ */
+inline void LazyCheckPoint(const ISerializable *global_model);
+/*!
+ * \return version number of the current stored model,
+ *         which means how many calls to CheckPoint we made so far
+ * \sa LoadCheckPoint, CheckPoint
+ */
+inline int VersionNumber(void);
+// ----- extensions that allow customized reducer ------
+// helper class to do customized reduce, user do not need to know the type
+namespace engine {
+class ReduceHandle;
+}  // namespace engine
+/*!
+ * \brief template class to make customized reduce and all reduce easy  
+ *  Do not use reducer directly in the function you call Finalize, 
+ *   because the destructor can execute after Finalize
+ * \tparam DType data type that to be reduced
+ * \tparam freduce the customized reduction function
+ *  DType must be a struct, with no pointer
+ */
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+class Reducer {
+ public:
+  Reducer(void);
+  /*!
+   * \brief customized in-place all reduce operation 
+   * \param sendrecvbuf the in place send-recv buffer
+   * \param count number of elements to be reduced
+   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function 
+   */
+  inline void Allreduce(DType *sendrecvbuf, size_t count,
+                        void (*prepare_fun)(void *arg) = NULL,
+                        void *prepare_arg = NULL);
+#if __cplusplus >= 201103L
+  /*!
+   * \brief customized in-place all reduce operation, with lambda function as preprocessor
+   * \param sendrecvbuf pointer to the array of objects to be reduced
+   * \param count number of elements to be reduced
+   * \param prepare_fun lambda function executed to prepare the data, if necessary
+   */  
+  inline void Allreduce(DType *sendrecvbuf, size_t count,
+                        std::function<void()> prepare_fun);
+#endif
+
+ private:
+  /*! \brief function handle to do reduce */
+  engine::ReduceHandle handle_;
+};
+/*!
+ * \brief template class to make customized reduce,
+ *  this class defines complex reducer handles all the data structure that can be
+ *  serialized/deserialized into fixed size buffer
+ *  Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
+ * 
+ * \tparam DType data type that to be reduced, DType must contain the following functions:
+ * \tparam freduce the customized reduction function
+ *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte)
+ */
+template<typename DType>
+class SerializeReducer {
+ public:
+  SerializeReducer(void);
+  /*!
+   * \brief customized in-place all reduce operation 
+   * \param sendrecvobj pointer to the array of objects to be reduced
+   * \param max_nbyte maximum amount of memory needed to serialize each object
+   *        this includes budget limit for intermediate and final result
+   * \param count number of elements to be reduced
+   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
+   *                     If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function 
+   */
+  inline void Allreduce(DType *sendrecvobj,
+                        size_t max_nbyte, size_t count,
+                        void (*prepare_fun)(void *arg) = NULL,
+                        void *prepare_arg = NULL);
+// C++11 support for lambda prepare function
+#if __cplusplus >= 201103L
+  /*!
+   * \brief customized in-place all reduce operation, with lambda function as preprocessor
+   * \param sendrecvobj pointer to the array of objects to be reduced
+   * \param max_nbyte maximum amount of memory needed to serialize each object
+   *        this includes budget limit for intermediate and final result
+   * \param count number of elements to be reduced
+   * \param prepare_fun lambda function executed to prepare the data, if necessary
+   */  
+  inline void Allreduce(DType *sendrecvobj,
+                        size_t max_nbyte, size_t count,
+                        std::function<void()> prepare_fun);
+#endif
+
+ private:
+  /*! \brief function handle to do reduce */
+  engine::ReduceHandle handle_;
+  /*! \brief temporal buffer used to do reduce*/
+  std::string buffer_;
+};
+}  // namespace rabit
+// implementation of template functions
+#include "./rabit/rabit-inl.h"
+#endif  // RABIT_RABIT_H_
diff --git a/subtree/rabit/include/rabit/engine.h b/subtree/rabit/include/rabit/engine.h
new file mode 100644
index 000000000..668b4fcef
--- /dev/null
+++ b/subtree/rabit/include/rabit/engine.h
@@ -0,0 +1,256 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine.h
+ * \brief This file defines the core interface of rabit library
+ * \author Tianqi Chen, Nacho, Tianyi
+ */
+#ifndef RABIT_ENGINE_H_
+#define RABIT_ENGINE_H_
+#include <string>
+#include "../rabit_serializable.h"
+
+namespace MPI {
+/*! \brief MPI data type just to be compatible with MPI reduce function*/
+class Datatype;
+}
+
+/*! \brief namespace of rabit */
+namespace rabit {
+/*! \brief core interface of the engine */
+namespace engine {
+/*! \brief interface of core Allreduce engine */
+class IEngine {
+ public:
+  /*! 
+   * \brief Preprocessing function, that is called before AllReduce,
+   *        used to prepare the data used by AllReduce
+   * \param arg additional possible argument used to invoke the preprocessor
+   */
+  typedef void (PreprocFunction) (void *arg);
+  /*!
+   * \brief reduce function, the same form of MPI reduce function is used,
+   *        to be compatible with MPI interface
+   *        In all the functions, the memory is ensured to aligned to 64-bit
+   *        which means it is OK to cast src,dst to double* int* etc
+   * \param src pointer to source space
+   * \param dst pointer to destination reduction
+   * \param count total number of elements to be reduced (note this is total number of elements instead of bytes)
+   *              the definition of the reduce function should be type aware
+   * \param dtype the data type object, to be compatible with MPI reduce
+   */
+  typedef void (ReduceFunction) (const void *src,
+                                 void *dst, int count,
+                                 const MPI::Datatype &dtype);
+  /*!
+   * \brief performs in-place Allreduce, on sendrecvbuf
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and receiving data
+   * \param type_nbytes the number of bytes the type has
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \param prepare_func Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
+   */
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun = NULL,
+                         void *prepare_arg = NULL) = 0;
+  /*!
+   * \brief broadcasts data from root to every other node
+   * \param sendrecvbuf_ buffer for both sending and receiving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   */
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) = 0;
+  /*!
+   * \brief explicitly re-initialize everything before calling LoadCheckPoint
+   *    call this function when IEngine throws an exception,
+   *    this function should only be used for test purposes
+   */
+  virtual void InitAfterException(void) = 0;
+  /*!
+   * \brief loads the latest check point
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller needs to guarantee that the global_model
+   *   is the same in all nodes
+   * \param local_model pointer to the local model that is specific to current node/rank
+   *   this can be NULL when no local model is needed
+   *
+   * \return the version number of the model loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     the p_model is not touched, users should do necessary initialization by themselves
+   *   
+   *   Common usage example:
+   *      int iter = rabit::LoadCheckPoint(&model);
+   *      if (iter == 0) model.InitParameters();
+   *      for (i = iter; i < max_iter; ++i) {
+   *        do many things, include allreduce
+   *        rabit::CheckPoint(model);
+   *      } 
+   *
+   * \sa CheckPoint, VersionNumber
+   */
+  virtual int LoadCheckPoint(ISerializable *global_model,
+                             ISerializable *local_model = NULL) = 0;
+  /*!
+   * \brief checkpoints the model, meaning a stage of execution was finished
+   *  every time we call check point, a version number increases by ones
+   * 
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller needs to guarantee that the global_model
+   *   is the same in every node
+   * \param local_model pointer to the local model that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model does not need explicit replication.
+   *       So, only CheckPoint with global_model if possible
+   *
+   * \sa LoadCheckPoint, VersionNumber
+   */
+  virtual void CheckPoint(const ISerializable *global_model,
+                          const ISerializable *local_model = NULL) = 0;
+  /*!
+   * \brief This function can be used to replace CheckPoint for global_model only,
+   *   when certain condition is met (see detailed explanation).
+   * 
+   *   This is a "lazy" checkpoint such that only the pointer to global_model is
+   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+   *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
+   *   In other words, global_model can be changed only between the last call of 
+   *   Allreduce/Broadcast and LazyCheckPoint in the current version
+   *   
+   *   For example, suppose the calling sequence is:
+   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+   *   
+   *   If the user can only change global_model in code3, then LazyCheckPoint can be used to
+   *   improve the efficiency of the program.
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller needs to guarantee that global_model
+   *   is the same in every node
+   * \sa LoadCheckPoint, CheckPoint, VersionNumber
+   */
+  virtual void LazyCheckPoint(const ISerializable *global_model) = 0;
+  /*!
+   * \return version number of the current stored model,
+   *         which means how many calls to CheckPoint we made so far
+   * \sa LoadCheckPoint, CheckPoint
+   */
+  virtual int VersionNumber(void) const = 0;
+  /*! \brief gets rank of current node */
+  virtual int GetRank(void) const = 0;
+  /*! \brief gets total number of nodes */
+  virtual int GetWorldSize(void) const = 0;
+  /*! \brief gets the host name of the current node */
+  virtual std::string GetHost(void) const = 0;
+  /*!
+   * \brief prints the msg in the tracker,
+   *    this function can be used to communicate progress information to
+   *    the user who monitors the tracker
+   * \param msg message to be printed in the tracker
+   */
+  virtual void TrackerPrint(const std::string &msg) = 0;
+};
+
+/*! \brief initializes the engine module */
+void Init(int argc, char *argv[]);
+/*! \brief finalizes the engine module */
+void Finalize(void);
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void);
+
+/*! \brief namespace that contains stubs to be compatible with MPI */
+namespace mpi {
+/*!\brief enum of all operators */
+enum OpType {
+  kMax = 0,
+  kMin = 1,
+  kSum = 2,
+  kBitwiseOR = 3
+};
+/*!\brief enum of supported data types */
+enum DataType {
+  kChar = 0,
+  kUChar = 1,
+  kInt = 2,
+  kUInt = 3,
+  kLong = 4,
+  kULong = 5,
+  kFloat = 6,
+  kDouble = 7
+};
+}  // namespace mpi
+/*!
+ * \brief perform in-place Allreduce, on sendrecvbuf 
+ *   this is an internal function used by rabit to be able to compile with MPI
+ *   do not use this function directly
+ * \param sendrecvbuf buffer for both sending and receiving data
+ * \param type_nbytes the number of bytes the type has
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \param dtype the data type 
+ * \param op the reduce operator type
+ * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+ *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param prepare_arg argument used to pass into the lazy preprocessing function.
+ */
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun = NULL,
+                void *prepare_arg = NULL);
+
+/*!
+ * \brief handle for customized reducer, used to handle customized reduce
+ *  this class is mainly created for compatiblity issues with MPI's customized reduce
+ */
+class ReduceHandle {
+ public:
+  // constructor
+  ReduceHandle(void);
+  // destructor
+  ~ReduceHandle(void);
+  /*!
+   * \brief initialize the reduce function,
+   *   with the type the reduce function needs to deal with
+   *   the reduce function MUST be communicative
+   */
+  void Init(IEngine::ReduceFunction redfunc, size_t type_nbytes);
+  /*!
+   * \brief customized in-place all reduce operation 
+   * \param sendrecvbuf the in place send-recv buffer
+   * \param type_n4bytes size of the type, in terms of 4bytes
+   * \param count number of elements to send
+   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
+   */
+  void Allreduce(void *sendrecvbuf,
+                 size_t type_nbytes, size_t count,
+                 IEngine::PreprocFunction prepare_fun = NULL,
+                 void *prepare_arg = NULL);
+  /*! \return the number of bytes occupied by the type */
+  static int TypeSize(const MPI::Datatype &dtype);
+
+ protected:
+  // handle function field
+  void *handle_;
+  // reduce function of the reducer
+  IEngine::ReduceFunction *redfunc_;
+  // handle to the type field
+  void *htype_;
+  // the created type in 4 bytes
+  size_t created_type_nbytes_;
+};
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_ENGINE_H_
diff --git a/subtree/rabit/include/rabit/io.h b/subtree/rabit/include/rabit/io.h
new file mode 100644
index 000000000..d7597b915
--- /dev/null
+++ b/subtree/rabit/include/rabit/io.h
@@ -0,0 +1,107 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file io.h
+ * \brief utilities with different serializable implementations
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_UTILS_IO_H_
+#define RABIT_UTILS_IO_H_
+#include <cstdio>
+#include <vector>
+#include <cstring>
+#include <string>
+#include <algorithm>
+#include "./utils.h"
+#include "../rabit_serializable.h"
+
+namespace rabit {
+namespace utils {
+/*! \brief interface of i/o stream that support seek */
+class ISeekStream: public IStream {
+ public:
+  /*! \brief seek to certain position of the file */
+  virtual void Seek(size_t pos) = 0;
+  /*! \brief tell the position of the stream */
+  virtual size_t Tell(void) = 0;
+};
+
+/*! \brief fixed size memory buffer */
+struct MemoryFixSizeBuffer : public ISeekStream {
+ public:
+  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
+      : p_buffer_(reinterpret_cast<char*>(p_buffer)),
+        buffer_size_(buffer_size) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryFixSizeBuffer(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ + size <= buffer_size_,
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, p_buffer_ + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    utils::Assert(curr_ptr_ + size <=  buffer_size_,
+                  "write position exceed fixed buffer size");
+    memcpy(p_buffer_ + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  char *p_buffer_;
+  /*! \brief current pointer */
+  size_t buffer_size_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+};  // class MemoryFixSizeBuffer
+
+/*! \brief a in memory buffer that can be read and write as stream interface */
+struct MemoryBufferStream : public ISeekStream {
+ public:
+  explicit MemoryBufferStream(std::string *p_buffer)
+      : p_buffer_(p_buffer) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryBufferStream(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ <= p_buffer_->length(),
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    if (curr_ptr_ + size > p_buffer_->length()) {
+      p_buffer_->resize(curr_ptr_+size);
+    }
+    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  std::string *p_buffer_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+};  // class MemoryBufferStream
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_UTILS_IO_H_
diff --git a/subtree/rabit/include/rabit/rabit-inl.h b/subtree/rabit/include/rabit/rabit-inl.h
new file mode 100644
index 000000000..e0a14f4ad
--- /dev/null
+++ b/subtree/rabit/include/rabit/rabit-inl.h
@@ -0,0 +1,310 @@
+/*!
+ * \file rabit-inl.h
+ * \brief implementation of inline template function for rabit interface
+ *
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_RABIT_INL_H
+#define RABIT_RABIT_INL_H
+// use engine for implementation
+#include "./io.h"
+#include "./utils.h"
+#include "../rabit.h"
+
+namespace rabit {
+namespace engine {
+namespace mpi {
+// template function to translate type to enum indicator
+template<typename DType>
+inline DataType GetType(void);
+template<>
+inline DataType GetType<char>(void) {
+  return kChar;
+}
+template<>
+inline DataType GetType<unsigned char>(void) {
+  return kUChar;
+}
+template<>
+inline DataType GetType<int>(void) {
+  return kInt;
+}
+template<>
+inline DataType GetType<unsigned>(void) {
+  return kUInt;
+}
+template<>
+inline DataType GetType<long>(void) {
+  return kLong;
+}
+template<>
+inline DataType GetType<unsigned long>(void) {
+  return kULong;
+}
+template<>
+inline DataType GetType<float>(void) {
+  return kFloat;
+}
+template<>
+inline DataType GetType<double>(void) {
+  return kDouble;
+}
+}  // namespace mpi
+}  // namespace engine
+
+namespace op {
+struct Max {
+  const static engine::mpi::OpType kType = engine::mpi::kMax;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) {
+    if (dst < src) dst = src;
+  }
+};
+struct Min {
+  const static engine::mpi::OpType kType = engine::mpi::kMin;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) {
+    if (dst > src) dst = src;
+  }
+};
+struct Sum {
+  const static engine::mpi::OpType kType = engine::mpi::kSum;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) {
+    dst += src;
+  }
+};
+struct BitOR {
+  const static engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) {
+    dst |= src;
+  }
+};
+template<typename OP, typename DType>
+inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
+  const DType *src = (const DType*)src_;
+  DType *dst = (DType*)dst_;  
+  for (int i = 0; i < len; ++i) {
+    OP::Reduce(dst[i], src[i]);
+  }
+}
+} // namespace op
+
+// intialize the rabit engine
+inline void Init(int argc, char *argv[]) {
+  engine::Init(argc, argv);
+}
+// finalize the rabit engine
+inline void Finalize(void) {
+  engine::Finalize();
+}
+// get the rank of current process
+inline int GetRank(void) {
+  return engine::GetEngine()->GetRank();
+}
+// the the size of the world
+inline int GetWorldSize(void) {
+  return engine::GetEngine()->GetWorldSize();
+}
+// get the name of current processor
+inline std::string GetProcessorName(void) {
+  return engine::GetEngine()->GetHost();
+}
+// broadcast data to all other nodes from root
+inline void Broadcast(void *sendrecv_data, size_t size, int root) {
+  engine::GetEngine()->Broadcast(sendrecv_data, size, root);
+}
+template<typename DType>
+inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
+  size_t size = sendrecv_data->size();
+  Broadcast(&size, sizeof(size), root);
+  if (sendrecv_data->size() != size) {
+    sendrecv_data->resize(size);
+  }
+  if (size != 0) {
+    Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
+  }
+}
+inline void Broadcast(std::string *sendrecv_data, int root) {
+  size_t size = sendrecv_data->length();
+  Broadcast(&size, sizeof(size), root);
+  if (sendrecv_data->length() != size) {
+    sendrecv_data->resize(size);
+  }
+  if (size != 0) {
+    Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
+  }
+}
+
+// perform inplace Allreduce
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      void (*prepare_fun)(void *arg), 
+                      void *prepare_arg) {
+  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP,DType>,
+                     engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg);
+}
+
+// C++11 support for lambda prepare function
+#if __cplusplus >= 201103L
+inline void InvokeLambda_(void *fun) {
+  (*static_cast<std::function<void()>*>(fun))();
+}
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count, std::function<void()> prepare_fun) {
+  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP,DType>,
+                     engine::mpi::GetType<DType>(), OP::kType, InvokeLambda_, &prepare_fun);
+}
+#endif // C++11
+
+// print message to the tracker
+inline void TrackerPrint(const std::string &msg) {
+  engine::GetEngine()->TrackerPrint(msg);
+}
+#ifndef RABIT_STRICT_CXX98_
+inline void TrackerPrintf(const char *fmt, ...) {
+  const int kPrintBuffer = 1 << 10;
+  std::string msg(kPrintBuffer, '\0');
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+  va_end(args);
+  TrackerPrint(msg);
+}
+#endif
+// load latest check point
+inline int LoadCheckPoint(ISerializable *global_model,
+                          ISerializable *local_model) {
+  return engine::GetEngine()->LoadCheckPoint(global_model, local_model);
+}
+// checkpoint the model, meaning we finished a stage of execution
+inline void CheckPoint(const ISerializable *global_model,
+                       const ISerializable *local_model) {
+  engine::GetEngine()->CheckPoint(global_model, local_model);
+}
+// lazy checkpoint the model, only remember the pointer to global_model
+inline void LazyCheckPoint(const ISerializable *global_model) {
+  engine::GetEngine()->LazyCheckPoint(global_model);
+}
+// return the version number of currently stored model
+inline int VersionNumber(void) {
+  return engine::GetEngine()->VersionNumber();
+}
+// ---------------------------------
+// Code to handle customized Reduce
+// ---------------------------------
+// function to perform reduction for Reducer
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+  const size_t kUnit = sizeof(DType);
+  const char *psrc = reinterpret_cast<const char*>(src_);
+  char *pdst = reinterpret_cast<char*>(dst_);
+  DType tdst, tsrc;
+  for (int i = 0; i < len_; ++i) {
+    // use memcpy to avoid alignment issue
+    std::memcpy(&tdst, pdst + i * kUnit, sizeof(tdst));
+    std::memcpy(&tsrc, psrc + i * kUnit, sizeof(tsrc));
+    freduce(tdst, tsrc);
+    std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
+  }
+}
+// function to perform reduction for Reducer
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+inline void ReducerAlign_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+  const DType *psrc = reinterpret_cast<const DType*>(src_);
+  DType *pdst = reinterpret_cast<DType*>(dst_);
+  for (int i = 0; i < len_; ++i) {
+    freduce(pdst[i], psrc[i]);
+  }
+}
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+inline Reducer<DType, freduce>::Reducer(void) {
+  // it is safe to directly use handle for aligned data types
+  if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
+    this->handle_.Init(ReducerAlign_<DType, freduce>, sizeof(DType));
+  } else {
+    this->handle_.Init(ReducerSafe_<DType, freduce>, sizeof(DType));
+  }
+}
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
+                                               void (*prepare_fun)(void *arg),
+                                               void *prepare_arg) {
+  handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun, prepare_arg);
+}
+// function to perform reduction for SerializeReducer
+template<typename DType>
+inline void SerializeReducerFunc_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+  int nbytes = engine::ReduceHandle::TypeSize(dtype);
+  // temp space
+  DType tsrc, tdst;
+  for (int i = 0; i < len_; ++i) {
+    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes);
+    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes);
+    tsrc.Load(fsrc);
+    tdst.Load(fdst);
+    // govern const check
+    tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
+    fdst.Seek(0);
+    tdst.Save(fdst);
+  }
+}
+template<typename DType>
+inline SerializeReducer<DType>::SerializeReducer(void) {
+  handle_.Init(SerializeReducerFunc_<DType>, sizeof(DType));
+}
+// closure to call Allreduce
+template<typename DType>
+struct SerializeReduceClosure {
+  DType *sendrecvobj;
+  size_t max_nbyte, count;
+  void (*prepare_fun)(void *arg);
+  void *prepare_arg;
+  std::string *p_buffer;
+  // invoke the closure
+  inline void Run(void) {
+    if (prepare_fun != NULL) prepare_fun(prepare_arg);
+    for (size_t i = 0; i < count; ++i) {
+      utils::MemoryFixSizeBuffer fs(BeginPtr(*p_buffer) + i * max_nbyte, max_nbyte);
+      sendrecvobj[i].Save(fs);
+    }
+  }
+  inline static void Invoke(void *c) {
+    static_cast<SerializeReduceClosure<DType>*>(c)->Run();
+  }
+};
+template<typename DType>
+inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
+                                               size_t max_nbyte, size_t count,
+                                               void (*prepare_fun)(void *arg),
+                                               void *prepare_arg) {
+  buffer_.resize(max_nbyte * count);
+  // setup closure
+  SerializeReduceClosure<DType> c;
+  c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count;
+  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;   
+  // invoke here 
+  handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
+                    SerializeReduceClosure<DType>::Invoke, &c);
+  for (size_t i = 0; i < count; ++i) {
+    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
+    sendrecvobj[i].Load(fs);
+  }
+}
+
+#if __cplusplus >= 201103L
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
+                                               std::function<void()> prepare_fun) {
+  this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun);
+}
+template<typename DType>
+inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
+                                               size_t max_nbytes, size_t count,
+                                               std::function<void()> prepare_fun) {
+  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda_, &prepare_fun);
+}
+#endif
+}  // namespace rabit
+#endif
diff --git a/subtree/rabit/include/rabit/timer.h b/subtree/rabit/include/rabit/timer.h
new file mode 100644
index 000000000..51ece4a89
--- /dev/null
+++ b/subtree/rabit/include/rabit/timer.h
@@ -0,0 +1,23 @@
+/*!
+ * \file timer.h
+ * \brief This file defines the utils for timing
+ * \author Tianqi Chen, Nacho, Tianyi
+ */
+#ifndef RABIT_TIMER_H
+#define RABIT_TIMER_H
+#include <time.h>
+#include "./utils.h"
+
+namespace rabit {
+namespace utils {
+/*!
+ * \brief return time in seconds, not cross platform, avoid to use this in most places
+ */
+inline double GetTime(void) {
+  timespec ts;
+  utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
+  return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
+}
+}
+}
+#endif
diff --git a/subtree/rabit/include/rabit/utils.h b/subtree/rabit/include/rabit/utils.h
new file mode 100644
index 000000000..aae3c6ab4
--- /dev/null
+++ b/subtree/rabit/include/rabit/utils.h
@@ -0,0 +1,191 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file utils.h
+ * \brief simple utils to support the code
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_UTILS_H_
+#define RABIT_UTILS_H_
+#define _CRT_SECURE_NO_WARNINGS
+#include <cstdio>
+#include <string>
+#include <cstdlib>
+#include <vector>
+
+#ifndef RABIT_STRICT_CXX98_
+#include <cstdarg>
+#endif
+
+#if !defined(__GNUC__)
+#define fopen64 std::fopen
+#endif
+#ifdef _MSC_VER
+// NOTE: sprintf_s is not equivalent to snprintf,
+// they are equivalent when success, which is sufficient for our case
+#define snprintf sprintf_s
+#define vsnprintf vsprintf_s
+#else
+#ifdef _FILE_OFFSET_BITS
+#if _FILE_OFFSET_BITS == 32
+#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit")
+#endif
+#endif
+
+#ifdef __APPLE__
+#define off64_t off_t
+#define fopen64 std::fopen
+#endif
+
+extern "C" {
+#include <sys/types.h>
+}
+#endif
+
+#ifdef _MSC_VER
+typedef unsigned char uint8_t;
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long uint64_t;
+typedef long int64_t;
+#else
+#include <inttypes.h>
+#endif
+
+namespace rabit {
+/*! \brief namespace for helper utils of the project */
+namespace utils {
+
+/*! \brief error message buffer length */
+const int kPrintBuffer = 1 << 12;
+
+#ifndef RABIT_CUSTOMIZE_MSG_
+/*! 
+ * \brief handling of Assert error, caused by inappropriate input
+ * \param msg error message 
+ */
+inline void HandleAssertError(const char *msg) {
+  fprintf(stderr, "AssertError:%s\n", msg);
+  exit(-1);
+}
+/*! 
+ * \brief handling of Check error, caused by inappropriate input
+ * \param msg error message 
+ */
+inline void HandleCheckError(const char *msg) {
+  fprintf(stderr, "%s\n", msg);
+  exit(-1);
+}
+inline void HandlePrint(const char *msg) {
+  printf("%s", msg);
+}
+inline void HandleLogPrint(const char *msg) {
+  fprintf(stderr, "%s", msg);
+  fflush(stderr);
+}
+#else
+#ifndef RABIT_STRICT_CXX98_
+// include declarations, some one must implement this
+void HandleAssertError(const char *msg);
+void HandleCheckError(const char *msg);
+void HandlePrint(const char *msg);
+#endif
+#endif
+#ifdef RABIT_STRICT_CXX98_
+// these function pointers are to be assigned
+extern "C" void (*Printf)(const char *fmt, ...);
+extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
+extern "C" void (*Assert)(int exp, const char *fmt, ...);
+extern "C" void (*Check)(int exp, const char *fmt, ...);
+extern "C" void (*Error)(const char *fmt, ...);
+#else
+/*! \brief printf, prints messages to the console */
+inline void Printf(const char *fmt, ...) {
+  std::string msg(kPrintBuffer, '\0');
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+  va_end(args);
+  HandlePrint(msg.c_str());
+}
+/*! \brief portable version of snprintf */
+inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  int ret = vsnprintf(buf, size, fmt, args);
+  va_end(args);
+  return ret;
+}
+
+/*! \brief assert a condition is true, use this to handle debug information */
+inline void Assert(bool exp, const char *fmt, ...) {
+  if (!exp) {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleAssertError(msg.c_str());
+  }
+}
+
+/*!\brief same as assert, but this is intended to be used as a message for users */
+inline void Check(bool exp, const char *fmt, ...) {
+  if (!exp) {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleCheckError(msg.c_str());
+  }
+}
+
+/*! \brief report error message, same as check */
+inline void Error(const char *fmt, ...) {
+  {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleCheckError(msg.c_str());
+  }
+}
+#endif
+
+/*! \brief replace fopen, report error when the file open fails */
+inline std::FILE *FopenCheck(const char *fname, const char *flag) {
+  std::FILE *fp = fopen64(fname, flag);
+  Check(fp != NULL, "can not open file \"%s\"\n", fname);
+  return fp;
+}
+}  // namespace utils
+// easy utils that can be directly accessed in xgboost
+/*! \brief get the beginning address of a vector */
+template<typename T>
+inline T *BeginPtr(std::vector<T> &vec) {
+  if (vec.size() == 0) {
+    return NULL;
+  } else {
+    return &vec[0];
+  }
+}
+/*! \brief get the beginning address of a vector */
+template<typename T>
+inline const T *BeginPtr(const std::vector<T> &vec) {
+  if (vec.size() == 0) {
+    return NULL;
+  } else {
+    return &vec[0];
+  }
+}
+inline char* BeginPtr(std::string &str) {
+  if (str.length() == 0) return NULL;
+  return &str[0];
+}
+inline const char* BeginPtr(const std::string &str) {
+  if (str.length() == 0) return NULL;
+  return &str[0];
+}
+}  // namespace rabit
+#endif  // RABIT_UTILS_H_
diff --git a/subtree/rabit/include/rabit_serializable.h b/subtree/rabit/include/rabit_serializable.h
new file mode 100644
index 000000000..bee125ed8
--- /dev/null
+++ b/subtree/rabit/include/rabit_serializable.h
@@ -0,0 +1,106 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file rabit_serializable.h
+ * \brief defines serializable interface of rabit
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_RABIT_SERIALIZABLE_H_
+#define RABIT_RABIT_SERIALIZABLE_H_
+#include <vector>
+#include <string>
+#include "./rabit/utils.h"
+namespace rabit {
+/*!
+ * \brief interface of stream I/O, used by ISerializable
+ * \sa ISerializable
+ */
+class IStream {
+ public:
+  /*!
+   * \brief reads data from a stream
+   * \param ptr pointer to a memory buffer
+   * \param size block size
+   * \return the size of data read
+   */
+  virtual size_t Read(void *ptr, size_t size) = 0;
+  /*!
+   * \brief writes data to a stream
+   * \param ptr pointer to a memory buffer
+   * \param size block size
+   */
+  virtual void Write(const void *ptr, size_t size) = 0;
+  /*! \brief virtual destructor */
+  virtual ~IStream(void) {}
+
+ public:
+  // helper functions to write/read different data structures
+  /*!
+   * \brief writes a vector
+   * \param vec vector to be written/serialized
+   */
+  template<typename T>
+  inline void Write(const std::vector<T> &vec) {
+    uint64_t sz = static_cast<uint64_t>(vec.size());
+    this->Write(&sz, sizeof(sz));
+    if (sz != 0) {
+      this->Write(&vec[0], sizeof(T) * sz);
+    }
+  }
+  /*!
+   * \brief loads a vector
+   * \param out_vec vector to be loaded/deserialized
+   * \return whether the load was successful
+   */
+  template<typename T>
+  inline bool Read(std::vector<T> *out_vec) {
+    uint64_t sz;
+    if (this->Read(&sz, sizeof(sz)) == 0) return false;
+    out_vec->resize(sz);
+    if (sz != 0) {
+      if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
+    }
+    return true;
+  }
+  /*!
+   * \brief writes a string
+   * \param str the string to be written/serialized
+   */ 
+  inline void Write(const std::string &str) {
+    uint64_t sz = static_cast<uint64_t>(str.length());
+    this->Write(&sz, sizeof(sz));
+    if (sz != 0) {
+      this->Write(&str[0], sizeof(char) * sz);
+    }
+  }
+  /*!
+   * \brief loads a string
+   * \param out_str string to be loaded/deserialized
+   * \return whether the load/deserialization was successful
+   */
+  inline bool Read(std::string *out_str) {
+    uint64_t sz;
+    if (this->Read(&sz, sizeof(sz)) == 0) return false;
+    out_str->resize(sz);
+    if (sz != 0) {
+      if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
+    }
+    return true;
+  }
+};
+
+/*! \brief interface for serializable objects */
+class ISerializable {
+ public:
+  /*! 
+  * \brief load the model from a stream
+  * \param fi stream where to load the model from
+  */
+  virtual void Load(IStream &fi) = 0;
+  /*! 
+  * \brief saves the model to a stream
+  * \param fo stream where to save the model to
+  */
+  virtual void Save(IStream &fo) const = 0;
+};
+}  // namespace rabit
+#endif  // RABIT_RABIT_SERIALIZABLE_H_
diff --git a/subtree/rabit/lib/README.md b/subtree/rabit/lib/README.md
new file mode 100644
index 000000000..b6a5aa8b2
--- /dev/null
+++ b/subtree/rabit/lib/README.md
@@ -0,0 +1,15 @@
+Rabit Library
+=====
+This folder holds the library file generated by the compiler. To generate the library file, type ```make``` in the project root folder. If you want mpi compatible library, type ```make mpi```
+
+***List of Files***
+* rabit.a The rabit package library
+  - Normally you need to link with this one
+* rabit_mock.a The rabit package library with mock test
+  - This library allows additional mock-test
+* rabit_mpi.a The MPI backed library
+  - Link against this library makes the program use MPI Allreduce
+  - This library is not fault-tolerant
+* rabit_empty.a Dummy package implementation
+  - This is an empty library that does not provide anything
+  - Only introduced to minimize code dependency for projects that only need single machine code
diff --git a/subtree/rabit/rabit-learn/README.md b/subtree/rabit/rabit-learn/README.md
new file mode 100644
index 000000000..bd16ea826
--- /dev/null
+++ b/subtree/rabit/rabit-learn/README.md
@@ -0,0 +1,17 @@
+Rabit-Learn
+====
+This folder contains implementation of distributed machine learning algorithm using rabit.
+It also contain links to the Machine Learning packages that uses rabit.
+
+* Contribution of toolkits, examples, benchmarks is more than welcomed!
+
+Toolkits
+====
+* [KMeans Clustering](kmeans)
+* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/unity/multi-node)
+  - xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
+    10 times faster than existing packages
+  - Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
+    single node version, and scale it to even larger problems
+
+
diff --git a/subtree/rabit/rabit-learn/common.mk b/subtree/rabit/rabit-learn/common.mk
new file mode 100644
index 000000000..be73390c2
--- /dev/null
+++ b/subtree/rabit/rabit-learn/common.mk
@@ -0,0 +1,30 @@
+# this is the common build script for rabit programs
+# you do not have to use it 
+export CC  = gcc
+export CXX = g++
+export MPICXX = mpicxx
+export LDFLAGS= -pthread -lm -L../../lib
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../../include -I../common
+
+.PHONY: clean all lib mpi
+all: $(BIN) $(MOCKBIN)
+mpi: $(MPIBIN)
+
+lib:
+	cd ../..;make lib/librabit.a lib/librabit_mock.a; cd -
+libmpi:
+	cd ../..;make lib/librabit_mpi.a;cd -
+
+$(BIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit
+$(MOCKBIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit_mock
+
+$(OBJ) : 
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIBIN) : 
+	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^)  $(LDFLAGS) -lrabit_mpi 
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~
diff --git a/subtree/rabit/rabit-learn/common/toolkit_util.h b/subtree/rabit/rabit-learn/common/toolkit_util.h
new file mode 100644
index 000000000..d616ac0bf
--- /dev/null
+++ b/subtree/rabit/rabit-learn/common/toolkit_util.h
@@ -0,0 +1,117 @@
+#include <rabit.h>
+#include <vector>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+
+namespace rabit {
+/*! \brief sparse matrix, CSR format */
+struct SparseMat {
+  // sparse matrix entry
+  struct Entry {
+    // feature index 
+    unsigned findex;
+    // feature value
+    float fvalue;
+  };
+  // sparse vector
+  struct Vector {
+    const Entry *data;
+    unsigned length;
+    inline const Entry &operator[](size_t i) const {
+      return data[i];
+    }
+  };
+  inline Vector operator[](size_t i) const {
+    Vector v;
+    v.data = &data[0] + row_ptr[i];
+    v.length = static_cast<unsigned>(row_ptr[i + 1]-row_ptr[i]);
+    return v;
+  }
+  // load data from LibSVM format
+  inline void Load(const char *fname) {
+    FILE *fi;
+    if (!strcmp(fname, "stdin")) {
+      fi = stdin;
+    } else {
+      fi = utils::FopenCheck(fname, "r");
+    }
+    row_ptr.clear();
+    row_ptr.push_back(0);
+    data.clear();    
+    feat_dim = 0;
+    float label; bool init = true;
+    char tmp[1024];
+    while (fscanf(fi, "%s", tmp) == 1) {
+      Entry e;
+      if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
+        data.push_back(e);
+        feat_dim = std::max(e.findex, feat_dim);
+      } else {
+        if (!init) {
+          labels.push_back(label);
+          row_ptr.push_back(data.size());
+        }
+        utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
+        init = false;
+      }
+    }
+    // last row
+    labels.push_back(label);
+    row_ptr.push_back(data.size());
+    feat_dim += 1;
+    // close the filed
+    if (fi != stdin) fclose(fi);
+  }
+  inline size_t NumRow(void) const {
+    return row_ptr.size() - 1;
+  }
+  // maximum feature dimension
+  unsigned feat_dim;
+  std::vector<size_t> row_ptr;
+  std::vector<Entry> data;
+  std::vector<float> labels;
+};
+// dense matrix
+struct Matrix {
+  inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
+    this->nrow = nrow;
+    this->ncol = ncol;
+    data.resize(nrow * ncol);
+    std::fill(data.begin(), data.end(), v);
+  }
+  inline float *operator[](size_t i) {
+    return &data[0] + i * ncol;
+  }
+  inline const float *operator[](size_t i) const {
+    return &data[0] + i * ncol;
+  }
+  inline void Print(const char *fname) {
+    FILE *fo;
+    if (!strcmp(fname, "stdout")) {
+      fo = stdout;
+    } else {
+      fo = utils::FopenCheck(fname, "w");
+    }
+    for (size_t i = 0; i < data.size(); ++i) {
+      fprintf(fo, "%g", data[i]);
+      if ((i+1) % ncol == 0) {
+        fprintf(fo, "\n");
+      } else {
+        fprintf(fo, " ");
+      }
+    }
+    // close the filed
+    if (fo != stdout) fclose(fo);
+  }
+  // number of data
+  size_t nrow, ncol;
+  std::vector<float> data;
+};
+
+/*!\brief computes a random number modulo the value */
+inline int Random(int value) {
+  return rand() % value;
+}
+} // namespace rabit
diff --git a/subtree/rabit/rabit-learn/kmeans/.gitignore b/subtree/rabit/rabit-learn/kmeans/.gitignore
new file mode 100644
index 000000000..5f8241b66
--- /dev/null
+++ b/subtree/rabit/rabit-learn/kmeans/.gitignore
@@ -0,0 +1,2 @@
+kmeans
+*.mpi
diff --git a/subtree/rabit/rabit-learn/kmeans/Makefile b/subtree/rabit/rabit-learn/kmeans/Makefile
new file mode 100644
index 000000000..244d9afdf
--- /dev/null
+++ b/subtree/rabit/rabit-learn/kmeans/Makefile
@@ -0,0 +1,16 @@
+# specify tensor path
+BIN = kmeans.rabit 
+MOCKBIN= kmeans.mock
+MPIBIN = kmeans.mpi
+# objectives that makes up rabit library
+OBJ = kmeans.o
+
+# common build script for programs
+include ../common.mk
+
+# dependenies here
+kmeans.rabit: kmeans.o lib
+kmeans.mock: kmeans.o lib
+kmeans.mpi: kmeans.o libmpi
+kmeans.o: kmeans.cc ../../src/*.h
+
diff --git a/subtree/rabit/rabit-learn/kmeans/README.md b/subtree/rabit/rabit-learn/kmeans/README.md
new file mode 100644
index 000000000..78a0a2e20
--- /dev/null
+++ b/subtree/rabit/rabit-learn/kmeans/README.md
@@ -0,0 +1,129 @@
+Toolkit
+====
+This folder contains some example toolkits developed with rabit to help you get started. 
+
+KMeans
+====
+
+## Input File Format
+KMeans uses LIBSVM format to parse the input. If you are not familiar with LIBSVM, <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">here</a> you will find more details. 
+
+The format is the following:
+
+&lt;label&gt; &lt;index1&gt;:&lt;value1&gt; &lt;index2&gt;:&lt;value2&gt; ...
+
+where label is a dummy integer value in this case (you can add 1's to every example), index&lt;x&gt; is the index for feature x, and value&lt;x&gt; is the feature x value.
+
+## Output File Format
+KMeans currently outputs the centroids as dense vectors. Each line in the output file corresponds to a centroid. The number of lines in the file must match the number of clusters K you specified in the command line.
+
+## Example
+
+Let's go over a more detailed example...
+
+#### Preprocess
+
+Download the smallwiki dataset used in the Machine Learning for Big Data class at University of Washington.
+
+http://courses.cs.washington.edu/courses/cse547/14wi/datasets/smallwiki.zip
+
+Unzip it, you should find three files:
+* tfidf.txt: each row is in the form of “docid||termid1:tfidf1,termid2:tfidf2,...
+* dictionary.txt: map of term to termid
+* cluster0.txt: initial cluster centers. Won't needed.
+
+The first thing to do is to convert the tfidf file format into the input format rabit supports, i.e. LIBSVM. For that, you can use a simple python script. The following should suffice. You should redirect the output to a file, let's say tfidf.libsvm.
+
+```python
+  for line in open("tfidf.txt").read().splitlines():
+    example = line.split('|')[1].split(',')
+    example = ' '.join(example)
+    print '%s %s' % (1, example)
+```
+#### Compile
+
+You will then need to build the KMeans program with ```make```, which will produce three binaries:
+
+* kmeans.mpi: runs on MPI.
+* kmeans.mock: uses a mock to simulate error conditions for testing purposes.
+* kmeans.rabit: uses our C++ implementation.
+
+#### Running with Hadoop
+ 
+If you want to run it with Hadoop, you can execute the [./kmeans_hadoop.sh](./kmeans_hadoop.sh) script from your master node in cluster. 
+You will have to edit the file in order to specify the path to the Hadoop Streaming jar. Afterwards, you can execute it with the following arguments (in the exact same order):
+
+* number of worker nodes in your Hadoop cluster (i.e. number of slave nodes)
+* path to the input data (HDFS path where you put the preprocessed file in libsvm format)
+* number of clusters K (let's use 20 for this example)
+* number of iterations to perform (let's use just 5 iterations)
+* output path (HDFS path where to store the output data, must be a non-existent folder)
+
+The current implementation runs for the amount of iterations you specify in the command line argument. If you would like to add some convergence criteria (e.g. when no cluster assignment changes between iterations you stop or something like that) you will have to modify [./kmeans.cc](./kmeans.cc). We leave that as an exercise to the reader :)
+
+You may have noticed that [./kmeans_hadoop.sh](./kmeans_hadoop.sh) uses kmeans.rabit binary, but you can also use kmeans.mock in order to easily test your system behavior in presence of failures. More on that later.
+
+Don't forget to copy the preprocessed file into HDFS and create the output folder. For example, inside the bin folder in Hadoop, you can execute the following:
+
+```bash
+$ ./hadoop fs -mkdir kmeans
+$ ./hadoop fs -mkdir kmeans/in
+$ ./hadoop fs -put tfidf.libsvm kmeans/in
+$ ./hadoop fs -mkdir kmeans/out
+```
+
+#### Running with MPI
+
+You will need to have a MPI cluster installed, for example OpenMPI. In order to run the program, you can use mpirun to submit the job. This is a non-fault tolerant version as it is backed by MPI.
+
+
+#### Running with Mock
+
+As previously mentioned, you can execute the kmeans example, an any of your own, with the mock binary. This will allow you to test error conditions while you are developing your algorithms. As explained in the [Tutorial](../guide), passing the script certain parameters (e.g. mock=0,0,1,0) will cause certain node to exit after calling Allreduce/Broadcast in some iteration.
+
+You can also run this locally, you will only need to split the input file into several smaller files, each will be used by a particular process in the shared memory environment. You can use some Unix command line tool such as split.
+
+
+#### Processing Output
+
+Once the program finishes running, you can fetch the output from HDFS. For example, inside the bin folder in Hadoop, you can execute the following:
+
+```bash
+$ ./hadoop fs -get kmeans/out/part-00000 kmeans.out
+
+```
+
+Each line of the output file is a centroid in dense format. As this dataset contains the words in dictionary.txt file, you can do some simple post processing to recover the top 10 words of each centroid. Something like this should work:
+
+```python
+  words = {}
+  for line in open("dictionary.txt").read().splitlines():
+    word, index = line.split(' ')
+    words[int(index)] = word
+  
+  from collections import defaultdict
+  clusters = defaultdict(list)
+  cluster_name = 0
+  for line in open("kmeans.out").read().splitlines():
+    line = line.split(' ')
+    clusters[cluster_name].extend(line)
+    cluster_name+=1
+
+  import numpy as np
+  for j, key in enumerate(clusters):
+    elements = clusters[key]
+    array = np.array(elements).astype(np.float32)
+    idx = np.argsort(array)[::-1][:10]
+    ws = []
+    for i in idx:
+      ws.append(words[i])
+    print 'cluster %d = %s' % (j, ' '.join(ws))
+```
+
+
+
+
+
+
+
+
diff --git a/subtree/rabit/rabit-learn/kmeans/kmeans.cc b/subtree/rabit/rabit-learn/kmeans/kmeans.cc
new file mode 100644
index 000000000..0a8171f9f
--- /dev/null
+++ b/subtree/rabit/rabit-learn/kmeans/kmeans.cc
@@ -0,0 +1,162 @@
+// this is a test case to test whether rabit can recover model when 
+// facing an exception
+#include <rabit.h>
+#include <rabit/utils.h>
+#include "./toolkit_util.h"
+#include <time.h>
+
+using namespace rabit;
+
+// kmeans model
+class Model : public rabit::ISerializable {
+ public:
+  // matrix of centroids
+  Matrix centroids;
+  // load from stream
+  virtual void Load(rabit::IStream &fi) {
+    fi.Read(&centroids.nrow, sizeof(centroids.nrow));
+    fi.Read(&centroids.ncol, sizeof(centroids.ncol));
+    fi.Read(&centroids.data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::IStream &fo) const {
+    fo.Write(&centroids.nrow, sizeof(centroids.nrow));
+    fo.Write(&centroids.ncol, sizeof(centroids.ncol));
+    fo.Write(centroids.data);
+  }
+  virtual void InitModel(unsigned num_cluster, unsigned feat_dim) {
+    centroids.Init(num_cluster, feat_dim);
+  }
+  // normalize L2 norm
+  inline void Normalize(void) {
+    for (size_t i = 0; i < centroids.nrow; ++i) {
+      float *row = centroids[i];
+      double wsum = 0.0;
+      for (size_t j = 0; j < centroids.ncol; ++j) {
+        wsum += row[j] * row[j];
+      }
+      wsum = sqrt(wsum);
+      if (wsum < 1e-6) return;
+      float winv = 1.0 / wsum;
+      for (size_t j = 0; j < centroids.ncol; ++j) {
+        row[j] *= winv;
+      }
+    }
+  }
+};
+inline void InitCentroids(const SparseMat &data, Matrix *centroids) {
+  int num_cluster = centroids->nrow; 
+  for (int i = 0; i < num_cluster; ++i) {
+    int index = Random(data.NumRow());
+    SparseMat::Vector v = data[index];
+    for (unsigned j = 0; j < v.length; ++j) {
+      (*centroids)[i][v[j].findex] = v[j].fvalue;
+    }
+  }
+  for (int i = 0; i < num_cluster; ++i) {
+    int proc = Random(rabit::GetWorldSize());
+    rabit::Broadcast((*centroids)[i], centroids->ncol * sizeof(float), proc);
+  }
+}
+
+inline double Cos(const float *row,
+                  const SparseMat::Vector &v) {
+  double rdot = 0.0, rnorm = 0.0; 
+  for (unsigned i = 0; i < v.length; ++i) {
+    rdot += row[v[i].findex] * v[i].fvalue;
+    rnorm += v[i].fvalue * v[i].fvalue;
+  }
+  return rdot  / sqrt(rnorm);
+}
+inline size_t GetCluster(const Matrix &centroids,
+                         const SparseMat::Vector &v) {
+  size_t imin = 0;
+  double dmin = Cos(centroids[0], v);
+  for (size_t k = 1; k < centroids.nrow; ++k) {
+    double dist = Cos(centroids[k], v);
+    if (dist > dmin) {
+      dmin = dist; imin = k;
+    }
+  }
+  return imin;
+}
+             
+int main(int argc, char *argv[]) {
+  if (argc < 5) {
+    if (rabit::GetRank() == 0) {
+      rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
+    }
+    return 0;
+  }
+  clock_t tStart = clock();
+
+  srand(0);
+  // load the data 
+  SparseMat data;
+  data.Load(argv[1]);
+  // set the parameters
+  int num_cluster = atoi(argv[2]);
+  int max_iter = atoi(argv[3]);
+  // intialize rabit engine
+  rabit::Init(argc, argv);
+  // load model
+  Model model; 
+  int iter = rabit::LoadCheckPoint(&model);
+  if (iter == 0) {
+    rabit::Allreduce<op::Max>(&data.feat_dim, 1);
+    model.InitModel(num_cluster, data.feat_dim);
+    InitCentroids(data, &model.centroids);
+    model.Normalize();
+    rabit::TrackerPrintf("[%d] start at %s\n",
+                         rabit::GetRank(), rabit::GetProcessorName().c_str());
+  } else {
+    rabit::TrackerPrintf("[%d] restart iter=%d\n", rabit::GetRank(), iter);    
+  }
+  const unsigned num_feat = data.feat_dim;
+  // matrix to store the result
+  Matrix temp;
+  for (int r = iter; r < max_iter; ++r) {    
+    temp.Init(num_cluster, num_feat + 1, 0.0f);    
+#if __cplusplus >= 201103L    
+    auto lazy_get_centroid = [&]()
+#endif
+    {
+      // lambda function used to calculate the data if necessary
+      // this function may not be called when the result can be directly recovered
+      const size_t ndata = data.NumRow();
+      for (size_t i = 0; i < ndata; ++i) {
+        SparseMat::Vector v = data[i];
+        size_t k = GetCluster(model.centroids, v);
+        // temp[k] += v
+        for (size_t j = 0; j < v.length; ++j) {
+          temp[k][v[j].findex] += v[j].fvalue;
+        }
+        // use last column to record counts
+        temp[k][num_feat] += 1.0f;
+      }
+    };
+    // call allreduce
+#if __cplusplus >= 201103L
+    rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size(), lazy_get_centroid);
+#else
+    rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size());
+#endif
+    // set number
+    for (int k = 0; k < num_cluster; ++k) {
+      float cnt = temp[k][num_feat];
+      utils::Check(cnt != 0.0f, "get zero sized cluster");
+      for (unsigned i = 0; i < num_feat; ++i) {
+        model.centroids[k][i] = temp[k][i] / cnt;
+      }
+    }
+    model.Normalize();
+    rabit::CheckPoint(&model);
+  }
+  // output the model file to somewhere
+  if (rabit::GetRank() == 0) {
+    model.centroids.Print(argv[4]);
+  }
+  rabit::TrackerPrintf("[%d] Time taken: %f seconds\n", rabit::GetRank(), static_cast<float>(clock() - tStart) / CLOCKS_PER_SEC);
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/rabit-learn/kmeans/kmeans_hadoop.sh b/subtree/rabit/rabit-learn/kmeans/kmeans_hadoop.sh
new file mode 100755
index 000000000..fb8d1d5a2
--- /dev/null
+++ b/subtree/rabit/rabit-learn/kmeans/kmeans_hadoop.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+if [ "$#" -lt 5 ];
+then
+    echo "Usage: <nslaves> <input_data> <ncluster> <max_iteration> <output>"
+    exit -1
+fi
+#set path to hadoop streaming jar here
+STREAMING_JAR=
+python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5  kmeans.rabit stdin $3 $4 stdout
diff --git a/subtree/rabit/src/README.md b/subtree/rabit/src/README.md
new file mode 100644
index 000000000..5e55d9210
--- /dev/null
+++ b/subtree/rabit/src/README.md
@@ -0,0 +1,6 @@
+Source Files of Rabit
+====
+* This folder contains the source files of rabit library
+* The library headers are in folder [include](../include)
+* The .h files in this folder are internal header files that are only used by rabit and will not be seen by users
+
diff --git a/subtree/rabit/src/allreduce_base.cc b/subtree/rabit/src/allreduce_base.cc
new file mode 100644
index 000000000..1736d8f6d
--- /dev/null
+++ b/subtree/rabit/src/allreduce_base.cc
@@ -0,0 +1,590 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_base.cc
+ * \brief Basic implementation of AllReduce
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include "./allreduce_base.h"
+
+namespace rabit {
+namespace engine {
+// constructor
+AllreduceBase::AllreduceBase(void) {
+  tracker_uri = "NULL";
+  tracker_port = 9000;
+  host_uri = "";
+  slave_port = 9010;
+  nport_trial = 1000;
+  rank = 0;
+  world_size = -1;
+  hadoop_mode = 0;
+  version_number = 0;
+  task_id = "NULL";
+  err_link = NULL;
+  this->SetParam("rabit_reduce_buffer", "256MB");
+}
+
+// initialization function
+void AllreduceBase::Init(void) {
+  // setup from enviroment variables
+  {
+    // handling for hadoop
+    const char *task_id = getenv("mapred_tip_id");
+    if (task_id == NULL) {
+      task_id = getenv("mapreduce_task_id");
+    }
+    if (hadoop_mode != 0) {
+      utils::Check(task_id != NULL,
+                   "hadoop_mode is set but cannot find mapred_task_id");
+    }
+    if (task_id != NULL) {
+      this->SetParam("rabit_task_id", task_id);
+      this->SetParam("rabit_hadoop_mode", "1");
+    }
+    const char *attempt_id = getenv("mapred_task_id");
+    if (attempt_id != 0) {
+      const char *att = strrchr(attempt_id, '_');
+      int num_trial;
+      if (att != NULL && sscanf(att + 1, "%d", &num_trial) == 1) {
+        this->SetParam("rabit_num_trial", att + 1);
+      }
+    }
+    // handling for hadoop
+    const char *num_task = getenv("mapred_map_tasks");
+    if (num_task == NULL) {
+      num_task = getenv("mapreduce_job_maps");
+    }
+    if (hadoop_mode != 0) {
+      utils::Check(num_task != NULL,
+                   "hadoop_mode is set but cannot find mapred_map_tasks");
+    }
+    if (num_task != NULL) {
+      this->SetParam("rabit_world_size", num_task);
+    }
+  }
+  // clear the setting before start reconnection
+  this->rank = -1;
+  //---------------------
+  // start socket
+  utils::Socket::Startup();
+  utils::Assert(all_links.size() == 0, "can only call Init once");
+  this->host_uri = utils::SockAddr::GetHostName();
+  // get information from tracker
+  this->ReConnectLinks();
+}
+
+void AllreduceBase::Shutdown(void) {
+  for (size_t i = 0; i < all_links.size(); ++i) {
+    all_links[i].sock.Close();
+  }
+  all_links.clear();
+  tree_links.plinks.clear();
+
+  if (tracker_uri == "NULL") return;
+  // notify tracker rank i have shutdown
+  utils::TCPSocket tracker = this->ConnectTracker();
+  tracker.SendStr(std::string("shutdown"));
+  tracker.Close();
+  utils::TCPSocket::Finalize();
+}
+void AllreduceBase::TrackerPrint(const std::string &msg) {
+  if (tracker_uri == "NULL") {
+    utils::Printf("%s", msg.c_str()); return;
+  }
+  utils::TCPSocket tracker = this->ConnectTracker();
+  tracker.SendStr(std::string("print"));
+  tracker.SendStr(msg);
+  tracker.Close();
+}
+/*!
+ * \brief set parameters to the engine 
+ * \param name parameter name
+ * \param val parameter value
+ */
+void AllreduceBase::SetParam(const char *name, const char *val) {
+  if (!strcmp(name, "rabit_tracker_uri")) tracker_uri = val;
+  if (!strcmp(name, "rabit_tracker_port")) tracker_port = atoi(val);
+  if (!strcmp(name, "rabit_task_id")) task_id = val;
+  if (!strcmp(name, "rabit_world_size")) world_size = atoi(val);
+  if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = atoi(val);
+  if (!strcmp(name, "rabit_reduce_buffer")) {
+    char unit;
+    uint64_t amount;
+    if (sscanf(val, "%lu%c", &amount, &unit) == 2) {
+      switch (unit) {
+        case 'B': reduce_buffer_size = (amount + 7)/ 8; break;
+        case 'K': reduce_buffer_size = amount << 7UL; break;
+        case 'M': reduce_buffer_size = amount << 17UL; break;
+        case 'G': reduce_buffer_size = amount << 27UL; break;
+        default: utils::Error("invalid format for reduce buffer");
+      }
+    } else {
+      utils::Error("invalid format for reduce_buffer,"\
+                   "shhould be {integer}{unit}, unit can be {B, KB, MB, GB}");
+    }
+  }
+}
+/*!
+ * \brief initialize connection to the tracker
+ * \return a socket that initializes the connection
+ */
+utils::TCPSocket AllreduceBase::ConnectTracker(void) const {
+  int magic = kMagic;
+  // get information from tracker
+  utils::TCPSocket tracker;
+  tracker.Create();
+  if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
+    utils::Socket::Error("Connect");
+  }
+  using utils::Assert;
+  Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic),
+         "ReConnectLink failure 1");
+  Assert(tracker.RecvAll(&magic, sizeof(magic)) == sizeof(magic),
+         "ReConnectLink failure 2");
+  utils::Check(magic == kMagic, "sync::Invalid tracker message, init failure");
+  Assert(tracker.SendAll(&rank, sizeof(rank)) == sizeof(rank),
+                "ReConnectLink failure 3");
+  Assert(tracker.SendAll(&world_size, sizeof(world_size)) == sizeof(world_size),
+         "ReConnectLink failure 3");
+  tracker.SendStr(task_id);
+  return tracker;
+}
+/*!
+ * \brief connect to the tracker to fix the the missing links
+ *   this function is also used when the engine start up
+ */
+void AllreduceBase::ReConnectLinks(const char *cmd) {
+  // single node mode
+  if (tracker_uri == "NULL") {
+    rank = 0; world_size = 1; return;
+  }
+  utils::TCPSocket tracker = this->ConnectTracker();
+  tracker.SendStr(std::string(cmd));
+
+  // the rank of previous link, next link in ring
+  int prev_rank, next_rank;
+  // the rank of neighbors
+  std::map<int, int> tree_neighbors;
+  using utils::Assert;
+  // get new ranks
+  int newrank, num_neighbors;
+  Assert(tracker.RecvAll(&newrank, sizeof(newrank)) == sizeof(newrank),
+           "ReConnectLink failure 4");
+  Assert(tracker.RecvAll(&parent_rank, sizeof(parent_rank)) ==\
+         sizeof(parent_rank), "ReConnectLink failure 4");
+  Assert(tracker.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
+         "ReConnectLink failure 4");
+  Assert(rank == -1 || newrank == rank,
+         "must keep rank to same if the node already have one");
+  rank = newrank;
+  Assert(tracker.RecvAll(&num_neighbors, sizeof(num_neighbors)) ==  \
+         sizeof(num_neighbors), "ReConnectLink failure 4");
+  for (int i = 0; i < num_neighbors; ++i) {
+    int nrank;
+    Assert(tracker.RecvAll(&nrank, sizeof(nrank)) == sizeof(nrank),
+           "ReConnectLink failure 4");
+    tree_neighbors[nrank] = 1;
+  }
+  Assert(tracker.RecvAll(&prev_rank, sizeof(prev_rank)) == sizeof(prev_rank),
+         "ReConnectLink failure 4");
+  Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
+         "ReConnectLink failure 4");
+  // create listening socket
+  utils::TCPSocket sock_listen;
+  sock_listen.Create();
+  int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
+  utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
+  sock_listen.Listen();
+
+  // get number of to connect and number of to accept nodes from tracker
+  int num_conn, num_accept, num_error = 1;
+  do {
+    // send over good links
+    std::vector<int> good_link;
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      if (!all_links[i].sock.BadSocket()) {
+        good_link.push_back(static_cast<int>(all_links[i].rank));
+      } else {
+        if (!all_links[i].sock.IsClosed()) all_links[i].sock.Close();
+      }
+    }    
+    int ngood = static_cast<int>(good_link.size());
+    Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
+           "ReConnectLink failure 5");
+    for (size_t i = 0; i < good_link.size(); ++i) {
+      Assert(tracker.SendAll(&good_link[i], sizeof(good_link[i])) == \
+             sizeof(good_link[i]), "ReConnectLink failure 6");
+    }
+    Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
+           "ReConnectLink failure 7");
+    Assert(tracker.RecvAll(&num_accept, sizeof(num_accept)) ==  \
+           sizeof(num_accept), "ReConnectLink failure 8");
+    num_error = 0;
+    for (int i = 0; i < num_conn; ++i) {
+      LinkRecord r;
+      int hport, hrank;
+      std::string hname;
+      tracker.RecvStr(&hname);
+      Assert(tracker.RecvAll(&hport, sizeof(hport)) == sizeof(hport),
+             "ReConnectLink failure 9");
+      Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
+             "ReConnectLink failure 10");
+      r.sock.Create();
+      if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
+        num_error += 1; r.sock.Close(); continue;
+      }
+      Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
+             "ReConnectLink failure 12");
+      Assert(r.sock.RecvAll(&r.rank, sizeof(r.rank)) == sizeof(r.rank),
+             "ReConnectLink failure 13");
+      utils::Check(hrank == r.rank,
+                   "ReConnectLink failure, link rank inconsistent");
+      bool match = false;
+      for (size_t i = 0; i < all_links.size(); ++i) {
+        if (all_links[i].rank == hrank) {
+          Assert(all_links[i].sock.IsClosed(),
+                 "Override a link that is active");
+          all_links[i].sock = r.sock; match = true; break;
+        }
+      }
+      if (!match) all_links.push_back(r);
+    }
+    Assert(tracker.SendAll(&num_error, sizeof(num_error)) == sizeof(num_error),
+           "ReConnectLink failure 14");
+  } while (num_error != 0);
+  // send back socket listening port to tracker
+  Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port),
+         "ReConnectLink failure 14");
+  // close connection to tracker
+  tracker.Close();
+  // listen to incoming links
+  for (int i = 0; i < num_accept; ++i) {
+    LinkRecord r;
+    r.sock = sock_listen.Accept();
+    Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
+           "ReConnectLink failure 15");
+    Assert(r.sock.RecvAll(&r.rank, sizeof(r.rank)) == sizeof(r.rank),
+           "ReConnectLink failure 15");
+    bool match = false;
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      if (all_links[i].rank == r.rank) {
+        utils::Assert(all_links[i].sock.IsClosed(),
+                      "Override a link that is active");
+        all_links[i].sock = r.sock; match = true; break;
+      }
+    }
+    if (!match) all_links.push_back(r);
+  }
+  // close listening sockets
+  sock_listen.Close();
+  this->parent_index = -1;
+  // setup tree links and ring structure
+  tree_links.plinks.clear();
+  for (size_t i = 0; i < all_links.size(); ++i) {
+    utils::Assert(!all_links[i].sock.BadSocket(), "ReConnectLink: bad socket");
+    // set the socket to non-blocking mode, enable TCP keepalive
+    all_links[i].sock.SetNonBlock(true);
+    all_links[i].sock.SetKeepAlive(true);
+    if (tree_neighbors.count(all_links[i].rank) != 0) {
+      if (all_links[i].rank == parent_rank) {
+        parent_index = static_cast<int>(tree_links.plinks.size());
+      }
+      tree_links.plinks.push_back(&all_links[i]);
+    }
+    if (all_links[i].rank == prev_rank) ring_prev = &all_links[i];
+    if (all_links[i].rank == next_rank) ring_next = &all_links[i];
+  }
+  Assert(parent_rank == -1 || parent_index != -1,
+         "cannot find parent in the link");
+  Assert(prev_rank == -1 || ring_prev != NULL,
+         "cannot find prev ring in the link");
+  Assert(next_rank == -1 || ring_next != NULL,
+         "cannot find next ring in the link");
+}
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
+ *
+ * NOTE on Allreduce:
+ *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
+ *    It only means the current node get the correct result of Allreduce.
+ *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
+ * 
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryAllreduce(void *sendrecvbuf_,
+                            size_t type_nbytes,
+                            size_t count,
+                            ReduceFunction reducer) {
+  RefLinkVector &links = tree_links;
+  if (links.size() == 0 || count == 0) return kSuccess;
+  // total size of message
+  const size_t total_size = type_nbytes * count;
+  // number of links
+  const int nlink = static_cast<int>(links.size());
+  // send recv buffer
+  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
+  // size of space that we already performs reduce in up pass
+  size_t size_up_reduce = 0;
+  // size of space that we have already passed to parent
+  size_t size_up_out = 0;
+  // size of message we received, and send in the down pass
+  size_t size_down_in = 0;
+  // initialize the link ring-buffer and pointer
+  for (int i = 0; i < nlink; ++i) {
+    if (i != parent_index) {
+      links[i].InitBuffer(type_nbytes, count, reduce_buffer_size);
+    }
+    links[i].ResetSize();
+  }
+  // if no childs, no need to reduce
+  if (nlink == static_cast<int>(parent_index != -1)) {
+    size_up_reduce = total_size;
+  }
+  // while we have not passed the messages out
+  while (true) {
+    // select helper
+    bool finished = true;
+    utils::SelectHelper selecter;
+    for (int i = 0; i < nlink; ++i) {
+      if (i == parent_index) {
+        if (size_down_in != total_size) {
+          selecter.WatchRead(links[i].sock);
+          // only watch for exception in live channels
+          selecter.WatchException(links[i].sock);
+          finished = false;
+        }
+        if (size_up_out != total_size && size_up_out < size_up_reduce) {
+          selecter.WatchWrite(links[i].sock);
+        }
+      } else {
+        if (links[i].size_read != total_size) {
+          selecter.WatchRead(links[i].sock);
+        }
+        // size_write <= size_read
+        if (links[i].size_write != total_size){
+          if (links[i].size_write < size_down_in) {
+            selecter.WatchWrite(links[i].sock);
+          }
+          // only watch for exception in live channels
+          selecter.WatchException(links[i].sock);
+          finished = false;
+        }
+      }
+    }
+    // finish runing allreduce
+    if (finished) break;
+    // select must return
+    selecter.Select();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      // recive OOB message from some link
+      if (selecter.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    // read data from childs
+    for (int i = 0; i < nlink; ++i) {
+      if (i != parent_index && selecter.CheckRead(links[i].sock)) {
+        ReturnType ret = links[i].ReadToRingBuffer(size_up_out);
+        if (ret != kSuccess) {
+          return ReportError(&links[i], ret);
+        }
+      }
+    }
+    // this node have childs, peform reduce
+    if (nlink > static_cast<int>(parent_index != -1)) {
+      size_t buffer_size = 0;
+      // do upstream reduce
+      size_t max_reduce = total_size;
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index) {
+          max_reduce= std::min(max_reduce, links[i].size_read);
+          utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
+                        "buffer size inconsistent");
+          buffer_size = links[i].buffer_size;
+        }
+      }
+      utils::Assert(buffer_size != 0, "must assign buffer_size");
+      // round to type_n4bytes
+      max_reduce = (max_reduce / type_nbytes * type_nbytes);
+      // peform reduce, can be at most two rounds
+      while (size_up_reduce < max_reduce) {
+        // start position
+        size_t start = size_up_reduce % buffer_size;
+        // peform read till end of buffer
+        size_t nread = std::min(buffer_size - start,
+                                max_reduce - size_up_reduce);
+        utils::Assert(nread % type_nbytes == 0, "Allreduce: size check");
+        for (int i = 0; i < nlink; ++i) {
+          if (i != parent_index) {
+            reducer(links[i].buffer_head + start,
+                    sendrecvbuf + size_up_reduce,
+                    static_cast<int>(nread / type_nbytes),
+                    MPI::Datatype(type_nbytes));
+          }
+        }
+        size_up_reduce += nread;
+      }
+    }
+    if (parent_index != -1) {
+      // pass message up to parent, can pass data that are already been reduced
+      if (size_up_out < size_up_reduce) {
+        ssize_t len = links[parent_index].sock.
+            Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
+        if (len != -1) {
+          size_up_out += static_cast<size_t>(len);
+        } else {
+          ReturnType ret = Errno2Return(errno);
+          if (ret != kSuccess) {
+            return ReportError(&links[parent_index], ret);
+          }
+        }
+      }
+      // read data from parent
+      if (selecter.CheckRead(links[parent_index].sock) &&
+          total_size > size_down_in) {
+        ssize_t len = links[parent_index].sock.
+            Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
+        if (len == 0) {
+          links[parent_index].sock.Close(); 
+          return ReportError(&links[parent_index], kRecvZeroLen);
+        }
+        if (len != -1) {
+          size_down_in += static_cast<size_t>(len);
+          utils::Assert(size_down_in <= size_up_out,
+                        "Allreduce: boundary error");
+        } else {
+          ReturnType ret = Errno2Return(errno);
+          if (ret != kSuccess) {
+            return ReportError(&links[parent_index], ret);
+          }
+        }
+      }
+    } else {
+      // this is root, can use reduce as most recent point
+      size_down_in = size_up_out = size_up_reduce;
+    }
+    // can pass message down to childs
+    for (int i = 0; i < nlink; ++i) {
+      if (i != parent_index && links[i].size_write < size_down_in) {
+        ReturnType ret = links[i].WriteFromArray(sendrecvbuf, size_down_in);
+        if (ret != kSuccess) {
+          return ReportError(&links[i], ret);
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief broadcast data from root to all nodes, this function can fail,and will return the cause of failure
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param total_size the size of the data to be broadcasted
+ * \param root the root worker id to broadcast the data
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
+  RefLinkVector &links = tree_links;
+  if (links.size() == 0 || total_size == 0) return kSuccess;
+  utils::Check(root < world_size,
+               "Broadcast: root should be smaller than world size");
+  // number of links
+  const int nlink = static_cast<int>(links.size());
+  // size of space already read from data
+  size_t size_in = 0;
+  // input link, -2 means unknown yet, -1 means this is root
+  int in_link = -2;
+
+  // initialize the link statistics
+  for (int i = 0; i < nlink; ++i) {
+    links[i].ResetSize();
+  }
+  // root have all the data
+  if (this->rank == root) {
+    size_in = total_size;
+    in_link = -1;
+  }
+  // while we have not passed the messages out
+  while (true) {
+    bool finished = true;
+    // select helper
+    utils::SelectHelper selecter;
+    for (int i = 0; i < nlink; ++i) {
+      if (in_link == -2) {
+        selecter.WatchRead(links[i].sock); finished = false;
+      }
+      if (i == in_link && links[i].size_read != total_size) {
+        selecter.WatchRead(links[i].sock); finished = false;
+      }
+      if (in_link != -2 && i != in_link && links[i].size_write != total_size) {
+        if (links[i].size_write < size_in) {
+          selecter.WatchWrite(links[i].sock);
+        }
+        finished = false;
+      }
+      selecter.WatchException(links[i].sock);
+    }
+    // finish running
+    if (finished) break;
+    // select
+    selecter.Select();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      // recive OOB message from some link
+      if (selecter.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    if (in_link == -2) {
+      // probe in-link
+      for (int i = 0; i < nlink; ++i) {
+        if (selecter.CheckRead(links[i].sock)) {
+          ReturnType ret = links[i].ReadToArray(sendrecvbuf_, total_size);
+          if (ret != kSuccess) {
+            return ReportError(&links[i], ret);
+          }
+          size_in = links[i].size_read;
+          if (size_in != 0) {
+            in_link = i; break;
+          }
+        }
+      }
+    } else {
+      // read from in link
+      if (in_link >= 0 && selecter.CheckRead(links[in_link].sock)) {
+        ReturnType ret = links[in_link].ReadToArray(sendrecvbuf_, total_size);
+        if (ret != kSuccess) {
+          return ReportError(&links[in_link], ret);
+        }
+        size_in = links[in_link].size_read;
+      }
+    }
+    // send data to all out-link
+    for (int i = 0; i < nlink; ++i) {
+      if (i != in_link && links[i].size_write < size_in) {
+        ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, size_in);
+        if (ret != kSuccess) {
+          return ReportError(&links[i], ret);
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/subtree/rabit/src/allreduce_base.h b/subtree/rabit/src/allreduce_base.h
new file mode 100644
index 000000000..3c442c7f4
--- /dev/null
+++ b/subtree/rabit/src/allreduce_base.h
@@ -0,0 +1,436 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_base.h
+ * \brief Basic implementation of AllReduce
+ *   using TCP non-block socket and tree-shape reduction.
+ *
+ *   This implementation provides basic utility of AllReduce and Broadcast
+ *   without considering node failure
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#ifndef RABIT_ALLREDUCE_BASE_H_
+#define RABIT_ALLREDUCE_BASE_H_
+
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "../include/rabit/utils.h"
+#include "../include/rabit/engine.h"
+#include "./socket.h"
+
+namespace MPI {
+// MPI data type to be compatible with existing MPI interface
+class Datatype {
+ public:
+  size_t type_size;
+  explicit Datatype(size_t type_size) : type_size(type_size) {}
+};
+}
+namespace rabit {
+namespace engine {
+/*! \brief implementation of basic Allreduce engine */
+class AllreduceBase : public IEngine {
+ public:
+  // magic number to verify server
+  static const int kMagic = 0xff99;
+  // constant one byte out of band message to indicate error happening
+  AllreduceBase(void);
+  virtual ~AllreduceBase(void) {}
+  // initialize the manager
+  virtual void Init(void);
+  // shutdown the engine
+  virtual void Shutdown(void);
+  /*!
+   * \brief set parameters to the engine 
+   * \param name parameter name
+   * \param val parameter value
+   */
+  virtual void SetParam(const char *name, const char *val);
+  /*!
+   * \brief print the msg in the tracker,
+   *    this function can be used to communicate the information of the progress to
+   *    the user who monitors the tracker
+   * \param msg message to be printed in the tracker
+   */
+  virtual void TrackerPrint(const std::string &msg);
+  /*! \brief get rank */
+  virtual int GetRank(void) const {
+    return rank;
+  }
+  /*! \brief get rank */
+  virtual int GetWorldSize(void) const {
+    if (world_size == -1) return 1;
+    return world_size;
+  }
+  /*! \brief get rank */
+  virtual std::string GetHost(void) const {
+    return host_uri;
+  }
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf 
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   */  
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun = NULL,
+                         void *prepare_arg = NULL) {
+    if (prepare_fun != NULL) prepare_fun(prepare_arg);
+    utils::Assert(TryAllreduce(sendrecvbuf_,
+                               type_nbytes, count, reducer) == kSuccess,
+                  "Allreduce failed");
+  }
+  /*!
+   * \brief broadcast data from root to all nodes
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   */
+  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
+    utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
+                  "Broadcast failed");
+  }
+  /*!
+   * \brief load latest check point
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local model is needed
+   *
+   * \return the version number of check point loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     the p_model is not touched, user should do necessary initialization by themselves
+   *   
+   *   Common usage example:
+   *      int iter = rabit::LoadCheckPoint(&model);
+   *      if (iter == 0) model.InitParameters();
+   *      for (i = iter; i < max_iter; ++i) {
+   *        do many things, include allreduce
+   *        rabit::CheckPoint(model);
+   *      } 
+   *
+   * \sa CheckPoint, VersionNumber
+   */
+  virtual int LoadCheckPoint(ISerializable *global_model,
+                             ISerializable *local_model = NULL) {
+    return 0;
+  }
+  /*!
+   * \brief checkpoint the model, meaning we finished a stage of execution
+   *  every time we call check point, there is a version number which will increase by one
+   * 
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
+   *       So only CheckPoint with global_model if possible
+   *
+   * \sa LoadCheckPoint, VersionNumber
+   */
+  virtual void CheckPoint(const ISerializable *global_model,
+                          const ISerializable *local_model = NULL) {
+    version_number += 1;
+  }
+  /*!
+   * \brief This function can be used to replace CheckPoint for global_model only,
+   *   when certain condition is met(see detailed expplaination).
+   * 
+   *   This is a "lazy" checkpoint such that only the pointer to global_model is
+   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+   *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
+   *   In another words, global_model model can be changed only between last call of 
+   *   Allreduce/Broadcast and LazyCheckPoint in current version
+   *   
+   *   For example, suppose the calling sequence is:
+   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+   *   
+   *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
+   *   improve efficiency of the program.
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \sa LoadCheckPoint, CheckPoint, VersionNumber
+   */
+  virtual void LazyCheckPoint(const ISerializable *global_model) {
+    version_number += 1;
+  }
+  /*!
+   * \return version number of current stored model,
+   *         which means how many calls to CheckPoint we made so far
+   * \sa LoadCheckPoint, CheckPoint
+   */
+  virtual int VersionNumber(void) const {
+    return version_number;
+  }
+  /*!
+   * \brief explicitly re-init everything before calling LoadCheckPoint
+   *    call this function when IEngine throw an exception out,
+   *    this function is only used for test purpose
+   */
+  virtual void InitAfterException(void) {
+    utils::Error("InitAfterException: not implemented");
+  }
+  /*! 
+   * \brief report current status to the job tracker 
+   * depending on the job tracker we are in
+   */
+  inline void ReportStatus(void) const {
+    if (hadoop_mode != 0) {
+      fprintf(stderr, "reporter:status:Rabit Phase[%03d] Operation %03d\n",
+              version_number, seq_counter);
+    }
+  }
+
+ protected:
+  /*! \brief enumeration of possible returning results from Try functions */
+  enum ReturnTypeEnum {
+    /*! \brief execution is successful */
+    kSuccess,
+    /*! \brief a link was reset by peer */
+    kConnReset,
+    /*! \brief received a zero length message */
+    kRecvZeroLen,
+    /*! \brief a neighbor node go down, the connection is dropped */
+    kSockError,
+    /*! 
+     * \brief another node which is not my neighbor go down,
+     *   get Out-of-Band exception notification from my neighbor
+     */
+    kGetExcept
+  };
+  /*! \brief struct return type to avoid implicit conversion to int/bool */
+  struct ReturnType {
+    /*! \brief internal return type */
+    ReturnTypeEnum value;
+    // constructor
+    ReturnType() {}
+    ReturnType(ReturnTypeEnum value) : value(value){}
+    inline bool operator==(const ReturnTypeEnum &v) const {
+      return value == v;
+    }
+    inline bool operator!=(const ReturnTypeEnum &v) const {
+      return value != v;
+    }
+  };
+  /*! \brief translate errno to return type */
+  inline static ReturnType Errno2Return(int errsv) {
+    if (errsv == EAGAIN || errsv == EWOULDBLOCK) return kSuccess;
+    if (errsv == ECONNRESET) return kConnReset;
+    return kSockError;
+  }
+  // link record to a neighbor
+  struct LinkRecord {
+   public:
+    // socket to get data from/to link
+    utils::TCPSocket sock;
+    // rank of the node in this link
+    int rank;
+    // size of data readed from link
+    size_t size_read;
+    // size of data sent to the link
+    size_t size_write;
+    // pointer to buffer head
+    char *buffer_head;
+    // buffer size, in bytes
+    size_t buffer_size;
+    // constructor
+    LinkRecord(void) 
+        : buffer_head(NULL), buffer_size(0) {
+    }
+    // initialize buffer
+    inline void InitBuffer(size_t type_nbytes, size_t count,
+                           size_t reduce_buffer_size) {
+      size_t n = (type_nbytes * count + 7)/ 8;
+      buffer_.resize(std::min(reduce_buffer_size, n));
+      // make sure align to type_nbytes
+      buffer_size =
+          buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
+      utils::Assert(type_nbytes <= buffer_size,
+                    "too large type_nbytes=%lu, buffer_size=%lu",
+                    type_nbytes, buffer_size);
+      // set buffer head
+      buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
+    }
+    // reset the recv and sent size
+    inline void ResetSize(void) {
+      size_write = size_read = 0;
+    }
+    /*!
+     * \brief read data into ring-buffer, with care not to existing useful override data
+     *  position after protect_start
+     * \param protect_start all data start from protect_start is still needed in buffer
+     *                      read shall not override this 
+     * \return the type of reading
+     */
+    inline ReturnType ReadToRingBuffer(size_t protect_start) {
+      utils::Assert(buffer_head != NULL, "ReadToRingBuffer: buffer not allocated");
+      size_t ngap = size_read - protect_start;
+      utils::Assert(ngap <= buffer_size, "Allreduce: boundary check");
+      size_t offset = size_read % buffer_size;
+      size_t nmax = std::min(buffer_size - ngap, buffer_size - offset);
+      if (nmax == 0) return kSuccess;
+      ssize_t len = sock.Recv(buffer_head + offset, nmax);
+      // length equals 0, remote disconnected
+      if (len == 0) {
+        sock.Close(); return kRecvZeroLen;
+      }
+      if (len == -1) return Errno2Return(errno);
+      size_read += static_cast<size_t>(len);
+      return kSuccess;
+    }
+    /*!
+     * \brief read data into array,
+     * this function can not be used together with ReadToRingBuffer
+     * a link can either read into the ring buffer, or existing array
+     * \param max_size maximum size of array
+     * \return true if it is an successful read, false if there is some error happens, check errno
+     */
+    inline ReturnType ReadToArray(void *recvbuf_, size_t max_size) {
+      if (max_size == size_read) return kSuccess;
+      char *p = static_cast<char*>(recvbuf_);
+      ssize_t len = sock.Recv(p + size_read, max_size - size_read);
+      // length equals 0, remote disconnected
+      if (len == 0) {
+        sock.Close(); return kRecvZeroLen;
+      }
+      if (len == -1) return Errno2Return(errno);
+      size_read += static_cast<size_t>(len);
+      return kSuccess;
+    }
+    /*!
+     * \brief write data in array to sock
+     * \param sendbuf_ head of array
+     * \param max_size maximum size of array
+     * \return true if it is an successful write, false if there is some error happens, check errno
+     */
+    inline ReturnType WriteFromArray(const void *sendbuf_, size_t max_size) {
+      const char *p = static_cast<const char*>(sendbuf_);
+      ssize_t len = sock.Send(p + size_write, max_size - size_write);
+      if (len == -1) return Errno2Return(errno);
+      size_write += static_cast<size_t>(len);
+      return kSuccess;
+    }
+
+   private:
+    // recv buffer to get data from child
+    // aligned with 64 bits, will be able to perform 64 bits operations freely
+    std::vector<uint64_t> buffer_;
+  };
+  /*!
+   * \brief simple data structure that works like a vector
+   *  but takes reference instead of space
+   */
+  struct RefLinkVector {
+    std::vector<LinkRecord*> plinks;
+    inline LinkRecord &operator[](size_t i) {
+      return *plinks[i];
+    }
+    inline size_t size(void) const {
+      return plinks.size();
+    }
+  };
+  /*!
+   * \brief initialize connection to the tracker
+   * \return a socket that initializes the connection
+   */
+  utils::TCPSocket ConnectTracker(void) const;
+  /*!
+   * \brief connect to the tracker to fix the the missing links
+   *   this function is also used when the engine start up
+   * \param cmd possible command to sent to tracker
+   */
+  void ReConnectLinks(const char *cmd = "start");
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
+   *
+   * NOTE on Allreduce:
+   *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
+   *    It only means the current node get the correct result of Allreduce.
+   *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
+   * 
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryAllreduce(void *sendrecvbuf_,
+                          size_t type_nbytes,
+                          size_t count,
+                          ReduceFunction reducer);
+  /*!
+   * \brief broadcast data from root to all nodes, this function can fail,and will return the cause of failure
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
+  /*!
+   * \brief function used to report error when a link goes wrong 
+   * \param link the pointer to the link who causes the error
+   * \param err the error type
+   */
+  inline ReturnType ReportError(LinkRecord *link, ReturnType err) {
+    err_link = link; return err;
+  }
+  //---- data structure related to model ----
+  // call sequence counter, records how many calls we made so far
+  // from last call to CheckPoint, LoadCheckPoint
+  int seq_counter;
+  // version number of model
+  int version_number;
+  // whether the job is running in hadoop
+  int hadoop_mode;
+  //---- local data related to link ----
+  // index of parent link, can be -1, meaning this is root of the tree
+  int parent_index;
+  // rank of parent node, can be -1
+  int parent_rank;
+  // sockets of all links this connects to
+  std::vector<LinkRecord> all_links;
+  // used to record the link where things goes wrong
+  LinkRecord *err_link;
+  // all the links in the reduction tree connection
+  RefLinkVector tree_links;
+  // pointer to links in the ring
+  LinkRecord *ring_prev, *ring_next;
+  //----- meta information-----
+  // unique identifier of the possible job this process is doing
+  // used to assign ranks, optional, default to NULL
+  std::string task_id;
+  // uri of current host, to be set by Init
+  std::string host_uri;
+  // uri of tracker
+  std::string tracker_uri;
+  // port of tracker address
+  int tracker_port;
+  // port of slave process
+  int slave_port, nport_trial;
+  // reduce buffer size
+  size_t reduce_buffer_size;
+  // current rank
+  int rank;
+  // world size
+  int world_size;
+};
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_ALLREDUCE_BASE_H
diff --git a/subtree/rabit/src/allreduce_mock.h b/subtree/rabit/src/allreduce_mock.h
new file mode 100644
index 000000000..33b8b60ae
--- /dev/null
+++ b/subtree/rabit/src/allreduce_mock.h
@@ -0,0 +1,100 @@
+/*!
+ * \file allreduce_mock.h
+ * \brief Mock test module of AllReduce engine,
+ * insert failures in certain call point, to test if the engine is robust to failure
+ * 
+ * \author Ignacio Cano, Tianqi Chen
+ */
+#ifndef RABIT_ALLREDUCE_MOCK_H
+#define RABIT_ALLREDUCE_MOCK_H
+#include <vector>
+#include <map>
+#include "../include/rabit/engine.h"
+#include "./allreduce_robust.h"
+
+namespace rabit {
+namespace engine {
+class AllreduceMock : public AllreduceRobust {
+ public:
+  // constructor
+  AllreduceMock(void) {
+    num_trial = 0;
+  }
+  // destructor
+  virtual ~AllreduceMock(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+    AllreduceRobust::SetParam(name, val);
+    // additional parameters
+    if (!strcmp(name, "rabit_num_trial")) num_trial = atoi(val);
+    if (!strcmp(name, "mock")) {
+      MockKey k;
+      utils::Check(sscanf(val, "%d,%d,%d,%d",
+                          &k.rank, &k.version, &k.seqno, &k.ntrial) == 4,
+                   "invalid mock parameter");
+      mock_map[k] = 1;
+    }
+  }
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun,
+                         void *prepare_arg) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "AllReduce");
+    AllreduceRobust::Allreduce(sendrecvbuf_, type_nbytes,
+                               count, reducer, prepare_fun, prepare_arg);
+  }
+  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "Broadcast");
+    AllreduceRobust::Broadcast(sendrecvbuf_, total_size, root);
+  }
+  virtual void CheckPoint(const ISerializable *global_model,
+                          const ISerializable *local_model) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "CheckPoint");
+    AllreduceRobust::CheckPoint(global_model, local_model);
+  }
+
+  virtual void LazyCheckPoint(const ISerializable *global_model) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
+    AllreduceRobust::LazyCheckPoint(global_model);
+  }
+  
+ private:
+  // key to identify the mock stage
+  struct MockKey {
+    int rank;
+    int version;
+    int seqno;
+    int ntrial;
+    MockKey(void) {}
+    MockKey(int rank, int version, int seqno, int ntrial) 
+        : rank(rank), version(version), seqno(seqno), ntrial(ntrial) {}
+    inline bool operator==(const MockKey &b) const {
+      return rank == b.rank && 
+          version == b.version &&
+          seqno == b.seqno &&
+          ntrial == b.ntrial;
+    }
+    inline bool operator<(const MockKey &b) const {
+      if (rank != b.rank) return rank < b.rank;
+      if (version != b.version) return version < b.version;
+      if (seqno != b.seqno) return seqno < b.seqno;
+      return ntrial < b.ntrial;
+    }
+  };
+  // number of failure trials
+  int num_trial;
+  // record all mock actions
+  std::map<MockKey, int> mock_map;
+  // used to generate all kinds of exceptions
+  inline void Verify(const MockKey &key, const char *name) {
+    if (mock_map.count(key) != 0) {
+      num_trial += 1;
+      fprintf(stderr, "[%d]@@@Hit Mock Error:%s\n", rank, name);
+      exit(-2);
+    }
+  }
+};
+}  // namespace engine
+}  // namespace rabit
+#endif // RABIT_ALLREDUCE_MOCK_H
diff --git a/subtree/rabit/src/allreduce_robust-inl.h b/subtree/rabit/src/allreduce_robust-inl.h
new file mode 100644
index 000000000..d8cc8dcdd
--- /dev/null
+++ b/subtree/rabit/src/allreduce_robust-inl.h
@@ -0,0 +1,161 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_robust-inl.h
+ * \brief implementation of inline template function in AllreduceRobust
+ *   
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_ENGINE_ROBUST_INL_H_
+#define RABIT_ENGINE_ROBUST_INL_H_
+#include <vector>
+
+namespace rabit {
+namespace engine {
+/*!
+ * \brief run message passing algorithm on the allreduce tree 
+ *        the result is edge message stored in p_edge_in and p_edge_out
+ * \param node_value the value associated with current node
+ * \param p_edge_in used to store input message from each of the edge
+ * \param p_edge_out used to store output message from each of the edge
+ * \param func a function that defines the message passing rule
+ *        Parameters of func:
+ *           - node_value same as node_value in the main function
+ *           - edge_in the array of input messages from each edge,
+ *                     this includes the output edge, which should be excluded
+ *           - out_index array the index of output edge, the function should
+ *                       exclude the output edge when compute the message passing value
+ *        Return of func:
+ *           the function returns the output message based on the input message and node_value
+ *
+ * \tparam EdgeType type of edge message, must be simple struct
+ * \tparam NodeType type of node value
+ */
+template<typename NodeType, typename EdgeType>
+inline AllreduceRobust::ReturnType
+AllreduceRobust::MsgPassing(const NodeType &node_value,
+                            std::vector<EdgeType> *p_edge_in,
+                            std::vector<EdgeType> *p_edge_out,
+                            EdgeType (*func)
+                            (const NodeType &node_value,
+                             const std::vector<EdgeType> &edge_in,
+                             size_t out_index)) {
+  RefLinkVector &links = tree_links;
+  if (links.size() == 0) return kSuccess;
+  // number of links
+  const int nlink = static_cast<int>(links.size());
+  // initialize the pointers
+  for (int i = 0; i < nlink; ++i) {
+    links[i].ResetSize();
+  }
+  std::vector<EdgeType> &edge_in = *p_edge_in;
+  std::vector<EdgeType> &edge_out = *p_edge_out;
+  edge_in.resize(nlink);
+  edge_out.resize(nlink);
+  // stages in the process
+  // 0: recv messages from childs
+  // 1: send message to parent
+  // 2: recv message from parent
+  // 3: send message to childs
+  int stage = 0;
+  // if no childs, no need to, directly start passing message
+  if (nlink == static_cast<int>(parent_index != -1)) {
+    utils::Assert(parent_index == 0, "parent must be 0");
+    edge_out[parent_index] = func(node_value, edge_in, parent_index);
+    stage = 1;
+  }
+  // while we have not passed the messages out
+  while (true) {
+    // for node with no parent, directly do stage 3
+    if (parent_index == -1) {
+      utils::Assert(stage != 2 && stage != 1, "invalie stage id");
+    }
+    // select helper
+    utils::SelectHelper selecter;
+    bool done = (stage == 3);
+    for (int i = 0; i < nlink; ++i) {
+      selecter.WatchException(links[i].sock);
+      switch (stage) {
+        case 0:
+          if (i != parent_index && links[i].size_read != sizeof(EdgeType)) {
+            selecter.WatchRead(links[i].sock);
+          }
+          break;
+        case 1: if (i == parent_index) selecter.WatchWrite(links[i].sock); break;
+        case 2: if (i == parent_index) selecter.WatchRead(links[i].sock); break;
+        case 3:
+          if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
+            selecter.WatchWrite(links[i].sock);
+            done = false;
+          }
+          break;
+        default: utils::Error("invalid stage");
+      }
+    }
+    // finish all the stages, and write out message
+    if (done) break;
+    selecter.Select();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      // recive OOB message from some link
+      if (selecter.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    if (stage == 0) {
+      bool finished = true;
+      // read data from childs
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index) {
+          if (selecter.CheckRead(links[i].sock)) {
+            ReturnType ret = links[i].ReadToArray(&edge_in[i], sizeof(EdgeType));
+            if (ret != kSuccess) return ReportError(&links[i], ret);
+          }
+          if (links[i].size_read != sizeof(EdgeType)) finished = false;
+        }
+      }
+      // if no parent, jump to stage 3, otherwise do stage 1
+      if (finished) {
+        if (parent_index != -1) {
+          edge_out[parent_index] = func(node_value, edge_in, parent_index);
+          stage = 1;
+        } else {
+          for (int i = 0; i < nlink; ++i) {
+            edge_out[i] = func(node_value, edge_in, i);
+          }
+          stage = 3;
+        }
+      }
+    }
+    if (stage == 1) {
+      const int pid = this->parent_index;
+      utils::Assert(pid != -1, "MsgPassing invalid stage");
+      ReturnType ret = links[pid].WriteFromArray(&edge_out[pid], sizeof(EdgeType));
+      if (ret != kSuccess) return ReportError(&links[pid], ret);
+      if (links[pid].size_write == sizeof(EdgeType)) stage = 2;
+    }
+    if (stage == 2) {
+      const int pid = this->parent_index;
+      utils::Assert(pid != -1, "MsgPassing invalid stage");
+      ReturnType ret = links[pid].ReadToArray(&edge_in[pid], sizeof(EdgeType));
+      if (ret != kSuccess) return ReportError(&links[pid], ret);
+      if (links[pid].size_read == sizeof(EdgeType)) {
+        for (int i = 0; i < nlink; ++i) {
+          if (i != pid) edge_out[i] = func(node_value, edge_in, i);
+        }
+        stage = 3;
+      }
+    }
+    if (stage == 3) {
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
+          ReturnType ret = links[i].WriteFromArray(&edge_out[i], sizeof(EdgeType));
+          if (ret != kSuccess) return ReportError(&links[i], ret);
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_ENGINE_ROBUST_INL_H_
diff --git a/subtree/rabit/src/allreduce_robust.cc b/subtree/rabit/src/allreduce_robust.cc
new file mode 100644
index 000000000..7f379237f
--- /dev/null
+++ b/subtree/rabit/src/allreduce_robust.cc
@@ -0,0 +1,1178 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_robust.cc
+ * \brief Robust implementation of Allreduce
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <limits>
+#include <utility>
+#include "../include/rabit/io.h"
+#include "../include/rabit/utils.h"
+#include "../include/rabit/engine.h"
+#include "../include/rabit/rabit-inl.h"
+#include "./allreduce_robust.h"
+
+namespace rabit {
+namespace engine {
+AllreduceRobust::AllreduceRobust(void) {
+  num_local_replica = 0;
+  num_global_replica = 5;
+  default_local_replica = 2;
+  seq_counter = 0;
+  local_chkpt_version = 0;
+  result_buffer_round = 1;
+  global_lazycheck = NULL;
+  use_local_model = -1;
+  recover_counter = 0;
+}
+void AllreduceRobust::Init(void) {
+  AllreduceBase::Init();
+  result_buffer_round = std::max(world_size / num_global_replica, 1);
+}
+/*! \brief shutdown the engine */
+void AllreduceRobust::Shutdown(void) {
+  // need to sync the exec before we shutdown, do a pesudo check point
+  // execute checkpoint, note: when checkpoint existing, load will not happen
+  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp),
+                "Shutdown: check point must return true");
+  // reset result buffer
+  resbuf.Clear(); seq_counter = 0;
+  // execute check ack step, load happens here
+  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
+                "Shutdown: check ack must return true");
+  AllreduceBase::Shutdown();
+}
+/*!
+ * \brief set parameters to the engine 
+ * \param name parameter name
+ * \param val parameter value
+ */
+void AllreduceRobust::SetParam(const char *name, const char *val) {
+  AllreduceBase::SetParam(name, val);
+  if (!strcmp(name, "rabit_global_replica")) num_global_replica = atoi(val);
+  if (!strcmp(name, "rabit_local_replica")) {
+    num_local_replica = atoi(val);
+  }
+}
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf 
+ *        this function is NOT thread-safe
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+ *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param prepare_arg argument used to passed into the lazy preprocessing function
+ */
+void AllreduceRobust::Allreduce(void *sendrecvbuf_,
+                                size_t type_nbytes,
+                                size_t count,
+                                ReduceFunction reducer,
+                                PreprocFunction prepare_fun,
+                                void *prepare_arg) {
+  // skip action in single node
+  if (world_size == 1) return;
+  bool recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter);
+  // now we are free to remove the last result, if any
+  if (resbuf.LastSeqNo() != -1 &&
+      (resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
+    resbuf.DropLast();
+  }
+  if (!recovered && prepare_fun != NULL) prepare_fun(prepare_arg);
+  void *temp = resbuf.AllocTemp(type_nbytes, count);
+  while (true) {
+    if (recovered) {
+      std::memcpy(temp, sendrecvbuf_, type_nbytes * count); break;
+    } else {
+      std::memcpy(temp, sendrecvbuf_, type_nbytes * count);
+      if (CheckAndRecover(TryAllreduce(temp, type_nbytes, count, reducer))) {
+        std::memcpy(sendrecvbuf_, temp, type_nbytes * count); break;
+      } else {
+        recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter);
+      }
+    }
+  }
+  resbuf.PushTemp(seq_counter, type_nbytes, count);
+  seq_counter += 1;
+}
+/*!
+ * \brief broadcast data from root to all nodes
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param size the size of the data to be broadcasted
+ * \param root the root worker id to broadcast the data
+ */
+void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
+  // skip action in single node
+  if (world_size == 1) return;
+  bool recovered = RecoverExec(sendrecvbuf_, total_size, 0, seq_counter);
+  // now we are free to remove the last result, if any
+  if (resbuf.LastSeqNo() != -1 &&
+      (resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
+    resbuf.DropLast();
+  }
+  void *temp = resbuf.AllocTemp(1, total_size);
+  while (true) {
+    if (recovered) {
+      std::memcpy(temp, sendrecvbuf_, total_size); break;
+    } else {
+      if (CheckAndRecover(TryBroadcast(sendrecvbuf_, total_size, root))) {
+        std::memcpy(temp, sendrecvbuf_, total_size); break;
+      } else {
+        recovered = RecoverExec(sendrecvbuf_, total_size, 0, seq_counter);
+      }
+    }
+  }
+  resbuf.PushTemp(seq_counter, 1, total_size);
+  seq_counter += 1;
+}
+/*!
+ * \brief load latest check point
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller need to gauranttees that global_model
+ *   is the same in all nodes
+ * \param local_model pointer to local model, that is specific to current node/rank
+ *   this can be NULL when no local model is needed
+ *
+ * \return the version number of check point loaded
+ *     if returned version == 0, this means no model has been CheckPointed
+ *     the p_model is not touched, user should do necessary initialization by themselves
+ *   
+ *   Common usage example:
+ *      int iter = rabit::LoadCheckPoint(&model);
+ *      if (iter == 0) model.InitParameters();
+ *      for (i = iter; i < max_iter; ++i) {
+ *        do many things, include allreduce
+ *        rabit::CheckPoint(model);
+ *      } 
+ *
+ * \sa CheckPoint, VersionNumber
+ */
+int AllreduceRobust::LoadCheckPoint(ISerializable *global_model,
+                                    ISerializable *local_model) {
+  // skip action in single node
+  if (world_size == 1) return 0;
+  this->LocalModelCheck(local_model != NULL);
+  if (num_local_replica == 0) {
+    utils::Check(local_model == NULL,
+                 "need to set rabit_local_replica larger than 1 to checkpoint local_model");
+  }
+  // check if we succesful
+  if (RecoverExec(NULL, 0, ActionSummary::kLoadCheck, ActionSummary::kSpecialOp)) {
+    int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
+    if (local_model != NULL) {
+      if (nlocal == num_local_replica + 1) {
+        // load in local model
+        utils::MemoryFixSizeBuffer fs(BeginPtr(local_chkpt[local_chkpt_version]),
+                                      local_rptr[local_chkpt_version][1]);
+        local_model->Load(fs);
+      } else {
+        utils::Assert(nlocal == 0, "[%d] local model inconsistent, nlocal=%d", rank, nlocal);
+      }
+    }
+    // reset result buffer
+    resbuf.Clear(); seq_counter = 0;
+    // load from buffer
+    utils::MemoryBufferStream fs(&global_checkpoint);
+    if (global_checkpoint.length() == 0) {
+      version_number = 0;
+    } else {
+      utils::Assert(fs.Read(&version_number, sizeof(version_number)) != 0,
+                    "read in version number");
+      global_model->Load(fs);
+      utils::Assert(local_model == NULL || nlocal == num_local_replica + 1,
+                    "local model inconsistent, nlocal=%d", nlocal);
+    }
+    // run another phase of check ack, if recovered from data
+    utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
+                  "check ack must return true");
+    return version_number;
+  } else {
+    // reset result buffer
+    resbuf.Clear(); seq_counter = 0; version_number = 0;
+    // nothing loaded, a fresh start, everyone init model
+    return version_number;
+  }
+}
+/*!
+ * \brief internal consistency check function,
+ *  use check to ensure user always call CheckPoint/LoadCheckPoint
+ *  with or without local but not both, this function will set the approperiate settings
+ *  in the first call of LoadCheckPoint/CheckPoint 
+ *
+ * \param with_local whether the user calls CheckPoint with local model
+ */
+void AllreduceRobust::LocalModelCheck(bool with_local) {
+  if (use_local_model == -1) {
+    if (with_local) {
+      use_local_model = 1;
+      if (num_local_replica == 0) {
+        num_local_replica = default_local_replica;
+      }
+    } else {
+      use_local_model = 0;
+      num_local_replica = 0;
+    }
+  } else {
+    utils::Check(use_local_model == int(with_local),
+                 "Can only call Checkpoint/LoadCheckPoint always with"\
+                 "or without local_model, but not mixed case");
+  }
+}
+/*!
+ * \brief internal implementation of checkpoint, support both lazy and normal way
+ * 
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller need to gauranttees that global_model
+ *   is the same in all nodes
+ * \param local_model pointer to local model, that is specific to current node/rank
+ *   this can be NULL when no local state is needed
+ * \param lazy_checkpt whether the action is lazy checkpoint
+ *
+ * \sa CheckPoint, LazyCheckPoint
+ */
+void AllreduceRobust::CheckPoint_(const ISerializable *global_model,
+                                  const ISerializable *local_model,
+                                  bool lazy_checkpt) {
+  // never do check point in single machine mode
+  if (world_size == 1) {
+    version_number += 1; return;
+  }
+  this->LocalModelCheck(local_model != NULL);
+  if (num_local_replica == 0) {
+    utils::Check(local_model == NULL,
+                 "need to set rabit_local_replica larger than 1 to checkpoint local_model");
+  }
+  if (num_local_replica != 0) {
+    while (true) {
+      if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
+      // save model model to new version place
+      int new_version = !local_chkpt_version;
+      local_chkpt[new_version].clear();
+      utils::MemoryBufferStream fs(&local_chkpt[new_version]);
+      if (local_model != NULL) {
+        local_model->Save(fs);
+      }
+      local_rptr[new_version].clear();
+      local_rptr[new_version].push_back(0);
+      local_rptr[new_version].push_back(local_chkpt[new_version].length());
+      if (CheckAndRecover(TryCheckinLocalState(&local_rptr[new_version],
+                                               &local_chkpt[new_version]))) break;
+    }
+    // run the ack phase, can be true or false
+    RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckAck);
+    // switch pointer to new version
+    local_chkpt_version = !local_chkpt_version;
+  }
+  // execute checkpoint, note: when checkpoint existing, load will not happen
+  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp),
+                "check point must return true");
+  // this is the critical region where we will change all the stored models
+  // increase version number
+  version_number += 1;
+  // save model
+  if (lazy_checkpt) {
+    global_lazycheck = global_model;
+  } else {
+    global_checkpoint.resize(0);
+    utils::MemoryBufferStream fs(&global_checkpoint);
+    fs.Write(&version_number, sizeof(version_number));
+    global_model->Save(fs);
+    global_lazycheck = NULL;
+  }
+  // reset result buffer
+  resbuf.Clear(); seq_counter = 0;
+  // execute check ack step, load happens here
+  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
+                "check ack must return true");
+}
+/*!
+ * \brief reset the all the existing links by sending Out-of-Band message marker
+ *  after this function finishes, all the messages received and sent before in all live links are discarded,
+ *  This allows us to get a fresh start after error has happened
+ *
+ * \return this function can return kSuccess or kSockError
+ *         when kSockError is returned, it simply means there are bad sockets in the links,
+ *         and some link recovery proceduer is needed
+ */
+AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
+  // number of links
+  const int nlink = static_cast<int>(all_links.size());
+  for (int i = 0; i < nlink; ++i) {
+    all_links[i].InitBuffer(sizeof(int), 1 << 10, reduce_buffer_size);
+    all_links[i].ResetSize();
+  }
+  // read and discard data from all channels until pass mark
+  while (true) {
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].sock.BadSocket()) continue;
+      if (all_links[i].size_write == 0) {
+        char sig = kOOBReset;
+        ssize_t len = all_links[i].sock.Send(&sig, sizeof(sig), MSG_OOB);
+        // error will be filtered in next loop
+        if (len == sizeof(sig)) all_links[i].size_write = 1;
+      }
+      if (all_links[i].size_write == 1) {
+        char sig = kResetMark;
+        ssize_t len = all_links[i].sock.Send(&sig, sizeof(sig));
+        if (len == sizeof(sig)) all_links[i].size_write = 2;
+      }
+    }
+    utils::SelectHelper rsel;
+    bool finished = true;
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].size_write != 2 && !all_links[i].sock.BadSocket()) {
+        rsel.WatchWrite(all_links[i].sock); finished = false;
+      }
+    }
+    if (finished) break;
+    // wait to read from the channels to discard data
+    rsel.Select();
+  }
+  for (int i = 0; i < nlink; ++i) {
+    if (!all_links[i].sock.BadSocket()) {
+      utils::SelectHelper::WaitExcept(all_links[i].sock);
+    }
+  }
+  while (true) {
+    utils::SelectHelper rsel;
+    bool finished = true;
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].size_read == 0 && !all_links[i].sock.BadSocket()) {
+        rsel.WatchRead(all_links[i].sock); finished = false;
+      }
+    }
+    if (finished) break;
+    rsel.Select();
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].sock.BadSocket()) continue;
+      if (all_links[i].size_read == 0) {
+        int atmark = all_links[i].sock.AtMark();
+        if (atmark < 0) {
+          utils::Assert(all_links[i].sock.BadSocket(), "must already gone bad");
+        } else if (atmark > 0) {
+          all_links[i].size_read = 1;
+        } else {
+          // no at mark, read and discard data
+          ssize_t len = all_links[i].sock.Recv(all_links[i].buffer_head, all_links[i].buffer_size);
+          if (all_links[i].sock.AtMark()) all_links[i].size_read = 1;
+          // zero length, remote closed the connection, close socket
+          if (len == 0) all_links[i].sock.Close();
+        }
+      }
+    }
+  }
+  // start synchronization, use blocking I/O to avoid select
+  for (int i = 0; i < nlink; ++i) {
+    if (!all_links[i].sock.BadSocket()) {
+      char oob_mark;
+      all_links[i].sock.SetNonBlock(false);
+      ssize_t len = all_links[i].sock.Recv(&oob_mark, sizeof(oob_mark), MSG_WAITALL);
+      if (len == 0) {
+        all_links[i].sock.Close(); continue;
+      } else if (len > 0) {
+        utils::Assert(oob_mark == kResetMark, "wrong oob msg");
+        utils::Assert(all_links[i].sock.AtMark() != 1, "should already read past mark");
+      } else {
+        utils::Assert(errno != EAGAIN|| errno != EWOULDBLOCK, "BUG");
+      }
+      // send out ack
+      char ack = kResetAck;
+      while (true) {
+        len = all_links[i].sock.Send(&ack, sizeof(ack));
+        if (len == sizeof(ack)) break;
+        if (len == -1) {
+          if (errno != EAGAIN && errno != EWOULDBLOCK) break;
+        }
+      }
+    }
+  }
+  // wait all ack
+  for (int i = 0; i < nlink; ++i) {
+    if (!all_links[i].sock.BadSocket()) {
+      char ack;
+      ssize_t len = all_links[i].sock.Recv(&ack, sizeof(ack), MSG_WAITALL);
+      if (len == 0) {
+        all_links[i].sock.Close(); continue;
+      } else if (len > 0) {
+        utils::Assert(ack == kResetAck, "wrong Ack MSG");
+      } else {
+        utils::Assert(errno != EAGAIN|| errno != EWOULDBLOCK, "BUG");
+      }
+      // set back to nonblock mode
+      all_links[i].sock.SetNonBlock(true);
+    }
+  }
+  for (int i = 0; i < nlink; ++i) {
+    if (all_links[i].sock.BadSocket()) return kSockError;
+  }
+  return kSuccess;
+}
+/*!
+ * \brief if err_type indicates an error
+ *         recover links according to the error type reported
+ *        if there is no error, return true
+ * \param err_type the type of error happening in the system
+ * \return true if err_type is kSuccess, false otherwise 
+ */
+bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
+  if (err_type == kSuccess) return true;
+  utils::Assert(err_link != NULL, "must know the error source");
+  recover_counter += 1;
+  {
+    // simple way, shutdown all links
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
+    }
+    ReConnectLinks("recover");
+    return false;
+  }
+  // this was old way
+  // TryResetLinks still causes possible errors, so not use this one
+  while (err_type != kSuccess) {
+    switch (err_type.value) {
+      case kGetExcept: err_type = TryResetLinks(); break;
+      case kSockError: {
+        TryResetLinks();
+        ReConnectLinks();
+        err_type = kSuccess;
+        break;
+      }
+      default: utils::Assert(false, "RecoverLinks: cannot reach here");
+    }
+  }
+  return false;
+}
+/*!
+ * \brief message passing function, used to decide the
+ *        shortest distance to the possible source of data
+ * \param node_value a pair of have_data and size
+ *           have_data whether current node have data
+ *           size gives the size of data, if current node is kHaveData
+ * \param dist_in the shorest to any data source distance in each direction
+ * \param out_index the edge index of output link
+ * \return the shorest distance result of out edge specified by out_index
+ */
+inline std::pair<int, size_t>
+ShortestDist(const std::pair<bool, size_t> &node_value,
+             const std::vector< std::pair<int, size_t> > &dist_in,
+             size_t out_index) {
+  if (node_value.first) {
+    return std::make_pair(1, node_value.second);
+  }
+  size_t size = 0;
+  int res = std::numeric_limits<int>::max();
+  for (size_t i = 0; i < dist_in.size(); ++i) {
+    if (i == out_index) continue;
+    if (dist_in[i].first == std::numeric_limits<int>::max()) continue;
+    if (dist_in[i].first + 1 < res) {
+      res = dist_in[i].first + 1;
+      size = dist_in[i].second;
+    }
+  }
+  // add one hop
+
+  return std::make_pair(res, size);
+}
+/*!
+ * \brief message passing function, used to decide the
+ *    data request from each edge, whether need to request data from certain edge
+ * \param node_value a pair of request_data and best_link
+ *           request_data stores whether current node need to request data 
+ *           best_link gives the best edge index to fetch the data
+ * \param req_in the data request from incoming edges
+ * \param out_index the edge index of output link
+ * \return the request to the output edge
+ */
+inline char DataRequest(const std::pair<bool, int> &node_value,
+                        const std::vector<char> &req_in,
+                        size_t out_index) {
+  // whether current node need to request data
+  bool request_data = node_value.first;
+  // which edge index is the best link to request data
+  // can be -1, which means current node contains data
+  const int best_link = node_value.second;
+  if (static_cast<int>(out_index) == best_link) {
+    if (request_data) return 1;
+    for (size_t i = 0; i < req_in.size(); ++i) {
+      if (i == out_index) continue;
+      if (req_in[i] != 0) return 1;
+    }
+  }
+  return 0;
+}
+/*!
+ * \brief try to decide the recovery message passing request
+ * \param role the current role of the node
+ * \param p_size used to store the size of the message, for node in state kHaveData,
+ *               this size must be set correctly before calling the function
+ *               for others, this surves as output parameter
+ *
+ * \param p_recvlink used to store the link current node should recv data from, if necessary
+ *          this can be -1, which means current node have the data
+ * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
+ *
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */  
+AllreduceRobust::ReturnType
+AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
+                                  size_t *p_size,
+                                  int *p_recvlink,
+                                  std::vector<bool> *p_req_in) {
+  int best_link = -2;
+  {
+    // get the shortest distance to the request point
+    std::vector<std::pair<int, size_t> > dist_in, dist_out;
+    ReturnType succ = MsgPassing(std::make_pair(role == kHaveData, *p_size),
+                                 &dist_in, &dist_out, ShortestDist);
+    if (succ != kSuccess) return succ;
+    if (role != kHaveData) {
+      for (size_t i = 0; i < dist_in.size(); ++i) {
+        if (dist_in[i].first != std::numeric_limits<int>::max()) {
+          utils::Check(best_link == -2 || *p_size == dist_in[i].second,
+                       "[%d] Allreduce size inconsistent, distin=%lu, size=%lu, reporting=%lu\n",
+                       rank, dist_in[i].first, *p_size, dist_in[i].second);
+          if (best_link == -2 || dist_in[i].first < dist_in[best_link].first) {
+            best_link = static_cast<int>(i);
+            *p_size = dist_in[i].second;
+          }
+        }
+      }
+      utils::Check(best_link != -2, "Too many nodes went down and we cannot recover..");
+    } else {
+      best_link = -1;
+    }
+  }
+  // get the node request
+  std::vector<char> req_in, req_out;
+  ReturnType succ = MsgPassing(std::make_pair(role == kRequestData, best_link),
+                               &req_in, &req_out, DataRequest);
+  if (succ != kSuccess) return succ;
+  // set p_req_in
+  p_req_in->resize(req_in.size());
+  for (size_t i = 0; i < req_in.size(); ++i) {
+    // set p_req_in
+    (*p_req_in)[i] = (req_in[i] != 0);
+    if (req_out[i] != 0) {
+      utils::Assert(req_in[i] == 0, "cannot get and receive request");
+      utils::Assert(static_cast<int>(i) == best_link, "request result inconsistent");
+    }
+  }
+  *p_recvlink = best_link;
+  return kSuccess;
+}
+/*!
+ * \brief try to finish the data recovery request,
+ *        this function is used together with TryDecideRouting
+ * \param role the current role of the node
+ * \param sendrecvbuf_ the buffer to store the data to be sent/recived
+ *          - if the role is kHaveData, this stores the data to be sent
+ *          - if the role is kRequestData, this is the buffer to store the result
+ *          - if the role is kPassData, this will not be used, and can be NULL
+ * \param size the size of the data, obtained from TryDecideRouting
+ * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
+ * \param req_in the request of each link to send data, obtained from TryDecideRouting
+ *
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType, TryDecideRouting
+ */  
+AllreduceRobust::ReturnType
+AllreduceRobust::TryRecoverData(RecoverType role,
+                                void *sendrecvbuf_,
+                                size_t size,
+                                int recv_link,
+                                const std::vector<bool> &req_in) {
+  RefLinkVector &links = tree_links;
+  // no need to run recovery for zero size messages
+  if (links.size() == 0 || size == 0) return kSuccess;
+  utils::Assert(req_in.size() == links.size(), "TryRecoverData");
+  const int nlink = static_cast<int>(links.size());
+  {
+    bool req_data = role == kRequestData;
+    for (int i = 0; i < nlink; ++i) {
+      if (req_in[i]) {
+        utils::Assert(i != recv_link, "TryDecideRouting");
+        req_data = true;
+      }
+    }
+    // do not need to provide data or receive data, directly exit
+    if (!req_data) return kSuccess;
+  }
+  utils::Assert(recv_link >= 0 || role == kHaveData, "recv_link must be active");
+  if (role == kPassData) {
+    links[recv_link].InitBuffer(1, size, reduce_buffer_size);
+  }
+  for (int i = 0; i < nlink; ++i) {
+    links[i].ResetSize();
+  }
+  while (true) {
+    bool finished = true;
+    utils::SelectHelper selecter;
+    for (int i = 0; i < nlink; ++i) {
+      if (i == recv_link && links[i].size_read != size) {
+        selecter.WatchRead(links[i].sock);
+        finished = false;
+      }
+      if (req_in[i] && links[i].size_write != size) {
+        if (role == kHaveData ||
+            (links[recv_link].size_read != links[i].size_write)) {
+          selecter.WatchWrite(links[i].sock);
+        }
+        finished = false;
+      }
+      selecter.WatchException(links[i].sock);
+    }
+    if (finished) break;
+    selecter.Select();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      if (selecter.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    if (role == kRequestData) {
+      const int pid = recv_link;
+      if (selecter.CheckRead(links[pid].sock)) {
+        ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);        
+        if (ret != kSuccess) {
+          return ReportError(&links[pid], ret);
+        }
+      }
+      for (int i = 0; i < nlink; ++i) {
+        if (req_in[i] && links[i].size_write != links[pid].size_read) {
+          ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, links[pid].size_read);
+          if (ret != kSuccess) {
+            return ReportError(&links[i], ret);
+          }
+        }
+      }
+    }
+    if (role == kHaveData) {
+      for (int i = 0; i < nlink; ++i) {
+        if (req_in[i] && links[i].size_write != size) {
+          ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, size);
+          if (ret != kSuccess) {
+            return ReportError(&links[i], ret);
+          }
+        }
+      }
+    }
+    if (role == kPassData) {
+      const int pid = recv_link;
+      const size_t buffer_size = links[pid].buffer_size;
+      if (selecter.CheckRead(links[pid].sock)) {
+        size_t min_write = size;
+        for (int i = 0; i < nlink; ++i) {
+          if (req_in[i]) min_write = std::min(links[i].size_write, min_write);
+        }
+        utils::Assert(min_write <= links[pid].size_read, "boundary check");
+        ReturnType ret = links[pid].ReadToRingBuffer(min_write);
+        if (ret != kSuccess) {
+          return ReportError(&links[pid], ret);
+        }
+      }
+      for (int i = 0; i < nlink; ++i) {
+        if (req_in[i] && links[pid].size_read != links[i].size_write) {
+          size_t start = links[i].size_write % buffer_size;
+          // send out data from ring buffer
+          size_t nwrite = std::min(buffer_size - start, links[pid].size_read - links[i].size_write);
+          ssize_t len = links[i].sock.Send(links[pid].buffer_head + start, nwrite);
+          if (len != -1) {
+            links[i].size_write += len;
+          } else {
+            ReturnType ret = Errno2Return(errno);
+            if (ret != kSuccess) return ReportError(&links[i], ret);
+          }
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief try to load check point
+ *
+ *        This is a collaborative function called by all nodes
+ *        only the nodes with requester set to true really needs to load the check point
+ *        other nodes acts as collaborative roles to complete this request
+ *
+ * \param requester whether current node is the requester
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
+  // check in local data
+  RecoverType role =  requester ? kRequestData : kHaveData;
+  ReturnType succ;
+  if (num_local_replica != 0) {
+    if (requester) {
+      // clear existing history, if any, before load
+      local_rptr[local_chkpt_version].clear();
+      local_chkpt[local_chkpt_version].clear();
+    }
+    // recover local checkpoint
+    succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
+                                &local_chkpt[local_chkpt_version]);
+    if (succ != kSuccess) return succ;
+    int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
+    // check if everyone is OK
+    unsigned state = 0;
+    if (nlocal == num_local_replica + 1) {
+      // complete recovery
+      state = 1;
+    } else if (nlocal == 0) {
+      // get nothing
+      state = 2;
+    } else {
+      // partially complete state
+      state = 4;
+    }
+    succ = TryAllreduce(&state, sizeof(state), 1, op::Reducer<op::BitOR, unsigned>);
+    if (succ != kSuccess) return succ;
+    utils::Check(state == 1 || state == 2,
+                 "LoadCheckPoint: too many nodes fails, cannot recover local state");
+  }
+  // do call save model if the checkpoint was lazy
+  if (role == kHaveData && global_lazycheck != NULL) {
+    global_checkpoint.resize(0);
+    utils::MemoryBufferStream fs(&global_checkpoint);
+    fs.Write(&version_number, sizeof(version_number));
+    global_lazycheck->Save(fs);
+    global_lazycheck = NULL;
+  }
+  // recover global checkpoint
+  size_t size = this->global_checkpoint.length();
+  int recv_link;
+  std::vector<bool> req_in;
+  succ = TryDecideRouting(role, &size, &recv_link, &req_in);
+  if (succ != kSuccess) return succ;
+  if (role == kRequestData) {
+    global_checkpoint.resize(size);
+  }
+  if (size == 0) return kSuccess;
+  return TryRecoverData(role, BeginPtr(global_checkpoint), size, recv_link, req_in);
+}
+/*!
+ * \brief try to get the result of operation specified by seqno
+ *
+ *        This is a collaborative function called by all nodes
+ *        only the nodes with requester set to true really needs to get the result
+ *        other nodes acts as collaborative roles to complete this request
+ *
+ * \param buf the buffer to store the result, this parameter is only used when current node is requester
+ * \param size the total size of the buffer, this parameter is only used when current node is requester
+ * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
+ * \param requester whether current node is the requester
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool requester) {
+  // if minimum sequence requested is local check point ack,
+  // this means all nodes have finished local check point, directly return
+  if (seqno == ActionSummary::kLocalCheckAck) return kSuccess;
+  if (seqno == ActionSummary::kLocalCheckPoint) {
+    // new version of local model
+    int new_version = !local_chkpt_version;
+    int nlocal = std::max(static_cast<int>(local_rptr[new_version].size()) - 1, 0);
+    // if we goes to this place, use must have already setup the state once
+    utils::Assert(nlocal == 1 || nlocal == num_local_replica + 1,
+                  "TryGetResult::Checkpoint");
+    return TryRecoverLocalState(&local_rptr[new_version], &local_chkpt[new_version]);
+  }
+  // handles normal data recovery
+  RecoverType role;
+  if (!requester) {
+    sendrecvbuf = resbuf.Query(seqno, &size);
+    role = sendrecvbuf != NULL ? kHaveData : kPassData;
+  } else {
+    role = kRequestData;
+  }
+  int recv_link;
+  std::vector<bool> req_in;
+  // size of data
+  size_t data_size = size;
+  ReturnType succ = TryDecideRouting(role, &data_size, &recv_link, &req_in);
+  if (succ != kSuccess) return succ;
+  utils::Check(data_size != 0, "zero size check point is not allowed");
+  if (role == kRequestData || role == kHaveData) {
+    utils::Check(data_size == size,
+                 "Allreduce Recovered data size do not match the specification of function call.\n"\
+                 "Please check if calling sequence of recovered program is the " \
+                 "same the original one in current VersionNumber");
+  }
+  return TryRecoverData(role, sendrecvbuf, data_size, recv_link, req_in);
+}
+/*!
+ * \brief try to run recover execution for a request action described by flag and seqno,
+ *        the function will keep blocking to run possible recovery operations before the specified action,
+ *        until the requested result is received by a recovering procedure,
+ *        or the function discovers that the requested action is not yet executed, and return false
+ *
+ * \param buf the buffer to store the result
+ * \param size the total size of the buffer
+ * \param flag flag information about the action \sa ActionSummary
+ * \param seqno sequence number of the action, if it is special action with flag set, 
+ *              seqno needs to be set to ActionSummary::kSpecialOp
+ *
+ * \return if this function can return true or false 
+ *    - true means buf already set to the
+ *           result by recovering procedure, the action is complete, no further action is needed
+ *    - false means this is the lastest action that has not yet been executed, need to execute the action
+ */
+bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
+  if (flag != 0) {
+    utils::Assert(seqno == ActionSummary::kSpecialOp, "must only set seqno for normal operations");
+  }
+  // request
+  ActionSummary req(flag, seqno);
+  while (true) {
+    this->ReportStatus();
+    // action
+    ActionSummary act = req;
+    // get the reduced action
+    if (!CheckAndRecover(TryAllreduce(&act, sizeof(act), 1, ActionSummary::Reducer))) continue;
+    if (act.check_ack()) {
+      if (act.check_point()) {
+        // if we also have check_point, do check point first
+        utils::Assert(!act.diff_seq(),
+                      "check ack & check pt  cannot occur together with normal ops");
+        // if we requested checkpoint, we are free to go
+        if (req.check_point()) return true;
+      } else if (act.load_check()) {
+        // if there is only check_ack and load_check, do load_check
+        if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
+        // if requested load check, then misson complete
+        if (req.load_check()) return true;
+      } else {
+        // there is no check point and no load check, execute check ack
+        if (req.check_ack()) return true;
+      }
+      // if execute to this point
+      // this means the action requested has not been completed
+      // try next round
+    } else {
+      if (act.check_point()) {
+        if (act.diff_seq()) {
+          utils::Assert(act.min_seqno() != ActionSummary::kSpecialOp, "min seq bug");
+          bool requester = req.min_seqno() == act.min_seqno();
+          if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
+          if (requester) return true;
+        } else  {
+          // no difference in seq no, means we are free to check point
+          if (req.check_point()) return true;
+        }
+      } else {
+        // no check point
+        if (act.load_check()) {
+          // all the nodes called load_check, this is an incomplete action
+          if (!act.diff_seq()) return false;
+          // load check have higher priority, do load_check
+          if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
+          // if requested load check, then misson complete
+          if (req.load_check()) return true;
+        } else {
+          // no special flags, no checkpoint, check ack, load_check
+          utils::Assert(act.min_seqno() != ActionSummary::kSpecialOp, "min seq bug");
+          if (act.diff_seq()) {
+            bool requester = req.min_seqno() == act.min_seqno();
+            if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
+            if (requester) return true;
+          } else {
+            // all the request is same,
+            // this is most recent command that is yet to be executed
+            return false;
+          }
+        }
+      }
+      // something is still incomplete try next round
+    }
+  }
+  utils::Assert(false, "RecoverExec: should not reach here");
+  return true;
+}
+/*!
+ * \brief try to recover the local state, making each local state to be the result of itself
+ *        plus replication of states in previous num_local_replica hops in the ring
+ *
+ * The input parameters must contain the valid local states available in current nodes,
+ * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt 
+ * If there is sufficient information in the ring, when the function returns, local_chkpt will
+ * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
+ * If there is no sufficient information in the ring, this function the number of checkpoints
+ * will be less than the specified value
+ *
+ * \param p_local_rptr the pointer to the segment pointers in the states array
+ * \param p_local_chkpt the pointer to the storage of local check points
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
+                                      std::string *p_local_chkpt) {
+  // if there is no local replica, we can do nothing
+  if (num_local_replica == 0) return kSuccess;
+  std::vector<size_t> &rptr = *p_local_rptr;
+  std::string &chkpt = *p_local_chkpt;
+  if (rptr.size() == 0) {
+    rptr.push_back(0);
+    utils::Assert(chkpt.length() == 0, "local chkpt space inconsistent");
+  }
+  const int n = num_local_replica;
+  {
+    // backward passing, passing state in backward direction of the ring
+    const int nlocal = static_cast<int>(rptr.size() - 1);
+    utils::Assert(nlocal <= n + 1, "invalid local replica");
+    std::vector<int> msg_back(n + 1);
+    msg_back[0] = nlocal;
+    // backward passing one hop the request
+    ReturnType succ;
+    succ = RingPassing(BeginPtr(msg_back),
+                       1 * sizeof(int), (n+1) * sizeof(int),
+                       0 * sizeof(int), n * sizeof(int),
+                       ring_next, ring_prev);
+    if (succ != kSuccess) return succ;
+    int msg_forward[2];
+    msg_forward[0] = nlocal;
+    succ = RingPassing(msg_forward,
+                       1 * sizeof(int), 2 * sizeof(int),
+                       0 * sizeof(int), 1 * sizeof(int),
+                       ring_prev, ring_next);
+    if (succ != kSuccess) return succ;
+    // calculate the number of things we can read from next link
+    int nread_end = nlocal;
+    for (int i = 1; i <= n; ++i) {
+      nread_end = std::max(nread_end, msg_back[i] - i);
+    }
+    // gives the size of forward
+    int nwrite_start = std::min(msg_forward[1] + 1, nread_end);
+    // get the size of each segments
+    std::vector<size_t> sizes(nread_end);
+    for (int i = 0; i < nlocal; ++i) {
+      sizes[i] = rptr[i + 1] - rptr[i];
+    }
+    // pass size through the link
+    succ = RingPassing(BeginPtr(sizes),
+                       nlocal * sizeof(size_t),
+                       nread_end * sizeof(size_t),
+                       nwrite_start * sizeof(size_t),
+                       nread_end * sizeof(size_t),
+                       ring_next, ring_prev);
+    if (succ != kSuccess) return succ;
+    // update rptr
+    rptr.resize(nread_end + 1);
+    for (int i = nlocal; i < nread_end; ++i) {
+      rptr[i + 1] = rptr[i] + sizes[i];
+    }
+    chkpt.resize(rptr.back());
+    // pass data through the link
+    succ = RingPassing(BeginPtr(chkpt), rptr[nlocal], rptr[nread_end],
+                       rptr[nwrite_start], rptr[nread_end],
+                       ring_next, ring_prev);
+    if (succ != kSuccess) {
+      rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ;
+    }
+  }
+  {
+    // forward passing, passing state in forward direction of the ring
+    const int nlocal = static_cast<int>(rptr.size() - 1);
+    utils::Assert(nlocal <= n + 1, "invalid local replica");
+    std::vector<int> msg_forward(n + 1);
+    msg_forward[0] = nlocal;
+    // backward passing one hop the request
+    ReturnType succ;
+    succ = RingPassing(BeginPtr(msg_forward),
+                       1 * sizeof(int), (n+1) * sizeof(int),
+                       0 * sizeof(int), n * sizeof(int),
+                       ring_prev, ring_next);
+    if (succ != kSuccess) return succ;
+    int msg_back[2];
+    msg_back[0] = nlocal;
+    succ = RingPassing(msg_back,
+                       1 * sizeof(int), 2 * sizeof(int),
+                       0 * sizeof(int), 1 * sizeof(int),
+                       ring_next, ring_prev);
+    if (succ != kSuccess) return succ;
+    // calculate the number of things we can read from next link
+    int nread_end = nlocal, nwrite_end = 1;
+    // have to have itself in order to get other data from prev link
+    if (nlocal != 0) {
+      for (int i = 1; i <= n; ++i) {
+        if (msg_forward[i] == 0) break;
+        nread_end = std::max(nread_end, i + 1);
+        nwrite_end = i + 1;
+      }
+      if (nwrite_end > n) nwrite_end = n;
+    } else  {
+      nread_end = 0; nwrite_end = 0;
+    }
+    // gives the size of forward
+    int nwrite_start = std::min(msg_back[1] - 1, nwrite_end);
+    // next node miss the state of itself, cannot recover
+    if (nwrite_start < 0) nwrite_start = nwrite_end = 0;
+    // get the size of each segments
+    std::vector<size_t> sizes(nread_end);
+    for (int i = 0; i < nlocal; ++i) {
+      sizes[i] = rptr[i + 1] - rptr[i];
+    }
+    // pass size through the link, check consistency
+    succ = RingPassing(BeginPtr(sizes),
+                       nlocal * sizeof(size_t),
+                       nread_end * sizeof(size_t),
+                       nwrite_start * sizeof(size_t),
+                       nwrite_end * sizeof(size_t),
+                       ring_prev, ring_next);
+    if (succ != kSuccess) return succ;
+    // update rptr
+    rptr.resize(nread_end + 1);
+    for (int i = nlocal; i < nread_end; ++i) {
+      rptr[i + 1] = rptr[i] + sizes[i];
+    }
+    chkpt.resize(rptr.back());
+    // pass data through the link
+    succ = RingPassing(BeginPtr(chkpt), rptr[nlocal], rptr[nread_end],
+                       rptr[nwrite_start], rptr[nwrite_end],
+                       ring_prev, ring_next);
+    if (succ != kSuccess) {
+      rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ;
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief try to checkpoint local state, this function is called in normal executation phase
+ *    of checkpoint that contains local state
+ *  the input state must exactly one saved state(local state of current node),
+ *  after complete, this function will get local state from previous num_local_replica nodes and put them
+ *  into local_chkpt and local_rptr
+ *
+ *  It is also OK to call TryRecoverLocalState instead,
+ *  TryRecoverLocalState makes less assumption about the input, and requires more communications
+ *
+ * \param p_local_rptr the pointer to the segment pointers in the states array
+ * \param p_local_chkpt the pointer to the storage of local check points
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType, TryRecoverLocalState
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
+                                      std::string *p_local_chkpt) {
+  // if there is no local replica, we can do nothing
+  if (num_local_replica == 0) return kSuccess;
+  std::vector<size_t> &rptr = *p_local_rptr;
+  std::string &chkpt = *p_local_chkpt;
+  utils::Assert(rptr.size() == 2,
+                "TryCheckinLocalState must have exactly 1 state");
+  const int n = num_local_replica;
+  std::vector<size_t> sizes(n + 1);
+  sizes[0] = rptr[1] - rptr[0];
+  ReturnType succ;
+  // pass size through the link
+  succ = RingPassing(BeginPtr(sizes),
+                     1 * sizeof(size_t),
+                     (n + 1) * sizeof(size_t),
+                     0 * sizeof(size_t),
+                     n * sizeof(size_t),
+                     ring_prev, ring_next);
+  if (succ != kSuccess) return succ;
+  // update rptr
+  rptr.resize(n + 2);
+  for (int i = 1; i <= n; ++i) {
+    rptr[i + 1] = rptr[i] + sizes[i];
+  }
+  chkpt.resize(rptr.back());
+  // pass data through the link
+  succ = RingPassing(BeginPtr(chkpt),
+                     rptr[1], rptr[n + 1],
+                     rptr[0], rptr[n],
+                     ring_prev, ring_next);
+  if (succ != kSuccess) {
+    rptr.resize(2); chkpt.resize(rptr.back()); return succ;
+  }
+  return kSuccess;
+}
+/*!
+ * \brief perform a ring passing to receive data from prev link, and sent data to next link
+ *  this allows data to stream over a ring structure
+ *  sendrecvbuf[0:read_ptr] are already provided by current node
+ *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
+ *  current node will send sendrecvbuf[write_ptr:write_end] to next link
+ *  write_ptr will wait till the data is readed before sending the data
+ *  this function requires read_end >= write_end
+ *
+ * \param sendrecvbuf_ the place to hold the incoming and outgoing data
+ * \param read_ptr the initial read pointer
+ * \param read_end the ending position to read
+ * \param write_ptr the initial write pointer
+ * \param write_end the ending position to write
+ * \param read_link pointer to link to previous position in ring
+ * \param write_link pointer to link of next position in ring
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::RingPassing(void *sendrecvbuf_,
+                             size_t read_ptr,
+                             size_t read_end,
+                             size_t write_ptr,
+                             size_t write_end,
+                             LinkRecord *read_link,
+                             LinkRecord *write_link) {
+  if (read_link == NULL || write_link == NULL || read_end == 0) return kSuccess;
+  utils::Assert(write_end <= read_end,
+                "RingPassing: boundary check1");
+  utils::Assert(read_ptr <= read_end, "RingPassing: boundary check2");
+  utils::Assert(write_ptr <= write_end, "RingPassing: boundary check3");
+  // take reference
+  LinkRecord &prev = *read_link, &next = *write_link;
+  // send recv buffer
+  char *buf = reinterpret_cast<char*>(sendrecvbuf_);
+  while (true) {
+    bool finished = true;
+    utils::SelectHelper selecter;
+    if (read_ptr != read_end) {
+      selecter.WatchRead(prev.sock);
+      finished = false;
+    }
+    if (write_ptr < read_ptr && write_ptr != write_end) {
+      selecter.WatchWrite(next.sock);
+      finished = false;
+    }
+    selecter.WatchException(prev.sock);
+    selecter.WatchException(next.sock);
+    if (finished) break;
+    selecter.Select();
+    if (selecter.CheckExcept(prev.sock)) return ReportError(&prev, kGetExcept);
+    if (selecter.CheckExcept(next.sock)) return ReportError(&next, kGetExcept);
+    if (read_ptr != read_end && selecter.CheckRead(prev.sock)) {
+      ssize_t len = prev.sock.Recv(buf + read_ptr, read_end - read_ptr);
+      if (len == 0) {
+        prev.sock.Close(); return ReportError(&prev, kRecvZeroLen);
+      }
+      if (len != -1) {
+        read_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return(errno);
+        if (ret != kSuccess) return ReportError(&prev, ret);
+      }
+    }
+    if (write_ptr != write_end && write_ptr < read_ptr) {
+      size_t nsend = std::min(write_end - write_ptr, read_ptr - write_ptr);
+      ssize_t len = next.sock.Send(buf + write_ptr, nsend);
+      if (len != -1) {
+        write_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return(errno);
+        if (ret != kSuccess) return ReportError(&prev, ret);
+      }
+    }
+  }
+  return kSuccess;
+}
+}  // namespace engine
+}  // namespace rabit
+
diff --git a/subtree/rabit/src/allreduce_robust.h b/subtree/rabit/src/allreduce_robust.h
new file mode 100644
index 000000000..ff5c046ac
--- /dev/null
+++ b/subtree/rabit/src/allreduce_robust.h
@@ -0,0 +1,553 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_robust.h
+ * \brief Robust implementation of Allreduce
+ *   using TCP non-block socket and tree-shape reduction.
+ *
+ *   This implementation considers the failure of nodes
+ *   
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#ifndef RABIT_ALLREDUCE_ROBUST_H_
+#define RABIT_ALLREDUCE_ROBUST_H_
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "../include/rabit/engine.h"
+#include "./allreduce_base.h"
+
+namespace rabit {
+namespace engine {
+/*! \brief implementation of fault tolerant all reduce engine */
+class AllreduceRobust : public AllreduceBase {
+ public:
+  AllreduceRobust(void);
+  virtual ~AllreduceRobust(void) {}
+  // initialize the manager
+  virtual void Init(void);
+  /*! \brief shutdown the engine */
+  virtual void Shutdown(void);
+  /*!
+   * \brief set parameters to the engine 
+   * \param name parameter name
+   * \param val parameter value
+   */
+  virtual void SetParam(const char *name, const char *val);
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf 
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   */
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun = NULL,
+                         void *prepare_arg = NULL);
+  /*!
+   * \brief broadcast data from root to all nodes
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   */
+  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root);
+  /*!
+   * \brief load latest check point
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local model is needed
+   *
+   * \return the version number of check point loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     the p_model is not touched, user should do necessary initialization by themselves
+   *   
+   *   Common usage example:
+   *      int iter = rabit::LoadCheckPoint(&model);
+   *      if (iter == 0) model.InitParameters();
+   *      for (i = iter; i < max_iter; ++i) {
+   *        do many things, include allreduce
+   *        rabit::CheckPoint(model);
+   *      } 
+   *
+   * \sa CheckPoint, VersionNumber
+   */
+  virtual int LoadCheckPoint(ISerializable *global_model,
+                             ISerializable *local_model = NULL);
+  /*!
+   * \brief checkpoint the model, meaning we finished a stage of execution
+   *  every time we call check point, there is a version number which will increase by one
+   * 
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
+   *       So only CheckPoint with global_model if possible
+   *
+   * \sa LoadCheckPoint, VersionNumber
+   */
+  virtual void CheckPoint(const ISerializable *global_model,
+                          const ISerializable *local_model = NULL) {
+    this->CheckPoint_(global_model, local_model, false);
+  }
+  /*!
+   * \brief This function can be used to replace CheckPoint for global_model only,
+   *   when certain condition is met(see detailed expplaination).
+   * 
+   *   This is a "lazy" checkpoint such that only the pointer to global_model is
+   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+   *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
+   *   In another words, global_model model can be changed only between last call of 
+   *   Allreduce/Broadcast and LazyCheckPoint in current version
+   *   
+   *   For example, suppose the calling sequence is:
+   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+   *   
+   *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
+   *   improve efficiency of the program.
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \sa LoadCheckPoint, CheckPoint, VersionNumber
+   */
+  virtual void LazyCheckPoint(const ISerializable *global_model) {
+    this->CheckPoint_(global_model, NULL, true);
+  }
+  /*!
+   * \brief explicitly re-init everything before calling LoadCheckPoint
+   *    call this function when IEngine throw an exception out,
+   *    this function is only used for test purpose
+   */
+  virtual void InitAfterException(void) {
+    // simple way, shutdown all links
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
+    }
+    ReConnectLinks("recover");
+  }
+
+ private:
+  // constant one byte out of band message to indicate error happening
+  // and mark for channel cleanup
+  static const char kOOBReset = 95;
+  // and mark for channel cleanup, after OOB signal
+  static const char kResetMark = 97;
+  // and mark for channel cleanup
+  static const char kResetAck = 97;
+  /*! \brief type of roles each node can play during recovery */
+  enum RecoverType {
+    /*! \brief current node have data */
+    kHaveData = 0,
+    /*! \brief current node request data */
+    kRequestData = 1,
+    /*! \brief current node only helps to pass data around */
+    kPassData = 2
+  };
+  /*!
+   * \brief summary of actions proposed in all nodes
+   *  this data structure is used to make consensus decision
+   *  about next action to take in the recovery mode
+   */
+  struct ActionSummary {
+    // maximumly allowed sequence id
+    static const int kSpecialOp = (1 << 26);
+    // special sequence number for local state checkpoint
+    static const int kLocalCheckPoint = (1 << 26) - 2;
+    // special sequnce number for local state checkpoint ack signal
+    static const int kLocalCheckAck = (1 << 26) - 1;
+    //---------------------------------------------
+    // The following are bit mask of flag used in
+    //----------------------------------------------
+    // some node want to load check point
+    static const int kLoadCheck = 1;
+    // some node want to do check point
+    static const int kCheckPoint = 2;
+    // check point Ack, we use a two phase message in check point,
+    // this is the second phase of check pointing
+    static const int kCheckAck = 4;
+    // there are difference sequence number the nodes proposed
+    // this means we want to do recover execution of the lower sequence
+    // action instead of normal execution
+    static const int kDiffSeq = 8;
+    // constructor
+    ActionSummary(void) {}
+    // constructor of action
+    explicit ActionSummary(int flag, int minseqno = kSpecialOp) {
+      seqcode = (minseqno << 4) | flag;
+    }
+    // minimum number of all operations
+    inline int min_seqno(void) const {
+      return seqcode >> 4;
+    }
+    // whether the operation set contains a load_check
+    inline bool load_check(void) const {
+      return (seqcode & kLoadCheck) != 0;
+    }
+    // whether the operation set contains a check point
+    inline bool check_point(void) const {
+      return (seqcode & kCheckPoint) != 0;
+    }
+    // whether the operation set contains a check ack
+    inline bool check_ack(void) const {
+      return (seqcode & kCheckAck) != 0;
+    }
+    // whether the operation set contains different sequence number
+    inline bool diff_seq(void) const {
+      return (seqcode & kDiffSeq) != 0;
+    }
+    // returns the operation flag of the result
+    inline int flag(void) const {
+      return seqcode & 15;
+    }
+    // reducer for Allreduce, get the result ActionSummary from all nodes
+    inline static void Reducer(const void *src_, void *dst_,
+                               int len, const MPI::Datatype &dtype) {
+      const ActionSummary *src = (const ActionSummary*)src_;
+      ActionSummary *dst = reinterpret_cast<ActionSummary*>(dst_);
+      for (int i = 0; i < len; ++i) {
+        int src_seqno = src[i].min_seqno();
+        int dst_seqno = dst[i].min_seqno();
+        int flag = src[i].flag() | dst[i].flag();
+        if (src_seqno == dst_seqno) {
+          dst[i] = ActionSummary(flag, src_seqno);
+        } else {
+          dst[i] = ActionSummary(flag | kDiffSeq,
+                                 std::min(src_seqno, dst_seqno));
+        }
+      }
+    }
+
+   private:
+    // internel sequence code
+    int seqcode;
+  };
+  /*! \brief data structure to remember result of Bcast and Allreduce calls */
+  class ResultBuffer {
+   public:
+    // constructor
+    ResultBuffer(void) {
+      this->Clear();
+    }
+    // clear the existing record
+    inline void Clear(void) {
+      seqno_.clear(); size_.clear();
+      rptr_.clear(); rptr_.push_back(0);
+      data_.clear();
+    }
+    // allocate temporal space
+    inline void *AllocTemp(size_t type_nbytes, size_t count) {
+      size_t size = type_nbytes * count;
+      size_t nhop = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
+      utils::Assert(nhop != 0, "cannot allocate 0 size memory");
+      data_.resize(rptr_.back() + nhop);
+      return BeginPtr(data_) + rptr_.back();
+    }
+    // push the result in temp to the
+    inline void PushTemp(int seqid, size_t type_nbytes, size_t count) {
+      size_t size = type_nbytes * count;
+      size_t nhop = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
+      if (seqno_.size() != 0) {
+        utils::Assert(seqno_.back() < seqid, "PushTemp seqid inconsistent");
+      }
+      seqno_.push_back(seqid);
+      rptr_.push_back(rptr_.back() + nhop);
+      size_.push_back(size);
+      utils::Assert(data_.size() == rptr_.back(), "PushTemp inconsistent");
+    }
+    // return the stored result of seqid, if any
+    inline void* Query(int seqid, size_t *p_size) {
+      size_t idx = std::lower_bound(seqno_.begin(),
+                                    seqno_.end(), seqid) - seqno_.begin();
+      if (idx == seqno_.size() || seqno_[idx] != seqid) return NULL;
+      *p_size = size_[idx];
+      return BeginPtr(data_) + rptr_[idx];
+    }
+    // drop last stored result
+    inline void DropLast(void) {
+      utils::Assert(seqno_.size() != 0, "there is nothing to be dropped");
+      seqno_.pop_back();
+      rptr_.pop_back();
+      size_.pop_back();
+      data_.resize(rptr_.back());
+    }
+    // the sequence number of last stored result
+    inline int LastSeqNo(void) const {
+      if (seqno_.size() == 0) return -1;
+      return seqno_.back();
+    }
+
+   private:
+    // sequence number of each
+    std::vector<int> seqno_;
+    // pointer to the positions
+    std::vector<size_t> rptr_;
+    // actual size of each buffer
+    std::vector<size_t> size_;
+    // content of the buffer
+    std::vector<uint64_t> data_;
+  };
+  /*!
+   * \brief internal consistency check function,
+   *  use check to ensure user always call CheckPoint/LoadCheckPoint
+   *  with or without local but not both, this function will set the approperiate settings
+   *  in the first call of LoadCheckPoint/CheckPoint 
+   *
+   * \param with_local whether the user calls CheckPoint with local model
+   */
+  void LocalModelCheck(bool with_local);
+  /*!
+   * \brief internal implementation of checkpoint, support both lazy and normal way
+   * 
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   * \param lazy_checkpt whether the action is lazy checkpoint
+   *
+   * \sa CheckPoint, LazyCheckPoint
+   */
+  void CheckPoint_(const ISerializable *global_model,
+                   const ISerializable *local_model,
+                   bool lazy_checkpt);
+  /*!
+   * \brief reset the all the existing links by sending Out-of-Band message marker
+   *  after this function finishes, all the messages received and sent
+   *  before in all live links are discarded,
+   *  This allows us to get a fresh start after error has happened
+   *    
+   *  TODO(tqchen): this function is not yet functioning was not used by engine,
+   *   simple resetlink and reconnect strategy is used
+   * 
+   * \return this function can return kSuccess or kSockError
+   *         when kSockError is returned, it simply means there are bad sockets in the links,
+   *         and some link recovery proceduer is needed
+   */
+  ReturnType TryResetLinks(void);
+  /*!
+   * \brief if err_type indicates an error
+   *         recover links according to the error type reported
+   *        if there is no error, return true
+   * \param err_type the type of error happening in the system
+   * \return true if err_type is kSuccess, false otherwise 
+   */
+  bool CheckAndRecover(ReturnType err_type);
+  /*!
+   * \brief try to run recover execution for a request action described by flag and seqno,
+   *        the function will keep blocking to run possible recovery operations before the specified action,
+   *        until the requested result is received by a recovering procedure,
+   *        or the function discovers that the requested action is not yet executed, and return false
+   *
+   * \param buf the buffer to store the result
+   * \param size the total size of the buffer
+   * \param flag flag information about the action \sa ActionSummary
+   * \param seqno sequence number of the action, if it is special action with flag set,
+   *        seqno needs to be set to ActionSummary::kSpecialOp
+   *
+   * \return if this function can return true or false 
+   *    - true means buf already set to the
+   *           result by recovering procedure, the action is complete, no further action is needed
+   *    - false means this is the lastest action that has not yet been executed, need to execute the action
+   */
+  bool RecoverExec(void *buf, size_t size, int flag,
+                   int seqno = ActionSummary::kSpecialOp);
+  /*!
+   * \brief try to load check point
+   *        
+   *        This is a collaborative function called by all nodes
+   *        only the nodes with requester set to true really needs to load the check point
+   *        other nodes acts as collaborative roles to complete this request
+   *
+   * \param requester whether current node is the requester
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryLoadCheckPoint(bool requester);
+  /*!
+   * \brief try to get the result of operation specified by seqno
+   *
+   *        This is a collaborative function called by all nodes
+   *        only the nodes with requester set to true really needs to get the result
+   *        other nodes acts as collaborative roles to complete this request
+   *
+   * \param buf the buffer to store the result, this parameter is only used when current node is requester
+   * \param size the total size of the buffer, this parameter is only used when current node is requester
+   * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
+   * \param requester whether current node is the requester
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
+  /*!
+   * \brief try to decide the routing strategy for recovery
+   * \param role the current role of the node
+   * \param p_size used to store the size of the message, for node in state kHaveData,
+   *               this size must be set correctly before calling the function
+   *               for others, this surves as output parameter
+   
+   * \param p_recvlink used to store the link current node should recv data from, if necessary
+   *          this can be -1, which means current node have the data
+   * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
+   *
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryRecoverData
+   */
+  ReturnType TryDecideRouting(RecoverType role,
+                              size_t *p_size,
+                              int *p_recvlink,
+                              std::vector<bool> *p_req_in);
+  /*!
+   * \brief try to finish the data recovery request,
+   *        this function is used together with TryDecideRouting
+   * \param role the current role of the node
+   * \param sendrecvbuf_ the buffer to store the data to be sent/recived
+   *          - if the role is kHaveData, this stores the data to be sent
+   *          - if the role is kRequestData, this is the buffer to store the result
+   *          - if the role is kPassData, this will not be used, and can be NULL
+   * \param size the size of the data, obtained from TryDecideRouting
+   * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
+   * \param req_in the request of each link to send data, obtained from TryDecideRouting
+   *
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryDecideRouting
+   */
+  ReturnType TryRecoverData(RecoverType role,
+                            void *sendrecvbuf_,
+                            size_t size,
+                            int recv_link,
+                            const std::vector<bool> &req_in);
+  /*!
+   * \brief try to recover the local state, making each local state to be the result of itself
+   *        plus replication of states in previous num_local_replica hops in the ring
+   *
+   * The input parameters must contain the valid local states available in current nodes,
+   * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt 
+   * If there is sufficient information in the ring, when the function returns, local_chkpt will
+   * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
+   * If there is no sufficient information in the ring, this function the number of checkpoints
+   * will be less than the specified value
+   *
+   * \param p_local_rptr the pointer to the segment pointers in the states array
+   * \param p_local_chkpt the pointer to the storage of local check points
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
+                                  std::string *p_local_chkpt);
+  /*!
+   * \brief try to checkpoint local state, this function is called in normal executation phase
+   *    of checkpoint that contains local state
+o   *  the input state must exactly one saved state(local state of current node),
+   *  after complete, this function will get local state from previous num_local_replica nodes and put them
+   *  into local_chkpt and local_rptr
+   *
+   *  It is also OK to call TryRecoverLocalState instead,
+   *  TryRecoverLocalState makes less assumption about the input, and requires more communications
+   *
+   * \param p_local_rptr the pointer to the segment pointers in the states array
+   * \param p_local_chkpt the pointer to the storage of local check points
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryRecoverLocalState
+   */
+  ReturnType TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
+                                  std::string *p_local_chkpt);
+  /*!
+   * \brief perform a ring passing to receive data from prev link, and sent data to next link
+   *  this allows data to stream over a ring structure
+   *  sendrecvbuf[0:read_ptr] are already provided by current node
+   *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
+   *  current node will send sendrecvbuf[write_ptr:write_end] to next link
+   *  write_ptr will wait till the data is readed before sending the data
+   *  this function requires read_end >= write_end
+   *
+   * \param sendrecvbuf_ the place to hold the incoming and outgoing data
+   * \param read_ptr the initial read pointer
+   * \param read_end the ending position to read
+   * \param write_ptr the initial write pointer
+   * \param write_end the ending position to write
+   * \param read_link pointer to link to previous position in ring
+   * \param write_link pointer to link of next position in ring
+   */
+  ReturnType RingPassing(void *senrecvbuf_,
+                         size_t read_ptr,
+                         size_t read_end,
+                         size_t write_ptr,
+                         size_t write_end,
+                         LinkRecord *read_link,
+                         LinkRecord *write_link);
+  /*!
+   * \brief run message passing algorithm on the allreduce tree 
+   *        the result is edge message stored in p_edge_in and p_edge_out
+   * \param node_value the value associated with current node
+   * \param p_edge_in used to store input message from each of the edge
+   * \param p_edge_out used to store output message from each of the edge
+   * \param func a function that defines the message passing rule
+   *        Parameters of func:
+   *           - node_value same as node_value in the main function
+   *           - edge_in the array of input messages from each edge,
+   *                     this includes the output edge, which should be excluded
+   *           - out_index array the index of output edge, the function should
+   *                       exclude the output edge when compute the message passing value
+   *        Return of func:
+   *           the function returns the output message based on the input message and node_value
+   *
+   * \tparam EdgeType type of edge message, must be simple struct
+   * \tparam NodeType type of node value
+   */
+  template<typename NodeType, typename EdgeType>
+  inline ReturnType MsgPassing(const NodeType &node_value,
+                               std::vector<EdgeType> *p_edge_in,
+                               std::vector<EdgeType> *p_edge_out,
+                               EdgeType (*func)
+                               (const NodeType &node_value,
+                                const std::vector<EdgeType> &edge_in,
+                                size_t out_index));
+  //---- recovery data structure ----
+  // the round of result buffer, used to mode the result
+  int result_buffer_round;
+  // result buffer of all reduce
+  ResultBuffer resbuf;
+  // last check point global model
+  std::string global_checkpoint;
+  // lazy checkpoint of global model
+  const ISerializable *global_lazycheck;
+  // number of replica for local state/model
+  int num_local_replica;
+  // number of default local replica
+  int default_local_replica;
+  // flag to decide whether local model is used, -1: unknown, 0: no, 1:yes
+  int use_local_model;
+  // number of replica for global state/model
+  int num_global_replica;
+  // number of times recovery happens
+  int recover_counter;
+  // --- recovery data structure for local checkpoint
+  // there is two version of the data structure,
+  // at one time one version is valid and another is used as temp memory
+  // pointer to memory position in the local model
+  // local model is stored in CSR format(like a sparse matrices)
+  // local_model[rptr[0]:rptr[1]] stores the model of current node
+  // local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops
+  std::vector<size_t> local_rptr[2];
+  // storage for local model replicas
+  std::string local_chkpt[2];
+  // version of local checkpoint can be 1 or 0
+  int local_chkpt_version;
+};
+}  // namespace engine
+}  // namespace rabit
+// implementation of inline template function
+#include "./allreduce_robust-inl.h"
+#endif  // RABIT_ALLREDUCE_ROBUST_H_
diff --git a/subtree/rabit/src/engine.cc b/subtree/rabit/src/engine.cc
new file mode 100644
index 000000000..c5041642e
--- /dev/null
+++ b/subtree/rabit/src/engine.cc
@@ -0,0 +1,84 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine.cc
+ * \brief this file governs which implementation of engine we are actually using
+ *  provides an singleton of engine interface
+ *   
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+
+#include "../include/rabit/engine.h"
+#include "./allreduce_base.h"
+#include "./allreduce_robust.h"
+
+namespace rabit {
+namespace engine {
+// singleton sync manager
+#ifndef RABIT_USE_BASE
+#ifndef RABIT_USE_MOCK
+AllreduceRobust manager;
+#else
+AllreduceMock manager;
+#endif
+#else
+AllreduceBase manager;
+#endif
+
+/*! \brief intiialize the synchronization module */
+void Init(int argc, char *argv[]) {
+  for (int i = 1; i < argc; ++i) {
+    char name[256], val[256];
+    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+      manager.SetParam(name, val);
+    }
+  }
+  manager.Init();
+}
+
+/*! \brief finalize syncrhonization module */
+void Finalize(void) {
+  manager.Shutdown();
+}
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void) {
+  return &manager;
+}
+// perform in-place allreduce, on sendrecvbuf
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun,
+                void *prepare_arg) {
+  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
+                         red, prepare_fun, prepare_arg);
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) 
+  : handle_(NULL), redfunc_(NULL), htype_(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {}
+
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return static_cast<int>(dtype.type_size);
+}
+void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
+  utils::Assert(redfunc_ == NULL, "cannot initialize reduce handle twice");
+  redfunc_ = redfunc;
+}
+void ReduceHandle::Allreduce(void *sendrecvbuf,
+                             size_t type_nbytes, size_t count,
+                             IEngine::PreprocFunction prepare_fun,
+                             void *prepare_arg) {
+  utils::Assert(redfunc_ != NULL, "must intialize handle to call AllReduce");
+  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
+                         redfunc_, prepare_fun, prepare_arg);
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/subtree/rabit/src/engine_base.cc b/subtree/rabit/src/engine_base.cc
new file mode 100644
index 000000000..62739536f
--- /dev/null
+++ b/subtree/rabit/src/engine_base.cc
@@ -0,0 +1,15 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_mock.cc
+ * \brief this is an engine implementation that will 
+ * insert failures in certain call point, to test if the engine is robust to failure
+ * \author Tianqi Chen
+ */
+// define use MOCK, os we will use mock Manager
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+// switch engine to AllreduceMock
+#define RABIT_USE_BASE
+#include "./engine.cc"
+
diff --git a/subtree/rabit/src/engine_empty.cc b/subtree/rabit/src/engine_empty.cc
new file mode 100644
index 000000000..298872ea0
--- /dev/null
+++ b/subtree/rabit/src/engine_empty.cc
@@ -0,0 +1,111 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_empty.cc
+ * \brief this file provides a dummy implementation of engine that does nothing
+ *  this file provides a way to fall back to single node program without causing too many dependencies
+ *  This is usually NOT needed, use engine_mpi or engine for real distributed version
+ * \author Tianqi Chen
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+
+#include "../include/rabit/engine.h"
+
+namespace rabit {
+namespace engine {
+/*! \brief EmptyEngine */
+class EmptyEngine : public IEngine {
+ public:
+  EmptyEngine(void) {
+    version_number = 0;
+  }
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun,
+                         void *prepare_arg) {
+    utils::Error("EmptyEngine:: Allreduce is not supported,"\
+                 "use Allreduce_ instead");
+  }
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) {
+  }
+  virtual void InitAfterException(void) {
+    utils::Error("EmptyEngine is not fault tolerant");
+  }
+  virtual int LoadCheckPoint(ISerializable *global_model,
+                             ISerializable *local_model = NULL) {
+    return 0;
+  }
+  virtual void CheckPoint(const ISerializable *global_model,
+                          const ISerializable *local_model = NULL) {
+    version_number += 1;
+  }
+  virtual void LazyCheckPoint(const ISerializable *global_model) {
+    version_number += 1;
+  }
+  virtual int VersionNumber(void) const {
+    return version_number;
+  }
+  /*! \brief get rank of current node */
+  virtual int GetRank(void) const {
+    return 0;
+  }
+  /*! \brief get total number of */
+  virtual int GetWorldSize(void) const {
+    return 1;
+  }
+  /*! \brief get the host name of current node */
+  virtual std::string GetHost(void) const {
+    return std::string("");
+  }
+  virtual void TrackerPrint(const std::string &msg) {
+    // simply print information into the tracker
+    utils::Printf("%s", msg.c_str());
+  }
+
+ private:
+  int version_number;
+};
+
+// singleton sync manager
+EmptyEngine manager;
+
+/*! \brief intiialize the synchronization module */
+void Init(int argc, char *argv[]) {
+}
+/*! \brief finalize syncrhonization module */
+void Finalize(void) {
+}
+
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void) {
+  return &manager;
+}
+// perform in-place allreduce, on sendrecvbuf
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun,
+                void *prepare_arg) {
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) : handle_(NULL), htype_(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {}
+
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return 0;
+}
+void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {}
+void ReduceHandle::Allreduce(void *sendrecvbuf,
+                             size_t type_nbytes, size_t count,
+                             IEngine::PreprocFunction prepare_fun,
+                             void *prepare_arg) {}
+}  // namespace engine
+}  // namespace rabit
diff --git a/subtree/rabit/src/engine_mock.cc b/subtree/rabit/src/engine_mock.cc
new file mode 100644
index 000000000..24415a1d5
--- /dev/null
+++ b/subtree/rabit/src/engine_mock.cc
@@ -0,0 +1,16 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_mock.cc
+ * \brief this is an engine implementation that will 
+ * insert failures in certain call point, to test if the engine is robust to failure
+ * \author Tianqi Chen
+ */
+// define use MOCK, os we will use mock Manager
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+// switch engine to AllreduceMock
+#define RABIT_USE_MOCK
+#include "./allreduce_mock.h"
+#include "./engine.cc"
+
diff --git a/subtree/rabit/src/engine_mpi.cc b/subtree/rabit/src/engine_mpi.cc
new file mode 100644
index 000000000..829051231
--- /dev/null
+++ b/subtree/rabit/src/engine_mpi.cc
@@ -0,0 +1,205 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_mpi.cc
+ * \brief this file gives an implementation of engine interface using MPI,
+ *   this will allow rabit program to run with MPI, but do not comes with fault tolerant
+ *   
+ * \author Tianqi Chen
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <mpi.h>
+#include <cstdio>
+#include "../include/rabit/engine.h"
+#include "../include/rabit/utils.h"
+
+namespace rabit {
+namespace engine {
+/*! \brief implementation of engine using MPI */
+class MPIEngine : public IEngine {
+ public:
+  MPIEngine(void) {
+    version_number = 0;
+  }
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun,
+                         void *prepare_arg) {
+    utils::Error("MPIEngine:: Allreduce is not supported,"\
+                 "use Allreduce_ instead");
+  }
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) {
+    MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root);
+  }
+  virtual void InitAfterException(void) {
+    utils::Error("MPI is not fault tolerant");
+  }
+  virtual int LoadCheckPoint(ISerializable *global_model,
+                             ISerializable *local_model = NULL) {
+    return 0;
+  }
+  virtual void CheckPoint(const ISerializable *global_model,
+                          const ISerializable *local_model = NULL) {
+    version_number += 1;
+  }
+  virtual void LazyCheckPoint(const ISerializable *global_model) {
+    version_number += 1;
+  }
+  virtual int VersionNumber(void) const {
+    return version_number;
+  }
+  /*! \brief get rank of current node */
+  virtual int GetRank(void) const {
+    return MPI::COMM_WORLD.Get_rank();
+  }
+  /*! \brief get total number of */
+  virtual int GetWorldSize(void) const {
+    return MPI::COMM_WORLD.Get_size();
+  }
+  /*! \brief get the host name of current node */
+  virtual std::string GetHost(void) const {
+    int len;
+    char name[MPI_MAX_PROCESSOR_NAME];
+    MPI::Get_processor_name(name, len);
+    name[len] = '\0';
+    return std::string(name);
+  }
+  virtual void TrackerPrint(const std::string &msg) {
+    // simply print information into the tracker
+    if (GetRank() == 0) {
+      utils::Printf("%s", msg.c_str());
+    }
+  }
+
+ private:
+  int version_number;
+};
+
+// singleton sync manager
+MPIEngine manager;
+
+/*! \brief intiialize the synchronization module */
+void Init(int argc, char *argv[]) {
+  MPI::Init(argc, argv);
+}
+/*! \brief finalize syncrhonization module */
+void Finalize(void) {
+  MPI::Finalize();
+}
+
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void) {
+  return &manager;
+}
+// transform enum to MPI data type
+inline MPI::Datatype GetType(mpi::DataType dtype) {
+  using namespace mpi;
+  switch (dtype) {
+    case kChar: return MPI::CHAR;
+    case kUChar: return MPI::BYTE;
+    case kInt: return MPI::INT;
+    case kUInt: return MPI::UNSIGNED;
+    case kLong: return MPI::LONG;
+    case kULong: return MPI::UNSIGNED_LONG;
+    case kFloat: return MPI::FLOAT;
+    case kDouble: return MPI::DOUBLE;
+  }
+  utils::Error("unknown mpi::DataType");
+  return MPI::CHAR;
+}
+// transform enum to MPI OP
+inline MPI::Op GetOp(mpi::OpType otype) {
+  using namespace mpi;
+  switch (otype) {
+    case kMax: return MPI::MAX;
+    case kMin: return MPI::MIN;
+    case kSum: return MPI::SUM;
+    case kBitwiseOR: return MPI::BOR;
+  }
+  utils::Error("unknown mpi::OpType");
+  return MPI::MAX;
+}
+// perform in-place allreduce, on sendrecvbuf
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun,
+                void *prepare_arg) {
+  if (prepare_fun != NULL) prepare_fun(prepare_arg);
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf,
+                            count, GetType(dtype), GetOp(op));
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) 
+    : handle_(NULL), redfunc_(NULL), htype_(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {
+  if (handle_ != NULL) {
+    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
+    op->Free();
+    delete op;
+  }
+  if (htype_ != NULL) {
+    MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
+    dtype->Free();
+    delete dtype;
+  }
+}
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return dtype.Get_size();
+}
+void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
+  utils::Assert(handle_ == NULL, "cannot initialize reduce handle twice");
+  if (type_nbytes != 0) {
+    MPI::Datatype *dtype = new MPI::Datatype();
+    if (type_nbytes % 8 == 0) {
+      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
+    } else if (type_nbytes % 4 == 0) {
+      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
+    } else {
+      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
+    }
+    dtype->Commit();
+    created_type_nbytes_ = type_nbytes;
+    htype_ = dtype;
+  }
+  MPI::Op *op = new MPI::Op();
+  MPI::User_function *pf = redfunc;
+  op->Init(pf, true);
+  handle_ = op;
+}
+void ReduceHandle::Allreduce(void *sendrecvbuf,
+                             size_t type_nbytes, size_t count,
+                             IEngine::PreprocFunction prepare_fun,
+                             void *prepare_arg) {
+  utils::Assert(handle_ != NULL, "must intialize handle to call AllReduce");
+  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
+  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
+  if (created_type_nbytes_ != type_nbytes || dtype == NULL) {
+    if (dtype == NULL) {
+      dtype = new MPI::Datatype();
+    } else {
+      dtype->Free();
+    }
+    if (type_nbytes % 8 == 0) {
+      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
+    } else if (type_nbytes % 4 == 0) {
+      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
+    } else {
+      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
+    }
+    dtype->Commit();
+    created_type_nbytes_ = type_nbytes;
+  }
+  if (prepare_fun != NULL) prepare_fun(prepare_arg);
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op);
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/subtree/rabit/src/socket.h b/subtree/rabit/src/socket.h
new file mode 100644
index 000000000..c0eb6278c
--- /dev/null
+++ b/subtree/rabit/src/socket.h
@@ -0,0 +1,499 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file socket.h
+ * \brief this file aims to provide a wrapper of sockets
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_SOCKET_H_
+#define RABIT_SOCKET_H_
+#if defined(_WIN32)
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#ifdef _MSC_VER
+#pragma comment(lib, "Ws2_32.lib")
+#endif
+#else
+#include <fcntl.h>
+#include <netdb.h>
+#include <errno.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#endif
+#include <string>
+#include <cstring>
+#include "../include/rabit/utils.h"
+
+#if defined(_WIN32)
+typedef int ssize_t;
+typedef int sock_size_t;
+#else
+typedef int SOCKET;
+typedef size_t sock_size_t;
+const int INVALID_SOCKET = -1;
+#endif
+
+namespace rabit {
+namespace utils {
+/*! \brief data structure for network address */
+struct SockAddr {
+  sockaddr_in addr;
+  // constructor
+  SockAddr(void) {}
+  SockAddr(const char *url, int port) {
+    this->Set(url, port);
+  }
+  inline static std::string GetHostName(void) {
+    std::string buf; buf.resize(256);
+    utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
+    return std::string(buf.c_str());
+  }
+  /*! 
+   * \brief set the address
+   * \param url the url of the address
+   * \param port the port of address
+   */
+  inline void Set(const char *host, int port) {
+    hostent *hp = gethostbyname(host);
+    Check(hp != NULL, "cannot obtain address of %s", host);
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port);
+    memcpy(&addr.sin_addr, hp->h_addr_list[0], hp->h_length);
+  }
+  /*! \brief return port of the address*/
+  inline int port(void) const {
+    return ntohs(addr.sin_port);
+  }
+  /*! \return a string representation of the address */
+  inline std::string AddrStr(void) const {
+    std::string buf; buf.resize(256);
+#ifdef _WIN32
+    const char *s = inet_ntop(AF_INET, (PVOID)&addr.sin_addr,
+                              &buf[0], buf.length());
+#else
+    const char *s = inet_ntop(AF_INET, &addr.sin_addr,
+                              &buf[0], buf.length());
+#endif
+    Assert(s != NULL, "cannot decode address");
+    return std::string(s);
+  }
+};
+
+/*! 
+ * \brief base class containing common operations of TCP and UDP sockets
+ */
+class Socket {
+ public:
+  /*! \brief the file descriptor of socket */
+  SOCKET sockfd;
+  // default conversion to int
+  inline operator SOCKET() const {
+    return sockfd;
+  }
+  /*!
+   * \brief start up the socket module
+   *   call this before using the sockets
+   */
+  inline static void Startup(void) {
+#ifdef _WIN32
+    WSADATA wsa_data;
+    if (WSAStartup(MAKEWORD(2, 2), &wsa_data) == -1) {
+      Socket::Error("Startup");
+    }
+    if (LOBYTE(wsa_data.wVersion) != 2 || HIBYTE(wsa_data.wVersion) != 2) {
+      WSACleanup();
+      utils::Error("Could not find a usable version of Winsock.dll\n");
+    }
+#endif
+  }
+  /*! 
+   * \brief shutdown the socket module after use, all sockets need to be closed
+   */  
+  inline static void Finalize(void) {
+#ifdef _WIN32
+    WSACleanup();
+#endif
+  }
+  /*! 
+   * \brief set this socket to use non-blocking mode
+   * \param non_block whether set it to be non-block, if it is false
+   *        it will set it back to block mode
+   */
+  inline void SetNonBlock(bool non_block) {
+#ifdef _WIN32
+    u_long mode = non_block ? 1 : 0;
+    if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
+      Socket::Error("SetNonBlock");
+    }
+#else
+    int flag = fcntl(sockfd, F_GETFL, 0);
+    if (flag == -1) {
+      Socket::Error("SetNonBlock-1");
+    }
+    if (non_block) {
+      flag |= O_NONBLOCK;
+    } else {
+      flag &= ~O_NONBLOCK;
+    }
+    if (fcntl(sockfd, F_SETFL, flag) == -1) {
+      Socket::Error("SetNonBlock-2");
+    }
+#endif
+  }
+  /*! 
+   * \brief bind the socket to an address 
+   * \param addr
+   */
+  inline void Bind(const SockAddr &addr) {
+    if (bind(sockfd, reinterpret_cast<const sockaddr*>(&addr.addr),
+             sizeof(addr.addr)) == -1) {
+      Socket::Error("Bind");
+    }
+  }
+  /*! 
+   * \brief try bind the socket to host, from start_port to end_port
+   * \param start_port starting port number to try
+   * \param end_port ending port number to try
+   * \return the port successfully bind to, return -1 if failed to bind any port
+   */
+  inline int TryBindHost(int start_port, int end_port) {
+    // TODO(tqchen) add prefix check
+    for (int port = start_port; port < end_port; ++port) {
+      SockAddr addr("0.0.0.0", port);
+      if (bind(sockfd, reinterpret_cast<sockaddr*>(&addr.addr),
+               sizeof(addr.addr)) == 0) {
+        return port;
+      }
+#if defined(_WIN32)
+	  if (WSAGetLastError() != WSAEADDRINUSE) {
+        Socket::Error("TryBindHost");	  
+	  }
+#else
+	  if (errno != EADDRINUSE) {
+        Socket::Error("TryBindHost");
+      }
+#endif
+    }
+
+    return -1;
+  }
+  /*! \brief get last error code if any */
+  inline int GetSockError(void) const {
+    int error = 0;
+    socklen_t len = sizeof(error);
+    if (getsockopt(sockfd,  SOL_SOCKET, SO_ERROR, reinterpret_cast<char*>(&error), &len) != 0) {
+      Error("GetSockError");
+    }
+    return error;
+  }
+  /*! \brief check if anything bad happens */
+  inline bool BadSocket(void) const {
+    if (IsClosed()) return true;
+    int err = GetSockError();
+    if (err == EBADF || err == EINTR) return true;
+    return false;
+  }
+  /*! \brief check if socket is already closed */
+  inline bool IsClosed(void) const {
+    return sockfd == INVALID_SOCKET;
+  }
+  /*! \brief close the socket */
+  inline void Close(void) {
+    if (sockfd != INVALID_SOCKET) {
+#ifdef _WIN32
+      closesocket(sockfd);
+#else
+      close(sockfd);
+#endif
+      sockfd = INVALID_SOCKET;
+    } else {
+      Error("Socket::Close double close the socket or close without create");
+    }
+  }
+  // report an socket error
+  inline static void Error(const char *msg) {
+    int errsv = errno;
+    utils::Error("Socket %s Error:%s", msg, strerror(errsv));
+  }
+
+ protected:
+  explicit Socket(SOCKET sockfd) : sockfd(sockfd) {
+  }
+};
+
+/*! 
+ * \brief a wrapper of TCP socket that hopefully be cross platform
+ */
+class TCPSocket : public Socket{
+ public:
+  // constructor
+  TCPSocket(void) : Socket(INVALID_SOCKET) {
+  }
+  explicit TCPSocket(SOCKET sockfd) : Socket(sockfd) {
+  }
+  /*!
+   * \brief enable/disable TCP keepalive
+   * \param keepalive whether to set the keep alive option on
+   */  
+  inline void SetKeepAlive(bool keepalive) {
+    int opt = static_cast<int>(keepalive);
+    if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
+      Socket::Error("SetKeepAlive");
+    }
+  }
+  /*!
+   * \brief create the socket, call this before using socket
+   * \param af domain
+   */
+  inline void Create(int af = PF_INET) {
+    sockfd = socket(PF_INET, SOCK_STREAM, 0);
+    if (sockfd == INVALID_SOCKET) {
+      Socket::Error("Create");
+    }
+  }
+  /*!
+   * \brief perform listen of the socket
+   * \param backlog backlog parameter
+   */
+  inline void Listen(int backlog = 16) {
+    listen(sockfd, backlog);
+  }
+  /*! \brief get a new connection */
+  TCPSocket Accept(void) {
+    SOCKET newfd = accept(sockfd, NULL, NULL);
+    if (newfd == INVALID_SOCKET) {
+      Socket::Error("Accept");
+    }
+    return TCPSocket(newfd);
+  }
+  /*!
+   * \brief decide whether the socket is at OOB mark 
+   * \return 1 if at mark, 0 if not, -1 if an error occured
+   */
+  inline int AtMark(void) const {
+#ifdef _WIN32
+	unsigned long atmark;
+    if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
+#else
+    int atmark;
+    if (ioctl(sockfd, SIOCATMARK, &atmark) == -1) return -1;
+#endif
+    return static_cast<int>(atmark);
+  }
+  /*! 
+   * \brief connect to an address 
+   * \param addr the address to connect to
+   * \return whether connect is successful
+   */
+  inline bool Connect(const SockAddr &addr) {
+    return connect(sockfd, reinterpret_cast<const sockaddr*>(&addr.addr),
+                   sizeof(addr.addr)) == 0;
+  }
+  /*!
+   * \brief send data using the socket
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \param flags extra flags
+   * \return size of data actually sent
+   *         return -1 if error occurs
+   */
+  inline ssize_t Send(const void *buf_, size_t len, int flag = 0) {
+    const char *buf = reinterpret_cast<const char*>(buf_);
+    return send(sockfd, buf, static_cast<sock_size_t>(len), flag);
+  }
+  /*! 
+   * \brief receive data using the socket 
+   * \param buf_ the pointer to the buffer
+   * \param len the size of the buffer
+   * \param flags extra flags
+   * \return size of data actually received
+   *         return -1 if error occurs
+   */
+  inline ssize_t Recv(void *buf_, size_t len, int flags = 0) {
+    char *buf = reinterpret_cast<char*>(buf_);
+    return recv(sockfd, buf, static_cast<sock_size_t>(len), flags);
+  }
+  /*!
+   * \brief peform block write that will attempt to send all data out
+   *    can still return smaller than request when error occurs
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \return size of data actually sent
+   */
+  inline size_t SendAll(const void *buf_, size_t len) {
+    const char *buf = reinterpret_cast<const char*>(buf_);
+    size_t ndone = 0;
+    while (ndone <  len) {
+      ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
+      if (ret == -1) {
+        if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
+        Socket::Error("SendAll");
+      }
+      buf += ret;
+      ndone += ret;
+    }
+    return ndone;
+  }
+  /*!
+   * \brief peforma block read that will attempt to read all data
+   *    can still return smaller than request when error occurs
+   * \param buf_ the buffer pointer
+   * \param len length of data to recv
+   * \return size of data actually sent
+   */
+  inline size_t RecvAll(void *buf_, size_t len) {
+    char *buf = reinterpret_cast<char*>(buf_);
+    size_t ndone = 0;
+    while (ndone <  len) {
+      ssize_t ret = recv(sockfd, buf,
+                         static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
+      if (ret == -1) {
+        if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
+        Socket::Error("RecvAll");
+      }
+      if (ret == 0) return ndone;
+      buf += ret;
+      ndone += ret;
+    }
+    return ndone;
+  }
+  /*!
+   * \brief send a string over network 
+   * \param str the string to be sent
+   */
+  inline void SendStr(const std::string &str) {
+    int len = static_cast<int>(str.length());
+    utils::Assert(this->SendAll(&len, sizeof(len)) == sizeof(len),
+                  "error during send SendStr");
+    if (len != 0) {
+      utils::Assert(this->SendAll(str.c_str(), str.length()) == str.length(),
+                    "error during send SendStr");
+    }
+  }
+  /*!
+   * \brief recv a string from network
+   * \param out_str the string to receive
+   */
+  inline void RecvStr(std::string *out_str) {
+    int len;
+    utils::Assert(this->RecvAll(&len, sizeof(len)) == sizeof(len),
+                  "error during send RecvStr");
+    out_str->resize(len);
+    if (len != 0) {
+      utils::Assert(this->RecvAll(&(*out_str)[0], len) == out_str->length(),
+                    "error during send SendStr");
+    }
+  }
+};
+
+/*! \brief helper data structure to perform select */
+struct SelectHelper {
+ public:
+  SelectHelper(void) {
+    FD_ZERO(&read_set);
+    FD_ZERO(&write_set);
+    FD_ZERO(&except_set);
+    maxfd = 0;
+  }
+  /*!
+   * \brief add file descriptor to watch for read 
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchRead(SOCKET fd) {
+    FD_SET(fd, &read_set);
+    if (fd > maxfd) maxfd = fd;
+  }
+  /*!
+   * \brief add file descriptor to watch for write
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchWrite(SOCKET fd) {
+    FD_SET(fd, &write_set);
+    if (fd > maxfd) maxfd = fd;
+  }
+  /*!
+   * \brief add file descriptor to watch for exception
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchException(SOCKET fd) {
+    FD_SET(fd, &except_set);
+    if (fd > maxfd) maxfd = fd;
+  }
+  /*!
+   * \brief Check if the descriptor is ready for read
+   * \param fd file descriptor to check status
+   */
+  inline bool CheckRead(SOCKET fd) const {
+    return FD_ISSET(fd, &read_set) != 0;
+  }
+  /*!
+   * \brief Check if the descriptor is ready for write
+   * \param fd file descriptor to check status
+   */
+  inline bool CheckWrite(SOCKET fd) const {
+    return FD_ISSET(fd, &write_set) != 0;
+  }
+  /*!
+   * \brief Check if the descriptor has any exception
+   * \param fd file descriptor to check status
+   */
+  inline bool CheckExcept(SOCKET fd) const {
+    return FD_ISSET(fd, &except_set) != 0;
+  }
+  /*!
+   * \brief wait for exception event on a single descriptor
+   * \param fd the file descriptor to wait the event for
+   * \param timeout the timeout counter, can be 0, which means wait until the event happen
+   * \return 1 if success, 0 if timeout, and -1 if error occurs
+   */
+  inline static int WaitExcept(SOCKET fd, long timeout = 0) {
+    fd_set wait_set;
+    FD_ZERO(&wait_set);
+    FD_SET(fd, &wait_set);
+    return Select_(static_cast<int>(fd + 1),
+                   NULL, NULL, &wait_set, timeout);
+  }
+  /*!
+   * \brief peform select on the set defined
+   * \param select_read whether to watch for read event
+   * \param select_write whether to watch for write event
+   * \param select_except whether to watch for exception event
+   * \param timeout specify timeout in micro-seconds(ms) if equals 0, means select will always block
+   * \return number of active descriptors selected, 
+   *         return -1 if error occurs
+   */
+  inline int Select(long timeout = 0) {
+    int ret =  Select_(static_cast<int>(maxfd + 1),
+                       &read_set, &write_set, &except_set, timeout);
+    if (ret == -1) {
+      Socket::Error("Select");
+    }
+    return ret;
+  }
+
+ private:
+  inline static int Select_(int maxfd, fd_set *rfds,
+                            fd_set *wfds, fd_set *efds, long timeout) {
+#if !defined(_WIN32)
+    utils::Assert(maxfd < FD_SETSIZE, "maxdf must be smaller than FDSETSIZE");
+#endif
+    if (timeout == 0) {
+      return select(maxfd, rfds, wfds, efds, NULL);
+    } else {
+      timeval tm;
+      tm.tv_usec = (timeout % 1000) * 1000;
+      tm.tv_sec = timeout / 1000;
+      return select(maxfd, rfds, wfds, efds, &tm);
+    }
+  }
+
+  SOCKET maxfd;
+  fd_set read_set, write_set, except_set;
+};
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_SOCKET_H_
diff --git a/subtree/rabit/test/.gitignore b/subtree/rabit/test/.gitignore
new file mode 100644
index 000000000..eb87d8f26
--- /dev/null
+++ b/subtree/rabit/test/.gitignore
@@ -0,0 +1,4 @@
+*.mpi
+test_*
+*_test
+*_recover
diff --git a/subtree/rabit/test/Makefile b/subtree/rabit/test/Makefile
new file mode 100644
index 000000000..5ff983e81
--- /dev/null
+++ b/subtree/rabit/test/Makefile
@@ -0,0 +1,41 @@
+export CC  = gcc
+export CXX = g++
+export MPICXX = mpicxx
+export LDFLAGS= -pthread -lm  -lrt -L../lib
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include  -std=c++11
+
+# specify tensor path
+BIN = speed_test model_recover local_recover lazy_recover
+OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
+MPIBIN = speed_test.mpi
+.PHONY: clean all lib mpi
+
+all: $(BIN) $(MPIBIN)
+lib:
+	cd ..;make;cd -
+mpi:
+	cd ..;make mpi;cd -
+# programs 
+speed_test.o: speed_test.cc ../include/*.h lib mpi
+model_recover.o: model_recover.cc ../include/*.h lib
+local_recover.o: local_recover.cc ../include/*.h lib
+lazy_recover.o: lazy_recover.cc ../include/*.h lib
+
+# we can link against MPI version to get use MPI
+speed_test: speed_test.o  $(RABIT_OBJ)
+speed_test.mpi: speed_test.o $(MPIOBJ)
+model_recover: model_recover.o  $(RABIT_OBJ)
+local_recover: local_recover.o  $(RABIT_OBJ)
+lazy_recover: lazy_recover.o  $(RABIT_OBJ)
+
+$(BIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) $(LDFLAGS) -lrabit_mock
+
+$(OBJ) : 
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIBIN) : 
+	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) $(LDFLAGS) -lrabit_mpi
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
diff --git a/subtree/rabit/test/README.md b/subtree/rabit/test/README.md
new file mode 100644
index 000000000..fb68112bf
--- /dev/null
+++ b/subtree/rabit/test/README.md
@@ -0,0 +1,18 @@
+Testcases of Rabit
+====
+This folder contains internal testcases to test correctness and efficiency of rabit API
+
+The example running scripts for testcases are given by test.mk
+* type ```make -f test.mk testcasename``` to run certain testcase
+
+
+Helper Scripts
+====
+* test.mk contains Makefile documentation of all testcases
+* keepalive.sh helper bash to restart a program when it dies abnormally
+
+List of Programs
+====
+* speed_test: test the running speed of rabit API
+* test_local_recover: test recovery of local state when error happens
+* test_model_recover: test recovery of global state when error happens
diff --git a/subtree/rabit/test/lazy_recover.cc b/subtree/rabit/test/lazy_recover.cc
new file mode 100644
index 000000000..d20e4f994
--- /dev/null
+++ b/subtree/rabit/test/lazy_recover.cc
@@ -0,0 +1,126 @@
+// this is a test case to test whether rabit can recover model when 
+// facing an exception
+#include <rabit.h>
+#include <rabit/utils.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+using namespace rabit;
+
+// dummy model
+class Model : public rabit::ISerializable {
+ public:
+  // iterations
+  std::vector<float> data;
+  // load from stream
+  virtual void Load(rabit::IStream &fi) {
+    fi.Read(&data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::IStream &fo) const {
+    fo.Write(data);
+  }
+  virtual void InitModel(size_t n) {
+    data.clear();
+    data.resize(n, 1.0f);
+  }
+};
+
+inline void TestMax(Model *model, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = iter + 111;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z  + model->data[i];
+  }
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());  
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % z + model->data[i];
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
+  }
+}
+
+inline void TestSum(Model *model, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z + model->data[i];
+  }
+  Allreduce<op::Sum>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rsum = model->data[i] * nproc;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z);
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestBcast(size_t n, int root, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);      
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+    rabit::Broadcast(&res, root);
+  } else {
+    rabit::Broadcast(&res, root);
+  }
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata> <config>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  rabit::Init(argc, argv);
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  Model model;  
+  srand(0);
+  int ntrial = 0;
+  for (int i = 1; i < argc; ++i) {
+    int n;
+    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n; 
+  } 
+  int iter = rabit::LoadCheckPoint(&model);
+  if (iter == 0) {
+    model.InitModel(n);
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  } else {
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  }
+  for (int r = iter; r < 3; ++r) { 
+    TestMax(&model, ntrial, r);
+    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);  
+    int step = std::max(nproc / 3, 1);
+    for (int i = 0; i < nproc; i += step) {
+      TestBcast(n, i, ntrial, r);
+    }
+    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
+    TestSum(&model, ntrial, r);
+    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
+    rabit::LazyCheckPoint(&model);
+    printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
+  }
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/test/local_recover.cc b/subtree/rabit/test/local_recover.cc
new file mode 100644
index 000000000..a601dd2d5
--- /dev/null
+++ b/subtree/rabit/test/local_recover.cc
@@ -0,0 +1,138 @@
+// this is a test case to test whether rabit can recover model when 
+// facing an exception
+#include <rabit.h>
+#include <rabit/utils.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+using namespace rabit;
+
+// dummy model
+class Model : public rabit::ISerializable {
+ public:
+  // iterations
+  std::vector<float> data;
+  // load from stream
+  virtual void Load(rabit::IStream &fi) {
+    fi.Read(&data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::IStream &fo) const {
+    fo.Write(data);
+  }
+  virtual void InitModel(size_t n, float v) {
+    data.clear();
+    data.resize(n, v);
+  }
+};
+
+inline void TestMax(Model *model, Model *local, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = iter + 111;  
+  std::vector<float> ndata(model->data.size());
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size(), 
+                            [&]() {
+                              // use lambda expression to prepare the data
+                              for (size_t i = 0; i < ndata.size(); ++i) {
+                                ndata[i] = (i * (rank+1)) % z  + local->data[i];
+                              }
+                            });  
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % z + model->data[i];
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i] + r);
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
+  }
+  model->data = ndata;
+  local->data = ndata;
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    local->data[i] = ndata[i] + rank;
+  }
+}
+
+inline void TestSum(Model *model, Model *local, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z + local->data[i];
+  }
+  Allreduce<op::Sum>(&ndata[0], ndata.size());
+  
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rsum = 0.0f;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z) + model->data[i] + r;
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+  model->data = ndata;
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    local->data[i] = ndata[i] + rank;
+  }
+}
+
+inline void TestBcast(size_t n, int root, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);      
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+    rabit::Broadcast(&res, root);
+  } else {
+    rabit::Broadcast(&res, root);
+  }
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  rabit::Init(argc, argv);
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  Model model, local;  
+  srand(0);
+  int ntrial = 0;
+  for (int i = 1; i < argc; ++i) {
+    int n;
+    if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n; 
+  } 
+  int iter = rabit::LoadCheckPoint(&model, &local);
+  if (iter == 0) {
+    model.InitModel(n, 1.0f);
+    local.InitModel(n, 1.0f + rank);
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  } else {
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  }
+  for (int r = iter; r < 3; ++r) { 
+    TestMax(&model, &local, ntrial, r);
+    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);  
+    int step = std::max(nproc / 3, 1);
+    for (int i = 0; i < nproc; i += step) {
+      TestBcast(n, i, ntrial, r);
+    }
+    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
+    TestSum(&model, &local, ntrial, r);
+    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
+    rabit::CheckPoint(&model, &local);
+    printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
+  }
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/test/local_recover.py b/subtree/rabit/test/local_recover.py
new file mode 100755
index 000000000..e35bd3177
--- /dev/null
+++ b/subtree/rabit/test/local_recover.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+import rabit
+import numpy as np
+
+rabit.init(lib='mock')
+rank = rabit.get_rank()
+n = 10
+nround = 3
+data = np.ones(n) * rank
+
+version, model, local = rabit.load_checkpoint(True)
+if version == 0:
+    model = np.zeros(n)
+    local = np.ones(n)
+else:
+    print '[%d] restart from version %d' % (rank, version)
+
+for i in xrange(version, nround):    
+    res = rabit.allreduce(data + model+local, rabit.SUM)
+    print '[%d] iter=%d: %s' % (rank, i, str(res))
+    model = res
+    local[:] = i
+    rabit.checkpoint(model, local)
+
+rabit.finalize()
diff --git a/subtree/rabit/test/model_recover.cc b/subtree/rabit/test/model_recover.cc
new file mode 100644
index 000000000..24012b91f
--- /dev/null
+++ b/subtree/rabit/test/model_recover.cc
@@ -0,0 +1,127 @@
+// this is a test case to test whether rabit can recover model when 
+// facing an exception
+#include <rabit.h>
+#include <rabit/utils.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+using namespace rabit;
+
+// dummy model
+class Model : public rabit::ISerializable {
+ public:
+  // iterations
+  std::vector<float> data;
+  // load from stream
+  virtual void Load(rabit::IStream &fi) {
+    fi.Read(&data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::IStream &fo) const {
+    fo.Write(data);
+  }
+  virtual void InitModel(size_t n) {
+    data.clear();
+    data.resize(n, 1.0f);
+  }
+};
+
+inline void TestMax(Model *model, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = iter + 111;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z  + model->data[i];
+  }
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());  
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % z + model->data[i];
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestSum(Model *model, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z + model->data[i];
+  }
+  Allreduce<op::Sum>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rsum = model->data[i] * nproc;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z);
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestBcast(size_t n, int root, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);      
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+    rabit::Broadcast(&res, root);
+  } else {
+    rabit::Broadcast(&res, root);
+  }
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata> <config>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  rabit::Init(argc, argv);
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  Model model;  
+  srand(0);
+  int ntrial = 0;
+  for (int i = 1; i < argc; ++i) {
+    int n;
+    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n; 
+  } 
+  int iter = rabit::LoadCheckPoint(&model);
+  if (iter == 0) {
+    model.InitModel(n);
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  } else {
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  }
+  for (int r = iter; r < 3; ++r) { 
+    TestMax(&model, ntrial, r);
+    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);  
+    int step = std::max(nproc / 3, 1);
+    for (int i = 0; i < nproc; i += step) {
+      TestBcast(n, i, ntrial, r);
+    }
+    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
+    TestSum(&model, ntrial, r);
+    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
+    rabit::CheckPoint(&model);
+    printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
+  }
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/test/speed_runner.py b/subtree/rabit/test/speed_runner.py
new file mode 100644
index 000000000..1644bfe99
--- /dev/null
+++ b/subtree/rabit/test/speed_runner.py
@@ -0,0 +1,34 @@
+import os
+import argparse
+import sys
+
+def main():
+  parser = argparse.ArgumentParser(description='TODO')
+  parser.add_argument('-ho', '--host_dir', required=True)
+  parser.add_argument('-s', '--submit_script', required=True)
+  parser.add_argument('-rex', '--rabit_exec', required=True)
+  parser.add_argument('-mpi', '--mpi_exec', required=True)
+  args = parser.parse_args()
+
+  ndata = [10**4, 10**5, 10**6, 10**7]
+  nrepeat = [10**4, 10**3, 10**2, 10]
+
+  machines = [2,4,8,16,31]
+
+  executables = [args.rabit_exec, args.mpi_exec]
+
+  for executable in executables:
+    sys.stderr.write('Executable %s' % executable)
+    sys.stderr.flush()
+    for i, data in enumerate(ndata):
+      for machine in machines:
+        host_file = os.path.join(args.host_dir, 'hosts%d' % machine)
+        cmd = 'python %s %d %s %s %d %d' % (args.submit_script, machine, host_file, executable, data, nrepeat[i])
+        sys.stderr.write('data=%d, repeat=%d, machine=%d\n' % (data, nrepeat[i], machine))
+        sys.stderr.flush()
+        os.system(cmd)
+    sys.stderr.write('\n')
+    sys.stderr.flush()
+
+if __name__ == "__main__":
+  main()
diff --git a/subtree/rabit/test/speed_test.cc b/subtree/rabit/test/speed_test.cc
new file mode 100644
index 000000000..68891bd31
--- /dev/null
+++ b/subtree/rabit/test/speed_test.cc
@@ -0,0 +1,100 @@
+// This program is used to test the speed of rabit API
+#include <rabit.h>
+#include <rabit/utils.h>
+#include <rabit/timer.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <time.h>
+
+using namespace rabit;
+
+double max_tdiff, sum_tdiff, bcast_tdiff, tot_tdiff;
+
+inline void TestMax(size_t n) {
+  int rank = rabit::GetRank();
+  std::vector<float> ndata(n);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % 111;
+  }
+  double tstart = utils::GetTime();
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
+  max_tdiff += utils::GetTime() - tstart;
+}
+
+inline void TestSum(size_t n) {
+  int rank = rabit::GetRank();
+  const int z = 131;
+  std::vector<float> ndata(n);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z;
+  }
+  double tstart = utils::GetTime();
+  rabit::Allreduce<op::Sum>(&ndata[0], ndata.size());  
+  sum_tdiff += utils::GetTime() - tstart;
+}
+
+inline void TestBcast(size_t n, int root) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  res.resize(n);
+  if (root == rank) {
+    res = s;
+  }
+  double tstart = utils::GetTime();  
+  rabit::Broadcast(&res[0], res.length(), root);
+  bcast_tdiff += utils::GetTime() - tstart;  
+}
+
+inline void PrintStats(const char *name, double tdiff, int n, int nrep, size_t size) {
+  int nproc = rabit::GetWorldSize();
+  double tsum = tdiff;
+  rabit::Allreduce<op::Sum>(&tsum, 1);
+  double tavg = tsum / nproc;
+  double tsqr = tdiff - tavg;
+  tsqr *= tsqr;
+  rabit::Allreduce<op::Sum>(&tsqr, 1);
+  double tstd = sqrt(tsqr / nproc);
+  if (rabit::GetRank() == 0) {
+    rabit::TrackerPrintf("%s: mean=%g, std=%g sec\n", name, tavg, tstd);
+    double ndata = n;
+    ndata *= nrep * size;
+    if (n != 0) {
+      rabit::TrackerPrintf("%s-speed: %g MB/sec\n", name, (ndata / tavg) / 1024 / 1024 );
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata> <nrepeat>\n");
+    return 0;
+  }
+  srand(0);
+  int n = atoi(argv[1]);
+  int nrep = atoi(argv[2]);
+  utils::Check(nrep >= 1, "need to at least repeat running once");
+  rabit::Init(argc, argv);
+  //int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  max_tdiff = sum_tdiff = bcast_tdiff = 0;
+  double tstart = utils::GetTime();
+  for (int i = 0; i < nrep; ++i) {
+    TestMax(n);
+    TestSum(n);
+    TestBcast(n, rand() % nproc);
+  }
+  tot_tdiff = utils::GetTime() - tstart;
+  // use allreduce to get the sum and std of time
+  PrintStats("max_tdiff", max_tdiff, n, nrep, sizeof(float));
+  PrintStats("sum_tdiff", sum_tdiff, n, nrep, sizeof(float));
+  PrintStats("bcast_tdiff", bcast_tdiff, n, nrep, sizeof(char));
+  PrintStats("tot_tdiff", tot_tdiff, 0, nrep, sizeof(float));
+  rabit::Finalize();
+  return 0;
+}
diff --git a/subtree/rabit/test/test.mk b/subtree/rabit/test/test.mk
new file mode 100644
index 000000000..360bc6cfe
--- /dev/null
+++ b/subtree/rabit/test/test.mk
@@ -0,0 +1,26 @@
+# this is a makefile used to show testcases of rabit
+.PHONY: all
+
+all:
+
+# this experiment test recovery with actually process exit, use keepalive to keep program alive
+model_recover_10_10k:
+	../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
+
+model_recover_10_10k_die_same:
+	../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
+
+model_recover_10_10k_die_hard:
+	../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
+
+local_recover_10_10k:
+	../tracker/rabit_demo.py -n 10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
+
+pylocal_recover_10_10k:
+	../tracker/rabit_demo.py -n 10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
+
+lazy_recover_10_10k_die_hard:
+	../tracker/rabit_demo.py -n 10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
+
+lazy_recover_10_10k_die_same:
+	../tracker/rabit_demo.py -n 10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
\ No newline at end of file
diff --git a/subtree/rabit/tracker/rabit_demo.py b/subtree/rabit/tracker/rabit_demo.py
new file mode 100755
index 000000000..56ac7c8e0
--- /dev/null
+++ b/subtree/rabit/tracker/rabit_demo.py
@@ -0,0 +1,94 @@
+#!/usr/bin/python
+"""
+This is the demo submission script of rabit, it is created to
+submit rabit jobs using hadoop streaming
+"""
+import argparse
+import sys
+import os
+import subprocess
+from threading import Thread
+import rabit_tracker as tracker
+if os.name == 'nt':
+    WRAPPER_PATH = os.path.dirname(__file__) + '\\..\\wrapper'
+else:
+    WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
+
+parser = argparse.ArgumentParser(description='Rabit script to submit rabit job locally using python subprocess')
+parser.add_argument('-n', '--nworker', required=True, type=int,
+                    help = 'number of worker proccess to be launched')
+parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
+                    help = 'print more messages into the console')
+parser.add_argument('command', nargs='+',
+                    help = 'command for rabit program')
+args = parser.parse_args()
+
+# bash script for keepalive
+# use it so that python do not need to communicate with subprocess
+echo="echo %s rabit_num_trial=$nrep;"
+keepalive = """
+nrep=0
+rc=254
+while [ $rc -eq 254 ]; 
+do
+    %s
+    %s %s rabit_num_trial=$nrep
+    rc=$?;
+    nrep=$((nrep+1));
+done
+"""
+
+def exec_cmd(cmd, taskid):
+    if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
+        cmd[0] = './' + cmd[0]
+    cmd = ' '.join(cmd)
+    arg = ' rabit_task_id=%d' % (taskid) 
+    cmd = cmd + arg
+    ntrial = 0
+    while True:
+        if os.name == 'nt':
+            prep = 'SET PYTHONPATH=\"%s\"\n' % WRAPPER_PATH
+            ret = subprocess.call(prep + cmd + ('rabit_num_trial=%d' % ntrial), shell=True)
+            if ret == 254:
+                ntrial += 1
+                continue
+            
+        else:
+            prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
+            if args.verbose != 0:            
+                bash = keepalive % (echo % cmd, prep, cmd)
+            else:
+                bash = keepalive % ('', prep, cmd)            
+            ret = subprocess.call(bash, shell=True, executable='bash')
+        if ret == 0:
+            if args.verbose != 0:        
+                print 'Thread %d exit with 0' % taskid
+            return
+        else:
+            if os.name == 'nt':
+                os.exit(-1)
+            else:
+                raise Exception('Get nonzero return code=%d' % ret)
+#
+#  Note: this submit script is only used for demo purpose
+#  submission script using pyhton multi-threading
+#
+def mthread_submit(nslave, worker_args):
+    """
+      customized submit script, that submit nslave jobs, each must contain args as parameter
+      note this can be a lambda function containing additional parameters in input
+      Parameters
+         nslave number of slave process to start up
+         args arguments to launch each job
+              this usually includes the parameters of master_uri and parameters passed into submit
+    """       
+    procs = {}
+    for i in range(nslave):
+        procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i))
+        procs[i].daemon = True
+        procs[i].start()
+    for i in range(nslave):
+        procs[i].join()
+
+# call submit, with nslave, the commands to run each job and submit function
+tracker.submit(args.nworker, [], fun_submit = mthread_submit, verbose = args.verbose)
diff --git a/subtree/rabit/tracker/rabit_hadoop.py b/subtree/rabit/tracker/rabit_hadoop.py
new file mode 100755
index 000000000..682ec69a1
--- /dev/null
+++ b/subtree/rabit/tracker/rabit_hadoop.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+"""
+This is a script to submit rabit job using hadoop streaming.
+It will submit the rabit process as mappers of MapReduce.
+"""
+import argparse
+import sys
+import os
+import time
+import subprocess
+import warnings
+import rabit_tracker as tracker
+
+WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
+
+#!!! Set path to hadoop and hadoop streaming jar here
+hadoop_binary = 'hadoop'
+hadoop_streaming_jar = None
+
+# code 
+hadoop_home = os.getenv('HADOOP_HOME')
+if hadoop_home != None:
+    if hadoop_binary == None:
+        hadoop_binary = hadoop_home + '/bin/hadoop'
+        assert os.path.exists(hadoop_binary), "HADOOP_HOME does not contain the hadoop binary"
+    if hadoop_streaming_jar == None:
+        hadoop_streaming_jar = hadoop_home + '/lib/hadoop-streaming.jar'
+        assert os.path.exists(hadoop_streaming_jar), "HADOOP_HOME does not contain the hadoop streaming jar"
+
+if hadoop_binary == None or hadoop_streaming_jar == None:
+    warnings.warn('Warning: Cannot auto-detect path to hadoop or hadoop-streaming jar\n'\
+                      '\tneed to set them via arguments -hs and -hb\n'\
+                      '\tTo enable auto-detection, you can set enviroment variable HADOOP_HOME'\
+                      ', or modify rabit_hadoop.py line 16', stacklevel = 2)
+
+parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
+                                     'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended')
+parser.add_argument('-n', '--nworker', required=True, type=int,
+                    help = 'number of worker proccess to be launched')
+parser.add_argument('-nt', '--nthread', default = -1, type=int,
+                    help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
+parser.add_argument('-i', '--input', required=True,
+                    help = 'input path in HDFS')
+parser.add_argument('-o', '--output', required=True,
+                    help = 'output path in HDFS')
+parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
+                    help = 'print more messages into the console')
+parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
+                    help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
+parser.add_argument('-f', '--files', default = [], action='append',
+                    help = 'the cached file list in mapreduce,'\
+                        ' the submission script will automatically cache all the files which appears in command'\
+                        ' This will also cause rewritten of all the file names in the command to current path,'\
+                        ' for example `../../kmeans ../kmeans.conf` will be rewritten to `./kmeans kmeans.conf`'\
+                        ' because the two files are cached to running folder.'\
+                        ' You may need this option to cache additional files.'\
+                        ' You can also use it to manually cache files when auto_file_cache is off')
+parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
+parser.add_argument('--timeout', default=600000000, type=int,
+                    help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
+                        'normally you do not need to set this ')
+parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
+                    help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
+                        'if you are running multi-threading rabit,'\
+                        'so that each node can occupy all the mapper slots in a machine for maximum performance')
+if hadoop_binary == None:
+    parser.add_argument('-hb', '--hadoop_binary', required = True,
+                        help="path to hadoop binary file")  
+else:
+    parser.add_argument('-hb', '--hadoop_binary', default = hadoop_binary, 
+                        help="path to hadoop binary file")  
+
+if hadoop_streaming_jar == None:
+    parser.add_argument('-hs', '--hadoop_streaming_jar', required = True,
+                        help='path to hadoop streamimg jar file')
+else:
+    parser.add_argument('-hs', '--hadoop_streaming_jar', default = hadoop_streaming_jar,
+                        help='path to hadoop streamimg jar file')
+parser.add_argument('command', nargs='+',
+                    help = 'command for rabit program')
+args = parser.parse_args()
+
+if args.jobname == 'auto':
+    args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
+
+# detech hadoop version
+(out, err) = subprocess.Popen('%s version' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
+out = out.split('\n')[0].split()
+assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
+hadoop_version = out[1].split('.')
+use_yarn = int(hadoop_version[0]) >= 2
+
+print 'Current Hadoop Version is %s' % out[1]
+
+def hadoop_streaming(nworker, worker_args, use_yarn):
+    fset = set()
+    if args.auto_file_cache:
+        for i in range(len(args.command)):
+            f = args.command[i]
+            if os.path.exists(f):
+                fset.add(f)
+                if i == 0:
+                    args.command[i] = './' + args.command[i].split('/')[-1]                    
+                else:
+                    args.command[i] = args.command[i].split('/')[-1]    
+    if args.command[0].endswith('.py'):
+        flst = [WRAPPER_PATH + '/rabit.py',
+                WRAPPER_PATH + '/librabit_wrapper.so',
+                WRAPPER_PATH + '/librabit_wrapper_mock.so']
+        for f in flst:
+            if os.path.exists(f):
+                fset.add(f)            
+    kmap = {}
+    # setup keymaps
+    if use_yarn:
+        kmap['nworker'] = 'mapreduce.job.maps'
+        kmap['jobname'] = 'mapreduce.job.name'
+        kmap['nthread'] = 'mapreduce.map.cpu.vcores'
+        kmap['timeout'] = 'mapreduce.task.timeout'
+        kmap['memory_mb'] = 'mapreduce.map.memory.mb'
+    else:
+        kmap['nworker'] = 'mapred.map.tasks'
+        kmap['jobname'] = 'mapred.job.name'
+        kmap['nthread'] = None
+        kmap['timeout'] = 'mapred.task.timeout'
+        kmap['memory_mb'] = 'mapred.job.map.memory.mb'
+    cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
+    cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
+    cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
+    if args.nthread != -1:
+        if kmap['nthread'] is None:
+            warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
+                              'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
+        else:
+            cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread)
+    cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
+    if args.memory_mb != -1:
+        cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
+
+    cmd += ' -input %s -output %s' % (args.input, args.output)
+    cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + worker_args))
+    if args.files != None:
+        for flst in args.files:
+            for f in flst.split('#'):
+                fset.add(f)
+    for f in fset:
+        cmd += ' -file %s' % f
+    print cmd
+    subprocess.check_call(cmd, shell = True)
+
+fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2)
+tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose)
diff --git a/subtree/rabit/tracker/rabit_mpi.py b/subtree/rabit/tracker/rabit_mpi.py
new file mode 100755
index 000000000..599a9a7c5
--- /dev/null
+++ b/subtree/rabit/tracker/rabit_mpi.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+"""
+This is the demo submission script of rabit, it is created to
+submit rabit jobs using hadoop streaming
+"""
+import argparse
+import sys
+import os
+import subprocess
+import rabit_tracker as tracker
+
+parser = argparse.ArgumentParser(description='Rabit script to submit rabit job using MPI')
+parser.add_argument('-n', '--nworker', required=True, type=int,
+                    help = 'number of worker proccess to be launched')
+parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
+                    help = 'print more messages into the console')
+parser.add_argument('-H', '--hostfile', type=str,
+                    help = 'the hostfile of mpi server')
+parser.add_argument('command', nargs='+',
+                    help = 'command for rabit program')
+args = parser.parse_args()
+#
+# submission script using MPI
+#
+def mpi_submit(nslave, worker_args):
+    """
+      customized submit script, that submit nslave jobs, each must contain args as parameter
+      note this can be a lambda function containing additional parameters in input
+      Parameters
+         nslave number of slave process to start up
+         args arguments to launch each job
+              this usually includes the parameters of master_uri and parameters passed into submit
+    """
+    sargs = ' '.join(args.command + worker_args)
+    if args.hostfile is None:
+        cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args) 
+    else:
+        ' '.join(['mpirun -n %d --hostfile %s' % (nslave, args.hostfile)] + args.command + worker_args)
+    print cmd
+    subprocess.check_call(cmd, shell = True)
+
+# call submit, with nslave, the commands to run each job and submit function
+tracker.submit(args.nworker, [], fun_submit = mpi_submit, verbose = args.verbose)
diff --git a/subtree/rabit/tracker/rabit_tracker.py b/subtree/rabit/tracker/rabit_tracker.py
new file mode 100644
index 000000000..aa21973e5
--- /dev/null
+++ b/subtree/rabit/tracker/rabit_tracker.py
@@ -0,0 +1,263 @@
+"""
+Tracker script for rabit
+Implements the tracker control protocol 
+ - start rabit jobs
+ - help nodes to establish links with each other
+
+Tianqi Chen
+"""
+
+import sys
+import os
+import socket
+import struct
+import subprocess
+import random
+from threading import Thread
+
+"""
+Extension of socket to handle recv and send of special data
+"""
+class ExSocket:    
+    def __init__(self, sock):
+        self.sock = sock
+    def recvall(self, nbytes):
+        res = []
+        sock = self.sock
+        nread = 0    
+        while nread < nbytes:
+            chunk = self.sock.recv(min(nbytes - nread, 1024))
+            nread += len(chunk)
+            res.append(chunk)
+        return ''.join(res)
+    def recvint(self):
+        return struct.unpack('@i', self.recvall(4))[0]
+    def sendint(self, n):
+        self.sock.sendall(struct.pack('@i', n))
+    def sendstr(self, s):
+        self.sendint(len(s))
+        self.sock.sendall(s)
+    def recvstr(self):
+        slen = self.recvint()
+        return self.recvall(slen)
+
+# magic number used to verify existence of data
+kMagic = 0xff99
+
+class SlaveEntry:
+    def __init__(self, sock, s_addr):
+        slave = ExSocket(sock)
+        self.sock = slave
+        self.host = s_addr[0]
+        magic = slave.recvint()
+        assert magic == kMagic, 'invalid magic number=%d from %s' % (magic, s_addr[0])
+        slave.sendint(kMagic)
+        self.rank = slave.recvint()
+        self.world_size = slave.recvint()
+        self.jobid = slave.recvstr()
+        self.cmd = slave.recvstr()
+
+    def decide_rank(self, job_map):
+        if self.rank >= 0:
+            return self.rank
+        if self.jobid != 'NULL' and self.jobid in job_map:
+            return job_map[self.jobid]
+        return -1
+
+    def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map):
+        self.rank = rank
+        nnset = set(tree_map[rank])
+        rprev, rnext = ring_map[rank]
+        self.sock.sendint(rank)
+        # send parent rank
+        self.sock.sendint(parent_map[rank])
+        # send world size
+        self.sock.sendint(len(tree_map))
+        self.sock.sendint(len(nnset))
+        # send the rprev and next link
+        for r in nnset:
+            self.sock.sendint(r)
+        # send prev link
+        if rprev != -1 and rprev != rank:
+            nnset.add(rprev)
+            self.sock.sendint(rprev)
+        else:
+            self.sock.sendint(-1)
+        # send next link
+        if rnext != -1 and rnext != rank:
+            nnset.add(rnext)
+            self.sock.sendint(rnext)
+        else:
+            self.sock.sendint(-1)
+        while True:
+            ngood = self.sock.recvint()
+            goodset = set([])
+            for i in xrange(ngood):
+                goodset.add(self.sock.recvint())
+            assert goodset.issubset(nnset)
+            badset = nnset - goodset
+            conset = []
+            for r in badset:
+                if r in wait_conn:
+                    conset.append(r)
+            self.sock.sendint(len(conset))
+            self.sock.sendint(len(badset) - len(conset))
+            for r in conset:
+                self.sock.sendstr(wait_conn[r].host)
+                self.sock.sendint(wait_conn[r].port)
+                self.sock.sendint(r)        
+            nerr = self.sock.recvint()
+            if nerr != 0:
+                continue
+            self.port = self.sock.recvint()
+            rmset = []
+            # all connection was successuly setup
+            for r in conset:
+                wait_conn[r].wait_accept -= 1
+                if wait_conn[r].wait_accept == 0:
+                    rmset.append(r)
+            for r in rmset:
+                wait_conn.pop(r, None)
+            self.wait_accept = len(badset) - len(conset)
+            return rmset
+    
+class Tracker:
+    def __init__(self, port = 9091, port_end = 9999, verbose = True):
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        for port in range(port, port_end):
+            try:
+                sock.bind(('', port))
+                self.port = port
+                break
+            except socket.error:
+                continue
+        sock.listen(16)
+        self.sock = sock
+        self.verbose = verbose
+        self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
+    def __del__(self):
+        self.sock.close()
+    def slave_args(self):
+        return ['rabit_tracker_uri=%s' % socket.gethostname(),
+                'rabit_tracker_port=%s' % self.port]        
+    def get_neighbor(self, rank, nslave):
+        rank = rank + 1
+        ret = []
+        if rank > 1:
+            ret.append(rank / 2 - 1)
+        if rank * 2 - 1  < nslave:
+            ret.append(rank * 2 - 1)            
+        if rank * 2 < nslave:
+            ret.append(rank * 2)
+        return ret
+    def get_tree(self, nslave):
+        tree_map = {}
+        parent_map = {}
+        for r in range(nslave):
+            tree_map[r] = self.get_neighbor(r, nslave)
+            parent_map[r] = (r + 1) / 2 - 1
+        return tree_map, parent_map
+    def find_share_ring(self, tree_map, parent_map, r):
+        """
+        get a ring structure that tends to share nodes with the tree
+        return a list starting from r
+        """
+        nset = set(tree_map[r])
+        cset = nset - set([parent_map[r]])
+        if len(cset) == 0:
+            return [r]
+        rlst = [r]
+        cnt = 0
+        for v in cset:
+            vlst = self.find_share_ring(tree_map, parent_map, v)
+            cnt += 1
+            if cnt == len(cset):
+                vlst.reverse()
+            rlst += vlst
+        return rlst
+    def get_ring(self, tree_map, parent_map):
+        """
+        get a ring connection used to recover local data
+        """
+        assert parent_map[0] == -1
+        rlst = self.find_share_ring(tree_map, parent_map, 0)
+        assert len(rlst) == len(tree_map)
+        ring_map = {}
+        nslave = len(tree_map)        
+        for r in range(nslave):
+            rprev = (r + nslave - 1) % nslave
+            rnext = (r + 1) % nslave            
+            ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
+        return ring_map
+    def handle_print(self,slave, msg):
+        sys.stdout.write(msg)
+    def log_print(self, msg, level):
+        if level == 1:
+            if self.verbose:
+                sys.stderr.write(msg + '\n')
+        else:
+            sys.stderr.write(msg + '\n')
+    def accept_slaves(self, nslave):
+        # set of nodes that finishs the job
+        shutdown = {}
+        # set of nodes that is waiting for connections
+        wait_conn = {}
+        # maps job id to rank
+        job_map = {}
+        # list of workers that is pending to be assigned rank
+        pending = []
+        # lazy initialize tree_map
+        tree_map = None
+        
+        while len(shutdown) != nslave:
+            fd, s_addr = self.sock.accept()
+            s = SlaveEntry(fd, s_addr)
+            if s.cmd == 'print':
+                msg = s.sock.recvstr()
+                self.handle_print(s, msg)
+                continue                
+            if s.cmd == 'shutdown':
+                assert s.rank >= 0 and s.rank not in shutdown
+                assert s.rank not in wait_conn
+                shutdown[s.rank] = s
+                self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
+                continue
+            assert s.cmd == 'start' or s.cmd == 'recover'
+            # lazily initialize the slaves
+            if tree_map == None:
+                assert s.cmd == 'start'
+                if s.world_size > 0:
+                    nslave = s.world_size
+                tree_map, parent_map = self.get_tree(nslave)
+                ring_map = self.get_ring(tree_map, parent_map)
+                # set of nodes that is pending for getting up
+                todo_nodes = range(nslave)
+                random.shuffle(todo_nodes)
+            else:
+                assert s.world_size == -1 or s.world_size == nslave
+            if s.cmd == 'recover':
+                assert s.rank >= 0
+            rank = s.decide_rank(job_map)
+            if rank == -1:
+                assert len(todo_nodes) != 0
+                rank = todo_nodes.pop(0)
+                if s.jobid != 'NULL':
+                    job_map[s.jobid] = rank
+                if len(todo_nodes) == 0:
+                    self.log_print('@tracker All of %d nodes getting started' % nslave, 2)
+            s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
+            if s.cmd != 'start':                
+                self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
+            else:
+                self.log_print('Recieve %s signal from %s; assign rank %d' % (s.cmd, s.host, s.rank), 1)
+            if s.wait_accept > 0:
+                wait_conn[rank] = s
+        self.log_print('@tracker All nodes finishes job', 2)
+
+def submit(nslave, args, fun_submit, verbose):
+    master = Tracker(verbose = verbose)
+    submit_thread = Thread(target = fun_submit, args = (nslave, args + master.slave_args()))
+    submit_thread.daemon = True
+    submit_thread.start()
+    master.accept_slaves(nslave)
+    submit_thread.join()
diff --git a/subtree/rabit/windows/.gitignore b/subtree/rabit/windows/.gitignore
new file mode 100644
index 000000000..3bc83e45f
--- /dev/null
+++ b/subtree/rabit/windows/.gitignore
@@ -0,0 +1,9 @@
+*.suo
+*.exp
+*sdf
+*.exe
+ipch
+x64
+*.filters
+Release
+*.user
diff --git a/subtree/rabit/windows/README.md b/subtree/rabit/windows/README.md
new file mode 100644
index 000000000..9bdeb7988
--- /dev/null
+++ b/subtree/rabit/windows/README.md
@@ -0,0 +1,12 @@
+The solution has been created with Visual Studio Express 2010.
+Make sure to compile the Release version
+
+Build
+====
+* Build the project ```rabit``` , this will give you ```rabit.lib``` in ```x64\Release```
+
+Build Your code with rabit
+====
+* Add include to the dependency path of your project
+* Add ```rabit.lib``` to the linker dependency  
+* The project basic is an example to show you how to build rabit with basic.cc
diff --git a/subtree/rabit/windows/basic/basic.vcxproj b/subtree/rabit/windows/basic/basic.vcxproj
new file mode 100644
index 000000000..4e686584c
--- /dev/null
+++ b/subtree/rabit/windows/basic/basic.vcxproj
@@ -0,0 +1,117 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{A6A95246-EB0A-46BA-9471-5939CB6B0006}</ProjectGuid>
+    <RootNamespace>basic</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\guide\basic.cc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/subtree/rabit/windows/rabit.sln b/subtree/rabit/windows/rabit.sln
new file mode 100644
index 000000000..bf61256d6
--- /dev/null
+++ b/subtree/rabit/windows/rabit.sln
@@ -0,0 +1,50 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "basic", "basic\basic.vcxproj", "{A6A95246-EB0A-46BA-9471-5939CB6B0006}"
+	ProjectSection(ProjectDependencies) = postProject
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit_wrapper", "rabit_wrapper\rabit_wrapper.vcxproj", "{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}"
+	ProjectSection(ProjectDependencies) = postProject
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.Build.0 = Debug|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.ActiveCfg = Debug|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.Build.0 = Debug|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.ActiveCfg = Release|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.Build.0 = Release|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.ActiveCfg = Release|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.Build.0 = Release|x64
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Debug|Win32.ActiveCfg = Debug|Win32
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Debug|Win32.Build.0 = Debug|Win32
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Debug|x64.ActiveCfg = Debug|Win32
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|Win32.ActiveCfg = Release|Win32
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|Win32.Build.0 = Release|Win32
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|x64.ActiveCfg = Release|x64
+		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|x64.Build.0 = Release|x64
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Debug|Win32.Build.0 = Debug|Win32
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Debug|x64.ActiveCfg = Debug|Win32
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|Win32.ActiveCfg = Release|Win32
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|Win32.Build.0 = Release|Win32
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|x64.ActiveCfg = Release|x64
+		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/subtree/rabit/windows/rabit/rabit.vcxproj b/subtree/rabit/windows/rabit/rabit.vcxproj
new file mode 100644
index 000000000..36ac12658
--- /dev/null
+++ b/subtree/rabit/windows/rabit/rabit.vcxproj
@@ -0,0 +1,132 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}</ProjectGuid>
+    <RootNamespace>rabit</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\allreduce_base.cc" />
+    <ClCompile Include="..\..\src\allreduce_robust.cc" />
+    <ClCompile Include="..\..\src\engine.cc" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\include\rabit.h" />
+    <ClInclude Include="..\..\include\rabit\engine.h" />
+    <ClInclude Include="..\..\include\rabit\io.h" />
+    <ClInclude Include="..\..\include\rabit\rabit-inl.h" />
+    <ClInclude Include="..\..\include\rabit\timer.h" />
+    <ClInclude Include="..\..\include\rabit\utils.h" />
+    <ClInclude Include="..\..\include\rabit_serializable.h" />
+    <ClInclude Include="..\..\src\allreduce_base.h" />
+    <ClInclude Include="..\..\src\allreduce_mock.h" />
+    <ClInclude Include="..\..\src\allreduce_robust.h" />
+    <ClInclude Include="..\..\src\socket.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/subtree/rabit/windows/rabit_wrapper/rabit_wrapper.vcxproj b/subtree/rabit/windows/rabit_wrapper/rabit_wrapper.vcxproj
new file mode 100644
index 000000000..73eb5abb4
--- /dev/null
+++ b/subtree/rabit/windows/rabit_wrapper/rabit_wrapper.vcxproj
@@ -0,0 +1,121 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}</ProjectGuid>
+    <RootNamespace>rabit_wrapper</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>..\..\x64\Release\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\wrapper\rabit_wrapper.cc" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\wrapper\rabit_wrapper.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/subtree/rabit/wrapper/rabit.py b/subtree/rabit/wrapper/rabit.py
new file mode 100644
index 000000000..a6c579338
--- /dev/null
+++ b/subtree/rabit/wrapper/rabit.py
@@ -0,0 +1,306 @@
+"""
+Python interface for rabit
+  Reliable Allreduce and Broadcast Library
+Author: Tianqi Chen
+"""
+import cPickle as pickle
+import ctypes
+import os
+import sys
+import warnings
+import numpy as np
+
+if os.name == 'nt':
+    WRAPPER_PATH = os.path.dirname(__file__) + '\\..\\windows\\x64\\Release\\rabit_wrapper%s.dll'
+else:
+    WRAPPER_PATH = os.path.dirname(__file__) + '/librabit_wrapper%s.so'
+rbtlib = None
+
+# load in xgboost library
+def loadlib__(lib = 'standard'):    
+    global rbtlib
+    if rbtlib != None:
+        warnings.Warn('rabit.int call was ignored because it has already been initialized', level = 2)
+        return
+    if lib == 'standard':
+        rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '')
+    elif lib == 'mock':
+        rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mock')
+    elif lib == 'mpi':
+        rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mpi')
+    else:
+        raise Exception('unknown rabit lib %s, can be standard, mock, mpi' % lib)
+    rbtlib.RabitGetRank.restype = ctypes.c_int
+    rbtlib.RabitGetWorldSize.restype = ctypes.c_int
+    rbtlib.RabitVersionNumber.restype = ctypes.c_int
+
+def unloadlib__():
+    global rbtlib
+    del rbtlib
+    rbtlib = None
+
+# reduction operators
+MAX = 0
+MIN = 1
+SUM = 2
+BITOR = 3
+
+def check_err__():    
+    """
+    reserved function used to check error    
+    """
+    return
+
+def init(args = sys.argv, lib = 'standard'):
+    """
+    intialize the rabit module, call this once before using anything
+    Arguments:
+        args: list(string) [default=sys.argv]
+           the list of arguments used to initialized the rabit
+           usually you need to pass in sys.argv
+        with_mock: boolean [default=False]
+            Whether initialize the mock test module
+    """
+    loadlib__(lib)
+    arr = (ctypes.c_char_p * len(args))()
+    arr[:] = args
+    rbtlib.RabitInit(len(args), arr)
+    check_err__()
+
+def finalize():
+    """
+    finalize the rabit engine, call this function after you finished all jobs 
+    """
+    rbtlib.RabitFinalize()
+    check_err__()
+    unloadlib__()
+
+def get_rank():
+    """
+    Returns rank of current process
+    """
+    ret = rbtlib.RabitGetRank()
+    check_err__()
+    return ret
+
+def get_world_size():
+    """
+    Returns get total number of process
+    """
+    ret = rbtlib.RabitGetWorlSize()
+    check_err__()
+    return ret
+
+def tracker_print(msg):
+    """
+    print message to the tracker
+    this function can be used to communicate the information of the progress
+    to the tracker
+    """
+    if not isinstance(msg, str):
+        msg = str(msg)
+    rbtlib.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
+    check_err__()
+
+def get_processor_name():
+    """
+    Returns the name of processor(host)
+    """
+    mxlen = 256
+    length = ctypes.c_ulong()
+    buf = ctypes.create_string_buffer(mxlen)
+    rbtlib.RabitGetProcessorName(buf, ctypes.byref(length),
+                                 mxlen)
+    check_err__()
+    return buf.value
+
+def broadcast(data, root):
+    """
+    broadcast object from one node to all other nodes
+    this function will return the broadcasted object
+
+    Example: the following example broadcast hello from rank 0 to all other nodes
+    ```python
+    rabit.init()
+    n = 3
+    rank = rabit.get_rank()
+    s = None
+    if rank == 0:
+        s = {'hello world':100, 2:3}
+    print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
+    s = rabit.broadcast(s, 0)
+    print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
+    rabit.finalize()
+    ```
+    
+    Arguments:
+        data: anytype that can be pickled
+              input data, if current rank does not equal root, this can be None
+        root: int
+              rank of the node to broadcast data from
+    Returns:
+        the result of broadcast
+    """
+    rank = get_rank()
+    length = ctypes.c_ulong()
+    if root == rank:
+        assert data is not None, 'need to pass in data when broadcasting'
+        s = pickle.dumps(data, protocol = pickle.HIGHEST_PROTOCOL)
+        length.value = len(s)
+    # run first broadcast
+    rbtlib.RabitBroadcast(ctypes.byref(length),
+                          ctypes.sizeof(ctypes.c_ulong),
+                          root)    
+    check_err__()
+    if root != rank:
+        dptr = (ctypes.c_char * length.value)()
+        # run second
+        rbtlib.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
+                              length.value, root)
+        check_err__()
+        data = pickle.loads(dptr.raw)
+        del dptr
+    else:
+        rbtlib.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
+                              length.value, root)
+        check_err__()
+        del s
+    return data
+
+# enumeration of dtypes
+DTYPE_ENUM__ = {
+    np.dtype('int8') : 0,
+    np.dtype('uint8') : 1,
+    np.dtype('int32') : 2,
+    np.dtype('uint32') : 3,
+    np.dtype('int64') : 4,
+    np.dtype('uint64') : 5,
+    np.dtype('float32') : 6,
+    np.dtype('float64') : 7
+}
+
+def allreduce(data, op, prepare_fun = None):
+    """
+    perform allreduce, return the result, this function is not thread-safe
+    Arguments:
+        data: numpy ndarray
+           input data 
+        op: int
+            reduction operators, can be MIN, MAX, SUM, BITOR
+        prepare_fun: lambda data
+            Lazy preprocessing function, if it is not None, prepare_fun(data)
+            will be called by the function before performing allreduce, to intialize the data
+            If the result of Allreduce can be recovered directly, then prepare_fun will NOT be called
+    Returns:
+        the result of allreduce, have same shape as data
+    """
+    if not isinstance(data, np.ndarray):
+        raise Exception('allreduce only takes in numpy.ndarray')
+    buf = data.ravel()
+    if buf.base is data.base:
+        buf = buf.copy()
+    if buf.dtype not in DTYPE_ENUM__:
+        raise Exception('data type %s not supported' % str(buf.dtype))
+    if prepare_fun is None:
+        rbtlib.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+                              buf.size, DTYPE_ENUM__[buf.dtype],
+                              op, None, None)
+    else:
+        PFUNC = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+        def pfunc(args):
+            prepare_fun(data)
+        rbtlib.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+                              buf.size, DTYPE_ENUM__[buf.dtype],
+                              op, PFUNC(pfunc), None)               
+    check_err__()
+    return buf
+
+
+def load_model__(ptr, length):
+    """
+    Internal function used by the module,
+    unpickle a model from a buffer specified by ptr, length
+    Arguments:
+        ptr: ctypes.POINTER(ctypes._char)
+            pointer to the memory region of buffer
+        length: int
+            the length of buffer
+    """
+    data = (ctypes.c_char * length).from_address(ctypes.addressof(ptr.contents))
+    return pickle.loads(data.raw)
+
+def load_checkpoint(with_local = False):
+    """
+    load latest check point
+    Arguments:
+        with_local: boolean [default = False]
+            whether the checkpoint contains local model
+    Returns: 
+        if with_local: return (version, gobal_model, local_model)
+        else return (version, gobal_model)
+        if returned version == 0, this means no model has been CheckPointed
+        and global_model, local_model returned will be None
+    """
+    gp = ctypes.POINTER(ctypes.c_char)()
+    global_len = ctypes.c_ulong()
+    if with_local:
+        lp = ctypes.POINTER(ctypes.c_char)()
+        local_len = ctypes.c_ulong()
+        version = rbtlib.RabitLoadCheckPoint(
+            ctypes.byref(gp),
+            ctypes.byref(global_len),
+            ctypes.byref(lp),
+            ctypes.byref(local_len))
+        check_err__()
+        if version == 0:
+            return (version, None, None)
+        return (version,
+                load_model__(gp, global_len.value),
+                load_model__(lp, local_len.value))
+    else:
+        version = rbtlib.RabitLoadCheckPoint(
+            ctypes.byref(gp),
+            ctypes.byref(global_len),
+            None, None)
+        check_err__()
+        if version == 0:
+            return (version, None)
+        return (version,
+                load_model__(gp, global_len.value))
+    
+def checkpoint(global_model, local_model = None):
+    """
+    checkpoint the model, meaning we finished a stage of execution
+    every time we call check point, there is a version number which will increase by one    
+
+    Arguments:
+        global_model: anytype that can be pickled
+            globally shared model/state when calling this function,
+            the caller need to gauranttees that global_model is the same in all nodes
+        local_model: anytype that can be pickled
+            local model, that is specific to current node/rank.
+            This can be None when no local state is needed.
+            local_model requires explicit replication of the model for fault-tolerance,
+            which will bring replication cost in checkpoint function,
+            while global_model do not need explicit replication.
+            It is recommended to use global_model if possible
+    """
+    sg = pickle.dumps(global_model)
+    if local_model is None:
+        rbtlib.RabitCheckPoint(sg, len(sg), None, 0)
+        check_err__()
+        del sg;
+    else:
+        sl = pickle.dumps(local_model)
+        rbtlib.RabitCheckPoint(sg, len(sg), sl, len(sl))
+        check_err__()
+        del sl; del sg;
+
+def version_number():
+    """
+    Returns version number of current stored model,
+    which means how many calls to CheckPoint we made so far
+    """
+    ret = rbtlib.RabitVersionNumber()
+    check_err__()
+    return ret
diff --git a/subtree/rabit/wrapper/rabit_wrapper.cc b/subtree/rabit/wrapper/rabit_wrapper.cc
new file mode 100644
index 000000000..ac2708f00
--- /dev/null
+++ b/subtree/rabit/wrapper/rabit_wrapper.cc
@@ -0,0 +1,239 @@
+// implementations in ctypes
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+
+#include <cstring>
+#include <string>
+#include "../include/rabit.h"
+#include "./rabit_wrapper.h"
+namespace rabit {
+namespace wrapper {
+// helper use to avoid BitOR operator
+template<typename OP, typename DType>
+struct FHelper {
+  inline static void
+  Allreduce(DType *senrecvbuf_,
+            size_t count,
+            void (*prepare_fun)(void *arg),
+            void *prepare_arg) {
+    rabit::Allreduce<OP>(senrecvbuf_, count,
+                         prepare_fun, prepare_arg);
+  }
+};
+template<typename DType>
+struct FHelper<op::BitOR, DType> {
+  inline static void
+  Allreduce(DType *senrecvbuf_,
+            size_t count,
+            void (*prepare_fun)(void *arg),
+            void *prepare_arg) {
+    utils::Error("DataType does not support bitwise or operation");
+  }  
+};
+template<typename OP>
+inline void Allreduce_(void *sendrecvbuf_,
+                       size_t count,
+                       engine::mpi::DataType enum_dtype,
+                       void (*prepare_fun)(void *arg),
+                       void *prepare_arg) {
+  using namespace engine::mpi;
+  switch (enum_dtype) {
+    case kChar:
+      rabit::Allreduce<OP>
+          (static_cast<char*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kUChar:
+      rabit::Allreduce<OP>
+          (static_cast<unsigned char*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kInt:
+      rabit::Allreduce<OP>
+          (static_cast<int*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kUInt:
+      rabit::Allreduce<OP>
+          (static_cast<unsigned*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kLong:
+      rabit::Allreduce<OP>
+          (static_cast<long*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kULong:
+      rabit::Allreduce<OP>
+          (static_cast<unsigned long*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kFloat:
+      FHelper<OP, float>::Allreduce
+          (static_cast<float*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kDouble:
+      FHelper<OP, double>::Allreduce
+          (static_cast<double*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    default: utils::Error("unknown data_type");
+  }
+}
+inline void Allreduce(void *sendrecvbuf,
+                      size_t count,
+                      engine::mpi::DataType enum_dtype,
+                      engine::mpi::OpType enum_op,
+                      void (*prepare_fun)(void *arg),
+                      void *prepare_arg) {
+  using namespace engine::mpi;
+  switch (enum_op) {
+    case kMax:
+      Allreduce_<op::Max>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    case kMin:
+      Allreduce_<op::Min>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    case kSum:
+      Allreduce_<op::Sum>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    case kBitwiseOR:
+      Allreduce_<op::BitOR>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    default: utils::Error("unknown enum_op");
+  }
+}
+// temporal memory for global and local model
+std::string global_buffer, local_buffer;
+// wrapper for serialization
+struct ReadWrapper : public ISerializable {
+  std::string *p_str;
+  explicit ReadWrapper(std::string *p_str)
+      : p_str(p_str) {}
+  virtual void Load(IStream &fi) {
+    uint64_t sz;
+    utils::Assert(fi.Read(&sz, sizeof(sz)) != 0,
+                 "Read pickle string");
+    p_str->resize(sz);
+    if (sz != 0) {
+      utils::Assert(fi.Read(&(*p_str)[0], sizeof(char) * sz) != 0,
+                    "Read pickle string");
+    }
+  }
+  virtual void Save(IStream &fo) const {
+    utils::Error("not implemented");
+  }  
+};
+struct WriteWrapper : public ISerializable {
+  const char *data;
+  size_t length;
+  explicit WriteWrapper(const char *data,
+                        size_t length)
+      : data(data), length(length) {
+  }
+  virtual void Load(IStream &fi) {
+    utils::Error("not implemented");
+  }
+  virtual void Save(IStream &fo) const {
+    uint64_t sz = static_cast<uint16_t>(length);
+    fo.Write(&sz, sizeof(sz));
+    fo.Write(data, length * sizeof(char));
+  }
+};
+}  // namespace wrapper
+}  // namespace rabit
+extern "C" {
+  void RabitInit(int argc, char *argv[]) {
+    rabit::Init(argc, argv);
+  }
+  void RabitFinalize(void) {
+    rabit::Finalize();
+  }
+  int RabitGetRank(void) {
+    return rabit::GetRank();
+  }
+  int RabitGetWorldSize(void) {
+    return rabit::GetWorldSize();
+  }
+  void RabitTrackerPrint(const char *msg) {
+    std::string m(msg);
+    rabit::TrackerPrint(m);
+  }
+  void RabitGetProcessorName(char *out_name,
+                             rbt_ulong *out_len,
+                             rbt_ulong max_len) {
+    std::string s = rabit::GetProcessorName();
+    if (s.length() > max_len) {
+      s.resize(max_len - 1);
+    }
+    strcpy(out_name, s.c_str());
+    *out_len = static_cast<rbt_ulong>(s.length());
+  }
+  void RabitBroadcast(void *sendrecv_data,
+                      rbt_ulong size, int root) {
+    rabit::Broadcast(sendrecv_data, size, root);
+  }
+  void RabitAllreduce(void *sendrecvbuf,
+                      size_t count,
+                      int enum_dtype,
+                      int enum_op,
+                      void (*prepare_fun)(void *arg),
+                      void *prepare_arg) {
+    rabit::wrapper::Allreduce
+        (sendrecvbuf, count,
+         static_cast<rabit::engine::mpi::DataType>(enum_dtype),
+         static_cast<rabit::engine::mpi::OpType>(enum_op),
+         prepare_fun, prepare_arg);
+  }
+  int RabitLoadCheckPoint(char **out_global_model,
+                          rbt_ulong *out_global_len,
+                          char **out_local_model,
+                          rbt_ulong *out_local_len) {
+    using rabit::BeginPtr;
+    using namespace rabit::wrapper;
+    ReadWrapper sg(&global_buffer);
+    ReadWrapper sl(&local_buffer);
+    int version;
+    if (out_local_model == NULL) {
+      version = rabit::LoadCheckPoint(&sg, NULL);
+      *out_global_model = BeginPtr(global_buffer);
+      *out_global_len = static_cast<rbt_ulong>(global_buffer.length());
+    } else {
+      version = rabit::LoadCheckPoint(&sg, &sl);
+      *out_global_model = BeginPtr(global_buffer);
+      *out_global_len = static_cast<rbt_ulong>(global_buffer.length());
+      *out_local_model = BeginPtr(local_buffer);
+      *out_local_len = static_cast<rbt_ulong>(local_buffer.length());
+    }
+    return version;    
+  }
+  void RabitCheckPoint(const char *global_model,
+                       rbt_ulong global_len,
+                       const char *local_model,
+                       rbt_ulong local_len) {
+    using namespace rabit::wrapper;
+    WriteWrapper sg(global_model, global_len);
+    WriteWrapper sl(local_model, local_len);
+    if (local_model == NULL) {
+      rabit::CheckPoint(&sg, NULL);
+    } else {
+      rabit::CheckPoint(&sg, &sl);
+    }
+  }
+  int RabitVersionNumber(void) {
+    return rabit::VersionNumber();
+  }
+}
diff --git a/subtree/rabit/wrapper/rabit_wrapper.h b/subtree/rabit/wrapper/rabit_wrapper.h
new file mode 100644
index 000000000..39caa70b4
--- /dev/null
+++ b/subtree/rabit/wrapper/rabit_wrapper.h
@@ -0,0 +1,125 @@
+#ifndef RABIT_WRAPPER_H_
+#define RABIT_WRAPPER_H_
+/*!
+ * \file rabit_wrapper.h
+ * \author Tianqi Chen
+ * \brief a C style wrapper of rabit
+ *  can be used to create wrapper of other languages
+ */
+#ifdef _MSC_VER
+#define RABIT_DLL __declspec(dllexport)
+#else
+#define RABIT_DLL
+#endif
+// manually define unsign long
+typedef unsigned long rbt_ulong;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief intialize the rabit module, call this once before using anything
+ * \param argc number of arguments in argv
+ * \param argv the array of input arguments
+ */
+  RABIT_DLL void RabitInit(int argc, char *argv[]);
+  /*! 
+   * \brief finalize the rabit engine, call this function after you finished all jobs 
+   */
+  RABIT_DLL void RabitFinalize(void);
+  /*! \brief get rank of current process */
+  RABIT_DLL int RabitGetRank(void);
+  /*! \brief get total number of process */
+  RABIT_DLL int RabitGetWorldSize(void);
+  /*!
+   * \brief print the msg to the tracker,
+   *    this function can be used to communicate the information of the progress to
+   *    the user who monitors the tracker
+   * \param msg the message to be printed
+   */
+  RABIT_DLL void RabitTrackerPrint(const char *msg);  
+  /*!
+   * \brief get name of processor 
+   * \param out_name hold output string
+   * \param out_len hold length of output string
+   * \param max_len maximum buffer length of input
+   */
+  RABIT_DLL void RabitGetProcessorName(char *out_name,
+                                       rbt_ulong *out_len,
+                                       rbt_ulong max_len);
+  /*!
+   * \brief broadcast an memory region to all others from root
+   *
+   *     Example: int a = 1; Broadcast(&a, sizeof(a), root); 
+   * \param sendrecv_data the pointer to send or recive buffer,
+   * \param size the size of the data
+   * \param root the root of process
+   */
+  RABIT_DLL void RabitBroadcast(void *sendrecv_data,
+                                rbt_ulong size, int root);
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf 
+   *        this function is NOT thread-safe
+   *
+   * Example Usage: the following code gives sum of the result
+   *     vector<int> data(10);
+   *     ...
+   *     Allreduce<op::Sum>(&data[0], data.size());
+   *     ...
+   * \param sendrecvbuf buffer for both sending and recving data
+   * \param count number of elements to be reduced
+   * \param enum_dtype the enumeration of data type, see rabit::engine::mpi::DataType in engine.h of rabit include
+   * \param enum_op the enumeration of operation type, see rabit::engine::mpi::OpType in engine.h of rabit
+   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                    will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   */
+  RABIT_DLL void RabitAllreduce(void *sendrecvbuf,
+                                size_t count,
+                                int enum_dtype,
+                                int enum_op,
+                                void (*prepare_fun)(void *arg),
+                                void *prepare_arg);
+  
+  /*!
+   * \brief load latest check point
+   * \param out_global_model hold output of serialized global_model
+   * \param out_global_len the output length of serialized global model
+   * \param out_local_model hold output of serialized local_model, can be NULL
+   * \param out_local_len the output length of serialized local model, can be NULL
+   * 
+   * \return the version number of check point loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     nothing will be touched
+   */
+  RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
+                                    rbt_ulong *out_global_len,
+                                    char **out_local_model,
+                                    rbt_ulong *out_local_len);
+  /*!
+   * \brief checkpoint the model, meaning we finished a stage of execution
+   *  every time we call check point, there is a version number which will increase by one
+   * 
+   * \param global_model hold content of serialized global_model
+   * \param global_len the content length of serialized global model
+   * \param local_model hold content of serialized local_model, can be NULL
+   * \param local_len the content length of serialized local model, can be NULL
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
+   *       So only CheckPoint with global_model if possible
+   */
+  RABIT_DLL void RabitCheckPoint(const char *global_model,
+                                 rbt_ulong global_len,
+                                 const char *local_model,
+                                 rbt_ulong local_len);
+  /*!
+   * \return version number of current stored model,
+   * which means how many calls to CheckPoint we made so far
+   */
+  RABIT_DLL int RabitVersionNumber(void);
+#ifdef __cplusplus
+}  // C
+#endif
+#endif  // XGBOOST_WRAPPER_H_
diff --git a/windows/xgboost.sln b/windows/xgboost.sln
index cdfe1548e..f2b08a456 100644
--- a/windows/xgboost.sln
+++ b/windows/xgboost.sln
@@ -2,9 +2,14 @@
 Microsoft Visual Studio Solution File, Format Version 11.00
 # Visual Studio 2010
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{19766C3F-7508-49D0-BAAC-0988FCC9970C}"
+	ProjectSection(ProjectDependencies) = postProject
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
+	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "..\subtree\rabit\windows\rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -28,6 +33,14 @@ Global
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.Build.0 = Release|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.Build.0 = Debug|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.ActiveCfg = Debug|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.Build.0 = Debug|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.ActiveCfg = Release|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.Build.0 = Release|Win32
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.ActiveCfg = Release|x64
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj
index 3d303efc4..ea8f03c21 100644
--- a/windows/xgboost/xgboost.vcxproj
+++ b/windows/xgboost/xgboost.vcxproj
@@ -106,11 +106,13 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <OpenMPSupport>true</OpenMPSupport>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
index 6c73e3cee..79788259d 100644
--- a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
+++ b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
@@ -22,6 +22,7 @@
     <ClCompile Include="..\..\src\gbm\gbm.cpp" />
     <ClCompile Include="..\..\src\io\io.cpp" />
     <ClCompile Include="..\..\src\tree\updater.cpp" />
+    <ClCompile Include="..\..\subtree\rabit\src\engine_empty.cc" />
     <ClCompile Include="..\..\wrapper\xgboost_wrapper.cpp" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
@@ -107,11 +108,13 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <OpenMPSupport>true</OpenMPSupport>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index 8d87dc8db..1a2a4e1c2 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -34,7 +34,6 @@ xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
 xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
 xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
 
-
 def ctypes2numpy(cptr, length, dtype):
     """convert a ctypes pointer array to numpy array """
     assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
@@ -304,6 +303,7 @@ class Booster:
                                     (ctypes.c_float*len(grad))(*grad),
                                     (ctypes.c_float*len(hess))(*hess),
                                     len(grad))
+
     def eval_set(self, evals, it = 0, feval = None):
         """evaluates by metric
             Args:
@@ -332,24 +332,38 @@ class Booster:
             return res
     def eval(self, mat, name = 'eval', it = 0):
         return self.eval_set( [(mat,name)], it)
-    def predict(self, data, output_margin=False, ntree_limit=0):
+    def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
         """
         predict with data
             Args:
                 data: DMatrix
-                      the dmatrix storing the input
+                    the dmatrix storing the input
                 output_margin: bool
-                               whether output raw margin value that is untransformed
-
+                    whether output raw margin value that is untransformed
                 ntree_limit: int
-                             limit number of trees in prediction, default to 0, 0 means using all the trees
+                    limit number of trees in prediction, default to 0, 0 means using all the trees
+                pred_leaf: bool
+                    when this option is on, the output will be a matrix of (nsample, ntrees)
+                    with each record indicate the predicted leaf index of each sample in each tree
+                    Note that the leaf index of tree is unique per tree, so you may find leaf 1 in both tree 1 and tree 0
             Returns:
                 numpy array of prediction
         """
+        option_mask = 0
+        if output_margin:
+            option_mask += 1
+        if pred_leaf:
+            option_mask += 2
         length = ctypes.c_ulong()
         preds = xglib.XGBoosterPredict(self.handle, data.handle,
-                                       int(output_margin), ntree_limit, ctypes.byref(length))
-        return ctypes2numpy(preds, length.value, 'float32')
+                                       option_mask, ntree_limit, ctypes.byref(length))        
+        preds = ctypes2numpy(preds, length.value, 'float32')
+        if pred_leaf:
+            preds = preds.astype('int32')        
+        nrow = data.num_row()
+        if preds.size != nrow and preds.size % nrow == 0:
+            preds = preds.reshape(nrow, preds.size / nrow) 
+        return preds
     def save_model(self, fname):
         """ save model to file
             Args:
@@ -542,3 +556,4 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
         sys.stderr.write(res+'\n')
         results.append(res)
     return results
+
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index 5ceb5d79d..d744c3e22 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -8,7 +8,9 @@
 #include <algorithm>
 // include all std functions
 using namespace std;
-
+#ifdef _MSC_VER
+#define isnan(x) (_isnan(x) != 0)
+#endif
 #include "./xgboost_wrapper.h"
 #include "../src/data.h"
 #include "../src/learner/learner-inl.hpp"
@@ -30,9 +32,9 @@ class Booster: public learner::BoostLearner {
     this->init_model = false;
     this->SetCacheData(mats);
   }
-  inline const float *Pred(const DataMatrix &dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) {
+  inline const float *Pred(const DataMatrix &dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
     this->CheckInitModel();
-    this->Predict(dmat, output_margin != 0, &this->preds_, ntree_limit);
+    this->Predict(dmat, (option_mask&1) != 0, &this->preds_, ntree_limit, (option_mask&2) != 0);
     *len = static_cast<bst_ulong>(this->preds_.size());
     return BeginPtr(this->preds_);
   }
@@ -44,7 +46,7 @@ class Booster: public learner::BoostLearner {
     for (bst_omp_uint j = 0; j < ndata; ++j) {
       gpair_[j] = bst_gpair(grad[j], hess[j]);
     }
-    gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
+    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
   }
   inline void CheckInitModel(void) {
     if (!init_model) {
@@ -132,7 +134,7 @@ extern "C"{
                                bst_ulong nrow,
                                bst_ulong ncol,
                                float  missing) {
-    bool nan_missing = std::isnan(missing);
+    bool nan_missing = isnan(missing);
     DMatrixSimple *p_mat = new DMatrixSimple();
     DMatrixSimple &mat = *p_mat;
     mat.info.info.num_row = nrow;
@@ -140,7 +142,7 @@ extern "C"{
     for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
       bst_ulong nelem = 0;
       for (bst_ulong j = 0; j < ncol; ++j) {
-        if (std::isnan(data[j])) {
+        if (isnan(data[j])) {
           utils::Check(nan_missing, "There are NAN in the matrix, however, you did not set missing=NAN");          
         } else {
           if (nan_missing || data[j] != missing) {
@@ -284,8 +286,8 @@ extern "C"{
     bst->eval_str = bst->EvalOneIter(iter, mats, names);
     return bst->eval_str.c_str();
   }
-  const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) {
-    return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, ntree_limit, len);
+  const float *XGBoosterPredict(void *handle, void *dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
+    return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), option_mask, ntree_limit, len);
   }
   void XGBoosterLoadModel(void *handle, const char *fname) {
     static_cast<Booster*>(handle)->LoadModel(fname);
diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h
index 2555289b1..82fedb9d6 100644
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -17,6 +17,28 @@ typedef unsigned long bst_ulong;
 #ifdef __cplusplus
 extern "C" {
 #endif
+  /*!
+   * \brief initialize sync module, this is needed if used in distributed model
+   *        normally, argv need to contain master_uri and master_port
+   *        if start using submit_job_tcp script, then pass args to this will do
+   * \param argc number of arguments
+   * \param argv the arguments to be passed in sync module
+   */
+  XGB_DLL void XGSyncInit(int argc, char *argv[]);
+  /*!
+   * \brief finalize sync module, call this when everything is done
+   */
+  XGB_DLL void XGSyncFinalize(void);
+  /*!
+   * \brief get the rank 
+   * \return return the rank of 
+   */
+  XGB_DLL int XGSyncGetRank(void);
+  /*!
+   * \brief get the world size from sync
+   * \return return the number of distributed job ran in the group
+   */
+  XGB_DLL int XGSyncGetWorldSize(void);
   /*!
    * \brief load a data matrix 
    * \return a loaded data matrix
@@ -41,7 +63,7 @@ extern "C" {
    * \param col_ptr pointer to col headers
    * \param indices findex
    * \param data fvalue
-   * \param nindptr number of rows in the matix + 1 
+   * \param nindptr number of rows in the matix + 1
    * \param nelem number of nonzero elements in the matrix
    * \return created dmatrix
    */
@@ -178,12 +200,18 @@ extern "C" {
    * \brief make prediction based on dmat
    * \param handle handle
    * \param dmat data matrix
-   * \param output_margin whether only output raw margin value
+   * \param option_mask bit-mask of options taken in prediction, possible values
+   *          0:normal prediction
+   *          1:output margin instead of transformed value
+   *          2:output leaf index of trees instead of leaf value, note leaf index is unique per tree
    * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees
    *    when the parameter is set to 0, we will use all the trees
    * \param len used to store length of returning result
    */
-  XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len);
+  XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, 
+                                        int option_mask, 
+                                        unsigned ntree_limit,
+                                        bst_ulong *len);
   /*!
    * \brief load model from existing file
    * \param handle handle