diff --git a/CHANGES.md b/CHANGES.md
index d834ce79d..90fd77ebb 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -21,8 +21,16 @@ xgboost-0.3
 * Add [Code Guide](src/README.md) for customizing objective function and evaluation
 * Add R module
 
-in progress version
+xgboost-0.4
 =====
-* Distributed version
-* Feature importance visualization in R module, thanks to Michael Benesty
-* Predict leaf inde
+* Distributed version of xgboost that runs on YARN, scales to billions of examples
+* Direct save/load data and model from/to S3 and HDFS
+* Feature importance visualization in R module, by Michael Benesty
+* Predict leaf index
+* Poisson regression for counts data
+* Early stopping option in training
+* Native save load support in R and python
+  - xgboost models now can be saved using save/load in R
+  - xgboost python model is now pickable
+* sklearn wrapper is supported in python module
+* Experimental External memory version
diff --git a/LICENSE b/LICENSE
index b9f38c38a..3be067aed 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2014 by Tianqi Chen and Contributors 
+Copyright (c) 2014 by Contributors 
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/Makefile b/Makefile
index e426b797d..e568222c2 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops
 
 ifeq ($(OS), Windows_NT)
 	export CXX = g++ -m64
@@ -18,7 +18,6 @@ endif
 # by default use c++11
 ifeq ($(cxx11),1)
 	CFLAGS += -std=c++11
-else 
 endif
 
 # handling dmlc
@@ -38,6 +37,14 @@ else
 	LIBDMLC=dmlc_simple.o
 endif
 
+ifndef WITH_FPIC
+	WITH_FPIC = 1
+endif
+ifeq ($(WITH_FPIC), 1)
+	CFLAGS += -fPIC	
+endif
+
+
 ifeq ($(OS), Windows_NT)
 	LIBRABIT = subtree/rabit/lib/librabit_empty.a
 	SLIB = wrapper/xgboost_wrapper.dll
@@ -51,11 +58,15 @@ BIN = xgboost
 MOCKBIN = xgboost.mock
 OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
 MPIBIN =
-TARGET = $(BIN) $(OBJ) $(SLIB)
+ifeq ($(WITH_FPIC), 1)
+	TARGET = $(BIN) $(OBJ) $(SLIB)
+else
+	TARGET = $(BIN)
+endif
 
 .PHONY: clean all mpi python Rpack
 
-all: $(BIN) $(OBJ) $(SLIB)
+all: $(TARGET)
 mpi: $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
@@ -79,7 +90,7 @@ subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
 	+	cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
 
 $(BIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
+	$(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 
 $(MOCKBIN) : 
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index e1383d3ad..c6975af5e 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: eXtreme Gradient Boosting
-Version: 0.3-4
-Date: 2014-12-28
+Version: 0.4-0
+Date: 2015-05-11
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
 Maintainer: Tong He <hetong007@gmail.com>
 Description: Xgboost is short for eXtreme Gradient Boosting, which is an 
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index e5f5c7b72..df7fd5648 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -54,6 +54,13 @@
 #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
 #'   If folds are supplied, the nfold and stratified parameters would be ignored.
 #' @param verbose \code{boolean}, print the statistics during the process
+#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
+#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
+#'     If set to an integer \code{k}, training with a validation set will stop if the performance 
+#'     keeps getting worse consecutively for \code{k} rounds.
+#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+#'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#'     
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @return
@@ -86,7 +93,8 @@
 #'
 xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
                    prediction = FALSE, showsd = TRUE, metrics=list(), 
-                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
+                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
+                   early.stop.round = NULL, maximize = NULL, ...) {
   if (typeof(params) != "list") {
     stop("xgb.cv: first argument params must be list")
   }
@@ -109,7 +117,50 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
   for (mc in metrics) {
     params <- append(params, list("eval_metric"=mc))
   }
-
+  
+  # customized objective and evaluation metric interface
+  if (!is.null(params$objective) && !is.null(obj))
+    stop("xgb.cv: cannot assign two different objectives")
+  if (!is.null(params$objective))
+    if (class(params$objective)=='function') {
+      obj = params$objective
+      params$objective = NULL
+    }
+  if (!is.null(params$eval_metric) && !is.null(feval))
+    stop("xgb.cv: cannot assign two different evaluation metrics")
+  if (!is.null(params$eval_metric))
+    if (class(params$eval_metric)=='function') {
+      feval = params$eval_metric
+      params$eval_metric = NULL
+    }
+  
+  # Early Stopping
+  if (!is.null(early.stop.round)){
+    if (!is.null(feval) && is.null(maximize))
+      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
+    if (is.null(maximize) && is.null(params$eval_metric))
+      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
+    if (is.null(maximize))
+    {
+      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
+        maximize = FALSE
+      } else {
+        maximize = TRUE
+      }
+    }
+    
+    if (maximize) {
+      bestScore = 0
+    } else {
+      bestScore = Inf
+    }
+    bestInd = 0
+    earlyStopflag = FALSE
+    
+    if (length(metrics)>1)
+      warning('Only the first metric is used for early stopping process.')
+  }
+  
   xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
   obj_type = params[['objective']]
   mat_pred = FALSE
@@ -124,6 +175,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
   else
     predictValues <- rep(0,xgb.numrow(dtrain))
   history <- c()
+  print.every.n = max(as.integer(print.every.n), 1L)
   for (i in 1:nrounds) {
     msg <- list()
     for (k in 1:nfold) {
@@ -148,7 +200,27 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
     }
     ret <- xgb.cv.aggcv(msg, showsd)
     history <- c(history, ret)
-    if(verbose) paste(ret, "\n", sep="") %>% cat
+    if(verbose)
+      if (0==(i-1L)%%print.every.n)
+        cat(ret, "\n", sep="")
+    
+    # early_Stopping
+    if (!is.null(early.stop.round)){
+      score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2]
+      score = strsplit(score,'\\+|:')[[1]][[2]]
+      score = as.numeric(score)
+      if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
+        bestScore = score
+        bestInd = i
+      } else {
+        if (i-bestInd>=early.stop.round) {
+          earlyStopflag = TRUE
+          cat('Stopping. Best iteration:',bestInd)
+          break
+        }
+      }
+    }
+    
   }
   
   colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 23accef3a..3acd2b174 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -36,7 +36,7 @@
 #' 3. Task Parameters 
 #' 
 #' \itemize{
-#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
+#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
 #'   \itemize{
 #'     \item \code{reg:linear} linear regression (Default).
 #'     \item \code{reg:logistic} logistic regression.
@@ -48,7 +48,7 @@
 #'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
 #'   }
 #'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
-#'   \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
+#'   \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 #' }
 #' 
 #' @param data takes an \code{xgb.DMatrix} as the input.
@@ -66,7 +66,12 @@
 #'   prediction and dtrain,
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
-#'
+#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
+#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
+#'     If set to an integer \code{k}, training with a validation set will stop if the performance 
+#'     keeps getting worse consecutively for \code{k} rounds.
+#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+#'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -98,7 +103,6 @@
 #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 #' dtest <- dtrain
 #' watchlist <- list(eval = dtest, train = dtrain)
-#' param <- list(max.depth = 2, eta = 1, silent = 1)
 #' logregobj <- function(preds, dtrain) {
 #'    labels <- getinfo(dtrain, "label")
 #'    preds <- 1/(1 + exp(-preds))
@@ -111,11 +115,13 @@
 #'   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
 #'   return(list(metric = "error", value = err))
 #' }
-#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
+#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
+#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
 #' @export
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
-                      obj = NULL, feval = NULL, verbose = 1, ...) {
+                      obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
+                      early.stop.round = NULL, maximize = NULL, ...) {
   dtrain <- data
   if (typeof(params) != "list") {
     stop("xgb.train: first argument params must be list")
@@ -130,19 +136,85 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
   }
   if (length(watchlist) != 0 && verbose == 0) {
     warning('watchlist is provided but verbose=0, no evaluation information will be printed')
-    watchlist <- list()
   }
   params = append(params, list(...))
   
+  # customized objective and evaluation metric interface
+  if (!is.null(params$objective) && !is.null(obj))
+    stop("xgb.train: cannot assign two different objectives")
+  if (!is.null(params$objective))
+    if (class(params$objective)=='function') {
+      obj = params$objective
+      params$objective = NULL
+    }
+  if (!is.null(params$eval_metric) && !is.null(feval))
+    stop("xgb.train: cannot assign two different evaluation metrics")
+  if (!is.null(params$eval_metric))
+    if (class(params$eval_metric)=='function') {
+      feval = params$eval_metric
+      params$eval_metric = NULL
+    }
+    
+  # Early stopping
+  if (!is.null(early.stop.round)){
+    if (!is.null(feval) && is.null(maximize))
+      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
+    if (length(watchlist) == 0)
+      stop('For early stopping you need at least one set in watchlist.')
+    if (is.null(maximize) && is.null(params$eval_metric))
+      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
+    if (is.null(maximize))
+    {
+      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
+        maximize = FALSE
+      } else {
+        maximize = TRUE
+      }
+    }
+    
+    if (maximize) {
+      bestScore = 0
+    } else {
+      bestScore = Inf
+    }
+    bestInd = 0
+    earlyStopflag = FALSE
+    
+    if (length(watchlist)>1)
+      warning('Only the first data set in watchlist is used for early stopping process.')
+  }
+  
+  
   handle <- xgb.Booster(params, append(watchlist, dtrain))
   bst <- xgb.handleToBooster(handle)
+  print.every.n=max( as.integer(print.every.n), 1L)
   for (i in 1:nrounds) {
     succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
     if (length(watchlist) != 0) {
       msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
-      cat(paste(msg, "\n", sep=""))
+      if (0== ( (i-1) %% print.every.n))
+	    cat(paste(msg, "\n", sep=""))
+      if (!is.null(early.stop.round))
+      {
+        score = strsplit(msg,':|\\s+')[[1]][3]
+        score = as.numeric(score)
+        if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
+          bestScore = score
+          bestInd = i
+        } else {
+          if (i-bestInd>=early.stop.round) {
+            earlyStopflag = TRUE
+            cat('Stopping. Best iteration:',bestInd)
+            break
+          }
+        }
+      }
     }
   }
   bst <- xgb.Booster.check(bst)
+  if (!is.null(early.stop.round)) {
+    bst$bestScore = bestScore
+    bst$bestInd = bestInd
+  }
   return(bst)
 } 
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index ede53b116..63077f866 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -28,8 +28,14 @@
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
 #'   performance and construction progress information
+#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
+#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
+#'     If set to an integer \code{k}, training with a validation set will stop if the performance 
+#'     keeps getting worse consecutively for \code{k} rounds.
+#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+#'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -51,7 +57,8 @@
 #' @export
 #' 
 xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
-                    verbose = 1, ...) {
+                    verbose = 1, print.every.n = 1L, early.stop.round = NULL,
+                    maximize = NULL, ...) {
   if (is.null(missing)) {
     dtrain <- xgb.get.DMatrix(data, label)
   } else {
@@ -66,7 +73,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
     watchlist <- list()
   }
   
-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose)
+  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
+                   early.stop.round = early.stop.round)
   
   return(bst)
 } 
diff --git a/R-package/README.md b/R-package/README.md
index 7d2be411d..e974e3554 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -8,11 +8,6 @@ For up-to-date version (which is recommended), please install from github. Windo
 devtools::install_github('dmlc/xgboost',subdir='R-package')
 ```
 
-For stable version on CRAN, please run
-
-```r
-install.packages('xgboost')
-```
 
 ## Examples
 
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
index 969da0d91..0112eb9e1 100644
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -6,3 +6,5 @@ generalized_linear_model        Generalized Linear Model
 cross_validation                Cross validation
 create_sparse_matrix            Create Sparse Matrix
 predict_leaf_indices            Predicting the corresponding leaves
+early_stopping                  Early Stop in training
+poisson_regression              Poisson Regression on count data
diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R
index fbb38f6d8..c3148ae21 100644
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -40,10 +40,10 @@ evalerror <- function(preds, dtrain) {
   return(list(metric = "error", value = err))
 }
 
-param <- list(max.depth=2,eta=1,silent=1)
+param <- list(max.depth=2,eta=1,silent=1,
+              objective = logregobj, eval_metric = evalerror)
 # train with customized objective
-xgb.cv(param, dtrain, nround, nfold = 5,
-       obj = logregobj, feval=evalerror)
+xgb.cv(param, dtrain, nround, nfold = 5)
 
 # do cross validation with prediction values for each fold
 res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
index b0a0a02ca..201f23d98 100644
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -8,7 +8,6 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
-param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
 watchlist <- list(eval = dtest, train = dtrain)
 num_round <- 2
 
@@ -33,10 +32,13 @@ evalerror <- function(preds, dtrain) {
   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
   return(list(metric = "error", value = err))
 }
+
+param <- list(max.depth=2,eta=1,nthread = 2, silent=1, 
+              objective=logregobj, eval_metric=evalerror)
 print ('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
+bst <- xgb.train(param, dtrain, num_round, watchlist)
 
 #
 # there can be cases where you want additional information 
@@ -59,4 +61,5 @@ logregobjattr <- function(preds, dtrain) {
 print ('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror)
+bst <- xgb.train(param, dtrain, num_round, watchlist, 
+                 objective=logregobj, eval_metric=evalerror)
diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R
new file mode 100644
index 000000000..aa74aa2ee
--- /dev/null
+++ b/R-package/demo/early_stopping.R
@@ -0,0 +1,40 @@
+require(xgboost)
+# load in the agaricus dataset
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+# note: for customized objective function, we leave objective as default
+# note: what we are getting is margin value in prediction
+# you must know what you are doing
+param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
+watchlist <- list(eval = dtest)
+num_round <- 20
+# user define objective function, given prediction, return gradient and second order gradient
+# this is loglikelihood loss
+logregobj <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  preds <- 1/(1 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1 - preds)
+  return(list(grad = grad, hess = hess))
+}
+# user defined evaluation function, return a pair metric_name, result
+# NOTE: when you do customized loss function, the default prediction value is margin
+# this may make buildin evalution metric not function properly
+# for example, we are doing logistic loss, the prediction is score before logistic transformation
+# the buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+evalerror <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+  return(list(metric = "error", value = err))
+}
+print ('start training with early Stopping setting')
+
+bst <- xgb.train(param, dtrain, num_round, watchlist, 
+                 objective = logregobj, eval_metric = evalerror, maximize = FALSE,
+                 early.stop.round = 3)
+bst <- xgb.cv(param, dtrain, num_round, nfold = 5, 
+              objective = logregobj, eval_metric = evalerror,
+              maximize = FALSE, early.stop.round = 3)
diff --git a/R-package/demo/poisson_regression.R b/R-package/demo/poisson_regression.R
new file mode 100644
index 000000000..f9dc4ac62
--- /dev/null
+++ b/R-package/demo/poisson_regression.R
@@ -0,0 +1,7 @@
+data(mtcars)
+head(mtcars)
+bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
+              objective='count:poisson',nrounds=5)
+pred = predict(bst,as.matrix(mtcars[,-11]))
+sqrt(mean((pred-mtcars[,11])^2))
+
diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R
index 2d0384156..7311ec95e 100644
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@@ -7,3 +7,5 @@ demo(generalized_linear_model)
 demo(cross_validation)
 demo(create_sparse_matrix)
 demo(predict_leaf_indices)
+demo(early_stopping)
+demo(poisson_regression)
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 19ab788f9..bb23992a2 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -7,7 +7,8 @@
 xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
   missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
-  verbose = T, ...)
+  verbose = T, print.every.n = 1L, early.stop.round = NULL,
+  maximize = NULL, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@@ -65,6 +66,15 @@ If folds are supplied, the nfold and stratified parameters would be ignored.}
 
 \item{verbose}{\code{boolean}, print the statistics during the process}
 
+\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
+
+\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
+If set to an integer \code{k}, training with a validation set will stop if the performance
+keeps getting worse consecutively for \code{k} rounds.}
+
+\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+    \code{maximize=TRUE} means the larger the evaluation score the better.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \value{
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index a24f337f9..7b1893ba7 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -5,7 +5,8 @@
 \title{eXtreme Gradient Boosting Training}
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
-  feval = NULL, verbose = 1, ...)
+  feval = NULL, verbose = 1, print.every.n = 1L,
+  early.stop.round = NULL, maximize = NULL, ...)
 }
 \arguments{
 \item{params}{the list of parameters.
@@ -42,7 +43,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
 3. Task Parameters
 
 \itemize{
-\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
+\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
   \itemize{
     \item \code{reg:linear} linear regression (Default).
     \item \code{reg:logistic} logistic regression.
@@ -54,7 +55,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
   }
   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
-  \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
+  \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 }}
 
 \item{data}{takes an \code{xgb.DMatrix} as the input.}
@@ -75,7 +76,16 @@ gradient with given prediction and dtrain,}
 prediction and dtrain,}
 
 \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
-  information of performance. If 2, xgboost will print information of both}
+information of performance. If 2, xgboost will print information of both}
+
+\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
+
+\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
+If set to an integer \code{k}, training with a validation set will stop if the performance
+keeps getting worse consecutively for \code{k} rounds.}
+
+\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+\code{maximize=TRUE} means the larger the evaluation score the better.}
 
 \item{...}{other parameters to pass to \code{params}.}
 }
@@ -110,7 +120,6 @@ data(agaricus.train, package='xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- dtrain
 watchlist <- list(eval = dtest, train = dtrain)
-param <- list(max.depth = 2, eta = 1, silent = 1)
 logregobj <- function(preds, dtrain) {
    labels <- getinfo(dtrain, "label")
    preds <- 1/(1 + exp(-preds))
@@ -123,6 +132,7 @@ evalerror <- function(preds, dtrain) {
   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
   return(list(metric = "error", value = err))
 }
-bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
+param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
+bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
 }
 
diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd
index 79cff207a..64bd00369 100644
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -5,7 +5,8 @@
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
 xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
-  nrounds, verbose = 1, ...)
+  nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
+  maximize = NULL, ...)
 }
 \arguments{
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
@@ -41,6 +42,15 @@ Commonly used ones are:
 information of performance. If 2, xgboost will print information of both
 performance and construction progress information}
 
+\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
+
+\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
+If set to an integer \code{k}, training with a validation set will stop if the performance
+keeps getting worse consecutively for \code{k} rounds.}
+
+\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+\code{maximize=TRUE} means the larger the evaluation score the better.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
index 15957480c..de6ed339f 100644
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -70,10 +70,10 @@ extern "C" {
   SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
     _WrapperBegin();
     void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
+    _WrapperEnd();
     SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
     R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;
   }
   SEXP XGDMatrixCreateFromMat_R(SEXP mat, 
@@ -91,10 +91,10 @@ extern "C" {
       }
     }
     void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
+    _WrapperEnd();
     SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
     R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;    
   }
   SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
@@ -120,10 +120,10 @@ extern "C" {
     }
     void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
                                           BeginPtr(data_), nindptr, ndata);
+    _WrapperEnd();
     SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
     R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;
   }
   SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
@@ -134,10 +134,10 @@ extern "C" {
       idxvec[i] = INTEGER(idxset)[i] - 1;
     }
     void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
+    _WrapperEnd();
     SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
     R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;        
   }
   void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
@@ -157,10 +157,7 @@ extern "C" {
         vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
       }
       XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
-      _WrapperEnd();
-      return;
-    }
-    {
+    } else {
       std::vector<float> vec(len);
       #pragma omp parallel for schedule(static)
       for (int i = 0; i < len; ++i) {
@@ -177,12 +174,12 @@ extern "C" {
     bst_ulong olen;
     const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
                                              CHAR(asChar(field)), &olen);
+    _WrapperEnd();
     SEXP ret = PROTECT(allocVector(REALSXP, olen));
     for (size_t i = 0; i < olen; ++i) {
       REAL(ret)[i] = res[i];
     }
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;
   }
   SEXP XGDMatrixNumRow_R(SEXP handle) {
@@ -203,10 +200,10 @@ extern "C" {
       dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
     }
     void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
+    _WrapperEnd();
     SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
     R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;
   }
   void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
@@ -252,10 +249,12 @@ extern "C" {
     for (int i = 0; i < len; ++i) {
       vec_sptr.push_back(vec_names[i].c_str());
     }
+    const char *ret =
+        XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
+                             asInteger(iter),
+                             BeginPtr(vec_dmats), BeginPtr(vec_sptr), len);  
     _WrapperEnd();
-    return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
-                                         asInteger(iter),
-                                         BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
+    return mkString(ret);
   }
   SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
     _WrapperBegin();
@@ -265,12 +264,12 @@ extern "C" {
                                         asInteger(option_mask),
                                         asInteger(ntree_limit),
                                         &olen);
+    _WrapperEnd();
     SEXP ret = PROTECT(allocVector(REALSXP, olen));
     for (size_t i = 0; i < olen; ++i) {
       REAL(ret)[i] = res[i];
     }
     UNPROTECT(1);
-    _WrapperEnd();
     return ret;
   }
   void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
@@ -305,17 +304,18 @@ extern "C" {
   SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
     _WrapperBegin();
     bst_ulong olen;
-    const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
-    CHAR(asChar(fmap)),
-    asInteger(with_stats),
-    &olen);
+    const char **res =
+        XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+                           CHAR(asChar(fmap)),
+                           asInteger(with_stats),
+                           &olen);
+    _WrapperEnd();
     SEXP out = PROTECT(allocVector(STRSXP, olen));    
     for (size_t i = 0; i < olen; ++i) {     
       stringstream stream;
       stream <<  "booster["<<i<<"]\n" << res[i];
       SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
     }
-    _WrapperEnd();
     UNPROTECT(1);
     return out;
   }
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index 0bab9a1f4..b7648340d 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -57,11 +57,9 @@ devtools::install_github('dmlc/xgboost', subdir='R-package')
 Cran version
 ------------
 
-For stable version on *CRAN*, run:
+As of 2015-03-13, ‘xgboost’ was removed from the CRAN repository.
 
-```{r installCran, eval=FALSE}
-install.packages('xgboost')
-```
+Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
 
 Learning
 ========
diff --git a/README.md b/README.md
index 093d8294d..415bf771b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ XGBoost: eXtreme Gradient Boosting
 ==================================
 
 An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
-It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data
+It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also be distributed and scale to Terascale data
 
 Contributors: https://github.com/dmlc/xgboost/graphs/contributors
 
@@ -20,33 +20,26 @@ Distributed Version: [Distributed XGBoost](multi-node)
 
 Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
 
+XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects
+
 What's New
 ==========
+* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
 * XGBoost wins [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
   - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
 * [External Memory Version](doc/external_memory.md)
-* XGBoost now support HDFS and S3
-* [Distributed XGBoost now runs on YARN](https://github.com/dmlc/wormhole/tree/master/learn/xgboost)
-* [xgboost user group](https://groups.google.com/forum/#!forum/xgboost-user/) for tracking changes, sharing your experience on xgboost
-* New features in the lastest changes :)
-  - Distributed version that scale xgboost to even larger problems with cluster
-  - Feature importance visualization in R module, thanks to Michael Benesty
-  - Predict leaf index, see [demo/guide-python/predict_leaf_indices.py](demo/guide-python/predict_leaf_indices.py)  
-* XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
-* XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
 
 Features
 ========
-* Sparse feature format:
-  - Sparse feature format allows easy handling of missing values, and improve computation efficiency.
-* Push the limit on single machine:
-  - Efficient implementation that optimizes memory and computation.
-* Speed: XGBoost is very fast
-  - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
-* Layout of gradient boosting algorithm to support user defined objective
-* Distributed and portable
-  - The distributed version of xgboost is highly portable and can be used in different platforms
-  - It inheritates all the optimizations made in single machine mode, maximumly utilize the resources using both multi-threading and distributed computing.
+* Easily accessible in python, R, Julia, CLI
+* Fast speed and memory efficient
+  - Can be more than 10 times faster than GBM in sklearn and R
+  - Handles sparse matrices, support external memory
+* Accurate prediction, and used extensively by data scientists and kagglers
+  - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
+* Distributed and Portable
+  - The distributed version runs on Hadoop (YARN), MPI, SGE etc.
+  - Scales to billions of examples and beyond
 
 Build
 =======
@@ -56,11 +49,9 @@ Build
 
 Version
 =======
-* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility
-* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure
-  - This means the model and buffer file of previous version can not be loaded in xgboost-3.0
-* For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)
-* Change log in [CHANGES.md](CHANGES.md)
+* Current version xgboost-0.4, a lot improvment has been made since 0.3
+  - Change log in [CHANGES.md](CHANGES.md)
+  - This version is compatible with 0.3x versions
 
 XGBoost in Graphlab Create
 ==========================
diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py
index ba8a4319f..5bfa55935 100755
--- a/demo/guide-python/basic_walkthrough.py
+++ b/demo/guide-python/basic_walkthrough.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 import numpy as np
 import scipy.sparse
+import pickle
 import xgboost as xgb
 
 ### simple example
@@ -19,7 +20,7 @@ bst = xgb.train(param, dtrain, num_round, watchlist)
 # this is prediction
 preds = bst.predict(dtest)
 labels = dtest.get_label()
-print ('error=%f' % (  sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
+print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
 bst.save_model('0001.model')
 # dump model
 bst.dump_model('dump.raw.txt')
@@ -28,6 +29,7 @@ bst.dump_model('dump.nice.txt','../data/featmap.txt')
 
 # save dmatrix into binary buffer
 dtest.save_binary('dtest.buffer')
+# save model
 bst.save_model('xgb.model')
 # load model and data in
 bst2 = xgb.Booster(model_file='xgb.model')
@@ -36,6 +38,14 @@ preds2 = bst2.predict(dtest2)
 # assert they are the same
 assert np.sum(np.abs(preds2-preds)) == 0
 
+# alternatively, you can pickle the booster
+pks = pickle.dumps(bst2)
+# load model and data in
+bst3 = pickle.loads(pks)
+preds3 = bst2.predict(dtest2)
+# assert they are the same
+assert np.sum(np.abs(preds3-preds)) == 0
+
 ###
 # build dmatrix from scipy.sparse
 print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
@@ -44,22 +54,22 @@ row = []; col = []; dat = []
 i = 0
 for l in open('../data/agaricus.txt.train'):
     arr = l.split()
-    labels.append( int(arr[0]))
+    labels.append(int(arr[0]))
     for it in arr[1:]:
         k,v = it.split(':')
         row.append(i); col.append(int(k)); dat.append(float(v))
     i += 1
-csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
-dtrain = xgb.DMatrix( csr, label = labels )
+csr = scipy.sparse.csr_matrix((dat, (row,col)))
+dtrain = xgb.DMatrix(csr, label = labels)
 watchlist  = [(dtest,'eval'), (dtrain,'train')]
-bst = xgb.train( param, dtrain, num_round, watchlist )
+bst = xgb.train(param, dtrain, num_round, watchlist)
 
 print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
 # we can also construct from csc matrix
-csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
+csc = scipy.sparse.csc_matrix((dat, (row,col)))
 dtrain = xgb.DMatrix(csc, label=labels)
 watchlist  = [(dtest,'eval'), (dtrain,'train')]
-bst = xgb.train( param, dtrain, num_round, watchlist )
+bst = xgb.train(param, dtrain, num_round, watchlist)
 
 print ('start running example of build DMatrix from numpy array')
 # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
@@ -67,6 +77,6 @@ print ('start running example of build DMatrix from numpy array')
 npymat = csr.todense()
 dtrain = xgb.DMatrix(npymat, label = labels)
 watchlist  = [(dtest,'eval'), (dtrain,'train')]
-bst = xgb.train( param, dtrain, num_round, watchlist )
+bst = xgb.train(param, dtrain, num_round, watchlist)
 
 
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index dd0620a7c..ce8c8d01e 100755
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -4,18 +4,17 @@ Created on 1 Apr 2015
 
 @author: Jamie Hall
 '''
-
+import pickle
 import xgboost as xgb
 
 import numpy as np
 from sklearn.cross_validation import KFold
-from sklearn.grid_search import GridSearchCV
 from sklearn.metrics import confusion_matrix, mean_squared_error
+from sklearn.grid_search import GridSearchCV
 from sklearn.datasets import load_iris, load_digits, load_boston
 
 rng = np.random.RandomState(31337)
 
-
 print("Zeros and Ones from the Digits dataset: binary classification")
 digits = load_digits(2)
 y = digits['target']
@@ -60,4 +59,9 @@ clf.fit(X,y)
 print(clf.best_score_)
 print(clf.best_params_)
 
-
+# The sklearn API models are picklable
+print("Pickling sklearn API models")
+# must open in binary format to pickle
+pickle.dump(clf, open("best_boston.pkl", "wb"))
+clf2 = pickle.load(open("best_boston.pkl", "rb"))
+print(np.allclose(clf.predict(X), clf2.predict(X)))
diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py
new file mode 100644
index 000000000..803f3fac8
--- /dev/null
+++ b/demo/guide-python/sklearn_parallel.py
@@ -0,0 +1,35 @@
+import os
+
+if __name__ == "__main__":
+    # NOTE: on posix systems, this *has* to be here and in the
+    # `__name__ == "__main__"` clause to run XGBoost in parallel processes
+    # using fork, if XGBoost was built with OpenMP support. Otherwise, if you
+    # build XGBoost without OpenMP support, you can use fork, which is the
+    # default backend for joblib, and omit this.
+    try:
+        from multiprocessing import set_start_method
+    except ImportError:
+        raise ImportError("Unable to import multiprocessing.set_start_method."
+                          " This example only runs on Python 3.4")
+    set_start_method("forkserver")
+
+    import numpy as np
+    from sklearn.grid_search import GridSearchCV
+    from sklearn.datasets import load_boston
+    import xgboost as xgb
+
+    rng = np.random.RandomState(31337)
+
+    print("Parallel Parameter optimization")
+    boston = load_boston()
+
+    os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want
+    y = boston['target']
+    X = boston['data']
+    xgb_model = xgb.XGBRegressor()
+    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
+                                   'n_estimators': [50, 100, 200]}, verbose=1,
+                       n_jobs=2)
+    clf.fit(X, y)
+    print(clf.best_score_)
+    print(clf.best_params_)
diff --git a/doc/parameter.md b/doc/parameter.md
index 6b47b4bf9..13eefa0fe 100644
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -26,19 +26,26 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame
 
 #### Parameter for Tree Booster
 * eta [default=0.3]
-  - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative.
+  - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative.
+  - range: [0,1]
 * gamma [default=0]
   - minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
+  - range: [0,∞]
 * max_depth [default=6]
   - maximum depth of a tree
+  - range: [1,∞]
 * min_child_weight [default=1]
   - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.
+  - range: [0,∞]
 * max_delta_step [default=0]
   - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
+  - range: [0,∞]
 * subsample [default=1]
   - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting.
+  - range: (0,1]
 * colsample_bytree [default=1]
   - subsample ratio of columns when constructing each tree.
+  - range: (0,1]
 
 #### Parameter for Linear Booster
 * lambda [default=0]
diff --git a/src/data.h b/src/data.h
index d1f5eb427..63dd2d78f 100644
--- a/src/data.h
+++ b/src/data.h
@@ -140,8 +140,12 @@ class IFMatrix {
    * \brief check if column access is supported, if not, initialize column access
    * \param enabled whether certain feature should be included in column access
    * \param subsample subsample ratio when generating column access
+   * \param max_row_perbatch auxilary information, maximum row used in each column batch
+   *         this is a hint information that can be ignored by the implementation
    */
-  virtual void InitColAccess(const std::vector<bool> &enabled, float subsample) = 0;
+  virtual void InitColAccess(const std::vector<bool> &enabled,
+                             float subsample,
+                             size_t max_row_perbatch) = 0;
   // the following are column meta data, should be able to answer them fast
   /*! \return whether column access is enabled */
   virtual bool HaveColAccess(void) const = 0;
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index 0a1ee4f98..c868c302a 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -64,7 +64,13 @@ class GBTree : public IGradBooster {
   }
   virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
     utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
-    fo.Write(&mparam, sizeof(ModelParam));
+    if (with_pbuffer) {
+      fo.Write(&mparam, sizeof(ModelParam));      
+    } else {
+      ModelParam p = mparam;
+      p.num_pbuffer = 0;
+      fo.Write(&p, sizeof(ModelParam));
+    }
     for (size_t i = 0; i < trees.size(); ++i) {
       trees[i]->SaveModel(fo);
     }
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 8fb02e18e..79455d130 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -33,10 +33,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
   }
   virtual bool Next(void) {
     if (!itr.Next(page_)) return false;
-    out_.base_rowid  = base_rowid_;
-    out_.ind_ptr = BeginPtr(page_->offset);
-    out_.data_ptr = BeginPtr(page_->data);
-    out_.size = page_->offset.size() - 1;
+    out_ = page_->GetRowBatch(base_rowid_);
     base_rowid_ += out_.size;
     return true;
   }
@@ -198,8 +195,8 @@ class DMatrixPageBase : public DataMatrix {
   }
   /*! \brief magic number used to identify DMatrix */
   static const int kMagic = TKMagic;
-  /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64UL << 20UL;
+  /*! \brief page size 32 MB */
+  static const size_t kPageSize = 32UL << 20UL;
 
  protected:
   virtual void set_cache_file(const std::string &cache_file)  = 0;
@@ -236,7 +233,7 @@ class DMatrixPage : public DMatrixPageBase<0xffffab02> {
 class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
  public:
   DMatrixHalfRAM(void) {
-    fmat_ = new FMatrixS(iter_);
+    fmat_ = new FMatrixS(iter_, this->info);
   }
   virtual ~DMatrixHalfRAM(void) {
     delete fmat_;
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 7d4cdb9cf..18f4c6dee 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -58,11 +58,13 @@ struct ColConvertFactory {
     return true;
   }
   inline void Setup(float pkeep,
+                    size_t max_row_perbatch,
                     size_t num_col,
                     utils::IIterator<RowBatch> *iter,
                     std::vector<bst_uint> *buffered_rowset,
                     const std::vector<bool> *enabled) {
     pkeep_ = pkeep;
+    max_row_perbatch_ = max_row_perbatch;
     num_col_ = num_col;
     iter_ = iter;
     buffered_rowset_ = buffered_rowset;
@@ -87,7 +89,8 @@ struct ColConvertFactory {
           tmp_.Push(batch[i]);
         }
       }
-      if (tmp_.MemCostBytes() >= kPageSize) {
+      if (tmp_.MemCostBytes() >= kPageSize ||
+          tmp_.Size() >= max_row_perbatch_) {
         this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
                           *enabled_, val);
         return true;
@@ -157,6 +160,8 @@ struct ColConvertFactory {
   }
   // probability of keep
   float pkeep_;
+  // maximum number of rows per batch
+  size_t max_row_perbatch_;
   // number of columns
   size_t num_col_;
   // row batch iterator
@@ -208,10 +213,10 @@ class FMatrixPage : public IFMatrix {
     return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
   }
   virtual void InitColAccess(const std::vector<bool> &enabled, 
-                             float pkeep = 1.0f) {
+                             float pkeep, size_t max_row_perbatch) {
     if (this->HaveColAccess()) return;
     if (TryLoadColData()) return;
-    this->InitColData(enabled, pkeep);
+    this->InitColData(enabled, pkeep, max_row_perbatch);
     utils::Check(TryLoadColData(), "failed on creating col.blob");
   }
   /*!
@@ -282,7 +287,8 @@ class FMatrixPage : public IFMatrix {
    * \brief intialize column data
    * \param pkeep probability to keep a row
    */
-  inline void InitColData(const std::vector<bool> &enabled, float pkeep) {
+  inline void InitColData(const std::vector<bool> &enabled,
+                          float pkeep, size_t max_row_perbatch) {
     // clear rowset
     buffered_rowset_.clear();
     col_size_.resize(info.num_col());
@@ -294,7 +300,7 @@ class FMatrixPage : public IFMatrix {
     size_t bytes_write = 0;
     utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer;
     citer.SetParam("buffer_size", "2");
-    citer.get_factory().Setup(pkeep, info.num_col(),
+    citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(),
                               iter_, &buffered_rowset_, &enabled);
     citer.Init();
     SparsePage *pcol;
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index 9b0addc1c..3876c21ad 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -28,7 +28,7 @@ class DMatrixSimple : public DataMatrix {
  public:
   // constructor
   DMatrixSimple(void) : DataMatrix(kMagic) {
-    fmat_ = new FMatrixS(new OneBatchIter(this));
+    fmat_ = new FMatrixS(new OneBatchIter(this), this->info);
     this->Clear();
   }
   // virtual destructor
@@ -171,7 +171,7 @@ class DMatrixSimple : public DataMatrix {
     utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
 
     info.LoadBinary(fs);
-    FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
+    LoadBinary(fs, &row_ptr_, &row_data_);
     fmat_->LoadColAccess(fs);
 
     if (!silent) {
@@ -198,9 +198,8 @@ class DMatrixSimple : public DataMatrix {
     utils::FileStream fs(utils::FopenCheck(fname, "wb"));
     int tmagic = kMagic;
     fs.Write(&tmagic, sizeof(tmagic));
-
     info.SaveBinary(fs);
-    FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
+    SaveBinary(fs, row_ptr_, row_data_);
     fmat_->SaveColAccess(fs);
     fs.Close();
 
@@ -251,6 +250,42 @@ class DMatrixSimple : public DataMatrix {
   static const int kMagic = 0xffffab01;
 
  protected:
+  /*!
+   * \brief save data to binary stream
+   * \param fo output stream
+   * \param ptr pointer data
+   * \param data data content
+   */
+  inline static void SaveBinary(utils::IStream &fo,
+                                const std::vector<size_t> &ptr,
+                                const std::vector<RowBatch::Entry> &data) {
+    size_t nrow = ptr.size() - 1;
+    fo.Write(&nrow, sizeof(size_t));
+    fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
+    if (data.size() != 0) {
+      fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
+    }
+  }
+  /*!
+   * \brief load data from binary stream
+   * \param fi input stream
+   * \param out_ptr pointer data
+   * \param out_data data content
+   */
+  inline static void LoadBinary(utils::IStream &fi,
+                                std::vector<size_t> *out_ptr,
+                                std::vector<RowBatch::Entry> *out_data) {
+    size_t nrow;
+    utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
+    out_ptr->resize(nrow + 1);
+    utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
+                  "invalid input file format");
+    out_data->resize(out_ptr->back());
+    if (out_data->size() != 0) {
+      utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
+                    "invalid input file format");
+    }
+  }
   // one batch iterator that return content in the matrix
   struct OneBatchIter: utils::IIterator<RowBatch> {
     explicit OneBatchIter(DMatrixSimple *parent)
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index acf85297f..fc6aab8f9 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -1,15 +1,18 @@
-#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
-#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
+#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
+#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
 /*!
  * \file simple_fmatrix-inl.hpp
  * \brief the input data structure for gradient boosting
  * \author Tianqi Chen
  */
+#include <limits>
 #include "../data.h"
 #include "../utils/utils.h"
 #include "../utils/random.h"
 #include "../utils/omp.h"
+#include "../learner/dmatrix.h"
 #include "../utils/group_data.h"
+#include "./sparse_batch_page.h"
 
 namespace xgboost {
 namespace io {
@@ -20,21 +23,23 @@ class FMatrixS : public IFMatrix {
  public:
   typedef SparseBatch::Entry Entry;
   /*! \brief constructor */
-  FMatrixS(utils::IIterator<RowBatch> *iter) {
+  FMatrixS(utils::IIterator<RowBatch> *iter,
+               const learner::MetaInfo &info)
+      : info_(info) {
     this->iter_ = iter;
   }
   // destructor
   virtual ~FMatrixS(void) {
-    if (iter_ != NULL) delete iter_;
+    if (iter_ != NULL) delete iter_;    
   }
   /*! \return whether column access is enabled */
   virtual bool HaveColAccess(void) const {
-    return col_ptr_.size() != 0;
+    return col_size_.size() != 0;
   }
   /*! \brief get number of colmuns */
   virtual size_t NumCol(void) const {
     utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_ptr_.size() - 1;
+    return col_size_.size() - 1;
   }
   /*! \brief get number of buffered rows */
   virtual const std::vector<bst_uint> &buffered_rowset(void) const {
@@ -42,17 +47,17 @@ class FMatrixS : public IFMatrix {
   }
   /*! \brief get column size */
   virtual size_t GetColSize(size_t cidx) const {
-    return col_ptr_[cidx+1] - col_ptr_[cidx];
+    return col_size_[cidx];
   }
   /*! \brief get column density */
   virtual float GetColDensity(size_t cidx) const {
-    size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
+    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
     return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
   }
   virtual void InitColAccess(const std::vector<bool> &enabled, 
-                             float pkeep = 1.0f) {
+                             float pkeep, size_t max_row_perbatch) {
     if (this->HaveColAccess()) return;
-    this->InitColData(pkeep, enabled);
+    this->InitColData(enabled, pkeep, max_row_perbatch);
   }
   /*!
    * \brief get the row iterator associated with FMatrix
@@ -70,7 +75,7 @@ class FMatrixS : public IFMatrix {
     for (size_t i = 0; i < ncol; ++i) {
       col_iter_.col_index_[i] = static_cast<bst_uint>(i);
     }
-    col_iter_.SetBatch(col_ptr_, col_data_);
+    col_iter_.BeforeFirst();
     return &col_iter_;
   }
   /*!
@@ -82,7 +87,7 @@ class FMatrixS : public IFMatrix {
     for (size_t i = 0; i < fset.size(); ++i) {
       if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); 
     }
-    col_iter_.SetBatch(col_ptr_, col_data_);
+    col_iter_.BeforeFirst();
     return &col_iter_;
   }
   /*!
@@ -90,64 +95,52 @@ class FMatrixS : public IFMatrix {
    * \param fo output stream to save to
    */
   inline void SaveColAccess(utils::IStream &fo) const {
-    fo.Write(buffered_rowset_);
-    if (buffered_rowset_.size() != 0) {
-      SaveBinary(fo, col_ptr_, col_data_);
-    }
+    size_t n = 0;
+    fo.Write(&n, sizeof(n));
   }
   /*!
    * \brief load column access data from stream
    * \param fo output stream to load from
    */
   inline void LoadColAccess(utils::IStream &fi) {
-    utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
-    if (buffered_rowset_.size() != 0) {
-      LoadBinary(fi, &col_ptr_, &col_data_);
-    }
+    // do nothing in load col access
   }
-  /*!
-   * \brief save data to binary stream
-   * \param fo output stream
-   * \param ptr pointer data
-   * \param data data content
-   */
-  inline static void SaveBinary(utils::IStream &fo,
-                                const std::vector<size_t> &ptr,
-                                const std::vector<RowBatch::Entry> &data) {
-    size_t nrow = ptr.size() - 1;
-    fo.Write(&nrow, sizeof(size_t));
-    fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
-    if (data.size() != 0) {
-      fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
-    }
-  }
-  /*!
-   * \brief load data from binary stream
-   * \param fi input stream
-   * \param out_ptr pointer data
-   * \param out_data data content
-   */
-  inline static void LoadBinary(utils::IStream &fi,
-                                std::vector<size_t> *out_ptr,
-                                std::vector<RowBatch::Entry> *out_data) {
-    size_t nrow;
-    utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
-    out_ptr->resize(nrow + 1);
-    utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
-                  "invalid input file format");
-    out_data->resize(out_ptr->back());
-    if (out_data->size() != 0) {
-      utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
-                    "invalid input file format");
-    }
-  }
-
+  
  protected:
   /*!
    * \brief intialize column data
+   * \param enabled the list of enabled columns
    * \param pkeep probability to keep a row
+   * \param max_row_perbatch maximum row per batch
    */
-  inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
+  inline void InitColData(const std::vector<bool> &enabled,
+                          float pkeep, size_t max_row_perbatch) {
+    col_iter_.Clear();
+    if (info_.num_row() < max_row_perbatch) {
+      SparsePage *page = new SparsePage();
+      this->MakeOneBatch(enabled, pkeep, page);
+      col_iter_.cpages_.push_back(page);
+    } else {
+      this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
+    }
+    // setup col-size
+    col_size_.resize(info_.num_col());
+    std::fill(col_size_.begin(), col_size_.end(), 0);
+    for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
+      SparsePage *pcol = col_iter_.cpages_[i];
+      for (size_t j = 0; j < pcol->Size(); ++j) {
+        col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];        
+      }
+    }
+  }
+  /*!
+   * \brief make column page from iterator
+   * \param pkeep probability to keep a row
+   * \param pcol the target column
+   */
+  inline void MakeOneBatch(const std::vector<bool> &enabled,
+                           float pkeep,                          
+                           SparsePage *pcol) {
     // clear rowset
     buffered_rowset_.clear();
     // bit map
@@ -157,8 +150,9 @@ class FMatrixS : public IFMatrix {
     {
       nthread = omp_get_num_threads();
     }
-    // build the column matrix in parallel
-    utils::ParallelGroupBuilder<RowBatch::Entry> builder(&col_ptr_, &col_data_);
+    pcol->Clear();
+    utils::ParallelGroupBuilder<SparseBatch::Entry>
+        builder(&pcol->offset, &pcol->data);
     builder.InitBudget(0, nthread);
     // start working
     iter_->BeforeFirst();
@@ -189,7 +183,7 @@ class FMatrixS : public IFMatrix {
       }
     }
     builder.InitStorage();
-
+    
     iter_->BeforeFirst();
     while (iter_->Next()) {
       const RowBatch &batch = iter_->Value();
@@ -209,66 +203,167 @@ class FMatrixS : public IFMatrix {
         }
       }
     }
+
+    utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
     // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
-    #pragma omp parallel for schedule(static)
+    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
     for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (col_ptr_[i] < col_ptr_[i + 1]) {
-        std::sort(BeginPtr(col_data_) + col_ptr_[i],
-                  BeginPtr(col_data_) + col_ptr_[i + 1], Entry::CmpValue);
+      if (pcol->offset[i] < pcol->offset[i + 1]) {
+        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
+                  BeginPtr(pcol->data) + pcol->offset[i + 1],
+                  SparseBatch::Entry::CmpValue);
+      }
+    }
+  }
+
+  inline void MakeManyBatch(const std::vector<bool> &enabled,
+                            float pkeep, size_t max_row_perbatch) {
+    size_t btop = 0;
+    buffered_rowset_.clear();
+    // internal temp cache
+    SparsePage tmp; tmp.Clear();
+    iter_->BeforeFirst();
+    while (iter_->Next()) {
+      const RowBatch &batch = iter_->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
+          buffered_rowset_.push_back(ridx);
+          tmp.Push(batch[i]);
+        }
+        if (tmp.Size() >= max_row_perbatch) {
+          SparsePage *page = new SparsePage();
+          this->MakeColPage(tmp.GetRowBatch(0),
+                            BeginPtr(buffered_rowset_) + btop,
+                            enabled, page);
+          col_iter_.cpages_.push_back(page);
+          btop = buffered_rowset_.size();
+          tmp.Clear();
+        }
+      }
+    }
+    if (tmp.Size() != 0) {
+      SparsePage *page = new SparsePage();
+      this->MakeColPage(tmp.GetRowBatch(0),
+                        BeginPtr(buffered_rowset_) + btop,
+                        enabled, page);
+      col_iter_.cpages_.push_back(page);
+    }
+  }
+  // make column page from subset of rowbatchs
+  inline void MakeColPage(const RowBatch &batch,
+                          const bst_uint *ridx,
+                          const std::vector<bool> &enabled,
+                          SparsePage *pcol) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+      int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); 
+      if (nthread > max_nthread) {
+        nthread = max_nthread;
+      }
+    }
+    pcol->Clear();
+    utils::ParallelGroupBuilder<SparseBatch::Entry>
+        builder(&pcol->offset, &pcol->data);
+    builder.InitBudget(info_.num_col(), nthread);
+    bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
+    #pragma omp parallel for schedule(static) num_threads(nthread)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      int tid = omp_get_thread_num();
+      RowBatch::Inst inst = batch[i];
+      for (bst_uint j = 0; j < inst.length; ++j) {
+        const SparseBatch::Entry &e = inst[j];
+        if (enabled[e.index]) { 
+          builder.AddBudget(e.index, tid);
+        }
+      }
+    }
+    builder.InitStorage();
+    #pragma omp parallel for schedule(static) num_threads(nthread)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      int tid = omp_get_thread_num();
+      RowBatch::Inst inst = batch[i];
+      for (bst_uint j = 0; j < inst.length; ++j) {
+        const SparseBatch::Entry &e = inst[j];
+        builder.Push(e.index,
+                     SparseBatch::Entry(ridx[i], e.fvalue),
+                     tid);
+      }
+    }
+    utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
+    // sort columns
+    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
+    for (bst_omp_uint i = 0; i < ncol; ++i) {
+      if (pcol->offset[i] < pcol->offset[i + 1]) {
+        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
+                  BeginPtr(pcol->data) + pcol->offset[i + 1],
+                  SparseBatch::Entry::CmpValue);
       }
     }
   }
 
  private:
   // one batch iterator that return content in the matrix
-  struct OneBatchIter: utils::IIterator<ColBatch> {
-    OneBatchIter(void) : at_first_(true){}
-    virtual ~OneBatchIter(void) {}
+  struct ColBatchIter: utils::IIterator<ColBatch> {
+    ColBatchIter(void) : data_ptr_(0) {}
+    virtual ~ColBatchIter(void) {
+      this->Clear();
+    }
     virtual void BeforeFirst(void) {
-      at_first_ = true;
+      data_ptr_ = 0;
     }
     virtual bool Next(void) {
-      if (!at_first_) return false;
-      at_first_ = false;
-      return true;
-    }
-    virtual const ColBatch &Value(void) const {
-      return batch_;
-    }
-    inline void SetBatch(const std::vector<size_t> &ptr,
-                         const std::vector<ColBatch::Entry> &data) {
+      if (data_ptr_ >= cpages_.size()) return false;
+      data_ptr_ += 1;
+      SparsePage *pcol = cpages_[data_ptr_ - 1];
       batch_.size = col_index_.size();
       col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
       for (size_t i = 0; i < col_data_.size(); ++i) {
         const bst_uint ridx = col_index_[i];
-        col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
-                                         static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
+        col_data_[i] = SparseBatch::Inst
+            (BeginPtr(pcol->data) + pcol->offset[ridx],
+             static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
       }
       batch_.col_index = BeginPtr(col_index_);
-      batch_.col_data = BeginPtr(col_data_);
-      this->BeforeFirst();
+      batch_.col_data = BeginPtr(col_data_);      
+      return true;
+    }
+    virtual const ColBatch &Value(void) const {      
+      return batch_;
+    }
+    inline void Clear(void) {
+      for (size_t i = 0; i < cpages_.size(); ++i) {
+        delete cpages_[i];
+      }
+      cpages_.clear();
     }
     // data content
     std::vector<bst_uint> col_index_;
+    // column content
     std::vector<ColBatch::Inst> col_data_;
-    // whether is at first
-    bool at_first_;
+    // column sparse pages
+    std::vector<SparsePage*> cpages_;    
+    // data pointer
+    size_t data_ptr_;
     // temporal space for batch
     ColBatch batch_;
-  }; 
+  };
   // --- data structure used to support InitColAccess --
   // column iterator
-  OneBatchIter col_iter_;
+  ColBatchIter col_iter_;
+  // shared meta info with DMatrix
+  const learner::MetaInfo &info_;  
   // row iterator
   utils::IIterator<RowBatch> *iter_;
   /*! \brief list of row index that are buffered */
   std::vector<bst_uint> buffered_rowset_;
-  /*! \brief column pointer of CSC format */
-  std::vector<size_t> col_ptr_;
-  /*! \brief column datas in CSC format */
-  std::vector<ColBatch::Entry> col_data_;
+  // count for column data
+  std::vector<size_t> col_size_;
 };
 }  // namespace io
 }  // namespace xgboost
-#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
+#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP
diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h
index 319f9da5c..d94141a6e 100644
--- a/src/io/sparse_batch_page.h
+++ b/src/io/sparse_batch_page.h
@@ -178,8 +178,22 @@ class SparsePage {
     offset.push_back(offset.back() + inst.length);
     size_t begin = data.size();
     data.resize(begin + inst.length);
-    std::memcpy(BeginPtr(data) + begin, inst.data,
-                sizeof(SparseBatch::Entry) * inst.length);
+    if (inst.length != 0) {
+      std::memcpy(BeginPtr(data) + begin, inst.data,
+                  sizeof(SparseBatch::Entry) * inst.length);
+    }
+  }
+  /*!
+   * \param base_rowid base_rowid of the data
+   * \return row batch representation of the page
+   */
+  inline RowBatch GetRowBatch(size_t base_rowid) const {
+    RowBatch out;
+    out.base_rowid  = base_rowid;
+    out.ind_ptr = BeginPtr(offset);
+    out.data_ptr = BeginPtr(data);
+    out.size = offset.size() - 1;
+    return out;
   }
 
  private:
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 9ceec969e..45e312aa7 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -33,6 +33,7 @@ class BoostLearner : public rabit::Serializable {
     silent= 0;
     prob_buffer_row = 1.0f;
     distributed_mode = 0;
+    updater_mode = 0;
     pred_buffer_size = 0;
     seed_per_iteration = 0;
     seed = 0;
@@ -95,6 +96,7 @@ class BoostLearner : public rabit::Serializable {
         utils::Error("%s is invalid value for dsplit, should be row or col", val);
       }
     }
+    if (!strcmp(name, "updater_mode")) updater_mode = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) {
       prob_buffer_row = static_cast<float>(atof(val));
       utils::Check(distributed_mode == 0,
@@ -157,11 +159,9 @@ class BoostLearner : public rabit::Serializable {
   /*!
    * \brief load model from stream
    * \param fi input stream
-   * \param with_pbuffer whether to load with predict buffer
    * \param calc_num_feature whether call InitTrainer with calc_num_feature
    */
   inline void LoadModel(utils::IStream &fi,
-                        bool with_pbuffer = true,
                         bool calc_num_feature = true) {
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "BoostLearner: wrong model format");
@@ -189,15 +189,15 @@ class BoostLearner : public rabit::Serializable {
     char tmp[32];
     utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class);
     obj_->SetParam("num_class", tmp);
-    gbm_->LoadModel(fi, with_pbuffer);
-    if (!with_pbuffer || distributed_mode == 2) {
+    gbm_->LoadModel(fi, mparam.saved_with_pbuffer != 0);
+    if (mparam.saved_with_pbuffer == 0) {
       gbm_->ResetPredBuffer(pred_buffer_size);
     }
   }
   // rabit load model from rabit checkpoint
   virtual void Load(rabit::Stream *fi) {
     // for row split, we should not keep pbuffer
-    this->LoadModel(*fi, distributed_mode != 2, false);
+    this->LoadModel(*fi, false);
   }
   // rabit save model to rabit checkpoint
   virtual void Save(rabit::Stream *fo) const {
@@ -218,18 +218,20 @@ class BoostLearner : public rabit::Serializable {
     if (header == "bs64") {
       utils::Base64InStream bsin(fi);
       bsin.InitPosition();
-      this->LoadModel(bsin);
+      this->LoadModel(bsin, true);
     } else if (header == "binf") {
-      this->LoadModel(*fi);
+      this->LoadModel(*fi, true);
     } else {
       delete fi;
       fi = utils::IStream::Create(fname, "r");
-      this->LoadModel(*fi);
+      this->LoadModel(*fi, true);
     }
     delete fi;   
   }
-  inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
-    fo.Write(&mparam, sizeof(ModelParam));
+  inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
+    ModelParam p = mparam;
+    p.saved_with_pbuffer = static_cast<int>(with_pbuffer);
+    fo.Write(&p, sizeof(ModelParam));
     fo.Write(name_obj_);
     fo.Write(name_gbm_);
     gbm_->SaveModel(fo, with_pbuffer);
@@ -237,17 +239,18 @@ class BoostLearner : public rabit::Serializable {
   /*!
    * \brief save model into file
    * \param fname file name
+   * \param with_pbuffer whether save pbuffer together
    */
-  inline void SaveModel(const char *fname) const {
+  inline void SaveModel(const char *fname, bool with_pbuffer) const {
     utils::IStream *fo = utils::IStream::Create(fname, "w");
     if (save_base64 != 0 || !strcmp(fname, "stdout")) {
       fo->Write("bs64\t", 5);
       utils::Base64OutStream bout(fo);
-      this->SaveModel(bout);
+      this->SaveModel(bout, with_pbuffer);
       bout.Finish('\n');    
     } else {
       fo->Write("binf", 4);
-      this->SaveModel(*fo);
+      this->SaveModel(*fo, with_pbuffer);
     }
     delete fo;
   }
@@ -258,9 +261,17 @@ class BoostLearner : public rabit::Serializable {
    */
   inline void CheckInit(DMatrix *p_train) {
     int ncol = static_cast<int>(p_train->info.info.num_col);    
-    std::vector<bool> enabled(ncol, true);    
+    std::vector<bool> enabled(ncol, true);
+    // set max row per batch to limited value
+    // in distributed mode, use safe choice otherwise
+    size_t max_row_perbatch = std::numeric_limits<size_t>::max();
+    if (updater_mode != 0 || distributed_mode == 2) {
+      max_row_perbatch = 32UL << 10UL;
+    }
     // initialize column access
-    p_train->fmat()->InitColAccess(enabled, prob_buffer_row);
+    p_train->fmat()->InitColAccess(enabled,
+                                   prob_buffer_row,
+                                   max_row_perbatch);
     const int kMagicPage = 0xffffab02;
     // check, if it is DMatrixPage, then use hist maker
     if (p_train->magic == kMagicPage) {
@@ -442,14 +453,17 @@ class BoostLearner : public rabit::Serializable {
     unsigned num_feature;
     /* \brief number of class, if it is multi-class classification  */
     int num_class;
+    /*! \brief whether the model itself is saved with pbuffer */
+    int saved_with_pbuffer;
     /*! \brief reserved field */
-    int reserved[31];
+    int reserved[30];
     /*! \brief constructor */
     ModelParam(void) {
+      std::memset(this, 0, sizeof(ModelParam));
       base_score = 0.5f;
       num_feature = 0;
       num_class = 0;
-      std::memset(reserved, 0, sizeof(reserved));
+      saved_with_pbuffer = 0;
     }
     /*!
      * \brief set parameters from outside
@@ -476,6 +490,8 @@ class BoostLearner : public rabit::Serializable {
   int silent;
   // distributed learning mode, if any, 0:none, 1:col, 2:row
   int distributed_mode;
+  // updater mode, 0:normal, reserved for internal test
+  int updater_mode;
   // cached size of predict buffer
   size_t pred_buffer_size;
   // maximum buffred row value
diff --git a/src/tree/param.h b/src/tree/param.h
index 3458a93a4..20ba1e6c0 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -48,6 +48,8 @@ struct TrainParam{
   int size_leaf_vector;  
   // option for parallelization
   int parallel_option;
+  // option to open cacheline optimizaton
+  int cache_opt;
   // number of threads to be used for tree construction,
   // if OpenMP is enabled, if equals 0, use system default
   int nthread;
@@ -70,6 +72,7 @@ struct TrainParam{
     parallel_option = 2;
     sketch_eps = 0.1f;
     sketch_ratio = 2.0f;
+    cache_opt = 1;
   }
   /*! 
    * \brief set parameters from outside 
@@ -96,6 +99,7 @@ struct TrainParam{
     if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
     if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
     if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
+    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
     if (!strcmp(name, "max_depth")) max_depth = atoi(val);
     if (!strcmp(name, "nthread")) nthread = atoi(val);
     if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
@@ -151,12 +155,12 @@ struct TrainParam{
     return dw;
   }
   /*! \brief whether need forward small to big search: default right */
-  inline bool need_forward_search(float col_density = 0.0f) const {
+  inline bool need_forward_search(float col_density, bool indicator) const {
     return this->default_direction == 2 ||
-        (default_direction == 0 && (col_density < opt_dense_col));
+        (default_direction == 0 && (col_density < opt_dense_col) && !indicator);
   }
   /*! \brief whether need backward big to small search: default left */
-  inline bool need_backward_search(float col_density = 0.0f) const {
+  inline bool need_backward_search(float col_density, bool indicator) const {
     return this->default_direction != 2;
   }
   /*! \brief given the loss change, whether we need to invode prunning */
@@ -192,6 +196,11 @@ struct GradStats {
   double sum_grad;
   /*! \brief sum hessian statistics */
   double sum_hess;
+  /*!
+   * \brief whether this is simply statistics and we only need to call
+   *   Add(gpair), instead of Add(gpair, info, ridx)
+   */
+  static const int kSimpleStats = 1;
   /*! \brief constructor, the object must be cleared during construction */
   explicit GradStats(const TrainParam &param) {
     this->Clear();
@@ -204,7 +213,14 @@ struct GradStats {
   inline static void CheckInfo(const BoosterInfo &info) {
   }
   /*!
-   * \brief accumulate statistics,
+   * \brief accumulate statistics 
+   * \param p the gradient pair
+   */
+  inline void Add(bst_gpair p) {
+    this->Add(p.grad, p.hess);
+  }
+  /*!
+   * \brief accumulate statistics, more complicated version
    * \param gpair the vector storing the gradient statistics
    * \param info the additional information 
    * \param ridx instance index of this instance
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index f657c0335..db3581aac 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -234,8 +234,9 @@ class ColMaker: public IUpdater {
                                   const IFMatrix &fmat,
                                   const std::vector<bst_gpair> &gpair,
                                   const BoosterInfo &info) {
-      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
-      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
+      const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
+      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
+      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
       const std::vector<int> &qexpand = qexpand_;
       #pragma omp parallel
       {
@@ -356,7 +357,100 @@ class ColMaker: public IUpdater {
           }
         }
       }
-    }    
+    }
+    // update enumeration solution
+    inline void UpdateEnumeration(int nid, bst_gpair gstats,
+                                  float fvalue, int d_step, bst_uint fid,
+                                  TStats &c, std::vector<ThreadEntry> &temp) {
+      // get the statistics of nid
+      ThreadEntry &e = temp[nid];
+      // test if first hit, this is fine, because we set 0 during init
+      if (e.stats.Empty()) {
+        e.stats.Add(gstats);
+        e.last_fvalue = fvalue;
+      } else {
+        // try to find a split
+        if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
+          c.SetSubstract(snode[nid].stats, e.stats);
+          if (c.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
+          }
+        }
+        // update the statistics
+        e.stats.Add(gstats);
+        e.last_fvalue = fvalue;
+      }
+    }
+    // same as EnumerateSplit, with cacheline prefetch optimization
+    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
+                                       const ColBatch::Entry *end,
+                                       int d_step,
+                                       bst_uint fid,
+                                       const std::vector<bst_gpair> &gpair,
+                                       std::vector<ThreadEntry> &temp) {
+      const std::vector<int> &qexpand = qexpand_;
+      // clear all the temp statistics
+      for (size_t j = 0; j < qexpand.size(); ++j) {
+        temp[qexpand[j]].stats.Clear();
+      }
+      // left statistics
+      TStats c(param);
+      // local cache buffer for position and gradient pair
+      const int kBuffer = 32;
+      int buf_position[kBuffer];
+      bst_gpair buf_gpair[kBuffer];
+      // aligned ending position
+      const ColBatch::Entry *align_end;
+      if (d_step > 0) {
+        align_end = begin + (end - begin) / kBuffer * kBuffer;
+      } else {
+        align_end = begin - (begin - end) / kBuffer * kBuffer;
+      }
+      int i;
+      const ColBatch::Entry *it;
+      const int align_step = d_step * kBuffer;
+      // internal cached loop
+      for (it = begin; it != align_end; it += align_step) {
+        const ColBatch::Entry *p;
+        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
+          buf_position[i] = position[p->index];
+          buf_gpair[i] = gpair[p->index];
+        }
+        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
+          const int nid = buf_position[i];
+          if (nid < 0) continue;
+          this->UpdateEnumeration(nid, buf_gpair[i],
+                                  p->fvalue, d_step,
+                                  fid, c, temp);
+        }        
+      }
+      // finish up the ending piece
+      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
+        buf_position[i] = position[it->index];
+        buf_gpair[i] = gpair[it->index];
+      }
+      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
+        const int nid = buf_position[i];
+        if (nid < 0) continue;
+        this->UpdateEnumeration(nid, buf_gpair[i],
+                                it->fvalue, d_step,
+                                fid, c, temp);
+      }            
+      // finish updating all statistics, check if it is possible to include all sum statistics
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        ThreadEntry &e = temp[nid];
+        c.SetSubstract(snode[nid].stats, e.stats);
+        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
+          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+          const float gap = std::abs(e.last_fvalue) + rt_eps;
+          const float delta = d_step == +1 ? gap: -gap;
+          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
+        }
+      }
+    }
+
     // enumerate the split values of specific feature
     inline void EnumerateSplit(const ColBatch::Entry *begin,
                                const ColBatch::Entry *end,
@@ -365,6 +459,11 @@ class ColMaker: public IUpdater {
                                const std::vector<bst_gpair> &gpair,
                                const BoosterInfo &info,
                                std::vector<ThreadEntry> &temp) {
+      // use cacheline aware optimization
+      if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
+        EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
+        return;
+      }
       const std::vector<int> &qexpand = qexpand_;
       // clear all the temp statistics
       for (size_t j = 0; j < qexpand.size(); ++j) {
@@ -411,6 +510,7 @@ class ColMaker: public IUpdater {
         }
       }
     }
+
     // update the solution candidate 
     virtual void UpdateSolution(const ColBatch &batch,
                                 const std::vector<bst_gpair> &gpair,
@@ -431,11 +531,12 @@ class ColMaker: public IUpdater {
           const bst_uint fid = batch.col_index[i];
           const int tid = omp_get_thread_num();
           const ColBatch::Inst c = batch[i];
-          if (param.need_forward_search(fmat.GetColDensity(fid))) {
+          const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
+          if (param.need_forward_search(fmat.GetColDensity(fid), ind)) {
             this->EnumerateSplit(c.data, c.data + c.length, +1, 
                                  fid, gpair, info, stemp[tid]);
           }
-          if (param.need_backward_search(fmat.GetColDensity(fid))) {
+          if (param.need_backward_search(fmat.GetColDensity(fid), ind)) {
             this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
                                  fid, gpair, info, stemp[tid]);
           }
@@ -550,8 +651,8 @@ class ColMaker: public IUpdater {
           #pragma omp parallel for schedule(static)
           for (bst_omp_uint j = 0; j < ndata; ++j) {
             const bst_uint ridx = col[j].index;
-            const float fvalue = col[j].fvalue;
             const int nid = this->DecodePosition(ridx);
+            const float fvalue = col[j].fvalue;
             // go back to parent, correct those who are not default
             if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
               if(fvalue < tree[nid].split_cond()) {
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index d6279592f..f739f23f3 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker<TStats> {
       utils::Assert(istart != hist.size, "the bound variable must be max");
       hist.data[istart].Add(gpair, info, ridx);
     }
+    /*! 
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    bst_gpair gstats) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gstats);
+    }
   };
   // sketch type used for this
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
@@ -479,11 +489,38 @@ class CQHistMaker: public HistMaker<TStats> {
       hbuilder[nid].istart = 0;
       hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
     }
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
+      const bst_uint kBuffer = 32;
+      bst_uint align_length = c.length / kBuffer * kBuffer;
+      int buf_position[kBuffer];
+      bst_gpair buf_gpair[kBuffer];
+      for (bst_uint j = 0; j < align_length; j += kBuffer) {
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          bst_uint ridx = c[j + i].index;
+          buf_position[i] = this->position[ridx];
+          buf_gpair[i] = gpair[ridx];
+        }
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          const int nid = buf_position[i];
+          if (nid >= 0) {
+            hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
+          }
+        }
+      }
+      for (bst_uint j = align_length; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
+        }
+      }
+    } else {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+        }
       }
     }
   }
@@ -536,11 +573,38 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].Init(max_size);
     }
     // second pass, build the sketch
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
+      const bst_uint kBuffer = 32;
+      bst_uint align_length = c.length / kBuffer * kBuffer;
+      int buf_position[kBuffer];
+      bst_float buf_hess[kBuffer];
+      for (bst_uint j = 0; j < align_length; j += kBuffer) {
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          bst_uint ridx = c[j + i].index;
+          buf_position[i] = this->position[ridx];
+          buf_hess[i] = gpair[ridx].hess;
+        }
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          const int nid = buf_position[i];
+          if (nid >= 0) {
+            sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
+          }
+        }        
+      }
+      for (bst_uint j = align_length; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        }
+      }
+    } else {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        }
       }
     }
     for (size_t i = 0; i < this->qexpand.size(); ++i) {
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 2c0f7f000..4e885e254 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -328,14 +328,14 @@ struct WXQSummary : public WQSummary<DType, RType> {
     }
     if (nbig >= n - 1) {
       // see what was the case
-      fprintf(stderr, "LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
-      fprintf(stderr, "LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
-              src.size, maxsize, static_cast<double>(range),
-              static_cast<double>(chunk));      
+      utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
+      utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
+                    src.size, maxsize, static_cast<double>(range),
+                    static_cast<double>(chunk));      
       for (size_t i = 0; i < src.size; ++i) {
-        printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
-               src.data[i].rmin, src.data[i].rmax,  src.data[i].wmin,
-               src.data[i].value, CheckLarge(src.data[i], chunk));
+        utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
+                      src.data[i].rmin, src.data[i].rmax,  src.data[i].wmin,
+                      src.data[i].value, CheckLarge(src.data[i], chunk));
       }
       utils::Assert(nbig < n - 1, "quantile: too many large chunk");
     }
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index ad87f8879..769e3be3b 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -87,6 +87,7 @@ class BoostLearnTask {
     if (!strcmp("name_pred", name)) name_pred = val;
     if (!strcmp("dsplit", name)) data_split = val;
     if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
+    if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val);
     if (!strncmp("eval[", name, 5)) {
       char evname[256];
       utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
@@ -115,6 +116,7 @@ class BoostLearnTask {
     model_dir_path = "./";
     data_split = "NONE";
     load_part = 0;
+    save_with_pbuffer = 0;
     data = NULL;
   }
   ~BoostLearnTask(void){
@@ -241,7 +243,7 @@ class BoostLearnTask {
   }
   inline void SaveModel(const char *fname) const {
     if (rabit::GetRank() != 0) return;
-    learner.SaveModel(fname);
+    learner.SaveModel(fname, save_with_pbuffer != 0);
   }
   inline void SaveModel(int i) const {
     char fname[256];
@@ -297,6 +299,8 @@ class BoostLearnTask {
   int pred_margin;
   /*! \brief whether dump statistics along with model */
   int dump_model_stats;
+  /*! \brief whether save prediction buffer */
+  int save_with_pbuffer;
   /*! \brief name of feature map */
   std::string name_fmap;
   /*! \brief name of dump file */
diff --git a/wrapper/README.md b/wrapper/README.md
index 9c0399693..ab013faf6 100644
--- a/wrapper/README.md
+++ b/wrapper/README.md
@@ -7,6 +7,8 @@ Python
 * To make the python module, type ```./build.sh``` in the root directory of project
 * Install with `python setup.py install` from this directory.
 * Refer also to the walk through example in [demo folder](../demo/guide-python)
+* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
+
 
 R
 =====
diff --git a/wrapper/setup.py b/wrapper/setup.py
index 14f9e9b55..52bf1cf82 100644
--- a/wrapper/setup.py
+++ b/wrapper/setup.py
@@ -28,7 +28,7 @@ if len(lib_path) == 0:
     raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
                                  "../make?")
 setup(name="xgboost",
-      version="0.32",
+      version="0.40",
       description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
       zip_safe=False,
       py_modules=['xgboost'],
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index 8ef82b2c7..0280d87b3 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -1,8 +1,8 @@
 # coding: utf-8
-
 """
 xgboost: eXtreme Gradient Boosting library
 
+Version: 0.40
 Authors: Tianqi Chen, Bing Xu
 Early stopping by Zygmunt Zając
 """
@@ -30,6 +30,9 @@ except ImportError:
 class XGBoostLibraryNotFound(Exception):
     pass
 
+class XGBoostError(Exception):
+    pass
+
 __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
 
 if sys.version_info[0] == 3:
@@ -70,6 +73,8 @@ def load_xglib():
     lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
     lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
     lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
+    lib.XGBoosterGetModelRaw.restype = ctypes.POINTER(ctypes.c_char)
+    lib.XGBoosterLoadModelFromBuffer.restype = ctypes.c_void_p
 
     return lib
 
@@ -89,6 +94,16 @@ def ctypes2numpy(cptr, length, dtype):
     return res
 
 
+def ctypes2buffer(cptr, length):
+    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
+        raise RuntimeError('expected char pointer')
+    res = bytearray(length)
+    rptr = (ctypes.c_char * length).from_buffer(res)
+    if not ctypes.memmove(rptr, cptr, length):
+        raise RuntimeError('memmove failed')
+    return res
+
+
 def c_str(string):
     return ctypes.c_char_p(string.encode('utf-8'))
 
@@ -98,7 +113,7 @@ def c_array(ctype, values):
 
 
 class DMatrix(object):
-    def __init__(self, data, label=None, missing=0.0, weight=None):
+    def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
         """
         Data matrix used in XGBoost.
 
@@ -113,14 +128,15 @@ class DMatrix(object):
             Value in the data which needs to be present as a missing value.
         weight : list or numpy 1-D array (optional)
             Weight for each instance.
+        silent: boolean
+            Whether print messages during construction
         """
-
         # force into void_p, mac need to pass things in as void_p
         if data is None:
             self.handle = None
             return
         if isinstance(data, string_types):
-            self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0))
+            self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent)))
         elif isinstance(data, scipy.sparse.csr_matrix):
             self._init_from_csr(data)
         elif isinstance(data, scipy.sparse.csc_matrix):
@@ -335,6 +351,46 @@ class Booster(object):
     def __del__(self):
         xglib.XGBoosterFree(self.handle)
 
+    def __getstate__(self):
+        # can't pickle ctypes pointers
+        # put model content in bytearray
+        this = self.__dict__.copy()
+        handle = this['handle']
+        if handle is not None:
+            raw = self.save_raw()
+            this["handle"] = raw
+        return this
+
+    def __setstate__(self, state):
+        # reconstruct handle from raw data
+        handle = state['handle']
+        if handle is not None:
+            buf = handle
+            dmats = c_array(ctypes.c_void_p, [])
+            handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0))
+            length = ctypes.c_ulong(len(buf))
+            ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
+            xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length)
+            state['handle'] = handle
+        self.__dict__.update(state)
+        self.set_param({'seed': 0})
+
+    def __copy__(self):
+        return self.__deepcopy__()
+
+    def __deepcopy__(self):
+        return Booster(model_file = self.save_raw())
+
+    def copy(self):
+        """
+        Copy the booster object
+
+        Returns
+        --------
+        a copied booster model
+        """
+        return self.__copy__()
+
     def set_param(self, params, pv=None):
         if isinstance(params, collections.Mapping):
             params = params.items()
@@ -427,6 +483,11 @@ class Booster(object):
         """
         Predict with data.
 
+        NOTE: This function is not thread safe.
+              For each booster object, predict can only be called from one thread.
+              If you want to run prediction using multiple thread, call bst.copy() to make copies
+              of model object and then call predict
+
         Parameters
         ----------
         data : DMatrix
@@ -468,9 +529,25 @@ class Booster(object):
         Parameters
         ----------
         fname : string
-            Output file name.
+            Output file name
         """
-        xglib.XGBoosterSaveModel(self.handle, c_str(fname))
+        if isinstance(fname, string_types):  # assume file name
+            xglib.XGBoosterSaveModel(self.handle, c_str(fname))
+        else:
+            raise TypeError("fname must be a string")
+
+    def save_raw(self):
+        """
+        Save the model to a in memory buffer represetation
+
+        Returns
+        -------
+        a in memory buffer represetation of the model
+        """
+        length = ctypes.c_ulong()
+        cptr = xglib.XGBoosterGetModelRaw(self.handle,
+                                          ctypes.byref(length))
+        return ctypes2buffer(cptr, length.value)
 
     def load_model(self, fname):
         """
@@ -478,10 +555,16 @@ class Booster(object):
 
         Parameters
         ----------
-        fname : string
-            Input file name.
+        fname : string or a memory buffer
+            Input file name or memory buffer(see also save_raw)
         """
-        xglib.XGBoosterLoadModel(self.handle, c_str(fname))
+        if isinstance(fname, str):  # assume file name
+            xglib.XGBoosterLoadModel(self.handle, c_str(fname))
+        else:
+            buf = fname
+            length = ctypes.c_ulong(len(buf))
+            ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
+            xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
 
     def dump_model(self, fo, fmap='', with_stats=False):
         """
@@ -622,7 +705,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
         maximize_score = False
         if 'eval_metric' in params:
             maximize_metrics = ('auc', 'map', 'ndcg')
-            if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics):
+            if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)):
                 maximize_score = True
 
         if maximize_score:
@@ -659,11 +742,11 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                 sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
                 bst.best_score = best_score
                 bst.best_iteration = best_score_i
-                return bst
-
+                break
+        bst.best_score = best_score
+        bst.best_iteration = best_score_i
         return bst
 
-
 class CVPack(object):
     def __init__(self, dtrain, dtest, param):
         self.dtrain = dtrain
@@ -815,12 +898,15 @@ class XGBModel(XGBModelBase):
         The initial prediction score of all instances, global bias.
     seed : int
         Random number seed.
+    missing : float, optional
+        Value in the data which needs to be present as a missing value. If
+        None, defaults to np.nan.
     """
     def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear",
                  nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0):
+                 base_score=0.5, seed=0, missing=None):
         if not SKLEARN_INSTALLED:
-            raise Exception('sklearn needs to be installed in order to use this module')
+            raise XGBoostError('sklearn needs to be installed in order to use this module')
         self.max_depth = max_depth
         self.learning_rate = learning_rate
         self.n_estimators = n_estimators
@@ -836,8 +922,37 @@ class XGBModel(XGBModelBase):
 
         self.base_score = base_score
         self.seed = seed
+        self.missing = missing if missing is not None else np.nan
 
-        self._Booster = Booster()
+        self._Booster = None
+
+    def __setstate__(self, state):
+        # backward compatiblity code
+        # load booster from raw if it is raw
+        # the booster now support pickle
+        bst = state["_Booster"]
+        if bst is not None and not isinstance(bst, Booster):
+            state["_Booster"] = Booster(model_file=bst)
+        self.__dict__.update(state)
+
+    def booster(self):
+        """
+        get the underlying xgboost Booster of this model
+        will raise an exception when fit was not called
+
+        Returns
+        -------
+        booster : a xgboost booster of underlying model
+        """
+        if self._Booster is None:
+            raise XGBoostError('need to call fit beforehand')
+        return self._Booster
+
+    def get_params(self, deep=False):
+        params = super(XGBModel, self).get_params(deep=deep)
+        if params['missing'] is np.nan:
+            params['missing'] = None  # sklearn doesn't handle nan. see #4725
+        return params
 
     def get_xgb_params(self):
         xgb_params = self.get_params()
@@ -849,13 +964,13 @@ class XGBModel(XGBModelBase):
         return xgb_params
 
     def fit(self, X, y):
-        trainDmatrix = DMatrix(X, label=y)
+        trainDmatrix = DMatrix(X, label=y, missing=self.missing)
         self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators)
         return self
 
     def predict(self, X):
-        testDmatrix = DMatrix(X)
-        return self._Booster.predict(testDmatrix)
+        testDmatrix = DMatrix(X, missing=self.missing)
+        return self.booster().predict(testDmatrix)
 
 
 class XGBClassifier(XGBModel, XGBClassifier):
@@ -865,15 +980,15 @@ class XGBClassifier(XGBModel, XGBClassifier):
 
     def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic",
                  nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0):
+                 base_score=0.5, seed=0, missing=None):
         super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective,
                                             nthread, gamma, min_child_weight, max_delta_step, subsample,
                                             colsample_bytree,
-                                            base_score, seed)
+                                            base_score, seed, missing)
 
     def fit(self, X, y, sample_weight=None):
-        y_values = list(np.unique(y))
-        self.n_classes_ = len(y_values)
+        self.classes_ = list(np.unique(y))
+        self.n_classes_ = len(self.classes_)
         if self.n_classes_ > 2:
             # Switch to using a multiclass objective in the underlying XGB instance
             self.objective = "multi:softprob"
@@ -886,17 +1001,19 @@ class XGBClassifier(XGBModel, XGBClassifier):
         training_labels = self._le.transform(y)
 
         if sample_weight is not None:
-            trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight)
+            trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
+                                   missing=self.missing)
         else:
-            trainDmatrix = DMatrix(X, label=training_labels)
+            trainDmatrix = DMatrix(X, label=training_labels,
+                                   missing=self.missing)
 
         self._Booster = train(xgb_options, trainDmatrix, self.n_estimators)
 
         return self
 
     def predict(self, X):
-        testDmatrix = DMatrix(X)
-        class_probs = self._Booster.predict(testDmatrix)
+        testDmatrix = DMatrix(X, missing=self.missing)
+        class_probs = self.booster().predict(testDmatrix)
         if len(class_probs.shape) > 1:
             column_indexes = np.argmax(class_probs, axis=1)
         else:
@@ -905,8 +1022,8 @@ class XGBClassifier(XGBModel, XGBClassifier):
         return self._le.inverse_transform(column_indexes)
 
     def predict_proba(self, X):
-        testDmatrix = DMatrix(X)
-        class_probs = self._Booster.predict(testDmatrix)
+        testDmatrix = DMatrix(X, missing=self.missing)
+        class_probs = self.booster().predict(testDmatrix)
         if self.objective == "multi:softprob":
             return class_probs
         else:
@@ -914,7 +1031,6 @@ class XGBClassifier(XGBModel, XGBClassifier):
             classzero_probs = 1.0 - classone_probs
             return np.vstack((classzero_probs, classone_probs)).transpose()
 
-
 class XGBRegressor(XGBModel, XGBRegressor):
     __doc__ = """
     Implementation of the scikit-learn API for XGBoost regression
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index 8ec3aa3f4..4d7828faf 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -58,13 +58,14 @@ class Booster: public learner::BoostLearner {
   }
   inline void LoadModelFromBuffer(const void *buf, size_t size) {
     utils::MemoryFixSizeBuffer fs((void*)buf, size);
-    learner::BoostLearner::LoadModel(fs);
+    learner::BoostLearner::LoadModel(fs, true);
     this->init_model = true;    
   }
   inline const char *GetModelRaw(bst_ulong *out_len) {
+    this->CheckInitModel();
     model_str.resize(0);
     utils::MemoryBufferStream fs(&model_str);
-    learner::BoostLearner::SaveModel(fs);
+    learner::BoostLearner::SaveModel(fs, false);
     *out_len = static_cast<bst_ulong>(model_str.length());
     if (*out_len == 0) {
       return NULL;
@@ -322,8 +323,10 @@ extern "C"{
   void XGBoosterLoadModel(void *handle, const char *fname) {
     static_cast<Booster*>(handle)->LoadModel(fname);
   }
-  void XGBoosterSaveModel(const void *handle, const char *fname) {
-    static_cast<const Booster*>(handle)->SaveModel(fname);
+  void XGBoosterSaveModel(void *handle, const char *fname) {
+    Booster *bst = static_cast<Booster*>(handle);
+    bst->CheckInitModel();
+    bst->SaveModel(fname, false);
   }
   void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
     static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h
index f1d2cc92a..88a327d0d 100644
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -203,7 +203,7 @@ extern "C" {
    * \param handle handle
    * \param fname file name
    */
-  XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
+  XGB_DLL void XGBoosterSaveModel(void *handle, const char *fname);
   /*!
    * \brief load model from in memory buffer
    * \param handle handle