Merge branch 'master' of https://github.com/pommedeterresautee/xgboost

2015-06-16 21:40:09 +02:00 · 2015-06-16 21:40:09 +02:00 · 37714eb331
commit 37714eb331
parent ad2e93f6c5 ab219d3331
42 changed files with 1072 additions and 298 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -21,8 +21,16 @@ xgboost-0.3
 * Add [Code Guide](src/README.md) for customizing objective function and evaluation
 * Add R module
-in progress version
+xgboost-0.4
 =====
-* Distributed version
+* Distributed version of xgboost that runs on YARN, scales to billions of examples
-* Feature importance visualization in R module, thanks to Michael Benesty
+* Direct save/load data and model from/to S3 and HDFS
-* Predict leaf inde
+* Feature importance visualization in R module, by Michael Benesty
 * Predict leaf index
 * Poisson regression for counts data
 * Early stopping option in training
 * Native save load support in R and python
  - xgboost models now can be saved using save/load in R
  - xgboost python model is now pickable
 * sklearn wrapper is supported in python module
 * Experimental External memory version
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-Copyright (c) 2014 by Tianqi Chen and Contributors 
+Copyright (c) 2014 by Contributors 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
--- a/19
+++ b/19
@ -2,7 +2,7 @@ export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops
 ifeq ($(OS), Windows_NT)
 	export CXX = g++ -m64
@ -18,7 +18,6 @@ endif
 # by default use c++11
 ifeq ($(cxx11),1)
 	CFLAGS += -std=c++11
 else 
 endif
 # handling dmlc
@ -38,6 +37,14 @@ else
 	LIBDMLC=dmlc_simple.o
 endif
 ifndef WITH_FPIC
 	WITH_FPIC = 1
 endif
 ifeq ($(WITH_FPIC), 1)
 	CFLAGS += -fPIC	
 endif
 ifeq ($(OS), Windows_NT)
 	LIBRABIT = subtree/rabit/lib/librabit_empty.a
 	SLIB = wrapper/xgboost_wrapper.dll
@ -51,11 +58,15 @@ BIN = xgboost
 MOCKBIN = xgboost.mock
 OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
 MPIBIN =
 ifeq ($(WITH_FPIC), 1)
 	TARGET = $(BIN) $(OBJ) $(SLIB)
 else
 	TARGET = $(BIN)
 endif
 .PHONY: clean all mpi python Rpack
-all: $(BIN) $(OBJ) $(SLIB)
+all: $(TARGET)
 mpi: $(MPIBIN)
 python: wrapper/libxgboostwrapper.so
@ -79,7 +90,7 @@ subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
 	+	cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
 $(BIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
+	$(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 $(MOCKBIN) : 
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: eXtreme Gradient Boosting
-Version: 0.3-4
+Version: 0.4-0
-Date: 2014-12-28
+Date: 2015-05-11
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
 Maintainer: Tong He <hetong007@gmail.com>
 Description: Xgboost is short for eXtreme Gradient Boosting, which is an 
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@ -54,6 +54,13 @@
 #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
 #'   If folds are supplied, the nfold and stratified parameters would be ignored.
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #'     
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @return
@ -86,7 +93,8 @@
 #'
 xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
                   prediction = FALSE, showsd = TRUE, metrics=list(), 
-                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
+                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
                   early.stop.round = NULL, maximize = NULL, ...) {
  if (typeof(params) != "list") {
    stop("xgb.cv: first argument params must be list")
  }
@ -110,6 +118,49 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    params <- append(params, list("eval_metric"=mc))
  }
  # customized objective and evaluation metric interface
  if (!is.null(params$objective) && !is.null(obj))
    stop("xgb.cv: cannot assign two different objectives")
  if (!is.null(params$objective))
    if (class(params$objective)=='function') {
      obj = params$objective
      params$objective = NULL
    }
  if (!is.null(params$eval_metric) && !is.null(feval))
    stop("xgb.cv: cannot assign two different evaluation metrics")
  if (!is.null(params$eval_metric))
    if (class(params$eval_metric)=='function') {
      feval = params$eval_metric
      params$eval_metric = NULL
    }
  # Early Stopping
  if (!is.null(early.stop.round)){
    if (!is.null(feval) && is.null(maximize))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize) && is.null(params$eval_metric))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize))
    {
      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
        maximize = FALSE
      } else {
        maximize = TRUE
      }
    }
    if (maximize) {
      bestScore = 0
    } else {
      bestScore = Inf
    }
    bestInd = 0
    earlyStopflag = FALSE
    if (length(metrics)>1)
      warning('Only the first metric is used for early stopping process.')
  }
  xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
  obj_type = params[['objective']]
  mat_pred = FALSE
@ -124,6 +175,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  else
    predictValues <- rep(0,xgb.numrow(dtrain))
  history <- c()
  print.every.n = max(as.integer(print.every.n), 1L)
  for (i in 1:nrounds) {
    msg <- list()
    for (k in 1:nfold) {
@ -148,7 +200,27 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    }
    ret <- xgb.cv.aggcv(msg, showsd)
    history <- c(history, ret)
-    if(verbose) paste(ret, "\n", sep="") %>% cat
+    if(verbose)
      if (0==(i-1L)%%print.every.n)
        cat(ret, "\n", sep="")
    # early_Stopping
    if (!is.null(early.stop.round)){
      score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2]
      score = strsplit(score,'\\+|:')[[1]][[2]]
      score = as.numeric(score)
      if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
        bestScore = score
        bestInd = i
      } else {
        if (i-bestInd>=early.stop.round) {
          earlyStopflag = TRUE
          cat('Stopping. Best iteration:',bestInd)
          break
        }
      }
    }
  }
  colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@ -36,7 +36,7 @@
 #' 3. Task Parameters 
 #' 
 #' \itemize{
-#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
+#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
 #'   \itemize{
 #'     \item \code{reg:linear} linear regression (Default).
 #'     \item \code{reg:logistic} logistic regression.
@ -48,7 +48,7 @@
 #'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
 #'   }
 #'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
-#'   \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
+#'   \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 #' }
 #' 
 #' @param data takes an \code{xgb.DMatrix} as the input.
@ -66,7 +66,12 @@
 #'   prediction and dtrain,
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
-#'
+#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@ -98,7 +103,6 @@
 #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 #' dtest <- dtrain
 #' watchlist <- list(eval = dtest, train = dtrain)
 #' param <- list(max.depth = 2, eta = 1, silent = 1)
 #' logregobj <- function(preds, dtrain) {
 #'    labels <- getinfo(dtrain, "label")
 #'    preds <- 1/(1 + exp(-preds))
@ -111,11 +115,13 @@
 #'   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
 #'   return(list(metric = "error", value = err))
 #' }
-#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
+#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
 #' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
 #' @export
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
-                      obj = NULL, feval = NULL, verbose = 1, ...) {
+                      obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
                      early.stop.round = NULL, maximize = NULL, ...) {
  dtrain <- data
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list")
@ -130,19 +136,85 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
  }
  if (length(watchlist) != 0 && verbose == 0) {
    warning('watchlist is provided but verbose=0, no evaluation information will be printed')
    watchlist <- list()
  }
  params = append(params, list(...))
  # customized objective and evaluation metric interface
  if (!is.null(params$objective) && !is.null(obj))
    stop("xgb.train: cannot assign two different objectives")
  if (!is.null(params$objective))
    if (class(params$objective)=='function') {
      obj = params$objective
      params$objective = NULL
    }
  if (!is.null(params$eval_metric) && !is.null(feval))
    stop("xgb.train: cannot assign two different evaluation metrics")
  if (!is.null(params$eval_metric))
    if (class(params$eval_metric)=='function') {
      feval = params$eval_metric
      params$eval_metric = NULL
    }
  # Early stopping
  if (!is.null(early.stop.round)){
    if (!is.null(feval) && is.null(maximize))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (length(watchlist) == 0)
      stop('For early stopping you need at least one set in watchlist.')
    if (is.null(maximize) && is.null(params$eval_metric))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize))
    {
      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
        maximize = FALSE
      } else {
        maximize = TRUE
      }
    }
    if (maximize) {
      bestScore = 0
    } else {
      bestScore = Inf
    }
    bestInd = 0
    earlyStopflag = FALSE
    if (length(watchlist)>1)
      warning('Only the first data set in watchlist is used for early stopping process.')
  }
  handle <- xgb.Booster(params, append(watchlist, dtrain))
  bst <- xgb.handleToBooster(handle)
  print.every.n=max( as.integer(print.every.n), 1L)
  for (i in 1:nrounds) {
    succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
    if (length(watchlist) != 0) {
      msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
      if (0== ( (i-1) %% print.every.n))
 	    cat(paste(msg, "\n", sep=""))
      if (!is.null(early.stop.round))
      {
        score = strsplit(msg,':|\\s+')[[1]][3]
        score = as.numeric(score)
        if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
          bestScore = score
          bestInd = i
        } else {
          if (i-bestInd>=early.stop.round) {
            earlyStopflag = TRUE
            cat('Stopping. Best iteration:',bestInd)
            break
          }
        }
      }
    }
  }
  bst <- xgb.Booster.check(bst)
  if (!is.null(early.stop.round)) {
    bst$bestScore = bestScore
    bst$bestInd = bestInd
  }
  return(bst)
 } 
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@ -28,8 +28,14 @@
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
 #'   performance and construction progress information
 #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@ -51,7 +57,8 @@
 #' @export
 #' 
 xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
-                    verbose = 1, ...) {
+                    verbose = 1, print.every.n = 1L, early.stop.round = NULL,
                    maximize = NULL, ...) {
  if (is.null(missing)) {
    dtrain <- xgb.get.DMatrix(data, label)
  } else {
@ -66,7 +73,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
    watchlist <- list()
  }
-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose)
+  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
                   early.stop.round = early.stop.round)
  return(bst)
 } 
--- a/R-package/README.md
+++ b/R-package/README.md
@ -8,11 +8,6 @@ For up-to-date version (which is recommended), please install from github. Windo
 devtools::install_github('dmlc/xgboost',subdir='R-package')
 ```
 For stable version on CRAN, please run
 ```r
 install.packages('xgboost')
 ```
 ## Examples
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@ -6,3 +6,5 @@ generalized_linear_model        Generalized Linear Model
 cross_validation                Cross validation
 create_sparse_matrix            Create Sparse Matrix
 predict_leaf_indices            Predicting the corresponding leaves
 early_stopping                  Early Stop in training
 poisson_regression              Poisson Regression on count data
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@ -40,10 +40,10 @@ evalerror <- function(preds, dtrain) {
  return(list(metric = "error", value = err))
 }
-param <- list(max.depth=2,eta=1,silent=1)
+param <- list(max.depth=2,eta=1,silent=1,
              objective = logregobj, eval_metric = evalerror)
 # train with customized objective
-xgb.cv(param, dtrain, nround, nfold = 5,
+xgb.cv(param, dtrain, nround, nfold = 5)
       obj = logregobj, feval=evalerror)
 # do cross validation with prediction values for each fold
 res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@ -8,7 +8,6 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
 param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
 watchlist <- list(eval = dtest, train = dtrain)
 num_round <- 2
@ -33,10 +32,13 @@ evalerror <- function(preds, dtrain) {
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
 param <- list(max.depth=2,eta=1,nthread = 2, silent=1, 
              objective=logregobj, eval_metric=evalerror)
 print ('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
+bst <- xgb.train(param, dtrain, num_round, watchlist)
 #
 # there can be cases where you want additional information 
@ -59,4 +61,5 @@ logregobjattr <- function(preds, dtrain) {
 print ('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror)
+bst <- xgb.train(param, dtrain, num_round, watchlist, 
                 objective=logregobj, eval_metric=evalerror)
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@ -0,0 +1,40 @@
 require(xgboost)
 # load in the agaricus dataset
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
 param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
 watchlist <- list(eval = dtest)
 num_round <- 20
 # user define objective function, given prediction, return gradient and second order gradient
 # this is loglikelihood loss
 logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
 }
 # user defined evaluation function, return a pair metric_name, result
 # NOTE: when you do customized loss function, the default prediction value is margin
 # this may make buildin evalution metric not function properly
 # for example, we are doing logistic loss, the prediction is score before logistic transformation
 # the buildin evaluation error assumes input is after logistic transformation
 # Take this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
 print ('start training with early Stopping setting')
 bst <- xgb.train(param, dtrain, num_round, watchlist, 
                 objective = logregobj, eval_metric = evalerror, maximize = FALSE,
                 early.stop.round = 3)
 bst <- xgb.cv(param, dtrain, num_round, nfold = 5, 
              objective = logregobj, eval_metric = evalerror,
              maximize = FALSE, early.stop.round = 3)
--- a/R-package/demo/poisson_regression.R
+++ b/R-package/demo/poisson_regression.R
@ -0,0 +1,7 @@
 data(mtcars)
 head(mtcars)
 bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
              objective='count:poisson',nrounds=5)
 pred = predict(bst,as.matrix(mtcars[,-11]))
 sqrt(mean((pred-mtcars[,11])^2))
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@ -7,3 +7,5 @@ demo(generalized_linear_model)
 demo(cross_validation)
 demo(create_sparse_matrix)
 demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@ -7,7 +7,8 @@
 xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
  missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
  obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
-  verbose = T, ...)
+  verbose = T, print.every.n = 1L, early.stop.round = NULL,
  maximize = NULL, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@ -65,6 +66,15 @@ If folds are supplied, the nfold and stratified parameters would be ignored.}
 \item{verbose}{\code{boolean}, print the statistics during the process}
 \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 \item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
    \code{maximize=TRUE} means the larger the evaluation score the better.}
 \item{...}{other parameters to pass to \code{params}.}
 }
 \value{
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@ -5,7 +5,8 @@
 \title{eXtreme Gradient Boosting Training}
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
-  feval = NULL, verbose = 1, ...)
+  feval = NULL, verbose = 1, print.every.n = 1L,
  early.stop.round = NULL, maximize = NULL, ...)
 }
 \arguments{
 \item{params}{the list of parameters.
@ -42,7 +43,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
 3. Task Parameters
 \itemize{
-\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
+\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
  \itemize{
    \item \code{reg:linear} linear regression (Default).
    \item \code{reg:logistic} logistic regression.
@ -54,7 +55,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
    \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
  }
  \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
-  \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
+  \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 }}
 \item{data}{takes an \code{xgb.DMatrix} as the input.}
@ -77,6 +78,15 @@ prediction and dtrain,}
 \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
 information of performance. If 2, xgboost will print information of both}
 \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 \item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
@ -110,7 +120,6 @@ data(agaricus.train, package='xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- dtrain
 watchlist <- list(eval = dtest, train = dtrain)
 param <- list(max.depth = 2, eta = 1, silent = 1)
 logregobj <- function(preds, dtrain) {
   labels <- getinfo(dtrain, "label")
   preds <- 1/(1 + exp(-preds))
@ -123,6 +132,7 @@ evalerror <- function(preds, dtrain) {
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
-bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
+param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
 bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
 }
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@ -5,7 +5,8 @@
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
 xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
-  nrounds, verbose = 1, ...)
+  nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
  maximize = NULL, ...)
 }
 \arguments{
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
@ -41,6 +42,15 @@ Commonly used ones are:
 information of performance. If 2, xgboost will print information of both
 performance and construction progress information}
 \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 \item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@ -70,10 +70,10 @@ extern "C" {
  SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
    _WrapperBegin();
    void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  SEXP XGDMatrixCreateFromMat_R(SEXP mat, 
@ -91,10 +91,10 @@ extern "C" {
      }
    }
    void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;    
  }
  SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
@ -120,10 +120,10 @@ extern "C" {
    }
    void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
                                          BeginPtr(data_), nindptr, ndata);
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
@ -134,10 +134,10 @@ extern "C" {
      idxvec[i] = INTEGER(idxset)[i] - 1;
    }
    void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;        
  }
  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
@ -157,10 +157,7 @@ extern "C" {
        vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
      }
      XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
-      _WrapperEnd();
+    } else {
      return;
    }
    {
      std::vector<float> vec(len);
      #pragma omp parallel for schedule(static)
      for (int i = 0; i < len; ++i) {
@ -177,12 +174,12 @@ extern "C" {
    bst_ulong olen;
    const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
                                             CHAR(asChar(field)), &olen);
    _WrapperEnd();
    SEXP ret = PROTECT(allocVector(REALSXP, olen));
    for (size_t i = 0; i < olen; ++i) {
      REAL(ret)[i] = res[i];
    }
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  SEXP XGDMatrixNumRow_R(SEXP handle) {
@ -203,10 +200,10 @@ extern "C" {
      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
    }
    void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
@ -252,10 +249,12 @@ extern "C" {
    for (int i = 0; i < len; ++i) {
      vec_sptr.push_back(vec_names[i].c_str());
    }
-    _WrapperEnd();
+    const char *ret =
-    return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
+        XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
                             asInteger(iter),
-                                         BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
+                             BeginPtr(vec_dmats), BeginPtr(vec_sptr), len);  
    _WrapperEnd();
    return mkString(ret);
  }
  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
    _WrapperBegin();
@ -265,12 +264,12 @@ extern "C" {
                                        asInteger(option_mask),
                                        asInteger(ntree_limit),
                                        &olen);
    _WrapperEnd();
    SEXP ret = PROTECT(allocVector(REALSXP, olen));
    for (size_t i = 0; i < olen; ++i) {
      REAL(ret)[i] = res[i];
    }
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
@ -305,17 +304,18 @@ extern "C" {
  SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
    _WrapperBegin();
    bst_ulong olen;
-    const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+    const char **res =
        XGBoosterDumpModel(R_ExternalPtrAddr(handle),
                           CHAR(asChar(fmap)),
                           asInteger(with_stats),
                           &olen);
    _WrapperEnd();
    SEXP out = PROTECT(allocVector(STRSXP, olen));    
    for (size_t i = 0; i < olen; ++i) {     
      stringstream stream;
      stream <<  "booster["<<i<<"]\n" << res[i];
      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
    }
    _WrapperEnd();
    UNPROTECT(1);
    return out;
  }
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@ -57,11 +57,9 @@ devtools::install_github('dmlc/xgboost', subdir='R-package')
 Cran version
 ------------
-For stable version on *CRAN*, run:
+As of 2015-03-13, ‘xgboost’ was removed from the CRAN repository.
-```{r installCran, eval=FALSE}
+Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
 install.packages('xgboost')
 ```
 Learning
 ========
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@ XGBoost: eXtreme Gradient Boosting
 ==================================
 An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
-It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data
+It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also be distributed and scale to Terascale data
 Contributors: https://github.com/dmlc/xgboost/graphs/contributors
@ -20,33 +20,26 @@ Distributed Version: [Distributed XGBoost](multi-node)
 Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
 XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects
 What's New
 ==========
 * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
 * XGBoost wins [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
  - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
 * [External Memory Version](doc/external_memory.md)
 * XGBoost now support HDFS and S3
 * [Distributed XGBoost now runs on YARN](https://github.com/dmlc/wormhole/tree/master/learn/xgboost)
 * [xgboost user group](https://groups.google.com/forum/#!forum/xgboost-user/) for tracking changes, sharing your experience on xgboost
 * New features in the lastest changes :)
  - Distributed version that scale xgboost to even larger problems with cluster
  - Feature importance visualization in R module, thanks to Michael Benesty
  - Predict leaf index, see [demo/guide-python/predict_leaf_indices.py](demo/guide-python/predict_leaf_indices.py)  
 * XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
 * XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
 Features
 ========
-* Sparse feature format:
+* Easily accessible in python, R, Julia, CLI
-  - Sparse feature format allows easy handling of missing values, and improve computation efficiency.
+* Fast speed and memory efficient
-* Push the limit on single machine:
+  - Can be more than 10 times faster than GBM in sklearn and R
-  - Efficient implementation that optimizes memory and computation.
+  - Handles sparse matrices, support external memory
-* Speed: XGBoost is very fast
+* Accurate prediction, and used extensively by data scientists and kagglers
-  - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
+  - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
-* Layout of gradient boosting algorithm to support user defined objective
+* Distributed and Portable
-* Distributed and portable
+  - The distributed version runs on Hadoop (YARN), MPI, SGE etc.
-  - The distributed version of xgboost is highly portable and can be used in different platforms
+  - Scales to billions of examples and beyond
  - It inheritates all the optimizations made in single machine mode, maximumly utilize the resources using both multi-threading and distributed computing.
 Build
 =======
@ -56,11 +49,9 @@ Build
 Version
 =======
-* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility
+* Current version xgboost-0.4, a lot improvment has been made since 0.3
-* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure
+  - Change log in [CHANGES.md](CHANGES.md)
-  - This means the model and buffer file of previous version can not be loaded in xgboost-3.0
+  - This version is compatible with 0.3x versions
 * For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)
 * Change log in [CHANGES.md](CHANGES.md)
 XGBoost in Graphlab Create
 ==========================
--- a/demo/guide-python/basic_walkthrough.py
+++ b/demo/guide-python/basic_walkthrough.py
@ -1,6 +1,7 @@
 #!/usr/bin/python
 import numpy as np
 import scipy.sparse
 import pickle
 import xgboost as xgb
 ### simple example
@ -28,6 +29,7 @@ bst.dump_model('dump.nice.txt','../data/featmap.txt')
 # save dmatrix into binary buffer
 dtest.save_binary('dtest.buffer')
 # save model
 bst.save_model('xgb.model')
 # load model and data in
 bst2 = xgb.Booster(model_file='xgb.model')
@ -36,6 +38,14 @@ preds2 = bst2.predict(dtest2)
 # assert they are the same
 assert np.sum(np.abs(preds2-preds)) == 0
 # alternatively, you can pickle the booster
 pks = pickle.dumps(bst2)
 # load model and data in
 bst3 = pickle.loads(pks)
 preds3 = bst2.predict(dtest2)
 # assert they are the same
 assert np.sum(np.abs(preds3-preds)) == 0
 ###
 # build dmatrix from scipy.sparse
 print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@ -4,18 +4,17 @@ Created on 1 Apr 2015
@author: Jamie Hall
 '''
-
+import pickle
 import xgboost as xgb
 import numpy as np
 from sklearn.cross_validation import KFold
 from sklearn.grid_search import GridSearchCV
 from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn.grid_search import GridSearchCV
 from sklearn.datasets import load_iris, load_digits, load_boston
 rng = np.random.RandomState(31337)
 print("Zeros and Ones from the Digits dataset: binary classification")
 digits = load_digits(2)
 y = digits['target']
@ -60,4 +59,9 @@ clf.fit(X,y)
 print(clf.best_score_)
 print(clf.best_params_)
-
+# The sklearn API models are picklable
 print("Pickling sklearn API models")
 # must open in binary format to pickle
 pickle.dump(clf, open("best_boston.pkl", "wb"))
 clf2 = pickle.load(open("best_boston.pkl", "rb"))
 print(np.allclose(clf.predict(X), clf2.predict(X)))
--- a/demo/guide-python/sklearn_parallel.py
+++ b/demo/guide-python/sklearn_parallel.py
@ -0,0 +1,35 @@
 import os
 if __name__ == "__main__":
    # NOTE: on posix systems, this *has* to be here and in the
    # `__name__ == "__main__"` clause to run XGBoost in parallel processes
    # using fork, if XGBoost was built with OpenMP support. Otherwise, if you
    # build XGBoost without OpenMP support, you can use fork, which is the
    # default backend for joblib, and omit this.
    try:
        from multiprocessing import set_start_method
    except ImportError:
        raise ImportError("Unable to import multiprocessing.set_start_method."
                          " This example only runs on Python 3.4")
    set_start_method("forkserver")
    import numpy as np
    from sklearn.grid_search import GridSearchCV
    from sklearn.datasets import load_boston
    import xgboost as xgb
    rng = np.random.RandomState(31337)
    print("Parallel Parameter optimization")
    boston = load_boston()
    os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want
    y = boston['target']
    X = boston['data']
    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
                       n_jobs=2)
    clf.fit(X, y)
    print(clf.best_score_)
    print(clf.best_params_)
--- a/doc/parameter.md
+++ b/doc/parameter.md
@ -26,19 +26,26 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame
 #### Parameter for Tree Booster
 * eta [default=0.3]
-  - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative.
+  - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative.
  - range: [0,1]
 * gamma [default=0]
  - minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
  - range: [0,∞]
 * max_depth [default=6]
  - maximum depth of a tree
  - range: [1,∞]
 * min_child_weight [default=1]
  - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.
  - range: [0,∞]
 * max_delta_step [default=0]
  - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
  - range: [0,∞]
 * subsample [default=1]
  - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting.
  - range: (0,1]
 * colsample_bytree [default=1]
  - subsample ratio of columns when constructing each tree.
  - range: (0,1]
 #### Parameter for Linear Booster
 * lambda [default=0]
--- a/src/data.h
+++ b/src/data.h
@ -140,8 +140,12 @@ class IFMatrix {
   * \brief check if column access is supported, if not, initialize column access
   * \param enabled whether certain feature should be included in column access
   * \param subsample subsample ratio when generating column access
   * \param max_row_perbatch auxilary information, maximum row used in each column batch
   *         this is a hint information that can be ignored by the implementation
   */
-  virtual void InitColAccess(const std::vector<bool> &enabled, float subsample) = 0;
+  virtual void InitColAccess(const std::vector<bool> &enabled,
                             float subsample,
                             size_t max_row_perbatch) = 0;
  // the following are column meta data, should be able to answer them fast
  /*! \return whether column access is enabled */
  virtual bool HaveColAccess(void) const = 0;
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@ -64,7 +64,13 @@ class GBTree : public IGradBooster {
  }
  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
    utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
    if (with_pbuffer) {
      fo.Write(&mparam, sizeof(ModelParam));      
    } else {
      ModelParam p = mparam;
      p.num_pbuffer = 0;
      fo.Write(&p, sizeof(ModelParam));
    }
    for (size_t i = 0; i < trees.size(); ++i) {
      trees[i]->SaveModel(fo);
    }
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@ -33,10 +33,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
  }
  virtual bool Next(void) {
    if (!itr.Next(page_)) return false;
-    out_.base_rowid  = base_rowid_;
+    out_ = page_->GetRowBatch(base_rowid_);
    out_.ind_ptr = BeginPtr(page_->offset);
    out_.data_ptr = BeginPtr(page_->data);
    out_.size = page_->offset.size() - 1;
    base_rowid_ += out_.size;
    return true;
  }
@ -198,8 +195,8 @@ class DMatrixPageBase : public DataMatrix {
  }
  /*! \brief magic number used to identify DMatrix */
  static const int kMagic = TKMagic;
-  /*! \brief page size 64 MB */
+  /*! \brief page size 32 MB */
-  static const size_t kPageSize = 64UL << 20UL;
+  static const size_t kPageSize = 32UL << 20UL;
 protected:
  virtual void set_cache_file(const std::string &cache_file)  = 0;
@ -236,7 +233,7 @@ class DMatrixPage : public DMatrixPageBase<0xffffab02> {
 class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
 public:
  DMatrixHalfRAM(void) {
-    fmat_ = new FMatrixS(iter_);
+    fmat_ = new FMatrixS(iter_, this->info);
  }
  virtual ~DMatrixHalfRAM(void) {
    delete fmat_;
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@ -58,11 +58,13 @@ struct ColConvertFactory {
    return true;
  }
  inline void Setup(float pkeep,
                    size_t max_row_perbatch,
                    size_t num_col,
                    utils::IIterator<RowBatch> *iter,
                    std::vector<bst_uint> *buffered_rowset,
                    const std::vector<bool> *enabled) {
    pkeep_ = pkeep;
    max_row_perbatch_ = max_row_perbatch;
    num_col_ = num_col;
    iter_ = iter;
    buffered_rowset_ = buffered_rowset;
@ -87,7 +89,8 @@ struct ColConvertFactory {
          tmp_.Push(batch[i]);
        }
      }
-      if (tmp_.MemCostBytes() >= kPageSize) {
+      if (tmp_.MemCostBytes() >= kPageSize ||
          tmp_.Size() >= max_row_perbatch_) {
        this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
                          *enabled_, val);
        return true;
@ -157,6 +160,8 @@ struct ColConvertFactory {
  }
  // probability of keep
  float pkeep_;
  // maximum number of rows per batch
  size_t max_row_perbatch_;
  // number of columns
  size_t num_col_;
  // row batch iterator
@ -208,10 +213,10 @@ class FMatrixPage : public IFMatrix {
    return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
  }
  virtual void InitColAccess(const std::vector<bool> &enabled, 
-                             float pkeep = 1.0f) {
+                             float pkeep, size_t max_row_perbatch) {
    if (this->HaveColAccess()) return;
    if (TryLoadColData()) return;
-    this->InitColData(enabled, pkeep);
+    this->InitColData(enabled, pkeep, max_row_perbatch);
    utils::Check(TryLoadColData(), "failed on creating col.blob");
  }
  /*!
@ -282,7 +287,8 @@ class FMatrixPage : public IFMatrix {
   * \brief intialize column data
   * \param pkeep probability to keep a row
   */
-  inline void InitColData(const std::vector<bool> &enabled, float pkeep) {
+  inline void InitColData(const std::vector<bool> &enabled,
                          float pkeep, size_t max_row_perbatch) {
    // clear rowset
    buffered_rowset_.clear();
    col_size_.resize(info.num_col());
@ -294,7 +300,7 @@ class FMatrixPage : public IFMatrix {
    size_t bytes_write = 0;
    utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer;
    citer.SetParam("buffer_size", "2");
-    citer.get_factory().Setup(pkeep, info.num_col(),
+    citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(),
                              iter_, &buffered_rowset_, &enabled);
    citer.Init();
    SparsePage *pcol;
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@ -28,7 +28,7 @@ class DMatrixSimple : public DataMatrix {
 public:
  // constructor
  DMatrixSimple(void) : DataMatrix(kMagic) {
-    fmat_ = new FMatrixS(new OneBatchIter(this));
+    fmat_ = new FMatrixS(new OneBatchIter(this), this->info);
    this->Clear();
  }
  // virtual destructor
@ -171,7 +171,7 @@ class DMatrixSimple : public DataMatrix {
    utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
    info.LoadBinary(fs);
-    FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
+    LoadBinary(fs, &row_ptr_, &row_data_);
    fmat_->LoadColAccess(fs);
    if (!silent) {
@ -198,9 +198,8 @@ class DMatrixSimple : public DataMatrix {
    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
    int tmagic = kMagic;
    fs.Write(&tmagic, sizeof(tmagic));
    info.SaveBinary(fs);
-    FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
+    SaveBinary(fs, row_ptr_, row_data_);
    fmat_->SaveColAccess(fs);
    fs.Close();
@ -251,6 +250,42 @@ class DMatrixSimple : public DataMatrix {
  static const int kMagic = 0xffffab01;
 protected:
  /*!
   * \brief save data to binary stream
   * \param fo output stream
   * \param ptr pointer data
   * \param data data content
   */
  inline static void SaveBinary(utils::IStream &fo,
                                const std::vector<size_t> &ptr,
                                const std::vector<RowBatch::Entry> &data) {
    size_t nrow = ptr.size() - 1;
    fo.Write(&nrow, sizeof(size_t));
    fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
    if (data.size() != 0) {
      fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
    }
  }
  /*!
   * \brief load data from binary stream
   * \param fi input stream
   * \param out_ptr pointer data
   * \param out_data data content
   */
  inline static void LoadBinary(utils::IStream &fi,
                                std::vector<size_t> *out_ptr,
                                std::vector<RowBatch::Entry> *out_data) {
    size_t nrow;
    utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
    out_ptr->resize(nrow + 1);
    utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
                  "invalid input file format");
    out_data->resize(out_ptr->back());
    if (out_data->size() != 0) {
      utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
                    "invalid input file format");
    }
  }
  // one batch iterator that return content in the matrix
  struct OneBatchIter: utils::IIterator<RowBatch> {
    explicit OneBatchIter(DMatrixSimple *parent)
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@ -1,15 +1,18 @@
-#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
+#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
-#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
+#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
 /*!
 * \file simple_fmatrix-inl.hpp
 * \brief the input data structure for gradient boosting
 * \author Tianqi Chen
 */
 #include <limits>
 #include "../data.h"
 #include "../utils/utils.h"
 #include "../utils/random.h"
 #include "../utils/omp.h"
 #include "../learner/dmatrix.h"
 #include "../utils/group_data.h"
 #include "./sparse_batch_page.h"
 namespace xgboost {
 namespace io {
@ -20,7 +23,9 @@ class FMatrixS : public IFMatrix {
 public:
  typedef SparseBatch::Entry Entry;
  /*! \brief constructor */
-  FMatrixS(utils::IIterator<RowBatch> *iter) {
+  FMatrixS(utils::IIterator<RowBatch> *iter,
               const learner::MetaInfo &info)
      : info_(info) {
    this->iter_ = iter;
  }
  // destructor
@ -29,12 +34,12 @@ class FMatrixS : public IFMatrix {
  }
  /*! \return whether column access is enabled */
  virtual bool HaveColAccess(void) const {
-    return col_ptr_.size() != 0;
+    return col_size_.size() != 0;
  }
  /*! \brief get number of colmuns */
  virtual size_t NumCol(void) const {
    utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_ptr_.size() - 1;
+    return col_size_.size() - 1;
  }
  /*! \brief get number of buffered rows */
  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
@ -42,17 +47,17 @@ class FMatrixS : public IFMatrix {
  }
  /*! \brief get column size */
  virtual size_t GetColSize(size_t cidx) const {
-    return col_ptr_[cidx+1] - col_ptr_[cidx];
+    return col_size_[cidx];
  }
  /*! \brief get column density */
  virtual float GetColDensity(size_t cidx) const {
-    size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
+    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
  }
  virtual void InitColAccess(const std::vector<bool> &enabled, 
-                             float pkeep = 1.0f) {
+                             float pkeep, size_t max_row_perbatch) {
    if (this->HaveColAccess()) return;
-    this->InitColData(pkeep, enabled);
+    this->InitColData(enabled, pkeep, max_row_perbatch);
  }
  /*!
   * \brief get the row iterator associated with FMatrix
@ -70,7 +75,7 @@ class FMatrixS : public IFMatrix {
    for (size_t i = 0; i < ncol; ++i) {
      col_iter_.col_index_[i] = static_cast<bst_uint>(i);
    }
-    col_iter_.SetBatch(col_ptr_, col_data_);
+    col_iter_.BeforeFirst();
    return &col_iter_;
  }
  /*!
@ -82,7 +87,7 @@ class FMatrixS : public IFMatrix {
    for (size_t i = 0; i < fset.size(); ++i) {
      if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); 
    }
-    col_iter_.SetBatch(col_ptr_, col_data_);
+    col_iter_.BeforeFirst();
    return &col_iter_;
  }
  /*!
@ -90,64 +95,52 @@ class FMatrixS : public IFMatrix {
   * \param fo output stream to save to
   */
  inline void SaveColAccess(utils::IStream &fo) const {
-    fo.Write(buffered_rowset_);
+    size_t n = 0;
-    if (buffered_rowset_.size() != 0) {
+    fo.Write(&n, sizeof(n));
      SaveBinary(fo, col_ptr_, col_data_);
    }
  }
  /*!
   * \brief load column access data from stream
   * \param fo output stream to load from
   */
  inline void LoadColAccess(utils::IStream &fi) {
-    utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
+    // do nothing in load col access
    if (buffered_rowset_.size() != 0) {
      LoadBinary(fi, &col_ptr_, &col_data_);
    }
  }
  /*!
   * \brief save data to binary stream
   * \param fo output stream
   * \param ptr pointer data
   * \param data data content
   */
  inline static void SaveBinary(utils::IStream &fo,
                                const std::vector<size_t> &ptr,
                                const std::vector<RowBatch::Entry> &data) {
    size_t nrow = ptr.size() - 1;
    fo.Write(&nrow, sizeof(size_t));
    fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
    if (data.size() != 0) {
      fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
    }
  }
  /*!
   * \brief load data from binary stream
   * \param fi input stream
   * \param out_ptr pointer data
   * \param out_data data content
   */
  inline static void LoadBinary(utils::IStream &fi,
                                std::vector<size_t> *out_ptr,
                                std::vector<RowBatch::Entry> *out_data) {
    size_t nrow;
    utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
    out_ptr->resize(nrow + 1);
    utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
                  "invalid input file format");
    out_data->resize(out_ptr->back());
    if (out_data->size() != 0) {
      utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
                    "invalid input file format");
    }
  }
 protected:
  /*!
   * \brief intialize column data
   * \param enabled the list of enabled columns
   * \param pkeep probability to keep a row
   * \param max_row_perbatch maximum row per batch
   */
-  inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
+  inline void InitColData(const std::vector<bool> &enabled,
                          float pkeep, size_t max_row_perbatch) {
    col_iter_.Clear();
    if (info_.num_row() < max_row_perbatch) {
      SparsePage *page = new SparsePage();
      this->MakeOneBatch(enabled, pkeep, page);
      col_iter_.cpages_.push_back(page);
    } else {
      this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
    }
    // setup col-size
    col_size_.resize(info_.num_col());
    std::fill(col_size_.begin(), col_size_.end(), 0);
    for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
      SparsePage *pcol = col_iter_.cpages_[i];
      for (size_t j = 0; j < pcol->Size(); ++j) {
        col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];        
      }
    }
  }
  /*!
   * \brief make column page from iterator
   * \param pkeep probability to keep a row
   * \param pcol the target column
   */
  inline void MakeOneBatch(const std::vector<bool> &enabled,
                           float pkeep,                          
                           SparsePage *pcol) {
    // clear rowset
    buffered_rowset_.clear();
    // bit map
@ -157,8 +150,9 @@ class FMatrixS : public IFMatrix {
    {
      nthread = omp_get_num_threads();
    }
-    // build the column matrix in parallel
+    pcol->Clear();
-    utils::ParallelGroupBuilder<RowBatch::Entry> builder(&col_ptr_, &col_data_);
+    utils::ParallelGroupBuilder<SparseBatch::Entry>
        builder(&pcol->offset, &pcol->data);
    builder.InitBudget(0, nthread);
    // start working
    iter_->BeforeFirst();
@ -209,66 +203,167 @@ class FMatrixS : public IFMatrix {
        }
      }
    }
    utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
+    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (col_ptr_[i] < col_ptr_[i + 1]) {
+      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(col_data_) + col_ptr_[i],
+        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(col_data_) + col_ptr_[i + 1], Entry::CmpValue);
+                  BeginPtr(pcol->data) + pcol->offset[i + 1],
                  SparseBatch::Entry::CmpValue);
      }
    }
  }
  inline void MakeManyBatch(const std::vector<bool> &enabled,
                            float pkeep, size_t max_row_perbatch) {
    size_t btop = 0;
    buffered_rowset_.clear();
    // internal temp cache
    SparsePage tmp; tmp.Clear();
    iter_->BeforeFirst();
    while (iter_->Next()) {
      const RowBatch &batch = iter_->Value();
      for (size_t i = 0; i < batch.size; ++i) {
        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
          buffered_rowset_.push_back(ridx);
          tmp.Push(batch[i]);
        }
        if (tmp.Size() >= max_row_perbatch) {
          SparsePage *page = new SparsePage();
          this->MakeColPage(tmp.GetRowBatch(0),
                            BeginPtr(buffered_rowset_) + btop,
                            enabled, page);
          col_iter_.cpages_.push_back(page);
          btop = buffered_rowset_.size();
          tmp.Clear();
        }
      }
    }
    if (tmp.Size() != 0) {
      SparsePage *page = new SparsePage();
      this->MakeColPage(tmp.GetRowBatch(0),
                        BeginPtr(buffered_rowset_) + btop,
                        enabled, page);
      col_iter_.cpages_.push_back(page);
    }
  }
  // make column page from subset of rowbatchs
  inline void MakeColPage(const RowBatch &batch,
                          const bst_uint *ridx,
                          const std::vector<bool> &enabled,
                          SparsePage *pcol) {
    int nthread;
    #pragma omp parallel
    {
      nthread = omp_get_num_threads();
      int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); 
      if (nthread > max_nthread) {
        nthread = max_nthread;
      }
    }
    pcol->Clear();
    utils::ParallelGroupBuilder<SparseBatch::Entry>
        builder(&pcol->offset, &pcol->data);
    builder.InitBudget(info_.num_col(), nthread);
    bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
    #pragma omp parallel for schedule(static) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ndata; ++i) {
      int tid = omp_get_thread_num();
      RowBatch::Inst inst = batch[i];
      for (bst_uint j = 0; j < inst.length; ++j) {
        const SparseBatch::Entry &e = inst[j];
        if (enabled[e.index]) { 
          builder.AddBudget(e.index, tid);
        }
      }
    }
    builder.InitStorage();
    #pragma omp parallel for schedule(static) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ndata; ++i) {
      int tid = omp_get_thread_num();
      RowBatch::Inst inst = batch[i];
      for (bst_uint j = 0; j < inst.length; ++j) {
        const SparseBatch::Entry &e = inst[j];
        builder.Push(e.index,
                     SparseBatch::Entry(ridx[i], e.fvalue),
                     tid);
      }
    }
    utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
    // sort columns
    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ncol; ++i) {
      if (pcol->offset[i] < pcol->offset[i + 1]) {
        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
                  BeginPtr(pcol->data) + pcol->offset[i + 1],
                  SparseBatch::Entry::CmpValue);
      }
    }
  }
 private:
  // one batch iterator that return content in the matrix
-  struct OneBatchIter: utils::IIterator<ColBatch> {
+  struct ColBatchIter: utils::IIterator<ColBatch> {
-    OneBatchIter(void) : at_first_(true){}
+    ColBatchIter(void) : data_ptr_(0) {}
-    virtual ~OneBatchIter(void) {}
+    virtual ~ColBatchIter(void) {
      this->Clear();
    }
    virtual void BeforeFirst(void) {
-      at_first_ = true;
+      data_ptr_ = 0;
    }
    virtual bool Next(void) {
-      if (!at_first_) return false;
+      if (data_ptr_ >= cpages_.size()) return false;
-      at_first_ = false;
+      data_ptr_ += 1;
      SparsePage *pcol = cpages_[data_ptr_ - 1];
      batch_.size = col_index_.size();
      col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
      for (size_t i = 0; i < col_data_.size(); ++i) {
        const bst_uint ridx = col_index_[i];
        col_data_[i] = SparseBatch::Inst
            (BeginPtr(pcol->data) + pcol->offset[ridx],
             static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
      }
      batch_.col_index = BeginPtr(col_index_);
      batch_.col_data = BeginPtr(col_data_);      
      return true;
    }
    virtual const ColBatch &Value(void) const {      
      return batch_;
    }
-    inline void SetBatch(const std::vector<size_t> &ptr,
+    inline void Clear(void) {
-                         const std::vector<ColBatch::Entry> &data) {
+      for (size_t i = 0; i < cpages_.size(); ++i) {
-      batch_.size = col_index_.size();
+        delete cpages_[i];
      col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
      for (size_t i = 0; i < col_data_.size(); ++i) {
        const bst_uint ridx = col_index_[i];
        col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
                                         static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
      }
-      batch_.col_index = BeginPtr(col_index_);
+      cpages_.clear();
      batch_.col_data = BeginPtr(col_data_);
      this->BeforeFirst();
    }
    // data content
    std::vector<bst_uint> col_index_;
    // column content
    std::vector<ColBatch::Inst> col_data_;
-    // whether is at first
+    // column sparse pages
-    bool at_first_;
+    std::vector<SparsePage*> cpages_;    
    // data pointer
    size_t data_ptr_;
    // temporal space for batch
    ColBatch batch_;
  };
  // --- data structure used to support InitColAccess --
  // column iterator
-  OneBatchIter col_iter_;
+  ColBatchIter col_iter_;
  // shared meta info with DMatrix
  const learner::MetaInfo &info_;  
  // row iterator
  utils::IIterator<RowBatch> *iter_;
  /*! \brief list of row index that are buffered */
  std::vector<bst_uint> buffered_rowset_;
-  /*! \brief column pointer of CSC format */
+  // count for column data
-  std::vector<size_t> col_ptr_;
+  std::vector<size_t> col_size_;
  /*! \brief column datas in CSC format */
  std::vector<ColBatch::Entry> col_data_;
 };
 }  // namespace io
 }  // namespace xgboost
-#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
+#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP
--- a/src/io/sparse_batch_page.h
+++ b/src/io/sparse_batch_page.h
@ -178,9 +178,23 @@ class SparsePage {
    offset.push_back(offset.back() + inst.length);
    size_t begin = data.size();
    data.resize(begin + inst.length);
    if (inst.length != 0) {
      std::memcpy(BeginPtr(data) + begin, inst.data,
                  sizeof(SparseBatch::Entry) * inst.length);
    }
  }
  /*!
   * \param base_rowid base_rowid of the data
   * \return row batch representation of the page
   */
  inline RowBatch GetRowBatch(size_t base_rowid) const {
    RowBatch out;
    out.base_rowid  = base_rowid;
    out.ind_ptr = BeginPtr(offset);
    out.data_ptr = BeginPtr(data);
    out.size = offset.size() - 1;
    return out;
  }
 private:
  /*! \brief external memory column offset */
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@ -33,6 +33,7 @@ class BoostLearner : public rabit::Serializable {
    silent= 0;
    prob_buffer_row = 1.0f;
    distributed_mode = 0;
    updater_mode = 0;
    pred_buffer_size = 0;
    seed_per_iteration = 0;
    seed = 0;
@ -95,6 +96,7 @@ class BoostLearner : public rabit::Serializable {
        utils::Error("%s is invalid value for dsplit, should be row or col", val);
      }
    }
    if (!strcmp(name, "updater_mode")) updater_mode = atoi(val);
    if (!strcmp(name, "prob_buffer_row")) {
      prob_buffer_row = static_cast<float>(atof(val));
      utils::Check(distributed_mode == 0,
@ -157,11 +159,9 @@ class BoostLearner : public rabit::Serializable {
  /*!
   * \brief load model from stream
   * \param fi input stream
   * \param with_pbuffer whether to load with predict buffer
   * \param calc_num_feature whether call InitTrainer with calc_num_feature
   */
  inline void LoadModel(utils::IStream &fi,
                        bool with_pbuffer = true,
                        bool calc_num_feature = true) {
    utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                 "BoostLearner: wrong model format");
@ -189,15 +189,15 @@ class BoostLearner : public rabit::Serializable {
    char tmp[32];
    utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class);
    obj_->SetParam("num_class", tmp);
-    gbm_->LoadModel(fi, with_pbuffer);
+    gbm_->LoadModel(fi, mparam.saved_with_pbuffer != 0);
-    if (!with_pbuffer || distributed_mode == 2) {
+    if (mparam.saved_with_pbuffer == 0) {
      gbm_->ResetPredBuffer(pred_buffer_size);
    }
  }
  // rabit load model from rabit checkpoint
  virtual void Load(rabit::Stream *fi) {
    // for row split, we should not keep pbuffer
-    this->LoadModel(*fi, distributed_mode != 2, false);
+    this->LoadModel(*fi, false);
  }
  // rabit save model to rabit checkpoint
  virtual void Save(rabit::Stream *fo) const {
@ -218,18 +218,20 @@ class BoostLearner : public rabit::Serializable {
    if (header == "bs64") {
      utils::Base64InStream bsin(fi);
      bsin.InitPosition();
-      this->LoadModel(bsin);
+      this->LoadModel(bsin, true);
    } else if (header == "binf") {
-      this->LoadModel(*fi);
+      this->LoadModel(*fi, true);
    } else {
      delete fi;
      fi = utils::IStream::Create(fname, "r");
-      this->LoadModel(*fi);
+      this->LoadModel(*fi, true);
    }
    delete fi;   
  }
-  inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
+  inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
-    fo.Write(&mparam, sizeof(ModelParam));
+    ModelParam p = mparam;
    p.saved_with_pbuffer = static_cast<int>(with_pbuffer);
    fo.Write(&p, sizeof(ModelParam));
    fo.Write(name_obj_);
    fo.Write(name_gbm_);
    gbm_->SaveModel(fo, with_pbuffer);
@ -237,17 +239,18 @@ class BoostLearner : public rabit::Serializable {
  /*!
   * \brief save model into file
   * \param fname file name
   * \param with_pbuffer whether save pbuffer together
   */
-  inline void SaveModel(const char *fname) const {
+  inline void SaveModel(const char *fname, bool with_pbuffer) const {
    utils::IStream *fo = utils::IStream::Create(fname, "w");
    if (save_base64 != 0 || !strcmp(fname, "stdout")) {
      fo->Write("bs64\t", 5);
      utils::Base64OutStream bout(fo);
-      this->SaveModel(bout);
+      this->SaveModel(bout, with_pbuffer);
      bout.Finish('\n');    
    } else {
      fo->Write("binf", 4);
-      this->SaveModel(*fo);
+      this->SaveModel(*fo, with_pbuffer);
    }
    delete fo;
  }
@ -259,8 +262,16 @@ class BoostLearner : public rabit::Serializable {
  inline void CheckInit(DMatrix *p_train) {
    int ncol = static_cast<int>(p_train->info.info.num_col);    
    std::vector<bool> enabled(ncol, true);
    // set max row per batch to limited value
    // in distributed mode, use safe choice otherwise
    size_t max_row_perbatch = std::numeric_limits<size_t>::max();
    if (updater_mode != 0 || distributed_mode == 2) {
      max_row_perbatch = 32UL << 10UL;
    }
    // initialize column access
-    p_train->fmat()->InitColAccess(enabled, prob_buffer_row);
+    p_train->fmat()->InitColAccess(enabled,
                                   prob_buffer_row,
                                   max_row_perbatch);
    const int kMagicPage = 0xffffab02;
    // check, if it is DMatrixPage, then use hist maker
    if (p_train->magic == kMagicPage) {
@ -442,14 +453,17 @@ class BoostLearner : public rabit::Serializable {
    unsigned num_feature;
    /* \brief number of class, if it is multi-class classification  */
    int num_class;
    /*! \brief whether the model itself is saved with pbuffer */
    int saved_with_pbuffer;
    /*! \brief reserved field */
-    int reserved[31];
+    int reserved[30];
    /*! \brief constructor */
    ModelParam(void) {
      std::memset(this, 0, sizeof(ModelParam));
      base_score = 0.5f;
      num_feature = 0;
      num_class = 0;
-      std::memset(reserved, 0, sizeof(reserved));
+      saved_with_pbuffer = 0;
    }
    /*!
     * \brief set parameters from outside
@ -476,6 +490,8 @@ class BoostLearner : public rabit::Serializable {
  int silent;
  // distributed learning mode, if any, 0:none, 1:col, 2:row
  int distributed_mode;
  // updater mode, 0:normal, reserved for internal test
  int updater_mode;
  // cached size of predict buffer
  size_t pred_buffer_size;
  // maximum buffred row value
--- a/src/tree/param.h
+++ b/src/tree/param.h
@ -48,6 +48,8 @@ struct TrainParam{
  int size_leaf_vector;  
  // option for parallelization
  int parallel_option;
  // option to open cacheline optimizaton
  int cache_opt;
  // number of threads to be used for tree construction,
  // if OpenMP is enabled, if equals 0, use system default
  int nthread;
@ -70,6 +72,7 @@ struct TrainParam{
    parallel_option = 2;
    sketch_eps = 0.1f;
    sketch_ratio = 2.0f;
    cache_opt = 1;
  }
  /*! 
   * \brief set parameters from outside 
@ -96,6 +99,7 @@ struct TrainParam{
    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
    if (!strcmp(name, "nthread")) nthread = atoi(val);
    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
@ -151,12 +155,12 @@ struct TrainParam{
    return dw;
  }
  /*! \brief whether need forward small to big search: default right */
-  inline bool need_forward_search(float col_density = 0.0f) const {
+  inline bool need_forward_search(float col_density, bool indicator) const {
    return this->default_direction == 2 ||
-        (default_direction == 0 && (col_density < opt_dense_col));
+        (default_direction == 0 && (col_density < opt_dense_col) && !indicator);
  }
  /*! \brief whether need backward big to small search: default left */
-  inline bool need_backward_search(float col_density = 0.0f) const {
+  inline bool need_backward_search(float col_density, bool indicator) const {
    return this->default_direction != 2;
  }
  /*! \brief given the loss change, whether we need to invode prunning */
@ -192,6 +196,11 @@ struct GradStats {
  double sum_grad;
  /*! \brief sum hessian statistics */
  double sum_hess;
  /*!
   * \brief whether this is simply statistics and we only need to call
   *   Add(gpair), instead of Add(gpair, info, ridx)
   */
  static const int kSimpleStats = 1;
  /*! \brief constructor, the object must be cleared during construction */
  explicit GradStats(const TrainParam &param) {
    this->Clear();
@ -204,7 +213,14 @@ struct GradStats {
  inline static void CheckInfo(const BoosterInfo &info) {
  }
  /*!
-   * \brief accumulate statistics,
+   * \brief accumulate statistics 
   * \param p the gradient pair
   */
  inline void Add(bst_gpair p) {
    this->Add(p.grad, p.hess);
  }
  /*!
   * \brief accumulate statistics, more complicated version
   * \param gpair the vector storing the gradient statistics
   * \param info the additional information 
   * \param ridx instance index of this instance
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@ -234,8 +234,9 @@ class ColMaker: public IUpdater {
                                  const IFMatrix &fmat,
                                  const std::vector<bst_gpair> &gpair,
                                  const BoosterInfo &info) {
-      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
+      const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
-      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
+      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
      const std::vector<int> &qexpand = qexpand_;
      #pragma omp parallel
      {
@ -357,6 +358,99 @@ class ColMaker: public IUpdater {
        }
      }
    }
    // update enumeration solution
    inline void UpdateEnumeration(int nid, bst_gpair gstats,
                                  float fvalue, int d_step, bst_uint fid,
                                  TStats &c, std::vector<ThreadEntry> &temp) {
      // get the statistics of nid
      ThreadEntry &e = temp[nid];
      // test if first hit, this is fine, because we set 0 during init
      if (e.stats.Empty()) {
        e.stats.Add(gstats);
        e.last_fvalue = fvalue;
      } else {
        // try to find a split
        if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
          c.SetSubstract(snode[nid].stats, e.stats);
          if (c.sum_hess >= param.min_child_weight) {
            bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
            e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
          }
        }
        // update the statistics
        e.stats.Add(gstats);
        e.last_fvalue = fvalue;
      }
    }
    // same as EnumerateSplit, with cacheline prefetch optimization
    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
                                       const ColBatch::Entry *end,
                                       int d_step,
                                       bst_uint fid,
                                       const std::vector<bst_gpair> &gpair,
                                       std::vector<ThreadEntry> &temp) {
      const std::vector<int> &qexpand = qexpand_;
      // clear all the temp statistics
      for (size_t j = 0; j < qexpand.size(); ++j) {
        temp[qexpand[j]].stats.Clear();
      }
      // left statistics
      TStats c(param);
      // local cache buffer for position and gradient pair
      const int kBuffer = 32;
      int buf_position[kBuffer];
      bst_gpair buf_gpair[kBuffer];
      // aligned ending position
      const ColBatch::Entry *align_end;
      if (d_step > 0) {
        align_end = begin + (end - begin) / kBuffer * kBuffer;
      } else {
        align_end = begin - (begin - end) / kBuffer * kBuffer;
      }
      int i;
      const ColBatch::Entry *it;
      const int align_step = d_step * kBuffer;
      // internal cached loop
      for (it = begin; it != align_end; it += align_step) {
        const ColBatch::Entry *p;
        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
          buf_position[i] = position[p->index];
          buf_gpair[i] = gpair[p->index];
        }
        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
          const int nid = buf_position[i];
          if (nid < 0) continue;
          this->UpdateEnumeration(nid, buf_gpair[i],
                                  p->fvalue, d_step,
                                  fid, c, temp);
        }        
      }
      // finish up the ending piece
      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
        buf_position[i] = position[it->index];
        buf_gpair[i] = gpair[it->index];
      }
      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
        const int nid = buf_position[i];
        if (nid < 0) continue;
        this->UpdateEnumeration(nid, buf_gpair[i],
                                it->fvalue, d_step,
                                fid, c, temp);
      }            
      // finish updating all statistics, check if it is possible to include all sum statistics
      for (size_t i = 0; i < qexpand.size(); ++i) {
        const int nid = qexpand[i];
        ThreadEntry &e = temp[nid];
        c.SetSubstract(snode[nid].stats, e.stats);
        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
          const float gap = std::abs(e.last_fvalue) + rt_eps;
          const float delta = d_step == +1 ? gap: -gap;
          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
        }
      }
    }
    // enumerate the split values of specific feature
    inline void EnumerateSplit(const ColBatch::Entry *begin,
                               const ColBatch::Entry *end,
@ -365,6 +459,11 @@ class ColMaker: public IUpdater {
                               const std::vector<bst_gpair> &gpair,
                               const BoosterInfo &info,
                               std::vector<ThreadEntry> &temp) {
      // use cacheline aware optimization
      if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
        EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
        return;
      }
      const std::vector<int> &qexpand = qexpand_;
      // clear all the temp statistics
      for (size_t j = 0; j < qexpand.size(); ++j) {
@ -411,6 +510,7 @@ class ColMaker: public IUpdater {
        }
      }
    }
    // update the solution candidate 
    virtual void UpdateSolution(const ColBatch &batch,
                                const std::vector<bst_gpair> &gpair,
@ -431,11 +531,12 @@ class ColMaker: public IUpdater {
          const bst_uint fid = batch.col_index[i];
          const int tid = omp_get_thread_num();
          const ColBatch::Inst c = batch[i];
-          if (param.need_forward_search(fmat.GetColDensity(fid))) {
+          const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
          if (param.need_forward_search(fmat.GetColDensity(fid), ind)) {
            this->EnumerateSplit(c.data, c.data + c.length, +1, 
                                 fid, gpair, info, stemp[tid]);
          }
-          if (param.need_backward_search(fmat.GetColDensity(fid))) {
+          if (param.need_backward_search(fmat.GetColDensity(fid), ind)) {
            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
                                 fid, gpair, info, stemp[tid]);
          }
@ -550,8 +651,8 @@ class ColMaker: public IUpdater {
          #pragma omp parallel for schedule(static)
          for (bst_omp_uint j = 0; j < ndata; ++j) {
            const bst_uint ridx = col[j].index;
            const float fvalue = col[j].fvalue;
            const int nid = this->DecodePosition(ridx);
            const float fvalue = col[j].fvalue;
            // go back to parent, correct those who are not default
            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
              if(fvalue < tree[nid].split_cond()) {
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker<TStats> {
      utils::Assert(istart != hist.size, "the bound variable must be max");
      hist.data[istart].Add(gpair, info, ridx);
    }
    /*! 
     * \brief add a histogram to data,
     * do linear scan, start from istart
     */
    inline void Add(bst_float fv,
                    bst_gpair gstats) {
      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
      utils::Assert(istart != hist.size, "the bound variable must be max");
      hist.data[istart].Add(gstats);
    }
  };
  // sketch type used for this
  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
@ -479,6 +489,32 @@ class CQHistMaker: public HistMaker<TStats> {
      hbuilder[nid].istart = 0;
      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
    }
    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
      const bst_uint kBuffer = 32;
      bst_uint align_length = c.length / kBuffer * kBuffer;
      int buf_position[kBuffer];
      bst_gpair buf_gpair[kBuffer];
      for (bst_uint j = 0; j < align_length; j += kBuffer) {
        for (bst_uint i = 0; i < kBuffer; ++i) {
          bst_uint ridx = c[j + i].index;
          buf_position[i] = this->position[ridx];
          buf_gpair[i] = gpair[ridx];
        }
        for (bst_uint i = 0; i < kBuffer; ++i) {
          const int nid = buf_position[i];
          if (nid >= 0) {
            hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
          }
        }
      }
      for (bst_uint j = align_length; j < c.length; ++j) {
        const bst_uint ridx = c[j].index;
        const int nid = this->position[ridx];
        if (nid >= 0) {
          hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
        }
      }
    } else {
      for (bst_uint j = 0; j < c.length; ++j) {
        const bst_uint ridx = c[j].index;
        const int nid = this->position[ridx];
@ -487,6 +523,7 @@ class CQHistMaker: public HistMaker<TStats> {
        }
      }
    }
  }
  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
                              const ColBatch::Inst &c,
                              const RegTree &tree,
@ -536,6 +573,32 @@ class CQHistMaker: public HistMaker<TStats> {
      sbuilder[nid].Init(max_size);
    }
    // second pass, build the sketch
    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
      const bst_uint kBuffer = 32;
      bst_uint align_length = c.length / kBuffer * kBuffer;
      int buf_position[kBuffer];
      bst_float buf_hess[kBuffer];
      for (bst_uint j = 0; j < align_length; j += kBuffer) {
        for (bst_uint i = 0; i < kBuffer; ++i) {
          bst_uint ridx = c[j + i].index;
          buf_position[i] = this->position[ridx];
          buf_hess[i] = gpair[ridx].hess;
        }
        for (bst_uint i = 0; i < kBuffer; ++i) {
          const int nid = buf_position[i];
          if (nid >= 0) {
            sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
          }
        }        
      }
      for (bst_uint j = align_length; j < c.length; ++j) {
        const bst_uint ridx = c[j].index;
        const int nid = this->position[ridx];
        if (nid >= 0) {
          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
        }
      }
    } else {
      for (bst_uint j = 0; j < c.length; ++j) {
        const bst_uint ridx = c[j].index;
        const int nid = this->position[ridx];
@ -543,6 +606,7 @@ class CQHistMaker: public HistMaker<TStats> {
          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
        }
      }
    }
    for (size_t i = 0; i < this->qexpand.size(); ++i) {
      const int nid = this->qexpand[i];
      sbuilder[nid].Finalize(max_size);
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@ -328,12 +328,12 @@ struct WXQSummary : public WQSummary<DType, RType> {
    }
    if (nbig >= n - 1) {
      // see what was the case
-      fprintf(stderr, "LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
+      utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
-      fprintf(stderr, "LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
+      utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
                    src.size, maxsize, static_cast<double>(range),
                    static_cast<double>(chunk));      
      for (size_t i = 0; i < src.size; ++i) {
-        printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
+        utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
                      src.data[i].rmin, src.data[i].rmax,  src.data[i].wmin,
                      src.data[i].value, CheckLarge(src.data[i], chunk));
      }
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@ -87,6 +87,7 @@ class BoostLearnTask {
    if (!strcmp("name_pred", name)) name_pred = val;
    if (!strcmp("dsplit", name)) data_split = val;
    if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
    if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val);
    if (!strncmp("eval[", name, 5)) {
      char evname[256];
      utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
@ -115,6 +116,7 @@ class BoostLearnTask {
    model_dir_path = "./";
    data_split = "NONE";
    load_part = 0;
    save_with_pbuffer = 0;
    data = NULL;
  }
  ~BoostLearnTask(void){
@ -241,7 +243,7 @@ class BoostLearnTask {
  }
  inline void SaveModel(const char *fname) const {
    if (rabit::GetRank() != 0) return;
-    learner.SaveModel(fname);
+    learner.SaveModel(fname, save_with_pbuffer != 0);
  }
  inline void SaveModel(int i) const {
    char fname[256];
@ -297,6 +299,8 @@ class BoostLearnTask {
  int pred_margin;
  /*! \brief whether dump statistics along with model */
  int dump_model_stats;
  /*! \brief whether save prediction buffer */
  int save_with_pbuffer;
  /*! \brief name of feature map */
  std::string name_fmap;
  /*! \brief name of dump file */
--- a/wrapper/README.md
+++ b/wrapper/README.md
@ -7,6 +7,8 @@ Python
 * To make the python module, type ```./build.sh``` in the root directory of project
 * Install with `python setup.py install` from this directory.
 * Refer also to the walk through example in [demo folder](../demo/guide-python)
 * **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
 R
 =====
--- a/wrapper/setup.py
+++ b/wrapper/setup.py
@ -28,7 +28,7 @@ if len(lib_path) == 0:
    raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
                                 "../make?")
 setup(name="xgboost",
-      version="0.32",
+      version="0.40",
      description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
      zip_safe=False,
      py_modules=['xgboost'],
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -1,8 +1,8 @@
 # coding: utf-8
 """
 xgboost: eXtreme Gradient Boosting library
 Version: 0.40
 Authors: Tianqi Chen, Bing Xu
 Early stopping by Zygmunt Zając
 """
@ -30,6 +30,9 @@ except ImportError:
 class XGBoostLibraryNotFound(Exception):
    pass
 class XGBoostError(Exception):
    pass
 __all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
 if sys.version_info[0] == 3:
@ -70,6 +73,8 @@ def load_xglib():
    lib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
    lib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
    lib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
    lib.XGBoosterGetModelRaw.restype = ctypes.POINTER(ctypes.c_char)
    lib.XGBoosterLoadModelFromBuffer.restype = ctypes.c_void_p
    return lib
@ -89,6 +94,16 @@ def ctypes2numpy(cptr, length, dtype):
    return res
 def ctypes2buffer(cptr, length):
    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
        raise RuntimeError('expected char pointer')
    res = bytearray(length)
    rptr = (ctypes.c_char * length).from_buffer(res)
    if not ctypes.memmove(rptr, cptr, length):
        raise RuntimeError('memmove failed')
    return res
 def c_str(string):
    return ctypes.c_char_p(string.encode('utf-8'))
@ -98,7 +113,7 @@ def c_array(ctype, values):
 class DMatrix(object):
-    def __init__(self, data, label=None, missing=0.0, weight=None):
+    def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
        """
        Data matrix used in XGBoost.
@ -113,14 +128,15 @@ class DMatrix(object):
            Value in the data which needs to be present as a missing value.
        weight : list or numpy 1-D array (optional)
            Weight for each instance.
        silent: boolean
            Whether print messages during construction
        """
        # force into void_p, mac need to pass things in as void_p
        if data is None:
            self.handle = None
            return
        if isinstance(data, string_types):
-            self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0))
+            self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), int(silent)))
        elif isinstance(data, scipy.sparse.csr_matrix):
            self._init_from_csr(data)
        elif isinstance(data, scipy.sparse.csc_matrix):
@ -335,6 +351,46 @@ class Booster(object):
    def __del__(self):
        xglib.XGBoosterFree(self.handle)
    def __getstate__(self):
        # can't pickle ctypes pointers
        # put model content in bytearray
        this = self.__dict__.copy()
        handle = this['handle']
        if handle is not None:
            raw = self.save_raw()
            this["handle"] = raw
        return this
    def __setstate__(self, state):
        # reconstruct handle from raw data
        handle = state['handle']
        if handle is not None:
            buf = handle
            dmats = c_array(ctypes.c_void_p, [])
            handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, 0))
            length = ctypes.c_ulong(len(buf))
            ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
            xglib.XGBoosterLoadModelFromBuffer(handle, ptr, length)
            state['handle'] = handle
        self.__dict__.update(state)
        self.set_param({'seed': 0})
    def __copy__(self):
        return self.__deepcopy__()
    def __deepcopy__(self):
        return Booster(model_file = self.save_raw())
    def copy(self):
        """
        Copy the booster object
        Returns
        --------
        a copied booster model
        """
        return self.__copy__()
    def set_param(self, params, pv=None):
        if isinstance(params, collections.Mapping):
            params = params.items()
@ -427,6 +483,11 @@ class Booster(object):
        """
        Predict with data.
        NOTE: This function is not thread safe.
              For each booster object, predict can only be called from one thread.
              If you want to run prediction using multiple thread, call bst.copy() to make copies
              of model object and then call predict
        Parameters
        ----------
        data : DMatrix
@ -468,9 +529,25 @@ class Booster(object):
        Parameters
        ----------
        fname : string
-            Output file name.
+            Output file name
        """
        if isinstance(fname, string_types):  # assume file name
            xglib.XGBoosterSaveModel(self.handle, c_str(fname))
        else:
            raise TypeError("fname must be a string")
    def save_raw(self):
        """
        Save the model to a in memory buffer represetation
        Returns
        -------
        a in memory buffer represetation of the model
        """
        length = ctypes.c_ulong()
        cptr = xglib.XGBoosterGetModelRaw(self.handle,
                                          ctypes.byref(length))
        return ctypes2buffer(cptr, length.value)
    def load_model(self, fname):
        """
@ -478,10 +555,16 @@ class Booster(object):
        Parameters
        ----------
-        fname : string
+        fname : string or a memory buffer
-            Input file name.
+            Input file name or memory buffer(see also save_raw)
        """
        if isinstance(fname, str):  # assume file name
            xglib.XGBoosterLoadModel(self.handle, c_str(fname))
        else:
            buf = fname
            length = ctypes.c_ulong(len(buf))
            ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
            xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
    def dump_model(self, fo, fmap='', with_stats=False):
        """
@ -622,7 +705,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
        maximize_score = False
        if 'eval_metric' in params:
            maximize_metrics = ('auc', 'map', 'ndcg')
-            if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics):
+            if list(filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics)):
                maximize_score = True
        if maximize_score:
@ -659,11 +742,11 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
                bst.best_score = best_score
                bst.best_iteration = best_score_i
                break
        bst.best_score = best_score
        bst.best_iteration = best_score_i
        return bst
        return bst
 class CVPack(object):
    def __init__(self, dtrain, dtest, param):
        self.dtrain = dtrain
@ -815,12 +898,15 @@ class XGBModel(XGBModelBase):
        The initial prediction score of all instances, global bias.
    seed : int
        Random number seed.
    missing : float, optional
        Value in the data which needs to be present as a missing value. If
        None, defaults to np.nan.
    """
    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear",
                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0):
+                 base_score=0.5, seed=0, missing=None):
        if not SKLEARN_INSTALLED:
-            raise Exception('sklearn needs to be installed in order to use this module')
+            raise XGBoostError('sklearn needs to be installed in order to use this module')
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
@ -836,8 +922,37 @@ class XGBModel(XGBModelBase):
        self.base_score = base_score
        self.seed = seed
        self.missing = missing if missing is not None else np.nan
-        self._Booster = Booster()
+        self._Booster = None
    def __setstate__(self, state):
        # backward compatiblity code
        # load booster from raw if it is raw
        # the booster now support pickle
        bst = state["_Booster"]
        if bst is not None and not isinstance(bst, Booster):
            state["_Booster"] = Booster(model_file=bst)
        self.__dict__.update(state)
    def booster(self):
        """
        get the underlying xgboost Booster of this model
        will raise an exception when fit was not called
        Returns
        -------
        booster : a xgboost booster of underlying model
        """
        if self._Booster is None:
            raise XGBoostError('need to call fit beforehand')
        return self._Booster
    def get_params(self, deep=False):
        params = super(XGBModel, self).get_params(deep=deep)
        if params['missing'] is np.nan:
            params['missing'] = None  # sklearn doesn't handle nan. see #4725
        return params
    def get_xgb_params(self):
        xgb_params = self.get_params()
@ -849,13 +964,13 @@ class XGBModel(XGBModelBase):
        return xgb_params
    def fit(self, X, y):
-        trainDmatrix = DMatrix(X, label=y)
+        trainDmatrix = DMatrix(X, label=y, missing=self.missing)
        self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators)
        return self
    def predict(self, X):
-        testDmatrix = DMatrix(X)
+        testDmatrix = DMatrix(X, missing=self.missing)
-        return self._Booster.predict(testDmatrix)
+        return self.booster().predict(testDmatrix)
 class XGBClassifier(XGBModel, XGBClassifier):
@ -865,15 +980,15 @@ class XGBClassifier(XGBModel, XGBClassifier):
    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic",
                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0):
+                 base_score=0.5, seed=0, missing=None):
        super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective,
                                            nthread, gamma, min_child_weight, max_delta_step, subsample,
                                            colsample_bytree,
-                                            base_score, seed)
+                                            base_score, seed, missing)
    def fit(self, X, y, sample_weight=None):
-        y_values = list(np.unique(y))
+        self.classes_ = list(np.unique(y))
-        self.n_classes_ = len(y_values)
+        self.n_classes_ = len(self.classes_)
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying XGB instance
            self.objective = "multi:softprob"
@ -886,17 +1001,19 @@ class XGBClassifier(XGBModel, XGBClassifier):
        training_labels = self._le.transform(y)
        if sample_weight is not None:
-            trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight)
+            trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
                                   missing=self.missing)
        else:
-            trainDmatrix = DMatrix(X, label=training_labels)
+            trainDmatrix = DMatrix(X, label=training_labels,
                                   missing=self.missing)
        self._Booster = train(xgb_options, trainDmatrix, self.n_estimators)
        return self
    def predict(self, X):
-        testDmatrix = DMatrix(X)
+        testDmatrix = DMatrix(X, missing=self.missing)
-        class_probs = self._Booster.predict(testDmatrix)
+        class_probs = self.booster().predict(testDmatrix)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
@ -905,8 +1022,8 @@ class XGBClassifier(XGBModel, XGBClassifier):
        return self._le.inverse_transform(column_indexes)
    def predict_proba(self, X):
-        testDmatrix = DMatrix(X)
+        testDmatrix = DMatrix(X, missing=self.missing)
-        class_probs = self._Booster.predict(testDmatrix)
+        class_probs = self.booster().predict(testDmatrix)
        if self.objective == "multi:softprob":
            return class_probs
        else:
@ -914,7 +1031,6 @@ class XGBClassifier(XGBModel, XGBClassifier):
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs, classone_probs)).transpose()
 class XGBRegressor(XGBModel, XGBRegressor):
    __doc__ = """
    Implementation of the scikit-learn API for XGBoost regression
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@ -58,13 +58,14 @@ class Booster: public learner::BoostLearner {
  }
  inline void LoadModelFromBuffer(const void *buf, size_t size) {
    utils::MemoryFixSizeBuffer fs((void*)buf, size);
-    learner::BoostLearner::LoadModel(fs);
+    learner::BoostLearner::LoadModel(fs, true);
    this->init_model = true;    
  }
  inline const char *GetModelRaw(bst_ulong *out_len) {
    this->CheckInitModel();
    model_str.resize(0);
    utils::MemoryBufferStream fs(&model_str);
-    learner::BoostLearner::SaveModel(fs);
+    learner::BoostLearner::SaveModel(fs, false);
    *out_len = static_cast<bst_ulong>(model_str.length());
    if (*out_len == 0) {
      return NULL;
@ -322,8 +323,10 @@ extern "C"{
  void XGBoosterLoadModel(void *handle, const char *fname) {
    static_cast<Booster*>(handle)->LoadModel(fname);
  }
-  void XGBoosterSaveModel(const void *handle, const char *fname) {
+  void XGBoosterSaveModel(void *handle, const char *fname) {
-    static_cast<const Booster*>(handle)->SaveModel(fname);
+    Booster *bst = static_cast<Booster*>(handle);
    bst->CheckInitModel();
    bst->SaveModel(fname, false);
  }
  void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
    static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@ -203,7 +203,7 @@ extern "C" {
   * \param handle handle
   * \param fname file name
   */
-  XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
+  XGB_DLL void XGBoosterSaveModel(void *handle, const char *fname);
  /*!
   * \brief load model from in memory buffer
   * \param handle handle