Update README.md

0.4
update date
2015-05-11 23:44:02 -07:00 · 2015-05-11 23:42:49 -07:00 · 2015-05-11 20:58:41 -07:00 · 2015-05-11 20:55:09 -07:00 · 2015-05-11 16:59:18 -07:00 · 2015-05-11 16:55:14 -07:00
255 changed files with 22671 additions and 1688 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *.slo
 *.lo
 *.o
-
+*.page
 # Compiled Dynamic libraries
 *.so
 *.dylib
@@ -44,3 +44,17 @@ Debug
 *dump
 *save
 *csv
 .Rproj.user
 *.cpage.col
 *.cpage
 *.Rproj
 xgboost
 xgboost.mpi
 xgboost.mock
 train*
 rabit
 #.Rbuildignore
 R-package.Rproj
 *.cache*
 R-package/inst
 R-package/src
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -20,3 +20,17 @@ xgboost-0.3
 * Linear booster is now parallelized, using parallel coordinated descent.
 * Add [Code Guide](src/README.md) for customizing objective function and evaluation
 * Add R module
 xgboost-0.4
 =====
 * Distributed version of xgboost that runs on YARN, scales to billions of examples
 * Direct save/load data and model from/to S3 and HDFS
 * Feature importance visualization in R module, by Michael Benesty
 * Predict leaf index
 * Poisson regression for counts data
 * Early stopping option in training
 * Native save load support in R and python
  - xgboost models now can be saved using save/load in R
  - xgboost python model is now pickable
 * sklearn wrapper is supported in python module
 * Experimental External memory version
--- a/102
+++ b/102
@@ -1,8 +1,13 @@
 export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
 export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -pedantic 
+ifeq ($(OS), Windows_NT)
 	export CXX = g++ -m64
 	export CC = gcc -m64
 endif
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP 
@@ -10,56 +15,117 @@ else
 	CFLAGS += -fopenmp
 endif
 # by default use c++11
 ifeq ($(cxx11),1)
 	CFLAGS += -std=c++11
 else 
 endif
 # handling dmlc
 ifdef dmlc
 	ifndef config
 		ifneq ("$(wildcard $(dmlc)/config.mk)","")
 			config = $(dmlc)/config.mk
 		else
 			config = $(dmlc)/make/config.mk
 		endif	
 	endif
 	include $(config)
 	include $(dmlc)/make/dmlc.mk
 	LDFLAGS+= $(DMLC_LDFLAGS)
 	LIBDMLC=$(dmlc)/libdmlc.a
 else
 	LIBDMLC=dmlc_simple.o
 endif
 ifeq ($(OS), Windows_NT)
 	LIBRABIT = subtree/rabit/lib/librabit_empty.a
 	SLIB = wrapper/xgboost_wrapper.dll
 else
 	LIBRABIT = subtree/rabit/lib/librabit.a
 	SLIB = wrapper/libxgboostwrapper.so
 endif
 # specify tensor path
 BIN = xgboost
-OBJ = updater.o gbm.o io.o
+MOCKBIN = xgboost.mock
-SLIB = wrapper/libxgboostwrapper.so 
+OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
 MPIBIN =
 TARGET = $(BIN) $(OBJ) $(SLIB)
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack
-all: $(BIN) $(OBJ) $(SLIB) 
+all: $(BIN) $(OBJ) $(SLIB)
 mpi: $(MPIBIN)
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
+updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
-updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
+dmlc_simple.o: src/io/dmlc_simple.cpp src/utils/*.h
-gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
+gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+xgboost:  updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC)
 wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
 # dependency on rabit
 subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
 	+	cd subtree/rabit;make lib/librabit.a; cd ../..
 subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
 	+	cd subtree/rabit;make lib/librabit_empty.a; cd ../..
 subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
 	+	cd subtree/rabit;make lib/librabit_mock.a; cd ../..
 subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
 	+	cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
 $(BIN) : 
-	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 $(MOCKBIN) : 
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 $(SLIB) :
-	$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS)
 $(OBJ) : 
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 $(MPIOBJ) : 
 	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) 
 $(MPIBIN) : 
 	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) 
 install:
 	cp -f -r $(BIN)  $(INSTALL_PATH)
 Rpack:
 	make clean
 	cd subtree/rabit;make clean;cd ..
 	rm -rf xgboost xgboost*.tar.gz
 	cp -r R-package xgboost
 	rm -rf xgboost/inst/examples/*.buffer
 	rm -rf xgboost/inst/examples/*.model
 	rm -rf xgboost/inst/examples/dump*
 	rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
 	rm -rf xgboost/src/*/*.o
 	rm -rf subtree/rabit/src/*.o
 	rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
 	rm -rf xgboost/demo/runall.R
 	cp -r src xgboost/src/src
 	mkdir xgboost/src/subtree
 	mkdir xgboost/src/subtree/rabit
 	cp -r subtree/rabit/include xgboost/src/subtree/rabit/include
 	cp -r subtree/rabit/src xgboost/src/subtree/rabit/src
 	rm -rf xgboost/src/subtree/rabit/src/*.o
 	mkdir xgboost/src/wrapper
 	cp  wrapper/xgboost_wrapper.h xgboost/src/wrapper
 	cp  wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
 	cp ./LICENSE xgboost
 	cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
-	cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
+	cp xgboost/src/Makevars xgboost/src/Makevars.win
 	# R CMD build --no-build-vignettes xgboost
 	R CMD build xgboost
 	rm -rf xgboost
 	R CMD check --as-cran xgboost*.tar.gz
 clean:
-	$(RM) $(OBJ) $(BIN) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
+	$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
 	cd subtree/rabit; make clean; cd ..
--- a/R-package/.Rbuildignore
+++ b/R-package/.Rbuildignore
@@ -0,0 +1,5 @@
 \.o$
 \.so$
 \.dll$
 ^.*\.Rproj$
 ^\.Rproj\.user$
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,24 +1,34 @@
 Package: xgboost
 Type: Package
 Title: eXtreme Gradient Boosting
-Version: 0.3-2
+Version: 0.4-0
-Date: 2014-08-23
+Date: 2015-05-11
-Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
+Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
 Maintainer: Tong He <hetong007@gmail.com>
-Description: This package is a R wrapper of xgboost, which is short for eXtreme
+Description: Xgboost is short for eXtreme Gradient Boosting, which is an 
-    Gradient Boosting. It is an efficient and scalable implementation of
+    efficient and scalable implementation of gradient boosting framework. 
-    gradient boosting framework. The package includes efficient linear model
+    This package is an R wrapper of xgboost. The package includes efficient 
-    solver and tree learning algorithms. The package can automatically do
+    linear model solver and tree learning algorithms. The package can automatically 
-    parallel computation with OpenMP, and it can be more than 10 times faster
+    do parallel computation with OpenMP, and it can be more than 10 times faster
    than existing gradient boosting packages such as gbm. It supports various
    objective functions, including regression, classification and ranking. The
    package is made to be extensible, so that users are also allowed to define
    their own objectives easily.
 License: Apache License (== 2.0) | file LICENSE
-URL: https://github.com/tqchen/xgboost
+URL: https://github.com/dmlc/xgboost
-BugReports: https://github.com/tqchen/xgboost/issues
+BugReports: https://github.com/dmlc/xgboost/issues
 VignetteBuilder: knitr
 Suggests:
    knitr,
    ggplot2 (>= 1.0.0),
    DiagrammeR (>= 0.6),
    Ckmeans.1d.dp (>= 3.3.1),
    vcd (>= 1.3)
 Depends:
    R (>= 2.10)
 Imports:
    Matrix (>= 1.1-0),
-    methods
+    methods,
    data.table (>= 1.9.4),
    magrittr (>= 1.5),
    stringr (>= 0.6.2)
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -1,4 +1,4 @@
-# Generated by roxygen2 (4.0.1): do not edit by hand
+# Generated by roxygen2 (4.1.1): do not edit by hand
 export(getinfo)
 export(setinfo)
@@ -7,11 +7,37 @@ export(xgb.DMatrix)
 export(xgb.DMatrix.save)
 export(xgb.cv)
 export(xgb.dump)
 export(xgb.importance)
 export(xgb.load)
 export(xgb.model.dt.tree)
 export(xgb.plot.importance)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
 export(xgb.train)
 export(xgboost)
 exportMethods(nrow)
 exportMethods(predict)
 import(methods)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
 importFrom(Matrix,cBind)
 importFrom(Matrix,colSums)
 importFrom(Matrix,sparseVector)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
 importFrom(data.table,copy)
 importFrom(data.table,data.table)
 importFrom(data.table,fread)
 importFrom(data.table,rbindlist)
 importFrom(data.table,set)
 importFrom(data.table,setnames)
 importFrom(magrittr,"%>%")
 importFrom(magrittr,add)
 importFrom(magrittr,not)
 importFrom(stringr,str_extract)
 importFrom(stringr,str_extract_all)
 importFrom(stringr,str_match)
 importFrom(stringr,str_replace)
 importFrom(stringr,str_split)
 importFrom(stringr,str_trim)
--- a/R-package/R/getinfo.xgb.DMatrix.R
+++ b/R-package/R/getinfo.xgb.DMatrix.R
@@ -4,6 +4,15 @@ setClass('xgb.DMatrix')
 #' 
 #' Get information of an xgb.DMatrix object
 #' 
 #' The information can be one of the following:
 #' 
 #' \itemize{
 #'     \item \code{label}: label Xgboost learn from ;
 #'     \item \code{weight}: to do a weight rescale ;
 #'     \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
 #'     \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
 #' }
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
@@ -19,7 +28,9 @@ getinfo <- function(object, ...){
    UseMethod("getinfo")
 }
-#' @param object Object of class "xgb.DMatrix"
+
 #' @param object Object of class \code{xgb.DMatrix}
 #' @param name the name of the field to get
 #' @param ... other parameters
 #' @rdname getinfo
@@ -32,10 +43,15 @@ setMethod("getinfo", signature = "xgb.DMatrix",
              if (class(object) != "xgb.DMatrix") {
                  stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
              }
-              if (name != "label" && name != "weight" && name != "base_margin") {
+              if (name != "label" && name != "weight" && 
                      name != "base_margin" && name != "nrow") {
                  stop(paste("xgb.getinfo: unknown info name", name))
              }
-              ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
+              if (name != "nrow"){
                  ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
              } else {
                  ret <- xgb.numrow(object)
              }
              return(ret)
          })
--- a/R-package/R/nrow.xgb.DMatrix.R
+++ b/R-package/R/nrow.xgb.DMatrix.R
@@ -0,0 +1,19 @@
 setGeneric("nrow")
 #' @title Number of xgb.DMatrix rows
 #' @description \code{nrow} return the number of rows present in the \code{xgb.DMatrix}.
 #' @param x Object of class \code{xgb.DMatrix}
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
 #' dtrain <- xgb.DMatrix(train$data, label=train$label)
 #' stopifnot(nrow(dtrain) == nrow(train$data))
 #' 
 #' @export
 setMethod("nrow",
          signature = "xgb.DMatrix",
          definition = function(x) {
            xgb.numrow(x)
          }
 )
--- a/R-package/R/predict.xgb.Booster.R
+++ b/R-package/R/predict.xgb.Booster.R
@@ -1,4 +1,7 @@
-setClass("xgb.Booster")
+setClass("xgb.Booster.handle")
 setClass("xgb.Booster",
         slots = c(handle = "xgb.Booster.handle",
                   raw = "raw"))
 #' Predict method for eXtreme Gradient Boosting model
 #' 
@@ -7,6 +10,8 @@ setClass("xgb.Booster")
 #' @param object Object of class "xgb.Boost"
 #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or 
 #'   \code{xgb.DMatrix}. 
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #' @param outputmargin whether the prediction should be shown in the original
 #'   value of sum of functions, when outputmargin=TRUE, the prediction is 
 #'   untransformed margin value. In logistic regression, outputmargin=T will
@@ -14,20 +19,31 @@ setClass("xgb.Booster")
 #' @param ntreelimit limit number of trees used in prediction, this parameter is
 #'  only valid for gbtree, but not for gblinear. set it to be value bigger 
 #'  than 0. It will use all trees by default.
 #' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-#'                eta = 1, nround = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' pred <- predict(bst, test$data)
 #' @export
 #' 
 setMethod("predict", signature = "xgb.Booster", 
-          definition = function(object, newdata, outputmargin = FALSE, ntreelimit = NULL) {
+          definition = function(object, newdata, missing = NULL, 
                                outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
  if (class(object) != "xgb.Booster"){
    stop("predict: model in prediction must be of class xgb.Booster")
  } else {
    object <- xgb.Booster.check(object, saveraw = FALSE)
  }
  if (class(newdata) != "xgb.DMatrix") {
-    newdata <- xgb.DMatrix(newdata)
+    if (is.null(missing)) {
      newdata <- xgb.DMatrix(newdata)
    } else {
      newdata <- xgb.DMatrix(newdata, missing = missing)
    }
  }
  if (is.null(ntreelimit)) {
    ntreelimit <- 0
@@ -36,7 +52,24 @@ setMethod("predict", signature = "xgb.Booster",
      stop("predict: ntreelimit must be equal to or greater than 1")
    }
  }
-  ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
+  option = 0
  if (outputmargin) {
    option <- option + 1
  }
  if (predleaf) {
    option <- option + 2
  }
  ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option), 
               as.integer(ntreelimit), PACKAGE = "xgboost")
  if (predleaf){
      len <- getinfo(newdata, "nrow")
      if (length(ret) == len){
          ret <- matrix(ret,ncol = 1)
      } else {
          ret <- matrix(ret, ncol = len)
          ret <- t(ret)
      }
  }
  return(ret)
 })
- 
+
--- a/R-package/R/predict.xgb.Booster.handle.R
+++ b/R-package/R/predict.xgb.Booster.handle.R
@@ -0,0 +1,19 @@
 #' Predict method for eXtreme Gradient Boosting model handle
 #' 
 #' Predicted values based on xgb.Booster.handle object.
 #' 
 #' @param object Object of class "xgb.Boost.handle"
 #' @param ... Parameters pass to \code{predict.xgb.Booster}
 #' 
 setMethod("predict", signature = "xgb.Booster.handle", 
          definition = function(object, ...) {
  if (class(object) != "xgb.Booster.handle"){
    stop("predict: model in prediction must be of class xgb.Booster.handle")
  }
  bst <- xgb.handleToBooster(object)
  ret = predict(bst, ...)
  return(ret)
 })
--- a/R-package/R/setinfo.xgb.DMatrix.R
+++ b/R-package/R/setinfo.xgb.DMatrix.R
@@ -2,6 +2,15 @@
 #' 
 #' Set information of an xgb.DMatrix object
 #' 
 #' It can be one of the following:
 #' 
 #' \itemize{
 #'     \item \code{label}: label Xgboost learn from ;
 #'     \item \code{weight}: to do a weight rescale ;
 #'     \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
 #'     \item \code{group}.
 #' }
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
--- a/R-package/R/slice.xgb.DMatrix.R
+++ b/R-package/R/slice.xgb.DMatrix.R
@@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
              if (class(object) != "xgb.DMatrix") {
                  stop("slice: first argument dtrain must be xgb.DMatrix")
              }
-              ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
+              ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, 
                           PACKAGE = "xgboost")
              attr_list <- attributes(object)
              nr <- xgb.numrow(object)
              len <- sapply(attr_list,length)
              ind <- which(len==nr)
              if (length(ind)>0) {
                  nms <- names(attr_list)[ind]
                  for (i in 1:length(ind)) {
                    attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
                  }
              }
              return(structure(ret, class = "xgb.DMatrix"))
          })
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -15,21 +15,29 @@ xgb.setinfo <- function(dmat, name, info) {
    stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
  }
  if (name == "label") {
    if (length(info)!=xgb.numrow(dmat))
      stop("The length of labels must equal to the number of rows in the input data")
    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), 
          PACKAGE = "xgboost")
    return(TRUE)
  }
  if (name == "weight") {
    if (length(info)!=xgb.numrow(dmat))
      stop("The length of weights must equal to the number of rows in the input data")
    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), 
          PACKAGE = "xgboost")
    return(TRUE)
  }
  if (name == "base_margin") {
    # if (length(info)!=xgb.numrow(dmat))
    #   stop("The length of base margin must equal to the number of rows in the input data")
    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), 
          PACKAGE = "xgboost")
    return(TRUE)
  }
  if (name == "group") {
    if (sum(info)!=xgb.numrow(dmat))
      stop("The sum of groups must equal to the number of rows in the input data")
    .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), 
          PACKAGE = "xgboost")
    return(TRUE)
@@ -57,24 +65,55 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
    }
  }
  if (!is.null(modelfile)) {
-    if (typeof(modelfile) != "character") {
+    if (typeof(modelfile) == "character") {
-      stop("xgb.Booster: modelfile must be character")
+      .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
    } else if (typeof(modelfile) == "raw") {
      .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")      
    } else {
      stop("xgb.Booster: modelfile must be character or raw vector")
    }
    .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
  }
-  return(structure(handle, class = "xgb.Booster"))
+  return(structure(handle, class = "xgb.Booster.handle"))
 }
 # convert xgb.Booster.handle to xgb.Booster
 xgb.handleToBooster <- function(handle, raw = NULL)
 {
  bst <- list(handle = handle, raw = raw)
  class(bst) <- "xgb.Booster"
  return(bst)
 }
 # Check whether an xgb.Booster object is complete
 xgb.Booster.check <- function(bst, saveraw = TRUE)
 {
  isnull <- is.null(bst$handle)
  if (!isnull) {
    isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
  }
  if (isnull) {
    bst$handle <- xgb.Booster(modelfile = bst$raw)
  } else {
    if (is.null(bst$raw) && saveraw)
      bst$raw <- xgb.save.raw(bst$handle)
  }
  return(bst)
 }
 ## ----the following are low level iteratively function, not needed if
 ## you do not want to use them ---------------------------------------
 # get dmatrix from data, label
-xgb.get.DMatrix <- function(data, label = NULL) {
+xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
  inClass <- class(data)
  if (inClass == "dgCMatrix" || inClass == "matrix") {
    if (is.null(label)) {
      stop("xgboost: need label when data is a matrix")
    }
-    dtrain <- xgb.DMatrix(data, label = label)
+    if (is.null(missing)){
      dtrain <- xgb.DMatrix(data, label = label)
    } else {
      dtrain <- xgb.DMatrix(data, label = label, missing = missing)
    }
  } else {
    if (!is.null(label)) {
      warning("xgboost: label will be ignored.")
@@ -95,8 +134,8 @@ xgb.numrow <- function(dmat) {
 }
 # iteratively update booster with customized statistics
 xgb.iter.boost <- function(booster, dtrain, gpair) {
-  if (class(booster) != "xgb.Booster") {
+  if (class(booster) != "xgb.Booster.handle") {
-    stop("xgb.iter.update: first argument must be type xgb.Booster")
+    stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
  }
  if (class(dtrain) != "xgb.DMatrix") {
    stop("xgb.iter.update: second argument must be type xgb.DMatrix")
@@ -108,8 +147,8 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
 # iteratively update booster with dtrain
 xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
-  if (class(booster) != "xgb.Booster") {
+  if (class(booster) != "xgb.Booster.handle") {
-    stop("xgb.iter.update: first argument must be type xgb.Booster")
+    stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
  }
  if (class(dtrain) != "xgb.DMatrix") {
    stop("xgb.iter.update: second argument must be type xgb.DMatrix")
@@ -127,8 +166,8 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
 }
 # iteratively evaluate one iteration
-xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
+xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
-  if (class(booster) != "xgb.Booster") {
+  if (class(booster) != "xgb.Booster.handle") {
    stop("xgb.eval: first argument must be type xgb.Booster")
  }
  if (typeof(watchlist) != "list") {
@@ -158,41 +197,82 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
        if (length(names(w)) == 0) {
          stop("xgb.eval: name tag must be presented for every elements in watchlist")
        }
-        ret <- feval(predict(booster, w[[1]]), w[[1]])
+        preds <- predict(booster, w[[1]])
        ret <- feval(preds, w[[1]])
        msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="")
      }
    }
  } else {
    msg <- ""
  }
  if (prediction){
    preds <- predict(booster,watchlist[[2]])
    return(list(msg,preds))
  }
  return(msg)
-} 
+}
 #------------------------------------------
 # helper functions for cross validation
 #
-xgb.cv.mknfold <- function(dall, nfold, param) {
+xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
-  randidx <- sample(1 : xgb.numrow(dall))
+  if (nfold <= 1) {
-  kstep <- length(randidx) / nfold
+    stop("nfold must be bigger than 1")
-  idset <- list()
+  }
-  for (i in 1:nfold) {
+  if(is.null(folds)) {
-    idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
+    if (exists('objective', where=param) && strtrim(param[['objective']], 5) == 'rank:') {
      stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n",
           "\tConsider providing pre-computed CV-folds through the folds parameter.")
    }
    y <- getinfo(dall, 'label')
    randidx <- sample(1 : xgb.numrow(dall))
    if (stratified & length(y) == length(randidx)) {
      y <- y[randidx]
      #
      # WARNING: some heuristic logic is employed to identify classification setting!
      #
      # For classification, need to convert y labels to factor before making the folds,
      # and then do stratification by factor levels.
      # For regression, leave y numeric and do stratification by quantiles.
      if (exists('objective', where=param)) {
        # If 'objective' provided in params, assume that y is a classification label
        # unless objective is reg:linear
        if (param[['objective']] != 'reg:linear') y <- factor(y)
      } else {
        # If no 'objective' given in params, it means that user either wants to use
        # the default 'reg:linear' objective or has provided a custom obj function.
        # Here, assume classification setting when y has 5 or less unique values:
        if (length(unique(y)) <= 5) y <- factor(y)
      }
      folds <- xgb.createFolds(y, nfold)
    } else { 
      # make simple non-stratified folds
      kstep <- length(randidx) %/% nfold
      folds <- list()
      for (i in 1:(nfold-1)) {
        folds[[i]] = randidx[1:kstep]
        randidx = setdiff(randidx, folds[[i]])
      }
      folds[[nfold]] = randidx
    }
  }
  ret <- list()
  for (k in 1:nfold) {
-    dtest <- slice(dall, idset[[k]])
+    dtest <- slice(dall, folds[[k]])
    didx = c()
    for (i in 1:nfold) {
      if (i != k) {
-        didx <- append(didx, idset[[i]])
+        didx <- append(didx, folds[[i]])
      }
    }
    dtrain <- slice(dall, didx)
    bst <- xgb.Booster(param, list(dtrain, dtest))
    watchlist = list(train=dtrain, test=dtest)
-    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
+    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
  }
  return (ret)
 }
 xgb.cv.aggcv <- function(res, showsd = TRUE) {
  header <- res[[1]]
  ret <- header[1]
@@ -212,3 +292,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
  }
  return (ret)
 }
 # Shamelessly copied from caret::createFolds
 # and simplified by always returning an unnamed list of test indices
 xgb.createFolds <- function(y, k = 10)
 {
  if(is.numeric(y)) {
    ## Group the numeric data based on their magnitudes
    ## and sample within those groups.
    ## When the number of samples is low, we may have
    ## issues further slicing the numeric data into
    ## groups. The number of groups will depend on the
    ## ratio of the number of folds to the sample size.
    ## At most, we will use quantiles. If the sample
    ## is too small, we just do regular unstratified
    ## CV
    cuts <- floor(length(y)/k)
    if(cuts < 2) cuts <- 2
    if(cuts > 5) cuts <- 5
    y <- cut(y,
             unique(quantile(y, probs = seq(0, 1, length = cuts))),
             include.lowest = TRUE)
  }
  if(k < length(y)) {
    ## reset levels so that the possible levels and
    ## the levels in the vector are the same
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
    ## For each class, balance the fold allocation as far
    ## as possible, then resample the remainder.
    ## The final assignment of folds is also randomized.
    for(i in 1:length(numInClass)) {
      ## create a vector of integers from 1:k as many times as possible without
      ## going over the number of samples in the class. Note that if the number
      ## of samples in a class is less than k, nothing is producd here.
      seqVector <- rep(1:k, numInClass[i] %/% k)
      ## add enough random integers to get  length(seqVector) == numInClass[i]
      if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
      ## shuffle the integers for fold assignment and assign to this classes's data
      foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
    }
  } else foldVector <- seq(along = y)
  out <- split(seq(along = y), foldVector)
  names(out) <- NULL
  out
 }
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -6,7 +6,7 @@
 #'   indicating the data file.
 #' @param info a list of information of the xgb.DMatrix object
 #' @param missing Missing is only used when input is dense matrix, pick a float
-#     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #
 #' @param ... other information to pass to \code{info}.
 #' 
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -1,7 +1,18 @@
 #' Cross Validation
 #' 
 #' The cross valudation function of xgboost
-#'
+#' 
 #' @importFrom data.table data.table
 #' @importFrom data.table as.data.table
 #' @importFrom magrittr %>%
 #' @importFrom data.table :=
 #' @importFrom data.table rbindlist
 #' @importFrom stringr str_extract_all
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_split
 #' @importFrom stringr str_replace
 #' @importFrom stringr str_match
 #' 
 #' @param params the list of parameters. Commonly used ones are:
 #' \itemize{
 #'   \item \code{objective} objective function, common ones are
@@ -14,13 +25,16 @@
 #'   \item \code{nthread} number of thread used in training, if not set, all threads are used
 #' }
 #'
-#'   See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for 
+#'   See \link{xgb.train} for further details.
-#'   further details. See also demo/ for walkthrough example in R.
+#'   See also demo/ for walkthrough example in R.
-#' @param data takes an \code{xgb.DMatrix} as the input.
+#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
 #' @param nrounds the max number of iterations
-#' @param nfold number of folds used
+#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. 
-#' @param label option field, when data is Matrix
+#' @param label option field, when data is \code{Matrix}
-#' @param showsd boolean, whether show standard deviation of cross validation
+#' @param missing Missing is only used when input is dense matrix, pick a float
 #'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #' @param prediction A logical value indicating whether to return the prediction vector.
 #' @param showsd \code{boolean}, whether show standard deviation of cross validation
 #' @param metrics, list of evaluation metrics to be used in corss validation,
 #'   when it is not specified, the evaluation metric is chosen according to objective function.
 #'   Possible options are:
@@ -32,55 +46,187 @@
 #'   \item \code{merror} Exact matching error, used to evaluate multi-class classification
 #' }
 #' @param obj customized objective function. Returns gradient and second order 
-#'   gradient with given prediction and dtrain, 
+#'   gradient with given prediction and dtrain.
 #' @param feval custimized evaluation function. Returns 
 #'   \code{list(metric='metric-name', value='metric-value')} with given 
-#'   prediction and dtrain,
+#'   prediction and dtrain.
 #' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
 #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
 #'   If folds are supplied, the nfold and stratified parameters would be ignored.
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param early.stop.round An alternative of \code{early_stop_round}.
 #' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #'     
 #' @param ... other parameters to pass to \code{params}.
 #' 
-#' @details 
+#' @return
-#' This is the cross validation function for xgboost
+#' If \code{prediction = TRUE}, a list with the following elements is returned:
 #' \itemize{
 #'   \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
 #'   \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
 #' }
 #'
-#' Parallelization is automatically enabled if OpenMP is present.
+#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
-#' Number of threads can also be manually specified via "nthread" parameter.
+#'
 #' @details 
 #' The original sample is randomly partitioned into \code{nfold} equal size subsamples. 
 #' 
-#' This function only accepts an \code{xgb.DMatrix} object as the input.
+#' Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data. 
 #' 
 #' The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
 #' 
 #' All observations are used for both training and validation.
 #' 
 #' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-#' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
+#' history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
-#'                   "max.depth"=3, "eta"=1, "objective"="binary:logistic")
+#'                   max.depth =3, eta = 1, objective = "binary:logistic")
 #' print(history)
 #' @export
 #'
-xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL,
+xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
-                   showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, ...) {
+                   prediction = FALSE, showsd = TRUE, metrics=list(), 
                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,
                   early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) {
  if (typeof(params) != "list") {
    stop("xgb.cv: first argument params must be list")
  }
  if(!is.null(folds)) {
    if(class(folds)!="list" | length(folds) < 2) {
      stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
    }
    nfold <- length(folds)
  }
  if (nfold <= 1) {
    stop("nfold must be bigger than 1")
  }
-  dtrain <- xgb.get.DMatrix(data, label)
+  if (is.null(missing)) {
    dtrain <- xgb.get.DMatrix(data, label)
  } else {
    dtrain <- xgb.get.DMatrix(data, label, missing)
  }
  params <- append(params, list(...))
  params <- append(params, list(silent=1))
  for (mc in metrics) {
    params <- append(params, list("eval_metric"=mc))
  }
-
+  
-  folds <- xgb.cv.mknfold(dtrain, nfold, params)
+  # Early Stopping
-  history <- list()
+  if (is.null(early_stop_round) && !is.null(early.stop.round))
    early_stop_round = early.stop.round
  if (!is.null(early_stop_round)){
    if (!is.null(feval) && is.null(maximize))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize) && is.null(params$eval_metric))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize))
    {
      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
        maximize = FALSE
      } else {
        maximize = TRUE
      }
    }
    if (maximize) {
      bestScore = 0
    } else {
      bestScore = Inf
    }
    bestInd = 0
    earlyStopflag = FALSE
    if (length(metrics)>1)
      warning('Only the first metric is used for early stopping process.')
  }
  xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
  obj_type = params[['objective']]
  mat_pred = FALSE
  if (!is.null(obj_type) && obj_type=='multi:softprob')
  {
    num_class = params[['num_class']]
    if (is.null(num_class))
      stop('must set num_class to use softmax')
    predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
    mat_pred = TRUE
  }
  else
    predictValues <- rep(0,xgb.numrow(dtrain))
  history <- c()
  for (i in 1:nrounds) {
    msg <- list()
    for (k in 1:nfold) {
-      fd <- folds[[k]]
+      fd <- xgb_folds[[k]]
-      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)      
+      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
-      msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval), 
+      if (i<nrounds) {
-                           "\t")[[1]]
+          msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
      } else {
        if (!prediction) {
          msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
        } else {
          res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
          if (mat_pred) {
            pred_mat = matrix(res[[2]],num_class,length(fd$index))
            predictValues[fd$index,] <- t(pred_mat)
          } else {
            predictValues[fd$index] <- res[[2]]
          }
          msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
        }
      }
    }
    ret <- xgb.cv.aggcv(msg, showsd)
-    history <- append(history, ret)
+    history <- c(history, ret)
-    cat(paste(ret, "\n", sep=""))
+    if(verbose) paste(ret, "\n", sep="") %>% cat
    # early_Stopping
    if (!is.null(early_stop_round)){
      score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1]
      score = strsplit(score,'\\+|:')[[1]][[2]]
      score = as.numeric(score)
      if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
        bestScore = score
        bestInd = i
      } else {
        if (i-bestInd>=early_stop_round) {
          earlyStopflag = TRUE
          cat('Stopping. Best iteration:',bestInd)
          break
        }
      }
    }
  }
-  return (TRUE)
+  
  colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
  colnamesMean <- paste(colnames, "mean")
  if(showsd) colnamesStd <- paste(colnames, "std")
  colnames <- c()
  if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
  else colnames <- colnamesMean
  type <- rep(x = "numeric", times = length(colnames))
  dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
  split <- str_split(string = history, pattern = "\t")
  for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
  if (prediction) {
    return(list(dt = dt,pred = predictValues))
  }
  return(dt)
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(".")
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -2,14 +2,26 @@
 #' 
 #' Save a xgboost model to text file. Could be parsed later.
 #' 
 #' @importFrom magrittr %>%
 #' @importFrom stringr str_replace
 #' @importFrom data.table fread
 #' @importFrom data.table :=
 #' @importFrom data.table setnames
 #' @param model the model object.
-#' @param fname the name of the binary file.
+#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
 #' @param fmap feature map file representing the type of feature. 
 #'        Detailed description could be found at 
-#'        \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
+#'        \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
 #'        See demo/ for walkthrough example in R, and
-#'        \url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt} 
+#'        \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} 
 #'        for example Format.
 #' @param with.stats whether dump statistics of splits 
 #'        When this option is on, the model dump comes with two additional statistics:
 #'        gain is the approximate loss function gain we get in each split;
 #'        cover is the sum of second order gradient in each node.
 #'
 #' @return
 #' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -17,17 +29,43 @@
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-#'                eta = 1, nround = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump')
+#' # save the model in file 'xgb.model.dump'
 #' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
 #' 
 #' # print the model without saving it to a file
 #' print(xgb.dump(bst))
 #' @export
 #' 
-xgb.dump <- function(model, fname, fmap = "") {
+xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
  if (class(model) != "xgb.Booster") {
-    stop("xgb.dump: first argument must be type xgb.Booster")
+    stop("model: argument must be type xgb.Booster")
  } else {
    model <- xgb.Booster.check(model)
  }
-  if (typeof(fname) != "character") {
+  if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
-    stop("xgb.dump: second argument must be type character")
+    stop("fname: argument must be type character (when provided)")
  }
-  .Call("XGBoosterDumpModel_R", model, fname, fmap, PACKAGE = "xgboost")
+  if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
-  return(TRUE)
+    stop("fmap: argument must be type character (when provided)")
-} 
+  }
  longString <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with.stats), PACKAGE = "xgboost")
  dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F)
  setnames(dt, "Lines")
  if(is.null(fname)) {
    result <- dt[Lines != "0"][, Lines := str_replace(Lines, "^\t+", "")][Lines != ""][, paste(Lines)]
    return(result)
  } else {
    result <- dt[Lines != "0"][Lines != ""][, paste(Lines)] %>% writeLines(fname)
    return(TRUE)
  }
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(c("Lines", "."))
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -0,0 +1,134 @@
 #' Show importance of features in a model
 #' 
 #' Read a xgboost model text dump. 
 #' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table setnames
 #' @importFrom data.table :=
 #' @importFrom magrittr %>%
 #' @importFrom Matrix colSums
 #' @importFrom Matrix cBind
 #' @importFrom Matrix sparseVector
 #' 
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' 
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
 #' 
 #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' 
 #' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
 #' 
 #' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
 #' 
 #' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
 #'
 #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 #'
 #' @details 
 #' This is the function to understand the model trained (and through your model, your data).
 #' 
 #' Results are returned for both linear and tree models.
 #' 
 #' \code{data.table} is returned by the function. 
 #' There are 3 columns :
 #' \itemize{
 #'   \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
 #'   \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
 #'   \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
 #'   \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
 #' }
 #' 
 #' Co-occurence count
 #' ------------------
 #' 
 #' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
 #' 
 #' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
 #' 
 #' If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
 #' # Both dataset are list with two items, a sparse matrix and labels 
 #' # (labels = outcome column which will be learned). 
 #' # Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
 #' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
 #' xgb.importance(train$data@@Dimnames[[2]], model = bst)
 #' 
 #' # Same thing with co-occurence computation this time
 #' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
 #' 
 #' @export
 xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ((x + label) == 2)){  
  if (!class(feature_names) %in% c("character", "NULL")) {	   
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
    stop("filename_dump: Has to be a path to the model dump file.")
  }
  if (!class(model) %in% c("xgb.Booster", "NULL")) {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  if((is.null(data) & !is.null(label)) |(!is.null(data) & is.null(label))) {
    stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
  }
  if(class(label) == "numeric"){
    if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
  }
  if(is.null(model)){
    text <- readLines(filename_dump)  
  } else {
    text <- xgb.dump(model = model, with.stats = T)
  } 
  if(text[2] == "bias:"){
    result <- readLines(filename_dump) %>% linearDump(feature_names, .)
    if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
  }  else {
    result <- treeDump(feature_names, text = text, keepDetail = !is.null(data))
    # Co-occurence computation
    if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
      # Take care of missing column 
      a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0
      # Bind the two Matrix and reorder columns
      c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]]
      rm(a)
      # Apply split
      d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
      apply(c & d, 2, . %>% target %>% sum) -> vec
      result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL]
    }    
  }
  result
 }
 treeDump <- function(feature_names, text, keepDetail){
  if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
  result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)]
  result  
 }
 linearDump <- function(feature_names, text){
  which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(c(".", "Feature", "Split", "No", "Missing", "MissingNo", "RealCover"))
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -10,7 +10,7 @@
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-#'                eta = 1, nround = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' xgb.save(bst, 'xgb.model')
 #' bst <- xgb.load('xgb.model')
 #' pred <- predict(bst, test$data)
@@ -19,5 +19,14 @@
 xgb.load <- function(modelfile) {
  if (is.null(modelfile)) 
    stop("xgb.load: modelfile cannot be NULL")
-  xgb.Booster(modelfile = modelfile)
+  
  handle <- xgb.Booster(modelfile = modelfile)
  # re-use modelfile if it is raw so we donot need to serialize
  if (typeof(modelfile) == "raw") {
    bst <- xgb.handleToBooster(handle, modelfile)
  } else {
    bst <- xgb.handleToBooster(handle, NULL)
  }
  bst <- xgb.Booster.check(bst)
  return(bst)
 } 
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -0,0 +1,170 @@
 #' Convert tree model dump to data.table
 #' 
 #' Read a tree model text dump and return a data.table.
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table set
 #' @importFrom data.table rbindlist
 #' @importFrom data.table copy
 #' @importFrom data.table :=
 #' @importFrom magrittr %>%
 #' @importFrom magrittr not
 #' @importFrom magrittr add
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_split
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_trim
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
 #'
 #' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
 #'
 #' @details 
 #' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
 #' 
 #' The content of the \code{data.table} is organised that way:
 #' 
 #' \itemize{
 #' \item \code{ID}: unique identifier of a node ;
 #'  \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
 #'  \item \code{Split}: value of the chosen feature where is operated the split ;
 #'  \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
 #'  \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
 #'  \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
 #'  \item \code{Quality}: it's the gain related to the split in this specific node ;
 #'  \item \code{Cover}: metric to measure the number of observation affected by the split ;
 #'  \item \code{Tree}: ID of the tree. It is included in the main ID ;
 #'  \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
 #' } 
 #'   
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
 #' #Both dataset are list with two items, a sparse matrix and labels 
 #' #(labels = outcome column which will be learned). 
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
 #' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
 xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
  if (!class(feature_names) %in% c("character", "NULL")) {     
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
    stop("filename_dump: path to the model doesn't exist.")
  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
  }
  if (!class(model) %in% c("xgb.Booster", "NULL")) {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  if (!class(text) %in% c("character", "NULL")) { 
    stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
  }
  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
    stop("n_first_tree: Has to be a numeric vector of size 1.")
  }
  if(!is.null(model)){
    text = xgb.dump(model = model, with.stats = T)
  } else if(!is.null(filename_dump)){
    text <- readLines(filename_dump) %>% str_trim(side = "both")  
  }
  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
  n_round <- min(length(position) - 1, n_first_tree)
  addTreeId <- function(x, i) paste(i,x,sep = "-")
  allTrees <- data.table()
  anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" 
  for(i in 1:n_round){
    tree <- text[(position[i]+1):(position[i+1]-1)]
    # avoid tree made of a leaf only (no split)
    if(length(tree) <2) next
    treeID <- i-1
    notLeaf <- str_match(tree, "leaf") %>% is.na
    leaf <- notLeaf %>% not %>% tree[.]
    branch <- notLeaf %>% tree[.]
    idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
    idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric 
    if(!is.null(feature_names)){
      featureBranch <- feature_names[featureBranch + 1]
    }
    featureLeaf <- rep("Leaf", length(leaf))
    splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "") 
    splitLeaf <- rep(NA, length(leaf)) 
    yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
    yesLeaf <- rep(NA, length(leaf)) 
    noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
    noLeaf <- rep(NA, length(leaf))
    missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
    missingLeaf <- rep(NA, length(leaf))
    qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
    qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
    coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
    coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
    allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
  }
  yes <- allTrees[!is.na(Yes),Yes]
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
      j = "Yes.Feature", 
      value = allTrees[ID == yes,Feature])
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
      j = "Yes.Cover", 
      value = allTrees[ID == yes,Cover])
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
    j = "Yes.Quality", 
    value = allTrees[ID == yes,Quality])
  no <- allTrees[!is.na(No),No]
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
      j = "No.Feature", 
      value = allTrees[ID == no,Feature])
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
      j = "No.Cover", 
      value = allTrees[ID == no,Cover])
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
      j = "No.Quality", 
      value = allTrees[ID == no,Quality])
  allTrees
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -0,0 +1,57 @@
 #' Plot feature importance bar graph
 #' 
 #' Read a data.table containing feature importance details and plot it.
 #' 
 #' @importFrom magrittr %>%
 #' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
 #' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
 #'
 #' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
 #'
 #' @details 
 #' The purpose of this function is to easily represent the importance of each feature of a model.
 #' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
 #' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function. 
 #'   
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
 #' #Both dataset are list with two items, a sparse matrix and labels 
 #' #(labels = outcome column which will be learned). 
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
 #' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
 #' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst)
 #' xgb.plot.importance(importance_matrix)
 #' 
 #' @export
 xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
  if (!"data.table" %in% class(importance_matrix))  {     
    stop("importance_matrix: Should be a data.table.")
  }
  if (!require(ggplot2, quietly = TRUE)) {
    stop("ggplot2 package is required for plotting the importance", call. = FALSE)
  }
  if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
    stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
  }
  # To avoid issues in clustering when co-occurences are used
  importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
  clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
  importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
  plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+  geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
  return(plot)  
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -0,0 +1,97 @@
 #' Plot a boosted tree model
 #' 
 #' Read a tree model text dump. 
 #' Plotting only works for boosted tree model (not linear model).
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table set
 #' @importFrom data.table rbindlist
 #' @importFrom data.table :=
 #' @importFrom data.table copy
 #' @importFrom magrittr %>%
 #' @importFrom magrittr not
 #' @importFrom magrittr add
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_split
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_trim
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
 #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
 #' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
 #' @param  width  the width of the diagram in pixels.
 #' @param height	the height of the diagram in pixels.
 #'
 #' @return A \code{DiagrammeR} of the model.
 #'
 #' @details 
 #' 
 #' The content of each node is organised that way:
 #' 
 #' \itemize{
 #'  \item \code{feature} value ;
 #'  \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
 #'  \item \code{gain}: metric the importance of the node in the model.
 #' } 
 #' 
 #' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
 #' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
 #'  
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
 #' #Both dataset are list with two items, a sparse matrix and labels 
 #' #(labels = outcome column which will be learned). 
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
 #' train <- agaricus.train
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
 #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
 #' 
 xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){  
  if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
    stop("style: Has to be a character vector of size 1.")
  }
  if (!class(model) %in% c("xgb.Booster", "NULL")) {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
    stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
  }
  if(is.null(model)){
    allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)  
  } else {
    allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)  
  }
  allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
  allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
  if(is.null(CSSstyle)){
    CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
  }  
  yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
  no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
  path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
  DiagrammeR::mermaid(path, width, height)
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -11,7 +11,7 @@
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-#'                eta = 1, nround = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' xgb.save(bst, 'xgb.model')
 #' bst <- xgb.load('xgb.model')
 #' pred <- predict(bst, test$data)
@@ -22,7 +22,8 @@ xgb.save <- function(model, fname) {
    stop("xgb.save: fname must be character")
  }
  if (class(model) == "xgb.Booster") {
-    .Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
+    model <- xgb.Booster.check(model)
    .Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
    return(TRUE)
  }
  stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -0,0 +1,30 @@
 #' Save xgboost model to R's raw vector,
 #' user can call xgb.load to load the model back from raw vector
 #' 
 #' Save xgboost model from xgboost or xgb.train
 #' 
 #' @param model the model object.
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' raw <- xgb.save.raw(bst)
 #' bst <- xgb.load(raw)
 #' pred <- predict(bst, test$data)
 #' @export
 #' 
 xgb.save.raw <- function(model) {
  if (class(model) == "xgb.Booster"){
    model <- model$handle
  }
  if (class(model) == "xgb.Booster.handle") {
    raw <- .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost")
    return(raw)
  }
  stop("xgb.raw: the input must be xgb.Booster.handle. Use xgb.DMatrix.save to save
       xgb.DMatrix object.")
 }
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -1,21 +1,56 @@
 #' eXtreme Gradient Boosting Training
 #' 
-#' The training function of xgboost
+#' An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
 #'
-#' @param params the list of parameters. Commonly used ones are:
+#' @param params the list of parameters. 
 #' 
 #' 1. General Parameters
 #' 
 #' \itemize{
-#'   \item \code{objective} objective function, common ones are
+#'   \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
-#'   \itemize{
+#'   \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
 #'     \item \code{reg:linear} linear regression
 #'     \item \code{binary:logistic} logistic regression for classification
 #'   }
 #'   \item \code{eta} step size of each boosting step
 #'   \item \code{max.depth} maximum depth of the tree
 #'   \item \code{nthread} number of thread used in training, if not set, all threads are used
 #' }
-#'
+#'  
-#'   See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for 
+#' 2. Booster Parameters
-#'   further details. See also demo/ for walkthrough example in R.
+#' 
 #' 2.1. Parameter for Tree Booster
 #' 
 #' \itemize{
 #'   \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
 #'   \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
 #'   \item \code{max_depth} maximum depth of a tree. Default: 6
 #'   \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
 #'   \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
 #'   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
 #'   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
 #' }
 #' 
 #' 2.2. Parameter for Linear Booster
 #'  
 #' \itemize{
 #'   \item \code{lambda} L2 regularization term on weights. Default: 0
 #'   \item \code{lambda_bias} L2 regularization term on bias. Default: 0
 #'   \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
 #' }
 #' 
 #' 3. Task Parameters 
 #' 
 #' \itemize{
 #' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
 #'   \itemize{
 #'     \item \code{reg:linear} linear regression (Default).
 #'     \item \code{reg:logistic} logistic regression.
 #'     \item \code{binary:logistic} logistic regression for binary classification. Output probability.
 #'     \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
 #'     \item \code{num_class} set the number of classes. To use only with multiclass objectives.
 #'     \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
 #'     \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
 #'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
 #'   }
 #'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
 #'   \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 #' }
 #' 
 #' @param data takes an \code{xgb.DMatrix} as the input.
 #' @param nrounds the max number of iterations
 #' @param watchlist what information should be printed when \code{verbose=1} or
@@ -31,19 +66,37 @@
 #'   prediction and dtrain,
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
-#'
+#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param early.stop.round An alternative of \code{early_stop_round}.
 #' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
-#' This is the training function for xgboost.
+#' This is the training function for \code{xgboost}. 
 #' 
 #' It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
 #' therefore it is more flexible than \code{\link{xgboost}} function.
 #'
-#' Parallelization is automatically enabled if OpenMP is present.
+#' Parallelization is automatically enabled if \code{OpenMP} is present. 
-#' Number of threads can also be manually specified via "nthread" parameter.
+#' Number of threads can also be manually specified via \code{nthread} parameter.
 #' 
-#' This function only accepts an \code{xgb.DMatrix} object as the input.
+#' \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
-#' It supports advanced features such as watchlist, customized objective function,
+#'   \itemize{
-#' therefore it is more flexible than \code{\link{xgboost}}.
+#'      \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
 #'      \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
 #'      \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
 #'      \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
 #'      \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
 #'      \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
 #'   }
 #'   
 #' Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
 #' 
 #' This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -63,11 +116,13 @@
 #'   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
 #'   return(list(metric = "error", value = err))
 #' }
-#' bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
+#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
 #' @export
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
-                      obj = NULL, feval = NULL, verbose = 1, ...) {
+                      obj = NULL, feval = NULL, verbose = 1, printEveryN=1L,
                      early_stop_round = NULL, early.stop.round = NULL,
                      maximize = NULL, ...) {
  dtrain <- data
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list")
@@ -86,13 +141,68 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
  }
  params = append(params, list(...))
-  bst <- xgb.Booster(params, append(watchlist, dtrain))
+  # Early stopping
-  for (i in 1:nrounds) {
+  if (is.null(early_stop_round) && !is.null(early.stop.round))
-    succ <- xgb.iter.update(bst, dtrain, i - 1, obj)
+    early_stop_round = early.stop.round
-    if (length(watchlist) != 0) {
+  if (!is.null(early_stop_round)){
-      msg <- xgb.iter.eval(bst, watchlist, i - 1, feval)
+    if (!is.null(feval) && is.null(maximize))
-      cat(paste(msg, "\n", sep=""))
+      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (length(watchlist) == 0)
      stop('For early stopping you need at least one set in watchlist.')
    if (is.null(maximize) && is.null(params$eval_metric))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize))
    {
      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
        maximize = FALSE
      } else {
        maximize = TRUE
      }
    }
    if (maximize) {
      bestScore = 0
    } else {
      bestScore = Inf
    }
    bestInd = 0
    earlyStopflag = FALSE
    if (length(watchlist)>1)
      warning('Only the first data set in watchlist is used for early stopping process.')
  }
  handle <- xgb.Booster(params, append(watchlist, dtrain))
  bst <- xgb.handleToBooster(handle)
  printEveryN=max( as.integer(printEveryN), 1L)
  for (i in 1:nrounds) {
    succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
    if (length(watchlist) != 0) {
      msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
      if (0== ( (i-1) %% printEveryN))
 	    cat(paste(msg, "\n", sep=""))
      if (!is.null(early_stop_round))
      {
        score = strsplit(msg,':|\\s+')[[1]][3]
        score = as.numeric(score)
        if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
          bestScore = score
          bestInd = i
        } else {
          if (i-bestInd>=early_stop_round) {
            earlyStopflag = TRUE
            cat('Stopping. Best iteration:',bestInd)
            break
          }
        }
      }
    }
  }
  bst <- xgb.Booster.check(bst)
  if (!is.null(early_stop_round)) {
    bst$bestScore = bestScore
    bst$bestInd = bestInd
  }
  return(bst)
 } 
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -1,12 +1,14 @@
 #' eXtreme Gradient Boosting (Tree) library
 #' 
-#' A simple interface for xgboost in R
+#' A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface.
 #' 
 #' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or 
 #'   \code{xgb.DMatrix}. 
 #' @param label the response variable. User should not set this field,
-#    if data is local data file or  \code{xgb.DMatrix}. 
+#'    if data is local data file or  \code{xgb.DMatrix}. 
-#' @param params the list of parameters. Commonly used ones are:
+#' @param params the list of parameters.
 #' 
 #' Commonly used ones are:
 #' \itemize{
 #'   \item \code{objective} objective function, common ones are
 #'   \itemize{
@@ -17,20 +19,32 @@
 #'   \item \code{max.depth} maximum depth of the tree
 #'   \item \code{nthread} number of thread used in training, if not set, all threads are used
 #' }
-#'
+#'   
-#'   See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for 
+#'   Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
-#'   further details. See also demo/ for walkthrough example in R.
+#'   
 #'   See also \code{demo/} for walkthrough example in R.
 #' 
 #' @param nrounds the max number of iterations
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
 #'   performance and construction progress information
 #' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
 #' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param early.stop.round An alternative of \code{early_stop_round}.
 #' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
-#' This is the modeling function for xgboost.
+#' This is the modeling function for Xgboost.
 #' 
-#' Parallelization is automatically enabled if OpenMP is present.
+#' Parallelization is automatically enabled if \code{OpenMP} is present.
-#' Number of threads can also be manually specified via "nthread" parameter
+#' 
 #' Number of threads can also be manually specified via \code{nthread} parameter.
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -38,14 +52,20 @@
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-#'                eta = 1, nround = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 #' pred <- predict(bst, test$data)
 #' 
 #' @export
 #' 
-xgboost <- function(data = NULL, label = NULL, params = list(), nrounds, 
+xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
-                    verbose = 1, ...) {
+                    verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL,
-  dtrain <- xgb.get.DMatrix(data, label)  
+                    maximize = NULL, ...) {
  if (is.null(missing)) {
    dtrain <- xgb.get.DMatrix(data, label)
  } else {
    dtrain <- xgb.get.DMatrix(data, label, missing)
  }
  params <- append(params, list(...))
  if (verbose > 0) {
@@ -54,7 +74,9 @@ xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
    watchlist <- list()
  }
-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose)
+  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, printEveryN=printEveryN,
                   early_stop_round = early_stop_round,
                   early.stop.round = early.stop.round)
  return(bst)
 } 
@@ -69,7 +91,7 @@ xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
 #' 
 #' \itemize{
 #'  \item \code{label} the label for each record
-#'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
+#'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
 #' }
 #'
 #' @references
@@ -96,7 +118,7 @@ NULL
 #' 
 #' \itemize{
 #'  \item \code{label} the label for each record
-#'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
+#'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
 #' }
 #'
 #' @references
@@ -111,5 +133,5 @@ NULL
 #' @name agaricus.test
 #' @usage data(agaricus.test)
 #' @format A list containing a label vector, and a dgCMatrix object with 1611 
-#' rows and 127 variables
+#' rows and 126 variables
 NULL
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -2,11 +2,10 @@
 ## Installation
-For up-to-date version(which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
+For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
 ```r
-require(devtools)
+devtools::install_github('dmlc/xgboost',subdir='R-package')
 install_github('xgboost','tqchen',subdir='R-package')
 ```
 For stable version on CRAN, please run
@@ -17,5 +16,5 @@ install.packages('xgboost')
 ## Examples
-* Please visit [walk through example](https://github.com/tqchen/xgboost/blob/master/R-package/demo).
+* Please visit [walk through example](demo).
-* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.
+* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
--- a/R-package/data/agaricus.test.rda
+++ b/R-package/data/agaricus.test.rda
--- a/R-package/data/agaricus.train.rda
+++ b/R-package/data/agaricus.train.rda
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -4,3 +4,7 @@ boost_from_prediction           Boosting from existing prediction
 predict_first_ntree             Predicting using first n trees
 generalized_linear_model        Generalized Linear Model
 cross_validation                Cross validation
 create_sparse_matrix            Create Sparse Matrix
 predict_leaf_indices            Predicting the corresponding leaves
 early_stopping                  Early Stop in training
 poisson_regression              Poisson Regression on count data
--- a/R-package/demo/README.md
+++ b/R-package/demo/README.md
@@ -6,6 +6,7 @@ XGBoost R Feature Walkthrough
 * [Predicting using first n trees](predict_first_ntree.R)
 * [Generalized Linear Model](generalized_linear_model.R)
 * [Cross validation](cross_validation.R)
 * [Create a sparse matrix from a dense one](create_sparse_matrix.R)
 Benchmarks
 ====
@@ -13,5 +14,5 @@ Benchmarks
 Notes
 ====
-* Contribution of exampls, benchmarks is more than welcomed!
+* Contribution of examples, benchmarks is more than welcomed!
 * If you like to share how you use xgboost to solve your problem, send a pull request:)
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -16,27 +16,28 @@ class(train$data)
 # use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
 print("training xgboost with sparseMatrix")
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic")
+               nthread = 2, objective = "binary:logistic")
 # alternatively, you can put in dense matrix, i.e. basic R-matrix
 print("training xgboost with Matrix")
 bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic")
+               nthread = 2, objective = "binary:logistic")
 # you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
 print("training xgboost with xgb.DMatrix")
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
-bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
+bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, 
               objective = "binary:logistic")
 # Verbose = 0,1,2
 print ('train xgboost with verbose 0, no message')
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic", verbose = 0)
+               nthread = 2, objective = "binary:logistic", verbose = 0)
 print ('train xgboost with verbose 1, print evaluation metric')
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic", verbose = 1)
+               nthread = 2, objective = "binary:logistic", verbose = 1)
 print ('train xgboost with verbose 2, also print information about tree')
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic", verbose = 2)
+               nthread = 2, objective = "binary:logistic", verbose = 2)
 # you can also specify data as file path to a LibSVM format input
 # since we do not have this file with us, the following line is just for illustration
@@ -58,6 +59,14 @@ pred2 <- predict(bst2, test$data)
 # pred2 should be identical to pred
 print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
 # save model to R's raw vector
 raw = xgb.save.raw(bst)
 # load binary model to R
 bst3 <- xgb.load(raw)
 pred3 <- predict(bst3, test$data)
 # pred2 should be identical to pred
 print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
 #----------------Advanced features --------------
 # to use advanced features, we need to put data in xgb.DMatrix
 dtrain <- xgb.DMatrix(data = train$data, label=train$label)
@@ -69,25 +78,28 @@ watchlist <- list(train=dtrain, test=dtest)
 # watchlist allows us to monitor the evaluation result on all data in the list 
 print ('train xgboost using xgb.train with watchlist')
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
-                 objective = "binary:logistic")
+                 nthread = 2, objective = "binary:logistic")
 # we can change evaluation metrics, or use multiple evaluation metrics
 print ('train xgboost using xgb.train with watchlist, watch logloss and error')
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
                 eval.metric = "error", eval.metric = "logloss",
-                 objective = "binary:logistic")
+                 nthread = 2, objective = "binary:logistic")
 # xgb.DMatrix can also be saved using xgb.DMatrix.save
 xgb.DMatrix.save(dtrain, "dtrain.buffer")
 # to load it in, simply call xgb.DMatrix
 dtrain2 <- xgb.DMatrix("dtrain.buffer")
 bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
-                 objective = "binary:logistic")
+                 nthread = 2, objective = "binary:logistic")
 # information can be extracted from xgb.DMatrix using getinfo
 label = getinfo(dtest, "label")
 pred <- predict(bst, dtest)
 err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
 print(paste("test-error=", err))
-# Finally, you can dump the tree you learned using xgb.dump into a text file
+# You can dump the tree you learned using xgb.dump into a text file
-xgb.dump(bst, "dump.raw.txt")
+xgb.dump(bst, "dump.raw.txt", with.stats = T)
 # Finally, you can check which features are the most important.
 print("Most important features (look at column Gain):")
 print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt"))
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@@ -11,7 +11,7 @@ watchlist <- list(eval = dtest, train = dtrain)
 #
 print('start running example to start from a initial prediction')
 # train xgboost for 1 round
-param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
+param <- list(max.depth=2,eta=1,nthread = 2, silent=1,objective='binary:logistic')
 bst <- xgb.train( param, dtrain, 1, watchlist )
 # Note: we need the margin value instead of transformed prediction in set_base_margin
 # do predict with output_margin=TRUE, will always give you margin values before logistic transformation
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -0,0 +1,89 @@
 require(xgboost)
 require(Matrix)
 require(data.table)
 if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
 # According to its documentation, Xgboost works only on numbers.
 # Sometimes the dataset we have to work on have categorical data. 
 # A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
 #
 # In R, categorical variable is called Factor. 
 # Type ?factor in console for more information.
 #
 # In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
 # The method we are going to see is usually called "one hot encoding".
 #load Arthritis dataset in memory.
 data(Arthritis)
 # create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
 df <- data.table(Arthritis, keep.rownames = F)
 # Let's have a look to the data.table
 cat("Print the dataset\n")
 print(df)
 # 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
 cat("Structure of the dataset\n")
 str(df)
 # Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
 # For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
 df[,AgeDiscret:= as.factor(round(Age/10,0))]
 # Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
 df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
 # We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
 df[,ID:=NULL]
 # List the different values for the column Treatment: Placebo, Treated.
 cat("Values of the categorical feature Treatment\n")
 print(levels(df[,Treatment]))
 # Next step, we will transform the categorical data to dummy variables.
 # This method is also called one hot encoding.
 # The purpose is to transform each value of each categorical feature in one binary feature.
 #
 # Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary. For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, the value 1 in the new column Placebo and the value 0 in the new column  Treated.
 #
 # Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
 # Column Improved is excluded because it will be our output column, the one we want to predict.
 sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
 cat("Encoding of the sparse Matrix\n")
 print(sparse_matrix)
 # Create the output vector (not sparse)
 # 1. Set, for all rows, field in Y column to 0; 
 # 2. set Y to 1 when Improved == Marked; 
 # 3. Return Y column
 output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
 # Following is the same process as other demo
 cat("Learning...\n")
 bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
 xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
 importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
 print(importance)
 # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
 # Does these results make sense?
 # Let's check some Chi2 between each of these features and the outcome.
 print(chisq.test(df$Age, df$Y))
 # Pearson correlation between Age and illness disapearing is 35
 print(chisq.test(df$AgeDiscret, df$Y))
 # Our first simplification of Age gives a Pearson correlation of 8.
 print(chisq.test(df$AgeCat, df$Y))
 # The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but  for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
 # As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
 # However it's almost always worse when you add some arbitrary rules.
 # Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -6,7 +6,7 @@ dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 nround <- 2
-param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
+param <- list(max.depth=2,eta=1,silent=1,nthread = 2, objective='binary:logistic')
 cat('running cross validation\n')
 # do cross validation, this will print result out as
@@ -19,7 +19,7 @@ cat('running cross validation, disable standard deviation display\n')
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, nround, nfold=5,
-       metrics={'error'}, , showsd = FALSE)
+       metrics={'error'}, showsd = FALSE)
 ###
 # you can also do cross validation with cutomized loss function
@@ -45,3 +45,7 @@ param <- list(max.depth=2,eta=1,silent=1)
 xgb.cv(param, dtrain, nround, nfold = 5,
       obj = logregobj, feval=evalerror)
 # do cross validation with prediction values for each fold
 res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
 res$dt
 length(res$pred)
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -8,7 +8,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
-param <- list(max.depth=2,eta=1,silent=1)
+param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
 watchlist <- list(eval = dtest, train = dtrain)
 num_round <- 2
@@ -37,3 +37,26 @@ print ('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
 #
 # there can be cases where you want additional information 
 # being considered besides the property of DMatrix you can get by getinfo
 # you can set additional information as attributes if DMatrix
 # set label attribute of dtrain to be label, we use label as an example, it can be anything 
 attr(dtrain, 'label') <- getinfo(dtrain, 'label')
 # this is new customized objective, where you can access things you set
 # same thing applies to customized evaluation function
 logregobjattr <- function(preds, dtrain) {
  # now you can access the attribute in customized function
  labels <- attr(dtrain, 'label')
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
 }
 print ('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror)
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -0,0 +1,39 @@
 require(xgboost)
 # load in the agaricus dataset
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
 param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
 watchlist <- list(eval = dtest)
 num_round <- 20
 # user define objective function, given prediction, return gradient and second order gradient
 # this is loglikelihood loss
 logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
 }
 # user defined evaluation function, return a pair metric_name, result
 # NOTE: when you do customized loss function, the default prediction value is margin
 # this may make buildin evalution metric not function properly
 # for example, we are doing logistic loss, the prediction is score before logistic transformation
 # the buildin evaluation error assumes input is after logistic transformation
 # Take this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
 print ('start training with early Stopping setting')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE,
                 early.stop.round = 3)
 bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror,
              maximize = FALSE, early.stop.round = 3)
--- a/R-package/demo/generalized_linear_model.R
+++ b/R-package/demo/generalized_linear_model.R
@@ -15,7 +15,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # lambda is the L2 regularizer
 # you can also set lambda_bias which is L2 regularizer on the bias term
 param <- list(objective = "binary:logistic", booster = "gblinear",
-              alpha = 0.0001, lambda = 1)
+              nthread = 2, alpha = 0.0001, lambda = 1)
 # normally, you do not need to set eta (step_size)
 # XGBoost uses a parallel coordinate descent algorithm (shotgun), 
--- a/R-package/demo/poisson_regression.R
+++ b/R-package/demo/poisson_regression.R
@@ -0,0 +1,7 @@
 data(mtcars)
 head(mtcars)
 bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
              objective='count:poisson',nrounds=5)
 pred = predict(bst,as.matrix(mtcars[,-11]))
 sqrt(mean((pred-mtcars[,11])^2))
--- a/R-package/demo/predict_first_ntree.R
+++ b/R-package/demo/predict_first_ntree.R
@@ -10,7 +10,7 @@ watchlist <- list(eval = dtest, train = dtrain)
 nround = 2
 # training the model for two rounds
-bst = xgb.train(param, dtrain, nround, watchlist)
+bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
 cat('start testing prediction from first n trees\n')
 labels <- getinfo(dtest,'label')
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -0,0 +1,21 @@
 require(xgboost)
 # load in the agaricus dataset
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
 watchlist <- list(eval = dtest, train = dtrain)
 nround = 5
 # training the model for two rounds
 bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
 cat('start testing prediction from first n trees\n')
 ### predict using first 2 tree
 pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
 head(pred_with_leaf)
 # by default, we predict using all the trees
 pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
 head(pred_with_leaf)
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@@ -5,4 +5,7 @@ demo(boost_from_prediction)
 demo(predict_first_ntree)
 demo(generalized_linear_model)
 demo(cross_validation)
-
+demo(create_sparse_matrix)
 demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
--- a/R-package/man/agaricus.test.Rd
+++ b/R-package/man/agaricus.test.Rd
@@ -1,10 +1,11 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgboost.R
 \docType{data}
 \name{agaricus.test}
 \alias{agaricus.test}
 \title{Test part from Mushroom Data Set}
 \format{A list containing a label vector, and a dgCMatrix object with 1611
-rows and 127 variables}
+rows and 126 variables}
 \usage{
 data(agaricus.test)
 }
@@ -17,7 +18,7 @@ This data set includes the following fields:
 \itemize{
 \item \code{label} the label for each record
- \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
+ \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
 }
 }
 \references{
--- a/R-package/man/agaricus.train.Rd
+++ b/R-package/man/agaricus.train.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgboost.R
 \docType{data}
 \name{agaricus.train}
 \alias{agaricus.train}
@@ -17,7 +18,7 @@ This data set includes the following fields:
 \itemize{
 \item \code{label} the label for each record
- \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
+ \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
 }
 }
 \references{
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/getinfo.xgb.DMatrix.R
 \docType{methods}
 \name{getinfo}
 \alias{getinfo}
@@ -10,15 +11,25 @@ getinfo(object, ...)
 \S4method{getinfo}{xgb.DMatrix}(object, name)
 }
 \arguments{
-\item{object}{Object of class "xgb.DMatrix"}
+\item{object}{Object of class \code{xgb.DMatrix}}
 \item{name}{the name of the field to get}
 \item{...}{other parameters}
 \item{name}{the name of the field to get}
 }
 \description{
 Get information of an xgb.DMatrix object
 }
 \details{
 The information can be one of the following:
 \itemize{
    \item \code{label}: label Xgboost learn from ;
    \item \code{weight}: to do a weight rescale ;
    \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
    \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
 }
 }
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
--- a/R-package/man/nrow-xgb.DMatrix-method.Rd
+++ b/R-package/man/nrow-xgb.DMatrix-method.Rd
@@ -0,0 +1,22 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/nrow.xgb.DMatrix.R
 \docType{methods}
 \name{nrow,xgb.DMatrix-method}
 \alias{nrow,xgb.DMatrix-method}
 \title{Number of xgb.DMatrix rows}
 \usage{
 \S4method{nrow}{xgb.DMatrix}(x)
 }
 \arguments{
 \item{x}{Object of class \code{xgb.DMatrix}}
 }
 \description{
 \code{nrow} return the number of rows present in the \code{xgb.DMatrix}.
 }
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
 dtrain <- xgb.DMatrix(train$data, label=train$label)
 stopifnot(nrow(dtrain) == nrow(train$data))
 }
--- a/R-package/man/predict-xgb.Booster-method.Rd
+++ b/R-package/man/predict-xgb.Booster-method.Rd
@@ -1,11 +1,12 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/predict.xgb.Booster.R
 \docType{methods}
 \name{predict,xgb.Booster-method}
 \alias{predict,xgb.Booster-method}
 \title{Predict method for eXtreme Gradient Boosting model}
 \usage{
-\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE,
+\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
-  ntreelimit = NULL)
+  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
 }
 \arguments{
 \item{object}{Object of class "xgb.Boost"}
@@ -13,6 +14,9 @@
 \item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
 \code{xgb.DMatrix}.}
 \item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
 \item{outputmargin}{whether the prediction should be shown in the original
 value of sum of functions, when outputmargin=TRUE, the prediction is
 untransformed margin value. In logistic regression, outputmargin=T will
@@ -21,6 +25,8 @@ output value before logistic transformation.}
 \item{ntreelimit}{limit number of trees used in prediction, this parameter is
 only valid for gbtree, but not for gblinear. set it to be value bigger
 than 0. It will use all trees by default.}
 \item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
 }
 \description{
 Predicted values based on xgboost model object.
@@ -31,7 +37,7 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-               eta = 1, nround = 2,objective = "binary:logistic")
+               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 pred <- predict(bst, test$data)
 }
--- a/R-package/man/predict-xgb.Booster.handle-method.Rd
+++ b/R-package/man/predict-xgb.Booster.handle-method.Rd
@@ -0,0 +1,18 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/predict.xgb.Booster.handle.R
 \docType{methods}
 \name{predict,xgb.Booster.handle-method}
 \alias{predict,xgb.Booster.handle-method}
 \title{Predict method for eXtreme Gradient Boosting model handle}
 \usage{
 \S4method{predict}{xgb.Booster.handle}(object, ...)
 }
 \arguments{
 \item{object}{Object of class "xgb.Boost.handle"}
 \item{...}{Parameters pass to \code{predict.xgb.Booster}}
 }
 \description{
 Predicted values based on xgb.Booster.handle object.
 }
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/setinfo.xgb.DMatrix.R
 \docType{methods}
 \name{setinfo}
 \alias{setinfo}
@@ -12,15 +13,25 @@ setinfo(object, ...)
 \arguments{
 \item{object}{Object of class "xgb.DMatrix"}
 \item{...}{other parameters}
 \item{name}{the name of the field to get}
 \item{info}{the specific field of information to set}
 \item{...}{other parameters}
 }
 \description{
 Set information of an xgb.DMatrix object
 }
 \details{
 It can be one of the following:
 \itemize{
    \item \code{label}: label Xgboost learn from ;
    \item \code{weight}: to do a weight rescale ;
    \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
    \item \code{group}.
 }
 }
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/slice.xgb.DMatrix.R
 \docType{methods}
 \name{slice}
 \alias{slice}
@@ -13,9 +14,9 @@ slice(object, ...)
 \arguments{
 \item{object}{Object of class "xgb.DMatrix"}
 \item{idxset}{a integer vector of indices of rows needed}
 \item{...}{other parameters}
 \item{idxset}{a integer vector of indices of rows needed}
 }
 \description{
 Get a new DMatrix containing the specified rows of
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.DMatrix.R
 \name{xgb.DMatrix}
 \alias{xgb.DMatrix}
 \title{Contruct xgb.DMatrix object}
@@ -11,7 +12,8 @@ indicating the data file.}
 \item{info}{a list of information of the xgb.DMatrix object}
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
 \item{...}{other information to pass to \code{info}.}
 }
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.DMatrix.save.R
 \name{xgb.DMatrix.save}
 \alias{xgb.DMatrix.save}
 \title{Save xgb.DMatrix object to binary file}
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -1,10 +1,14 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.cv.R
 \name{xgb.cv}
 \alias{xgb.cv}
 \title{Cross Validation}
 \usage{
-xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
+xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
-  metrics = list(), obj = NULL, feval = NULL, ...)
+  missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
  obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
  verbose = T, early_stop_round = NULL, early.stop.round = NULL,
  maximize = NULL, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@@ -19,18 +23,23 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
  \item \code{nthread} number of thread used in training, if not set, all threads are used
 }
-  See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
+  See \link{xgb.train} for further details.
-  further details. See also demo/ for walkthrough example in R.}
+  See also demo/ for walkthrough example in R.}
-\item{data}{takes an \code{xgb.DMatrix} as the input.}
+\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
 \item{nrounds}{the max number of iterations}
-\item{nfold}{number of folds used}
+\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
-\item{label}{option field, when data is Matrix}
+\item{label}{option field, when data is \code{Matrix}}
-\item{showsd}{boolean, whether show standard deviation of cross validation}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
 \item{prediction}{A logical value indicating whether to return the prediction vector.}
 \item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
 \item{metrics,}{list of evaluation metrics to be used in corss validation,
  when it is not specified, the evaluation metric is chosen according to objective function.
@@ -44,29 +53,58 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
 }}
 \item{obj}{customized objective function. Returns gradient and second order
-gradient with given prediction and dtrain,}
+gradient with given prediction and dtrain.}
 \item{feval}{custimized evaluation function. Returns
 \code{list(metric='metric-name', value='metric-value')} with given
-prediction and dtrain,}
+prediction and dtrain.}
 \item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
 \item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
 If folds are supplied, the nfold and stratified parameters would be ignored.}
 \item{verbose}{\code{boolean}, print the statistics during the process}
 \item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 keeps getting worse consecutively for \code{k} rounds.}
 \item{early.stop.round}{An alternative of \code{early_stop_round}.}
 \item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
    \code{maximize=TRUE} means the larger the evaluation score the better.}
 \item{...}{other parameters to pass to \code{params}.}
 }
 \value{
 If \code{prediction = TRUE}, a list with the following elements is returned:
 \itemize{
  \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
  \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
 }
 If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
 }
 \description{
 The cross valudation function of xgboost
 }
 \details{
-This is the cross validation function for xgboost
+The original sample is randomly partitioned into \code{nfold} equal size subsamples.
-Parallelization is automatically enabled if OpenMP is present.
+Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
 Number of threads can also be manually specified via "nthread" parameter.
-This function only accepts an \code{xgb.DMatrix} object as the input.
+The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
 All observations are used for both training and validation.
 Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
 }
 \examples{
 data(agaricus.train, package='xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
+history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
-                  "max.depth"=3, "eta"=1, "objective"="binary:logistic")
+                  max.depth =3, eta = 1, objective = "binary:logistic")
 print(history)
 }
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -1,21 +1,30 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.dump.R
 \name{xgb.dump}
 \alias{xgb.dump}
 \title{Save xgboost model to text file}
 \usage{
-xgb.dump(model, fname, fmap = "")
+xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE)
 }
 \arguments{
 \item{model}{the model object.}
-\item{fname}{the name of the binary file.}
+\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
 \item{fmap}{feature map file representing the type of feature.
-       Detailed description could be found at
+Detailed description could be found at
-       \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
+\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
-       See demo/ for walkthrough example in R, and
+See demo/ for walkthrough example in R, and
-       \url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt}
+\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
-       for example Format.}
+for example Format.}
 \item{with.stats}{whether dump statistics of splits
       When this option is on, the model dump comes with two additional statistics:
       gain is the approximate loss function gain we get in each split;
       cover is the sum of second order gradient in each node.}
 }
 \value{
 if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
 }
 \description{
 Save a xgboost model to text file. Could be parsed later.
@@ -26,7 +35,11 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-               eta = 1, nround = 2,objective = "binary:logistic")
+               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump')
+# save the model in file 'xgb.model.dump'
 xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
 # print the model without saving it to a file
 print(xgb.dump(bst))
 }
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -0,0 +1,70 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.importance.R
 \name{xgb.importance}
 \alias{xgb.importance}
 \title{Show importance of features in a model}
 \usage{
 xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
  data = NULL, label = NULL, target = function(x) ((x + label) == 2))
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
 \item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 \item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
 \item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
 \item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
 }
 \value{
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 }
 \description{
 Read a xgboost model text dump.
 Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
 }
 \details{
 This is the function to understand the model trained (and through your model, your data).
 Results are returned for both linear and tree models.
 \code{data.table} is returned by the function.
 There are 3 columns :
 \itemize{
  \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
  \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
  \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
  \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
 }
 Co-occurence count
 ------------------
 The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
 Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
 If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
 }
 \examples{
 data(agaricus.train, package='xgboost')
 # Both dataset are list with two items, a sparse matrix and labels
 # (labels = outcome column which will be learned).
 # Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 # train$data@Dimnames[[2]] represents the column names of the sparse matrix.
 xgb.importance(train$data@Dimnames[[2]], model = bst)
 # Same thing with co-occurence computation this time
 xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label)
 }
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.load.R
 \name{xgb.load}
 \alias{xgb.load}
 \title{Load xgboost model from binary file}
@@ -17,7 +18,7 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-               eta = 1, nround = 2,objective = "binary:logistic")
+               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 pred <- predict(bst, test$data)
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -0,0 +1,59 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.model.dt.tree.R
 \name{xgb.model.dt.tree}
 \alias{xgb.model.dt.tree}
 \title{Convert tree model dump to data.table}
 \usage{
 xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
  model = NULL, text = NULL, n_first_tree = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
 \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 \item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
 }
 \value{
 A \code{data.table} of the features used in the model with their gain, cover and few other thing.
 }
 \description{
 Read a tree model text dump and return a data.table.
 }
 \details{
 General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
 The content of the \code{data.table} is organised that way:
 \itemize{
 \item \code{ID}: unique identifier of a node ;
 \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
 \item \code{Split}: value of the chosen feature where is operated the split ;
 \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
 \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
 \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
 \item \code{Quality}: it's the gain related to the split in this specific node ;
 \item \code{Cover}: metric to measure the number of observation affected by the split ;
 \item \code{Tree}: ID of the tree. It is included in the main ID ;
 \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
 }
 }
 \examples{
 data(agaricus.train, package='xgboost')
 #Both dataset are list with two items, a sparse matrix and labels
 #(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
 xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
 }
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -0,0 +1,40 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.plot.importance.R
 \name{xgb.plot.importance}
 \alias{xgb.plot.importance}
 \title{Plot feature importance bar graph}
 \usage{
 xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10))
 }
 \arguments{
 \item{importance_matrix}{a \code{data.table} returned by the \code{xgb.importance} function.}
 \item{numberOfClusters}{a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.}
 }
 \value{
 A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
 }
 \description{
 Read a data.table containing feature importance details and plot it.
 }
 \details{
 The purpose of this function is to easily represent the importance of each feature of a model.
 The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
 In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
 }
 \examples{
 data(agaricus.train, package='xgboost')
 #Both dataset are list with two items, a sparse matrix and labels
 #(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #train$data@Dimnames[[2]] represents the column names of the sparse matrix.
 importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst)
 xgb.plot.importance(importance_matrix)
 }
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -0,0 +1,58 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.plot.tree.R
 \name{xgb.plot.tree}
 \alias{xgb.plot.tree}
 \title{Plot a boosted tree model}
 \usage{
 xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
  n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
 \item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
 \item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
 \item{width}{the width of the diagram in pixels.}
 \item{height}{the height of the diagram in pixels.}
 }
 \value{
 A \code{DiagrammeR} of the model.
 }
 \description{
 Read a tree model text dump.
 Plotting only works for boosted tree model (not linear model).
 }
 \details{
 The content of each node is organised that way:
 \itemize{
 \item \code{feature} value ;
 \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
 \item \code{gain}: metric the importance of the node in the model.
 }
 Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
 It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
 }
 \examples{
 data(agaricus.train, package='xgboost')
 #Both dataset are list with two items, a sparse matrix and labels
 #(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
 train <- agaricus.train
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
 xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
 }
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -1,4 +1,5 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.save.R
 \name{xgb.save}
 \alias{xgb.save}
 \title{Save xgboost model to binary file}
@@ -19,7 +20,7 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-               eta = 1, nround = 2,objective = "binary:logistic")
+               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 pred <- predict(bst, test$data)
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -0,0 +1,27 @@
 % Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.save.raw.R
 \name{xgb.save.raw}
 \alias{xgb.save.raw}
 \title{Save xgboost model to R's raw vector,
 user can call xgb.load to load the model back from raw vector}
 \usage{
 xgb.save.raw(model)
 }
 \arguments{
 \item{model}{the model object.}
 }
 \description{
 Save xgboost model from xgboost or xgb.train
 }
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 raw <- xgb.save.raw(bst)
 bst <- xgb.load(raw)
 pred <- predict(bst, test$data)
 }
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,26 +1,62 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgb.train.R
 \name{xgb.train}
 \alias{xgb.train}
 \title{eXtreme Gradient Boosting Training}
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
-  feval = NULL, verbose = 1, ...)
+  feval = NULL, verbose = 1, printEveryN=1L, early_stop_round = NULL,
  early.stop.round = NULL, maximize = NULL, ...)
 }
 \arguments{
-\item{params}{the list of parameters. Commonly used ones are:
+\item{params}{the list of parameters.
 1. General Parameters
 \itemize{
-  \item \code{objective} objective function, common ones are
+  \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
-  \itemize{
+  \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
    \item \code{reg:linear} linear regression
    \item \code{binary:logistic} logistic regression for classification
  }
  \item \code{eta} step size of each boosting step
  \item \code{max.depth} maximum depth of the tree
  \item \code{nthread} number of thread used in training, if not set, all threads are used
 }
-  See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
+2. Booster Parameters
-  further details. See also demo/ for walkthrough example in R.}
+
 2.1. Parameter for Tree Booster
 \itemize{
  \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
  \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
  \item \code{max_depth} maximum depth of a tree. Default: 6
  \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
  \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
  \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
  \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
 }
 2.2. Parameter for Linear Booster
 \itemize{
  \item \code{lambda} L2 regularization term on weights. Default: 0
  \item \code{lambda_bias} L2 regularization term on bias. Default: 0
  \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
 }
 3. Task Parameters
 \itemize{
 \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
  \itemize{
    \item \code{reg:linear} linear regression (Default).
    \item \code{reg:logistic} logistic regression.
    \item \code{binary:logistic} logistic regression for binary classification. Output probability.
    \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
    \item \code{num_class} set the number of classes. To use only with multiclass objectives.
    \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
    \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
    \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
  }
  \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
  \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 }}
 \item{data}{takes an \code{xgb.DMatrix} as the input.}
@@ -40,22 +76,46 @@ gradient with given prediction and dtrain,}
 prediction and dtrain,}
 \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
-  information of performance. If 2, xgboost will print information of both}
+information of performance. If 2, xgboost will print information of both}
 \item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 \item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 keeps getting worse consecutively for \code{k} rounds.}
 \item{early.stop.round}{An alternative of \code{early_stop_round}.}
 \item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
-The training function of xgboost
+An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
 }
 \details{
-This is the training function for xgboost.
+This is the training function for \code{xgboost}.
-Parallelization is automatically enabled if OpenMP is present.
+It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
-Number of threads can also be manually specified via "nthread" parameter.
+therefore it is more flexible than \code{\link{xgboost}} function.
-This function only accepts an \code{xgb.DMatrix} object as the input.
+Parallelization is automatically enabled if \code{OpenMP} is present.
-It supports advanced features such as watchlist, customized objective function,
+Number of threads can also be manually specified via \code{nthread} parameter.
-therefore it is more flexible than \code{\link{xgboost}}.
+
 \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
  \itemize{
     \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
     \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
     \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
     \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
     \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
     \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
  }
 Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
 This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -75,6 +135,6 @@ evalerror <- function(preds, dtrain) {
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
-bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
+bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
 }
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -1,18 +1,26 @@
-% Generated by roxygen2 (4.0.1): do not edit by hand
+% Generated by roxygen2 (4.1.1): do not edit by hand
 % Please edit documentation in R/xgboost.R
 \name{xgboost}
 \alias{xgboost}
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
-xgboost(data = NULL, label = NULL, params = list(), nrounds,
+xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
-  verbose = 1, ...)
+  nrounds, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL,
  maximize = NULL, ...)
 }
 \arguments{
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
 \code{xgb.DMatrix}.}
-\item{label}{the response variable. User should not set this field,}
+\item{label}{the response variable. User should not set this field,
 if data is local data file or  \code{xgb.DMatrix}.}
-\item{params}{the list of parameters. Commonly used ones are:
+\item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
 \item{params}{the list of parameters.
 Commonly used ones are:
 \itemize{
  \item \code{objective} objective function, common ones are
  \itemize{
@@ -24,8 +32,9 @@ xgboost(data = NULL, label = NULL, params = list(), nrounds,
  \item \code{nthread} number of thread used in training, if not set, all threads are used
 }
-  See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
+  Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
-  further details. See also demo/ for walkthrough example in R.}
+
  See also \code{demo/} for walkthrough example in R.}
 \item{nrounds}{the max number of iterations}
@@ -33,16 +42,28 @@ xgboost(data = NULL, label = NULL, params = list(), nrounds,
 information of performance. If 2, xgboost will print information of both
 performance and construction progress information}
 \item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 \item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 keeps getting worse consecutively for \code{k} rounds.}
 \item{early.stop.round}{An alternative of \code{early_stop_round}.}
 \item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
-A simple interface for xgboost in R
+A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface.
 }
 \details{
-This is the modeling function for xgboost.
+This is the modeling function for Xgboost.
-Parallelization is automatically enabled if OpenMP is present.
+Parallelization is automatically enabled if \code{OpenMP} is present.
-Number of threads can also be manually specified via "nthread" parameter
+
 Number of threads can also be manually specified via \code{nthread} parameter.
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -50,7 +71,7 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-               eta = 1, nround = 2,objective = "binary:logistic")
+               eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 pred <- predict(bst, test$data)
 }
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -1,9 +1,8 @@
 # package root
 PKGROOT=../../
 # _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
+PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
-PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
+PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -1,7 +1,19 @@
 # package root
-PKGROOT=../../
+PKGROOT=./
 # _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
+
-PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
+# This file is only used for windows compilation from github
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+# It will be replaced by Makevars in CRAN version
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+.PHONY: all xgblib
 all: $(SHLIB)
 $(SHLIB): xgblib
 xgblib:
 	cp -r ../../src .
 	cp -r ../../wrapper .
 	cp -r ../../subtree .
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
 $(OBJECTS) : xgblib
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -3,10 +3,12 @@
 #include <utility>
 #include <cstring>
 #include <cstdio>
-#include "xgboost_R.h"
+#include <sstream> 
 #include "wrapper/xgboost_wrapper.h"
 #include "src/utils/utils.h"
 #include "src/utils/omp.h"
 #include "xgboost_R.h"
 using namespace std;
 using namespace xgboost;
@@ -26,7 +28,13 @@ extern "C" {
  void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
  void (*Error)(const char *fmt, ...) = error;
 }
-}  // namespace utils
+bool CheckNAN(double v) {
  return ISNAN(v);
 }
 bool LogGamma(double v) {
  return lgammafn(v);
 }
 } // namespace utils
 namespace random {
 void Seed(unsigned seed) {
@@ -51,6 +59,9 @@ inline void _WrapperEnd(void) {
 }
 extern "C" {
  SEXP XGCheckNullPtr_R(SEXP handle) {
    return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
  }
  void _DMatrixFinalizer(SEXP ext) {    
    if (R_ExternalPtrAddr(ext) == NULL) return;
    XGDMatrixFree(R_ExternalPtrAddr(ext));
@@ -59,31 +70,31 @@ extern "C" {
  SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
    _WrapperBegin();
    void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  SEXP XGDMatrixCreateFromMat_R(SEXP mat, 
                                SEXP missing) {
    _WrapperBegin();
    SEXP dim = getAttrib(mat, R_DimSymbol);
-    int nrow = INTEGER(dim)[0];
+    size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
-    int ncol = INTEGER(dim)[1];    
+    size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
    double *din = REAL(mat);
    std::vector<float> data(nrow * ncol);
    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < nrow; ++i) {
+    for (bst_omp_uint i = 0; i < nrow; ++i) {
-      for (int j = 0; j < ncol; ++j) {
+      for (size_t j = 0; j < ncol; ++j) {
        data[i * ncol +j] = din[i + nrow * j];
      }
    }
    void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;    
  }
  SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
@@ -109,10 +120,10 @@ extern "C" {
    }
    void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
                                          BeginPtr(data_), nindptr, ndata);
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
@@ -123,10 +134,10 @@ extern "C" {
      idxvec[i] = INTEGER(idxset)[i] - 1;
    }
    void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;        
  }
  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
@@ -146,10 +157,7 @@ extern "C" {
        vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
      }
      XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
-      _WrapperEnd();
+    } else {
      return;
    }
    {
      std::vector<float> vec(len);
      #pragma omp parallel for schedule(static)
      for (int i = 0; i < len; ++i) {
@@ -166,12 +174,12 @@ extern "C" {
    bst_ulong olen;
    const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
                                             CHAR(asChar(field)), &olen);
    _WrapperEnd();
    SEXP ret = PROTECT(allocVector(REALSXP, olen));
    for (size_t i = 0; i < olen; ++i) {
      REAL(ret)[i] = res[i];
    }
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  SEXP XGDMatrixNumRow_R(SEXP handle) {
@@ -192,10 +200,10 @@ extern "C" {
      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
    }
    void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
    _WrapperEnd();
    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
@@ -241,25 +249,27 @@ extern "C" {
    for (int i = 0; i < len; ++i) {
      vec_sptr.push_back(vec_names[i].c_str());
    }
-    return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
+    const char *ret =
-                                         asInteger(iter),
+        XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
-                                         BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
+                             asInteger(iter),
                             BeginPtr(vec_dmats), BeginPtr(vec_sptr), len);  
    _WrapperEnd();
    return mkString(ret);
  }
-  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
+  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
    _WrapperBegin();
    bst_ulong olen;
    const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
                                        R_ExternalPtrAddr(dmat),
-                                        asInteger(output_margin),
+                                        asInteger(option_mask),
                                        asInteger(ntree_limit),
                                        &olen);
    _WrapperEnd();
    SEXP ret = PROTECT(allocVector(REALSXP, olen));
    for (size_t i = 0; i < olen; ++i) {
      REAL(ret)[i] = res[i];
    }
    UNPROTECT(1);
    _WrapperEnd();
    return ret;
  }
  void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
@@ -272,18 +282,41 @@ extern "C" {
    XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
    _WrapperEnd();
  }
-  void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
+  void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {    
    _WrapperBegin();
-    bst_ulong olen;
+    XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
-    const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+                                 RAW(raw),
-                                          CHAR(asChar(fmap)),
+                                 length(raw));
                                          &olen);
    FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
    for (size_t i = 0; i < olen; ++i) {
      fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
      fprintf(fo, "%s", res[i]);
    }
    fclose(fo);
    _WrapperEnd();
  }
  SEXP XGBoosterModelToRaw_R(SEXP handle) {
    bst_ulong olen;
    _WrapperBegin();
    const char *raw = XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen);
    _WrapperEnd();
    SEXP ret = PROTECT(allocVector(RAWSXP, olen));
    if (olen != 0) {
      memcpy(RAW(ret), raw, olen);
    }
    UNPROTECT(1);    
    return ret;
  }
  SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
    _WrapperBegin();
    bst_ulong olen;
    const char **res =
        XGBoosterDumpModel(R_ExternalPtrAddr(handle),
                           CHAR(asChar(fmap)),
                           asInteger(with_stats),
                           &olen);
    _WrapperEnd();
    SEXP out = PROTECT(allocVector(STRSXP, olen));    
    for (size_t i = 0; i < olen; ++i) {     
      stringstream stream;
      stream <<  "booster["<<i<<"]\n" << res[i];
      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
    }
    UNPROTECT(1);
    return out;
  }
 }
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -8,9 +8,16 @@
 extern "C" {
 #include <Rinternals.h>
 #include <R_ext/Random.h>
 #include <Rmath.h>
 }
 extern "C" {
  /*!
   * \brief check whether a handle is NULL
   * \param handle
   * \return whether it is null ptr
   */
  SEXP XGCheckNullPtr_R(SEXP handle);
  /*!
   * \brief load a data matrix 
   * \param fname name of the content
@@ -111,10 +118,10 @@ extern "C" {
   * \brief make prediction based on dmat
   * \param handle handle
   * \param dmat data matrix
-   * \param output_margin whether only output raw margin value
+   * \param option_mask output_margin:1 predict_leaf:2
   * \param ntree_limit limit number of trees used in prediction
   */
-  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit);
+  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
  /*!
   * \brief load model from existing file
   * \param handle handle
@@ -128,11 +135,22 @@ extern "C" {
   */    
  void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
  /*!
-   * \brief dump model into text file 
+   * \brief load model from raw array
   * \param handle handle
-   * \param fname file name of model that can be dumped into
+   */    
-   * \param fmap  name to fmap can be empty string
+  void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw);
  /*!
   * \brief save model into R's raw array
   * \param handle handle
   * \return raw array
   */
-  void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
+  SEXP XGBoosterModelToRaw_R(SEXP handle);
  /*!
   * \brief dump model into a string
   * \param handle handle
   * \param fmap  name to fmap can be empty string
   * \param with_stats whether dump statistics of splits
   */
  SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
 }
 #endif  // XGBOOST_WRAPPER_R_H_
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -0,0 +1,337 @@
 ---
 title: "Understand your dataset with Xgboost"
 output: 
  rmarkdown::html_vignette:
    css: vignette.css
    number_sections: yes
    toc: yes
 author: Tianqi Chen, Tong He, Michaël Benesty
 vignette: >
  %\VignetteIndexEntry{Discover your data}
  %\VignetteEngine{knitr::rmarkdown}
  \usepackage[utf8]{inputenc}
 ---
 Introduction
 ============
 The purpose of this Vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.
 This Vignette is not about predicting anything (see [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **Xgboost** to highlight the *link* between the *features* of your data and the *outcome*.
 Pacakge loading:
 ```{r libLoading, results='hold', message=F, warning=F}
 require(xgboost)
 require(Matrix)
 require(data.table)
 if (!require('vcd')) install.packages('vcd') 
 ```
 > **VCD** package is used for one of its embedded dataset only.
 Preparation of the dataset
 ==========================
 Numeric VS categorical variables
 --------------------------------
 **Xgboost** manages only `numeric` vectors.
 What to do when you have *categorical* data?
 A *categorical* variable has a fixed number of different values. For instance, if a variable called *Colour* can have only one of these three values, *red*, *blue* or *green*, then *Colour* is a *categorical* variable.
 > In **R**, a *categorical* variable is called `factor`.
 >
 > Type `?factor` in the console for more information.
 To answer the question above we will convert *categorical* variables to `numeric` one.
 Conversion from categorical to numeric variables
 ------------------------------------------------
 ### Looking at the raw data
 In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
 The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
 The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
 ```{r, results='hide'}
 data(Arthritis)
 df <- data.table(Arthritis, keep.rownames = F)
 ```
 > `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `panda` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
 The first thing we want to do is to have a look to the first lines of the `data.table`:
 ```{r}
 head(df)
 ```
 Now we will check the format of each column.
 ```{r}
 str(df)
 ```
 2 columns have `factor` type, one has `ordinal` type.
 > `ordinal` variable :
 >
 > * can take a limited number of values (like `factor`) ;
 > * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None`
 ### Creation of new features based on old ones
 We will add some new *categorical* features to see if it helps.
 #### Grouping per 10 years
 For the first feature we create groups of age by rounding the real age.
 Note that we transform it to `factor` so the algorithm treat these age groups as independent values.
 Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
 ```{r}
 head(df[,AgeDiscret := as.factor(round(Age/10,0))])
 ```
 #### Random split in two groups
 Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
 ```{r}
 head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
 ```
 #### Risks in adding correlated features
 These new features are highly correlated to the `Age` feature because they are simple transformations of this feature. 
 For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
 Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
 #### Cleaning data
 We remove ID as there is nothing to learn from this feature (it would just add some noise).
 ```{r, results='hide'}
 df[,ID:=NULL]
 ```
 We will list the different values for the column `Treatment`:
 ```{r}
 levels(df[,Treatment])
 ```
 ### One-hot encoding
 Next step, we will transform the categorical data to dummy variables.
 This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
 The purpose is to transform each value of each *categorical* feature in a *binary* feature `{0, 1}`.
 For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
 Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
 ```{r, warning=FALSE,message=FALSE}
 sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
 head(sparse_matrix)
 ```
 > Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
 Create the output `numeric` vector (not as a sparse `Matrix`):
 ```{r}
 output_vector = df[,Improved] == "Marked"
 ```
 1. set `Y` vector to `0`; 
 2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ; 
 3. return `Y` vector.
 Build the model
 ===============
 The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
 ```{r}
 bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
 ```
 You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
 A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future). 
 > Here you can see the numbers decrease until line 7 and then increase. 
 >
 > It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-)
 Feature importance
 ==================
 Measure feature importance
 --------------------------
 ### Build the feature importance data.table
 In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
 ```{r}
 importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
 head(importance)
 ```
 > The column `Gain` provide the information we are looking for.
 >
 > As you can see, features are classified by `Gain`.
 `Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
 `Cover` measures the relative quantity of observations concerned by a feature.
 `Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
 ### Improvement in the interpretability of feature importance data.table
 We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
 One simple solution is to count the co-occurrences of a feature and a class of the classification.
 For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
 ```{r}
 importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
 # Cleaning for better display
 importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequence=NULL)]
 head(importanceClean)
 ```
 > In the table above we have removed two not needed columns and select only the first lines.
 First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits.
 How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
 The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents.
 Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
 > You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
 Plotting the feature importance
 -------------------------------
 All these things are nice, but it would be even better to plot the results.
 ```{r, fig.width=8, fig.height=5, fig.align='center'}
 xgb.plot.importance(importance_matrix = importanceRaw)
 ```
 Feature have automatically been divided in 2 clusters: the interesting features... and the others.
 > Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information.
 According to the plot above, the most important features in this dataset to predict if the treatment will work are :
 * the Age ;
 * having received a placebo or not ;
 * the sex is third but already included in the not interesting features group ; 
 * then we see our generated features (AgeDiscret). We can see that their contribution is very low.
 Do these results make sense?
 ------------------------------
 Let's check some **Chi2** between each of these features and the label.
 Higher **Chi2** means better correlation.
 ```{r, warning=FALSE, message=FALSE}
 c2 <- chisq.test(df$Age, output_vector)
 print(c2)
 ```
 Pearson correlation between Age and illness disapearing is **`r round(c2$statistic, 2 )`**.
 ```{r, warning=FALSE, message=FALSE}
 c2 <- chisq.test(df$AgeDiscret, output_vector)
 print(c2)
 ```
 Our first simplification of Age gives a Pearson correlation is **`r round(c2$statistic, 2)`**.
 ```{r, warning=FALSE, message=FALSE}
 c2 <- chisq.test(df$AgeCat, output_vector)
 print(c2)
 ```
 The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. 
 Morality: don't let your *gut* lower the quality of your model. 
 In *data science* expression, there is the word *science* :-)
 Conclusion
 ==========
 As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that. 
 But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. 
 The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
 Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
 Linear model may not be that smart in this scenario.
 Special Note: What about Random Forests™?
 ==========================================
 As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
 Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
 This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
 Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests™).
 However, in Random Forests™ this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
 In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
 If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters! 
 **Warning**: this is still an experimental parameter.
 For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
 ```{r, warning=FALSE, message=FALSE}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 #Random Forest™ - 1000 trees
 bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nround = 1, objective = "binary:logistic")
 #Boosting - 3 rounds
 bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nround = 3, objective = "binary:logistic")
 ```
 > Note that the parameter `round` is set to `1`.
 > [**Random Forests™**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.
--- a/R-package/vignettes/vignette.css
+++ b/R-package/vignettes/vignette.css
@@ -0,0 +1,225 @@
 body {
    margin: 0 auto;
    background-color: white;
 /*  --------- FONT FAMILY --------
   following are some optional font families. Usually a family 
 	is safer to choose than a specific font, 
 	which may not be on the users computer		*/
 /    font-family:Georgia, Palatino, serif;
    font-family: "Open Sans", "Book Antiqua", Palatino, serif;
 /    font-family:Arial, Helvetica, sans-serif;
 /    font-family:Tahoma, Verdana, Geneva, sans-serif;
 /    font-family:Courier, monospace;
 /    font-family:"Times New Roman", Times, serif;
 /*	-------------- COLOR OPTIONS ------------
 	following are additional color options for base font
 	you could uncomment another one to easily change the base color 
 	or add one to a specific element style below         */		
    color: #333333; /* dark gray not black */
 /    color: #000000; /* black */
 /    color: #666666; /* medium gray  black */	
 /    color: #E3E3E3; /* very light gray */
 /    color: white; 
    line-height: 100%;
    max-width: 800px;
    padding: 10px;
    font-size: 17px;
    text-align: justify;
    text-justify: inter-word;
 }
 p {
    line-height: 150%;
 /    max-width: 540px;
    max-width: 960px;
    margin-bottom: 5px;
    font-weight: 400;    
 /    color: #333333
 }
 h1, h2, h3, h4, h5, h6 {
    font-weight: 400;
    margin-top: 35px;
    margin-bottom: 15px;
    padding-top: 10px;
 }
 h1 {
    margin-top: 70px;
    color: #606AAA;
    font-size:230%;
    font-variant:small-caps;
    padding-bottom:20px;
    width:100%;
    border-bottom:1px solid #606AAA;
 }
 h2 {
    font-size:160%;
 }
 h3 {
    font-size:130%;
 }
 h4 {
    font-size:120%;
    font-variant:small-caps;
 }
 h5 {
    font-size:120%;
 }
 h6 {
    font-size:120%;
    font-variant:small-caps;
 }
 a {
    color: #606AAA;
    margin: 0;
    padding: 0;
    vertical-align: baseline;
 }
 a:hover {
    text-decoration: blink;
    color: green;
 }
 a:visited {
    color: gray;
 }
 ul, ol {
    padding: 0;
    margin: 0px 0px 0px 50px;
 }
 ul {
    list-style-type: square;
    list-style-position: inside;
 }
 li {
     line-height:150%    
 }
 li ul, li ul {
    margin-left: 24px;
 }
 pre {
    padding: 0px 10px;
    max-width: 800px;
    white-space: pre-wrap;
 }
 code {
    font-family: Consolas, Monaco, Andale Mono, monospace, courrier new;
    line-height: 1.5;
    font-size: 15px;
    background: #F8F8F8;
    border-radius: 4px;
    padding: 5px;
    display: inline-block;
    max-width: 800px;
    white-space: pre-wrap;
 }
 li code, p code {
  background: #CDCDCD;
  color: #606AAA;
  padding: 0px 5px 0px 5px;
 }
 code.r, code.cpp {
    display: block;
    word-wrap: break-word;
    border: 1px solid #606AAA;        
 }
 aside {
    display: block;
    float: right;
    width: 390px;
 }
 blockquote {
    border-left:.5em solid #606AAA;
    background: #F8F8F8;
    padding: 0em 1em 0em 1em;
    margin-left:10px;
    max-width: 500px;
 }
 blockquote cite {
    line-height:10px;
    color:#bfbfbf;
 }
 blockquote cite:before {
    /content: '\2014 \00A0';
 }
 blockquote p, blockquote li {  
    color: #666;
 }
 hr {
 /   width: 540px;
    text-align: left;
    margin: 0 auto 0 0;
    color: #999;
 }
 /* table */
 table {
  width: 100%;
  border-top: 1px solid #919699;
 	border-left: 1px solid #919699;
 	border-spacing: 0;
 }
 table th {
 	padding: 4px 8px 4px 8px;
  text-align: center;
  color: white;
 	background: #606AAA;
 	border-bottom: 1px solid #919699;
 	border-right: 1px solid #919699;
 }
 table th p {
 	font-weight: bold;
 	margin-bottom: 0px; 
 }
 table td {
 	padding: 8px;	
 	vertical-align: top;
 	border-bottom: 1px solid #919699;
 	border-right: 1px solid #919699;
 }
 table td:last-child {
  /background: lightgray;
  text-align: right;
 }
 table td p {
 	margin-bottom: 0px; 
 }
 table td p + p  {
 	margin-top: 5px; 
 }
 table td p + p + p {
 	margin-top: 5px; 
 }
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@@ -49,7 +49,7 @@ xgboost.version = '0.3-0'
 This is an introductory document of using the \verb@xgboost@ package in R. 
 \verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
- and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. 
+ and scalable implementation of gradient boosting framework by \citep{friedman2001greedy} \citep{friedman2000additive}. 
 The package includes efficient linear model solver and tree learning algorithm.
 It supports various objective functions, including regression, classification
 and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
@@ -214,3 +214,8 @@ competition.
 \end{document}
 <<Temp file cleaning, include=FALSE>>=
 file.remove("xgb.DMatrix")
 file.remove("model.dump")
 file.remove("model.save")
@
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -0,0 +1,405 @@
 ---
 title: "Xgboost presentation"
 output: 
  rmarkdown::html_vignette:
    css: vignette.css
    number_sections: yes
    toc: yes
 bibliography: xgboost.bib
 author: Tianqi Chen, Tong He, Michaël Benesty
 vignette: >
  %\VignetteIndexEntry{Xgboost presentation}
  %\VignetteEngine{knitr::rmarkdown}
  \usepackage[utf8]{inputenc}
 ---
 Introduction
 ============
 **Xgboost** is short for e**X**treme **G**radient **Boost**ing package. 
 The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions.
 It is an efficient and scalable implementation of gradient boosting framework by @friedman2000additive and @friedman2001greedy. Two solvers are included:
 - *linear* model ;
 - *tree learning* algorithm.
 It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily. 
 It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. 
 It has several features:
 * Speed: it can automatically do parallel computation on *Windows* and *Linux*, with *OpenMP*. It is generally over 10 times faster than the classical `gbm`.
 * Input Type: it takes several types of input data:
    * *Dense* Matrix: *R*'s *dense* matrix, i.e. `matrix` ;
    * *Sparse* Matrix: *R*'s *sparse* matrix, i.e. `Matrix::dgCMatrix` ;
    * Data File: local data files ;
    * `xgb.DMatrix`: its own class (recommended).
 * Sparsity: it accepts *sparse* input for both *tree booster*  and *linear booster*, and is optimized for *sparse* input ;
 * Customization: it supports customized objective functions and evaluation functions.
 Installation
 ============
 Github version
 --------------
 For up-to-date version (highly recommended), install from *Github*:
 ```{r installGithub, eval=FALSE}
 devtools::install_github('dmlc/xgboost', subdir='R-package')
 ```
 > *Windows* user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
 Cran version
 ------------
 For stable version on *CRAN*, run:
 ```{r installCran, eval=FALSE}
 install.packages('xgboost')
 ```
 Learning
 ========
 For the purpose of this tutorial we will load **Xgboost** package.
 ```{r libLoading, results='hold', message=F, warning=F}
 require(xgboost)
 ```
 Dataset presentation
 --------------------
 In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-). 
 Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.
 Dataset loading
 ---------------
 We will load the `agaricus` datasets embedded with the package and will link them to variables.
 The datasets are already split in:
 * `train`: will be used to build the model ;
 * `test`: will be used to assess the quality of our model. 
 Why *split* the dataset in two parts?
 In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
 ```{r datasetLoading, results='hold', message=F, warning=F}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 ```
 > In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/splitting.html).
 Each variable is a `list` containing two things, `label` and `data`:
 ```{r dataList, message=F, warning=F}
 str(train)
 ```
 `label` is the outcome of our dataset meaning it is the binary *classification* we will try to predict.
 Let's discover the dimensionality of our datasets.
 ```{r dataSize, message=F, warning=F}
 dim(train$data)
 dim(test$data)
 ```
 This dataset is very small to not make the **R** package too heavy, however **Xgboost** is built to manage huge dataset very efficiently.
 As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
 ```{r dataClass, message=F, warning=F}
 class(train$data)[1]
 class(train$label)
 ```
 Basic Training using Xgboost
 ----------------------------
 This step is the most critical part of the process for the quality of our model.
 ### Basic training
 We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`.
 In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, in a dataset mainly made of `0`, memory size is reduced. It is very usual to have such dataset.
 We will train decision tree model using the following parameters:
 * `objective = "binary:logistic"`: we will train a binary classification model ;
 * `max.deph = 2`: the trees won't be deep, because our case is very simple ;
 * `nthread = 2`: the number of cpu threads we are going to use;
 * `nround = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
 ```{r trainingSparse, message=F, warning=F}
 bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 ```
 > More complex the relationship between your features and your `label` is, more passes you need.
 ### Parameter variations
 #### Dense matrix
 Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix.
 ```{r trainingDense, message=F, warning=F}
 bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 ```
 #### xgb.DMatrix
 **Xgboost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.
 ```{r trainingDmatrix, message=F, warning=F}
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 ```
 #### Verbose option
 **Xgboost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
 One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
 ```{r trainingVerbose0, message=T, warning=F}
 # verbose = 0, no message
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 0)
 ```
 ```{r trainingVerbose1, message=T, warning=F}
 # verbose = 1, print evaluation metric
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 1)
 ```
 ```{r trainingVerbose2, message=T, warning=F}
 # verbose = 2, also print information about tree
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
 ```
 Basic prediction using Xgboost
 ==============================
 Perform the prediction
 ----------------------
 The pupose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
 ```{r predicting, message=F, warning=F}
 pred <- predict(bst, test$data)
 # size of the prediction vector
 print(length(pred))
 # limit display of predictions to the first 10
 print(head(pred))
 ```
 These numbers doesn't look like *binary classification* `{0,1}`. We need to perform a simple transformation before being able to use these results.
 Transform the regression in a binary classification
 ---------------------------------------------------
 The only thing that **Xgboost** does is a *regression*. **Xgboost** is using `label` vector to build its *regression* model.
 How can we use a *regression* model to perform a binary classification?
 If we think about the meaning of a regression applied to our data, the numbers we get are probabilities that a datum will be classified as `1`. Therefore, we will set the rule that if this probability for a specific datum is `> 0.5` then the observation is classified as `1` (or `0` otherwise).
 ```{r predictingTest, message=F, warning=F}
 prediction <- as.numeric(pred > 0.5)
 print(head(prediction))
 ```
 Measuring model performance
 ---------------------------
 To measure the model performance, we will compute a simple metric, the *average error*.
 ```{r predictingAverageError, message=F, warning=F}
 err <- mean(as.numeric(pred > 0.5) != test$label)
 print(paste("test-error=", err))
 ```
 > Note that the algorithm has not seen the `test` data during the model construction.
 Steps explanation:
 1. `as.numeric(pred > 0.5)` applies our rule that when the probability (<=> regression <=> prediction) is `> 0.5` the observation is classified as `1` and `0` otherwise ;
 2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ;
 3. `mean(vectorOfErrors)` computes the *average error* itself.
 The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**. 
 *Multiclass* classification works in a similar way.
 This metric is **`r round(err, 2)`** and is pretty low: our yummly mushroom model works well!
 Advanced features
 =================
 Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content.
 Dataset preparation
 -------------------
 For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
 ```{r DMatrix, message=F, warning=F}
 dtrain <- xgb.DMatrix(data = train$data, label=train$label)
 dtest <- xgb.DMatrix(data = test$data, label=test$label)
 ```
 Measure learning progress with xgb.train
 ----------------------------------------
 Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
 One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
 One way to measure progress in learning of a model is to provide to **Xgboost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
 > in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
 For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
 ```{r watchlist, message=F, warning=F}
 watchlist <- list(train=dtrain, test=dtest)
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
 ```
 **Xgboost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. 
 Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
 If with your own dataset you have not such results, you should think about how you did to divide your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
 For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
 ```{r watchlist2, message=F, warning=F}
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
 ```
 > `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`.
 Linear boosting
 ---------------
 Until know, all the learnings we have performed were based on boosting trees. **Xgboost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
 ```{r linearBoosting, message=F, warning=F}
 bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
 ```
 In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. 
 In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
 Manipulating xgb.DMatrix
 ------------------------
 ### Save / Load
 Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function.
 ```{r DMatrixSave, message=F, warning=F}
 xgb.DMatrix.save(dtrain, "dtrain.buffer")
 # to load it in, simply call xgb.DMatrix
 dtrain2 <- xgb.DMatrix("dtrain.buffer")
 bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
 ```
 ```{r DMatrixDel, include=FALSE}
 file.remove("dtrain.buffer")
 ```
 ### Information extraction
 Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data.
 ```{r getinfo, message=F, warning=F}
 label = getinfo(dtest, "label")
 pred <- predict(bst, dtest)
 err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
 print(paste("test-error=", err))
 ```
 View the trees from a model
 ---------------------------
 You can dump the tree you learned using `xgb.dump` into a text file.
 ```{r dump, message=T, warning=F}
 xgb.dump(bst, with.stats = T)
 ```
 > if you provide a path to `fname` parameter you can save the trees to your hard drive.
 Save and load models
 --------------------
 May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
 Hopefully for you, **Xgboost** implements such functions.
 ```{r saveModel, message=F, warning=F}
 # save model to binary local file
 xgb.save(bst, "xgboost.model")
 ```
 > `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise.
 An interesting test to see how identic is our saved model with the original one would be to compare the two predictions.
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
 bst2 <- xgb.load("xgboost.model")
 pred2 <- predict(bst2, test$data)
 # And now the test
 print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
 ```
 ```{r clean, include=FALSE}
 # delete the created model
 file.remove("./xgboost.model")
 ```
 > result is `0`? We are good!
 In some very specific cases, like when you want to pilot **Xgboost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
 ```{r saveLoadRBinVectorModel, message=F, warning=F}
 # save model to R's raw vector
 rawVec <- xgb.save.raw(bst)
 # print class
 print(class(rawVec))
 # load binary model to R
 bst3 <- xgb.load(rawVec)
 pred3 <- predict(bst3, test$data)
 # pred2 should be identical to pred
 print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
 ``` 
 > Again `0`? It seems that `Xgboost` works pretty well!
 References
 ==========
--- a/README.md
+++ b/README.md
@@ -1,52 +1,57 @@
-xgboost: eXtreme Gradient Boosting 
+XGBoost: eXtreme Gradient Boosting 
-======
+==================================
 An optimized general purpose gradient boosting library. The library is parallelized using OpenMP. It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree. 
-Contributors: https://github.com/tqchen/xgboost/graphs/contributors
+An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
 It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data
-Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
+Contributors: https://github.com/dmlc/xgboost/graphs/contributors
-Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.com/tqchen/xgboost/issues?q=is%3Aissue+label%3Aquestion)
+Documentations: [Documentation of xgboost](doc/README.md)
-Examples Code: [Learning to use xgboost by examples](demo)
+Issues Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion)
-Notes on the Code: [Code Guide](src)
+Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) to ask questions and share your experience on xgboost.
  - Use issue tracker for bug reports, feature requests etc.
  - Use the user group to post your experience, ask questions about general usages.
 Gitter for developers [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 Distributed Version: [Distributed XGBoost](multi-node)
 Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
 What's New
-=====
+==========
-* See the updated [demo folder](demo) for feature walkthrough
+* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
-* Thanks to Tong He, the new [R package](R-package) is available
+* XGBoost wins [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
  - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
 * [External Memory Version](doc/external_memory.md)
 Features
-======
+========
-* Sparse feature format:
+* Easily accessible in python, R, Julia, CLI
-  - Sparse feature format allows easy handling of missing values, and improve computation efficiency.
+* Fast speed and memory efficient
-* Push the limit on single machine:
+  - Can be more than 10 times faster than GBM in sklearn and R
-  - Efficient implementation that optimizes memory and computation.
+  - Handles sparse matrices, support external memory
-* Speed: XGBoost is very fast
+* Accurate prediction, and used extensively by data scientists and kagglers
-  - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
+  - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
-* Layout of gradient boosting algorithm to support user defined objective
+* Distributed and Portable
-* Python interface, works with numpy and scipy.sparse matrix
+  - The distributed version runs on Hadoop (YARN), MPI, SGE etc.
  - Scales to billions of examples and beyond
 Build
-=====
+=======
 * Run ```bash build.sh``` (you can also type make)
-* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
+  - Normally it gives what you want
-* You may get a error: -lgomp is not found
+  - See [Build Instruction](doc/build.md) for more information
  - You can type ```make no_omp=1```, this will get you single thread xgboost
  - Alternatively, you can upgrade your compiler to compile multi-thread version
 * Windows(VS 2010): see [windows](windows) folder
  - In principle, you put all the cpp files in the Makefile to the project, and build
 Version
-======
+=======
-* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility
+* Current version xgboost-0.4, a lot improvment has been made since 0.3
-* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure
+  - Change log in [CHANGES.md](CHANGES.md)
-  - This means the model and buffer file of previous version can not be loaded in xgboost-3.0
+  - This version is compatible with 0.3x versions
 * For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)
 * Change log in [CHANGES.md](CHANGES.md)
 XGBoost in Graphlab Create
-======
+==========================
 * XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html
 * Nice blogpost by Jay Gu using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand
--- a/build.sh
+++ b/build.sh
@@ -1,8 +1,12 @@
 #!/bin/bash
-# this is a simple script to make xgboost in MAC nad Linux
+# This is a simple script to make xgboost in MAC and Linux
-# basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
+# Basically, it first try to make with OpenMP, if fails, disable OpenMP and make it again.
-# This will automatically make xgboost for MAC users who do not have openmp support
+# This will automatically make xgboost for MAC users who don't have OpenMP support.
-# In most cases, type make will give what you want
+# In most cases, type make will give what you want.
 # See additional instruction in doc/build.md
 if make; then
    echo "Successfully build multi-thread xgboost"
 else
@@ -12,4 +16,6 @@ else
    make clean
    make no_omp=1
    echo "Successfully build single-thread xgboost"
    echo "If you want multi-threaded version"
    echo "See additional instructions in doc/build.md"
 fi
--- a/demo/.gitignore
+++ b/demo/.gitignore
@@ -0,0 +1 @@
 *.libsvm
--- a/demo/README.md
+++ b/demo/README.md
@@ -1,22 +1,45 @@
 XGBoost Examples
 ====
-This folder contains the all example codes using xgboost. 
+This folder contains all the code examples using xgboost. 
-* Contribution of exampls, benchmarks is more than welcomed!
+* Contribution of examples, benchmarks is more than welcome!
 * If you like to share how you use xgboost to solve your problem, send a pull request:)
 Features Walkthrough
 ====
 This is a list of short codes introducing different functionalities of xgboost and its wrapper.
-* Basic walkthrough of wrappers [python](guide-python/basic_walkthrough.py)
+* Basic walkthrough of wrappers 
-* Cutomize loss function, and evaluation metric [python](guide-python/custom_objective.py)
+  [python](guide-python/basic_walkthrough.py)
-* Boosting from existing prediction [python](guide-python/boost_from_prediction.py)
+  [R](../R-package/demo/basic_walkthrough.R)
-* Predicting using first n trees [python](guide-python/predict_first_ntree.py)
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
-* Generalized Linear Model [python](guide-python/generalized_linear_model.py)
+* Customize loss function, and evaluation metric
-* Cross validation [python](guide-python/cross_validation.py)
+  [python](guide-python/custom_objective.py)
  [R](../R-package/demo/custom_objective.R)
  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/custom_objective.jl)
 * Boosting from existing prediction
  [python](guide-python/boost_from_prediction.py)
  [R](../R-package/demo/boost_from_prediction.R)
  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
 * Predicting using first n trees 
  [python](guide-python/predict_first_ntree.py)
  [R](../R-package/demo/boost_from_prediction.R)
  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)  
 * Generalized Linear Model
  [python](guide-python/generalized_linear_model.py)
  [R](../R-package/demo/generalized_linear_model.R)
  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl)  
 * Cross validation
  [python](guide-python/cross_validation.py)
  [R](../R-package/demo/cross_validation.R)
  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)  
 * Predicting leaf indices
  [python](guide-python/predict_leaf_indices.py)
  [R](../R-package/demo/predict_leaf_indices.R)
 Basic Examples by Tasks
 ====
 Most of examples in this section are based on CLI or python version.
 However, the parameter settings can be applied to all versions
 * [Binary classification](binary_classification)
 * [Multiclass classification](multiclass_classification)
 * [Regression](regression)
@@ -25,3 +48,5 @@ Basic Examples by Tasks
 Benchmarks
 ====
 * [Starter script for Kaggle Higgs Boson](kaggle-higgs)
 * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) 
--- a/demo/binary_classification/README
+++ b/demo/binary_classification/README
@@ -1,14 +0,0 @@
 Demonstrating how to use XGBoost accomplish binary classification tasks  on UCI mushroom dataset  http://archive.ics.uci.edu/ml/datasets/Mushroom
 Run: ./runexp.sh
 Format of input: LIBSVM format
 Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
  - Feature id must be from 0 to number of features, in sorted order.
  - i means this feature is binary indicator feature
  - q means this feature is a quantitative value, such as age, time, can be missing
  - int means this feature is integer value (when int is hinted, the decision boundary will be integer)
 Explainations: https://github.com/tqchen/xgboost/wiki/Binary-Classification
--- a/demo/binary_classification/README.md
+++ b/demo/binary_classification/README.md
@@ -0,0 +1,172 @@
 Binary Classification
 ====
 This is the quick start tutorial for xgboost CLI version. You can also checkout [../../doc/README.md](../../doc/README.md) for links to tutorial in python or R.
 Here we demonstrate how to use XGBoost for a binary classification task. Before getting started, make sure you compile xgboost in the root directory of the project by typing ```make```
 The script runexp.sh can be used to run the demo. Here we use [mushroom dataset](https://archive.ics.uci.edu/ml/datasets/Mushroom) from UCI machine learning repository. 
 ### Tutorial
 #### Generate Input Data
 XGBoost takes LibSVM format. An example of faked input data is below:
 ```
 1 101:1.2 102:0.03
 0 1:2.1 10001:300 10002:400 
 ...
 ```
 Each line represent a single instance, and in the first line '1' is the instance label,'101' and '102' are feature indices, '1.2' and '0.03' are feature values. In the binary classification case, '1' is used to indicate positive samples, and '0' is used to indicate negative samples. We also support probability values in [0,1] as label, to indicate the probability of the instance being positive.
 First we will transform the dataset into classic LibSVM format and split the data into training set and test set by running:
 ```
 python mapfeat.py
 python mknfold.py agaricus.txt 1
 ```
 The two files, 'agaricus.txt.train' and 'agaricus.txt.test' will be used as training set and test set.
 #### Training 
 Then we can run the training process:
 ```
 ../../xgboost mushroom.conf
 ```
 mushroom.conf is the configuration for both training and testing. Each line containing the [attribute]=[value] configuration:
 ```conf
 # General Parameters, see comment for each definition
 # can be gbtree or gblinear
 booster = gbtree 
 # choose logistic regression loss function for binary classification
 objective = binary:logistic
 # Tree Booster Parameters
 # step size shrinkage
 eta = 1.0 
 # minimum loss reduction required to make a further partition
 gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
 min_child_weight = 1 
 # maximum depth of a tree
 max_depth = 3 
 # Task Parameters
 # the number of round to do boosting
 num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
 # The path of training data
 data = "agaricus.txt.train" 
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
 eval[test] = "agaricus.txt.test" 
 # The path of test data 
 test:data = "agaricus.txt.test"      
 ```
 We use the tree booster and logistic regression objective in our setting. This indicates that we accomplish our task using classic gradient boosting regression tree(GBRT), which is a promising method for binary classification.
 The parameters shown in the example gives the most common ones that are needed to use xgboost.
 If you are interested in more parameter settings, the complete parameter settings and detailed descriptions are [here](../../doc/parameter.md). Besides putting the parameters in the configuration file, we can set them by passing them as arguments as below:
 ```
 ../../xgboost mushroom.conf max_depth=6
 ```
 This means that the parameter max_depth will be set as 6 rather than 3 in the conf file. When you use command line, make sure max_depth=6 is passed in as single argument, i.e. do not contain space in the argument. When a parameter setting is provided in both command line input and  the config file, the command line setting will override the setting in config file.
 In this example, we use tree booster for gradient boosting. If you would like to use linear booster for regression, you can keep all the parameters except booster and the tree booster parameters as below:
 ```conf 
 # General Parameters
 # choose the linear booster
 booster = gblinear
 ...
 # Change Tree Booster Parameters into Linear Booster Parameters
 # L2 regularization term on weights, default 0
 lambda = 0.01
 # L1 regularization term on weights, default 0
 f ```agaricus.txt.test.buffer``` exists, and automatically loads from binary buffer if possible, this can speedup training process when you do training many times. You can disable it by setting ```use_buffer=0```.
  - Buffer file can also be used as standalone input, i.e if buffer file exists, but original agaricus.txt.test was removed, xgboost will still run
 * Deviation from LibSVM input format: xgboost is compatible with LibSVM format, with the following minor differences:
  - xgboost allows feature index starts from 0
  - for binary classification, the label is 1 for positive, 0 for negative, instead of +1,-1
  - the feature indices in each line *do not* need to be sorted
 alpha = 0.01 
 # L2 regularization term on bias, default 0
 lambda_bias = 0.01 
 # Regression Parameters
 ...
 ```
 #### Get Predictions 
 After training, we can use the output model to get the prediction of the test data:
 ```
 ../../xgboost mushroom.conf task=pred model_in=0003.model
 ```
 For binary classification, the output predictions are probability confidence scores in [0,1], corresponds to the probability of the label to be positive.
 #### Dump Model
 This is a preliminary feature, so far only tree model support text dump. XGBoost can display the tree models in text files and we can scan the model in an easy way:
 ```
 ../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt 
 ../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt
 ```
 In this demo, the tree boosters obtained will be printed in dump.raw.txt and dump.nice.txt, and the latter one is easier to understand because of usage of feature mapping featmap.txt
 Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
  - Feature id must be from 0 to number of features, in sorted order.
  - i means this feature is binary indicator feature
  - q means this feature is a quantitative value, such as age, time, can be missing
  - int means this feature is integer value (when int is hinted, the decision boundary will be integer)
 #### Monitoring Progress
 When you run training we can find there are messages displayed on screen
 ```
 tree train end, 1 roots, 12 extra nodes, 0 pruned nodes ,max_depth=3
 [0]  test-error:0.016139
 boosting round 1, 0 sec elapsed
 tree train end, 1 roots, 10 extra nodes, 0 pruned nodes ,max_depth=3
 [1]  test-error:0.000000
 ```
 The messages for evaluation are printed into stderr, so if you want only to log the evaluation progress, simply type
 ```
 ../../xgboost mushroom.conf 2>log.txt
 ```
 Then you can find the following content in log.txt
 ```
 [0]     test-error:0.016139
 [1]     test-error:0.000000
 ```
 We can also monitor both training and test statistics, by adding following lines to configure
 ```conf
 eval[test] = "agaricus.txt.test" 
 eval[trainname] = "agaricus.txt.train" 
 ```
 Run the command again, we can find the log file becomes
 ```
 [0]     test-error:0.016139     trainname-error:0.014433
 [1]     test-error:0.000000     trainname-error:0.001228
 ```
 The rule is eval[name-printed-in-log] = filename, then the file will be added to monitoring process, and evaluated each round.
 xgboost also support monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes
 ```
 [0]     test-error:0.016139     test-negllik:0.029795   trainname-error:0.014433        trainname-negllik:0.027023
 [1]     test-error:0.000000     test-negllik:0.000000   trainname-error:0.001228        trainname-negllik:0.002457
 ```
 ### Saving Progress Models
 If you want to save model every two round, simply set save_period=2. You will find 0002.model in the current folder. If you want to change the output folder of models, add model_dir=foldername. By default xgboost saves the model of last round.
 #### Continue from Existing Model
 If you want to continue boosting from existing model, say 0002.model, use
 ```
 ../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.model
 ```
 xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
 #### Use Multi-Threading
 When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running threads to 10, add ```nthread=10``` to your configuration.
 #### Additional Notes
 * What are ```agaricus.txt.test.buffer``` and ```agaricus.txt.train.buffer``` generated during runexp.sh? 
  - By default xgboost will automatically generate a binary format buffer of input data, with suffix ```buffer```. When next time you run xgboost, it detects i
 Demonstrating how to use XGBoost accomplish binary classification tasks  on UCI mushroom dataset  http://archive.ics.uci.edu/ml/datasets/Mushroom
--- a/demo/binary_classification/mapfeat.py
+++ b/demo/binary_classification/mapfeat.py
@@ -1,17 +1,16 @@
 #!/usr/bin/python
 import sys
 def loadfmap( fname ):
    fmap = {}
    nmap = {}
-    
+
    for l in open( fname ):
        arr = l.split()
-        if arr[0].find('.') != -1:            
+        if arr[0].find('.') != -1:
            idx = int( arr[0].strip('.') )
-            assert idx not in fmap        
+            assert idx not in fmap
            fmap[ idx ] = {}
-            ftype = arr[1].strip(':')        
+            ftype = arr[1].strip(':')
            content = arr[2]
        else:
            content = arr[0]
@@ -23,7 +22,7 @@ def loadfmap( fname ):
            nmap[ len(nmap) ] = ftype+'='+k
    return fmap, nmap
-def write_nmap( fo, nmap ):    
+def write_nmap( fo, nmap ):
    for i in range( len(nmap) ):
        fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
@@ -33,7 +32,7 @@ fo = open( 'featmap.txt', 'w' )
 write_nmap( fo, nmap )
 fo.close()
-fo = open( 'agaricus.txt', 'w' ) 
+fo = open( 'agaricus.txt', 'w' )
 for l in open( 'agaricus-lepiota.data' ):
    arr = l.split(',')
    if arr[0] == 'p':
@@ -47,4 +46,4 @@ for l in open( 'agaricus-lepiota.data' ):
 fo.close()
- 
+
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@@ -6,3 +6,6 @@ XGBoost Python Feature Walkthrough
 * [Predicting using first n trees](predict_first_ntree.py)
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
 * [Predicting leaf indices](predict_leaf_indices.py)
 * [Sklearn Wrapper](sklearn_example.py)
 * [External Memory](external_memory.py)
--- a/demo/guide-python/basic_walkthrough.py
+++ b/demo/guide-python/basic_walkthrough.py
@@ -1,10 +1,6 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 import scipy.sparse
 # append the path to xgboost, you may need to change the following line
 # alternatively, you can add the path to PYTHONPATH environment variable
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ### simple example
@@ -33,7 +29,7 @@ bst.dump_model('dump.nice.txt','../data/featmap.txt')
 # save dmatrix into binary buffer
 dtest.save_binary('dtest.buffer')
 bst.save_model('xgb.model')
-# load model and data in 
+# load model and data in
 bst2 = xgb.Booster(model_file='xgb.model')
 dtest2 = xgb.DMatrix('dtest.buffer')
 preds2 = bst2.predict(dtest2)
--- a/demo/guide-python/boost_from_prediction.py
+++ b/demo/guide-python/boost_from_prediction.py
@@ -1,7 +1,5 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
 dtrain = xgb.DMatrix('../data/agaricus.txt.train')
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -1,7 +1,5 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ### load data in do training
@@ -56,7 +54,7 @@ def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
-param = {'max_depth':2, 'eta':1, 'silent':1} 
+param = {'max_depth':2, 'eta':1, 'silent':1}
 # train with customized objective
 xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
       obj = logregobj, feval=evalerror)
--- a/demo/guide-python/custom_objective.py
+++ b/demo/guide-python/custom_objective.py
@@ -1,11 +1,9 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ###
 # advanced: cutomsized loss function
-# 
+#
 print ('start running example to used cutomized objective function')
 dtrain = xgb.DMatrix('../data/agaricus.txt.train')
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -0,0 +1,25 @@
 #!/usr/bin/python
 import numpy as np
 import scipy.sparse
 import xgboost as xgb
 ### simple example for using external memory version
 # this is the only difference, add a # followed by a cache prefix name
 # several cache file with the prefix will be generated
 # currently only support convert from libsvm file
 dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache')
 dtest = xgb.DMatrix('../data/agaricus.txt.test#dtest.cache')
 # specify validations set to watch performance
 param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
 # performance notice: set nthread to be the number of your real cpu
 # some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4
 #param['nthread']=num_real_cpu
 watchlist  = [(dtest,'eval'), (dtrain,'train')]
 num_round = 2
 bst = xgb.train(param, dtrain, num_round, watchlist)
--- a/demo/guide-python/generalized_linear_model.py
+++ b/demo/guide-python/generalized_linear_model.py
@@ -1,6 +1,4 @@
 #!/usr/bin/python
 import sys
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ##
 #  this script demonstrate how to fit generalized linear model in xgboost
@@ -9,17 +7,17 @@ import xgboost as xgb
 dtrain = xgb.DMatrix('../data/agaricus.txt.train')
 dtest = xgb.DMatrix('../data/agaricus.txt.test')
 # change booster to gblinear, so that we are fitting a linear model
-# alpha is the L1 regularizer 
+# alpha is the L1 regularizer
 # lambda is the L2 regularizer
 # you can also set lambda_bias which is L2 regularizer on the bias term
 param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
         'alpha': 0.0001, 'lambda': 1 }
 # normally, you do not need to set eta (step_size)
-# XGBoost uses a parallel coordinate descent algorithm (shotgun), 
+# XGBoost uses a parallel coordinate descent algorithm (shotgun),
 # there could be affection on convergence with parallelization on certain cases
 # setting eta to be smaller value, e.g 0.5 can make the optimization more stable
-# param['eta'] = 1 
+# param['eta'] = 1
 ##
 # the rest of settings are the same
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -1,7 +1,5 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ### load data in do training
--- a/demo/guide-python/predict_leaf_indices.py
+++ b/demo/guide-python/predict_leaf_indices.py
@@ -0,0 +1,20 @@
 #!/usr/bin/python
 import numpy as np
 import xgboost as xgb
 ### load data in do training
 dtrain = xgb.DMatrix('../data/agaricus.txt.train')
 dtest = xgb.DMatrix('../data/agaricus.txt.test')
 param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
 watchlist  = [(dtest,'eval'), (dtrain,'train')]
 num_round = 3
 bst = xgb.train(param, dtrain, num_round, watchlist)
 print ('start testing predict the leaf indices')
 ### predict using first 2 tree
 leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
 print leafindex.shape
 print leafindex
 ### predict all trees
 leafindex = bst.predict(dtest, pred_leaf = True)
 print leafindex.shape
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -4,4 +4,5 @@ python custom_objective.py
 python boost_from_prediction.py
 python generalized_linear_model.py
 python cross_validation.py
-rm -rf *~ *.model *.buffer 
+python predict_leaf_indices.py
 rm -rf *~ *.model *.buffer 
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -0,0 +1,67 @@
 #!/usr/bin/python
 '''
 Created on 1 Apr 2015
@author: Jamie Hall
 '''
 import pickle
 import xgboost as xgb
 import numpy as np
 from sklearn.cross_validation import KFold
 from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn.grid_search import GridSearchCV
 from sklearn.datasets import load_iris, load_digits, load_boston
 rng = np.random.RandomState(31337)
 print("Zeros and Ones from the Digits dataset: binary classification")
 digits = load_digits(2)
 y = digits['target']
 X = digits['data']
 kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))
 print("Iris: multiclass classification")
 iris = load_iris()
 y = iris['target']
 X = iris['data']
 kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))
 print("Boston Housing: regression")
 boston = load_boston()
 y = boston['target']
 X = boston['data']
 kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(mean_squared_error(actuals, predictions))
 print("Parameter optimization")
 y = boston['target']
 X = boston['data']
 xgb_model = xgb.XGBRegressor()
 clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
 clf.fit(X,y)
 print(clf.best_score_)
 print(clf.best_params_)
 # The sklearn API models are picklable
 print("Pickling sklearn API models")
 # must open in binary format to pickle
 pickle.dump(clf, open("best_boston.pkl", "wb"))
 clf2 = pickle.load(open("best_boston.pkl", "rb"))
 print(np.allclose(clf.predict(X), clf2.predict(X)))
--- a/demo/guide-python/sklearn_parallel.py
+++ b/demo/guide-python/sklearn_parallel.py
@@ -0,0 +1,35 @@
 import os
 if __name__ == "__main__":
    # NOTE: on posix systems, this *has* to be here and in the
    # `__name__ == "__main__"` clause to run XGBoost in parallel processes
    # using fork, if XGBoost was built with OpenMP support. Otherwise, if you
    # build XGBoost without OpenMP support, you can use fork, which is the
    # default backend for joblib, and omit this.
    try:
        from multiprocessing import set_start_method
    except ImportError:
        raise ImportError("Unable to import multiprocessing.set_start_method."
                          " This example only runs on Python 3.4")
    set_start_method("forkserver")
    import numpy as np
    from sklearn.grid_search import GridSearchCV
    from sklearn.datasets import load_boston
    import xgboost as xgb
    rng = np.random.RandomState(31337)
    print("Parallel Parameter optimization")
    boston = load_boston()
    os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want
    y = boston['target']
    X = boston['data']
    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
                       n_jobs=2)
    clf.fit(X, y)
    print(clf.best_score_)
    print(clf.best_params_)
--- a/demo/kaggle-higgs/README.md
+++ b/demo/kaggle-higgs/README.md
@@ -1,3 +1,9 @@
 Highlights
 =====
 Higgs challenge ends recently, xgboost is being used by many users. This list highlights the xgboost solutions of players
 * Blogpost by phunther: [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) 
 * The solution by Tianqi Chen and Tong He [Link](https://github.com/hetong007/higgsml)
 Guide for Kaggle Higgs Challenge
 =====
--- a/demo/kaggle-higgs/higgs-cv.py
+++ b/demo/kaggle-higgs/higgs-cv.py
@@ -1,7 +1,5 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ### load data in do training
--- a/demo/kaggle-higgs/higgs-numpy.py
+++ b/demo/kaggle-higgs/higgs-numpy.py
@@ -1,14 +1,6 @@
 #!/usr/bin/python
-# this is the example script to use xgboost to train 
+# this is the example script to use xgboost to train
 import inspect
 import os
 import sys
 import numpy as np
 # add path of xgboost python module
 code_path = os.path.join(
    os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../wrapper")
 sys.path.append(code_path)
 import xgboost as xgb
@@ -29,7 +21,7 @@ weight = dtrain[:,31] * float(test_size) / len(label)
 sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0  )
 sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0  )
-# print weight statistics 
+# print weight statistics
 print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
 # construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
@@ -42,13 +34,13 @@ param = {}
 param['objective'] = 'binary:logitraw'
 # scale weight of positive examples
 param['scale_pos_weight'] = sum_wneg/sum_wpos
-param['eta'] = 0.1 
+param['eta'] = 0.1
 param['max_depth'] = 6
 param['eval_metric'] = 'auc'
 param['silent'] = 1
 param['nthread'] = 16
-# you can directly throw param in, though we want to watch multiple metrics here 
+# you can directly throw param in, though we want to watch multiple metrics here
 plst = list(param.items())+[('eval_metric', 'ams@0.15')]
 watchlist = [ (xgmat,'train') ]
--- a/demo/kaggle-higgs/higgs-pred.py
+++ b/demo/kaggle-higgs/higgs-pred.py
@@ -1,9 +1,6 @@
 #!/usr/bin/python
-# make prediction 
+# make prediction
 import sys
 import numpy as np
 # add path of xgboost python module
 sys.path.append('../../wrapper/')
 import xgboost as xgb
 # path to where the data lies
@@ -11,7 +8,7 @@ dpath = 'data'
 modelfile = 'higgs.model'
 outfile = 'higgs.pred.csv'
-# make top 15% as positive 
+# make top 15% as positive
 threshold_ratio = 0.15
 # load in training data, directly use numpy
@@ -24,7 +21,7 @@ xgmat = xgb.DMatrix( data, missing = -999.0 )
 bst = xgb.Booster({'nthread':16}, model_file = modelfile)
 ypred = bst.predict( xgmat )
-res  = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ] 
+res  = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]
 rorder = {}
 for k, v in sorted( res, key = lambda x:-x[1] ):
@@ -36,12 +33,12 @@ fo = open(outfile, 'w')
 nhit = 0
 ntot = 0
 fo.write('EventId,RankOrder,Class\n')
-for k, v in res:        
+for k, v in res:
    if rorder[k] <= ntop:
        lb = 's'
        nhit += 1
    else:
-        lb = 'b'        
+        lb = 'b'
    # change output rank order to follow Kaggle convention
    fo.write('%s,%d,%s\n' % ( k,  len(rorder)+1-rorder[k], lb ) )
    ntot += 1
--- a/demo/kaggle-higgs/speedtest.R
+++ b/demo/kaggle-higgs/speedtest.R
@@ -6,7 +6,7 @@ require(methods)
 testsize <- 550000
 dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
-
+dtrain$Label = as.numeric(dtrain$Label=='s')
 # gbm.time = system.time({
 #   gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120, 
 #                    interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
@@ -15,8 +15,8 @@ dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
 # print(gbm.time)
 # Test result: 761.48 secs
-dtrain[33] <- dtrain[33] == "s"
+# dtrain[33] <- dtrain[33] == "s"
-label <- as.numeric(dtrain[[33]])
+# label <- as.numeric(dtrain[[33]])
 data <- as.matrix(dtrain[2:31])
 weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
@@ -51,21 +51,21 @@ for (i in 1:length(threads)){
 xgboost.time
 # [[1]]
 # user  system elapsed 
-# 444.98    1.96  450.22 
+# 99.015   0.051  98.982 
 # 
 # [[2]]
 # user  system elapsed 
-# 188.15    0.82  102.41 
+# 100.268   0.317  55.473 
 # 
 # [[3]]
 # user  system elapsed 
-# 143.29    0.79   44.18 
+# 111.682   0.777  35.963 
 # 
 # [[4]]
 # user  system elapsed 
-# 176.60    1.45   34.04 
+# 149.396   1.851  32.661 
 # 
 # [[5]]
 # user  system elapsed 
-# 180.15    2.85   35.26 
+# 157.390   5.988  40.949 
--- a/demo/kaggle-higgs/speedtest.py
+++ b/demo/kaggle-higgs/speedtest.py
@@ -1,9 +1,6 @@
 #!/usr/bin/python
 # this is the example script to use xgboost to train
 import sys
 import numpy as np
 # add path of xgboost python module
 sys.path.append('../../wrapper/')
 import xgboost as xgb
 from sklearn.ensemble import GradientBoostingClassifier
 import time
--- a/demo/kaggle-otto/README.MD
+++ b/demo/kaggle-otto/README.MD
@@ -0,0 +1,24 @@
 Benckmark for Otto Group Competition
 =========
 This is a folder containing the benchmark for the [Otto Group Competition on Kaggle](http://www.kaggle.com/c/otto-group-product-classification-challenge).
 ## Getting started
 1. Put `train.csv` and `test.csv` under the `data` folder
 2. Run the script
 3. Submit the `submission.csv`
 The parameter `nthread` controls the number of cores to run on, please set it to suit your machine.
 ## R-package
 To install the R-package of xgboost, please run
 ```r
 devtools::install_github('tqchen/xgboost',subdir='R-package')
 ```
 Windows users may need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
--- a/demo/kaggle-otto/otto_train_pred.R
+++ b/demo/kaggle-otto/otto_train_pred.R
@@ -0,0 +1,43 @@
 require(xgboost)
 require(methods)
 train = read.csv('data/train.csv',header=TRUE,stringsAsFactors = F)
 test = read.csv('data/test.csv',header=TRUE,stringsAsFactors = F)
 train = train[,-1]
 test = test[,-1]
 y = train[,ncol(train)]
 y = gsub('Class_','',y)
 y = as.integer(y)-1 #xgboost take features in [0,numOfClass)
 x = rbind(train[,-ncol(train)],test)
 x = as.matrix(x)
 x = matrix(as.numeric(x),nrow(x),ncol(x))
 trind = 1:length(y)
 teind = (nrow(train)+1):nrow(x)
 # Set necessary parameter
 param <- list("objective" = "multi:softprob",
              "eval_metric" = "mlogloss",
              "num_class" = 9,
              "nthread" = 8)
 # Run Cross Valication
 cv.nround = 50
 bst.cv = xgb.cv(param=param, data = x[trind,], label = y, 
                nfold = 3, nrounds=cv.nround)
 # Train the model
 nround = 50
 bst = xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
 # Make prediction
 pred = predict(bst,x[teind,])
 pred = matrix(pred,9,length(pred)/9)
 pred = t(pred)
 # Output submission
 pred = format(pred, digits=2,scientific=F) # shrink the size of submission
 pred = data.frame(1:nrow(pred),pred)
 names(pred) = c('id', paste0('Class_',1:9))
 write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)
--- a/demo/kaggle-otto/understandingXGBoostModel.Rmd
+++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd
@@ -0,0 +1,231 @@
 ---
 title: "Understanding XGBoost Model on Otto Dataset"
 author: "Michaël Benesty"
 output: 
  rmarkdown::html_vignette:
    css: ../../R-package/vignettes/vignette.css
    number_sections: yes
    toc: yes
 ---
 Introduction
 ============
 **XGBoost** is an implementation of the famous gradient boosting algorithm. This model is often described as a *blackbox*, meaning it works well but it is not trivial to understand how. Indeed, the model is made of hundreds (thousands?) of decision trees. You may wonder how possible a human would be able to have a general view of the model?
 While XGBoost is known for its fast speed and accurate predictive power, it also comes with various functions to help you understand the model.
 The purpose of this RMarkdown document is to demonstrate how easily we can leverage the functions already implemented in **XGBoost R** package. Of course, everything showed below can be applied to the dataset you may have to manipulate at work or wherever!
 First we will prepare the **Otto** dataset and train a model, then we will generate two vizualisations to get a clue of what is important to the model, finally, we will see how we can leverage these information.
 Preparation of the data
 =======================
 This part is based on the **R** tutorial example by [Tong He](https://github.com/dmlc/xgboost/blob/master/demo/kaggle-otto/otto_train_pred.R)
 First, let's load the packages and the dataset.
 ```{r loading}
 require(xgboost)
 require(methods)
 require(data.table)
 require(magrittr)
 train <- fread('data/train.csv', header = T, stringsAsFactors = F)
 test <- fread('data/test.csv', header=TRUE, stringsAsFactors = F)
 ```
 > `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
 Let's explore the dataset.
 ```{r explore}
 # Train dataset dimensions
 dim(train)
 # Training content
 train[1:6,1:5, with =F]
 # Test dataset dimensions
 dim(train)
 # Test content
 test[1:6,1:5, with =F]
 ```
 > We only display the 6 first rows and 5 first columns for convenience
 Each *column* represents a feature measured by an `integer`. Each *row* is an **Otto** product.
 Obviously the first column (`ID`) doesn't contain any useful information. 
 To let the algorithm focus on real stuff, we will delete it.
 ```{r clean, results='hide'}
 # Delete ID column in training dataset
 train[, id := NULL]
 # Delete ID column in testing dataset
 test[, id := NULL]
 ```
 According to its description, the **Otto** challenge is a multi class classification challenge. We need to extract the labels (here the name of the different classes) from the dataset. We only have two files (test and training), it seems logical that the training file contains the class we are looking for. Usually the labels is in the first or the last column. We already know what is in the first column, let's check the content of the last one.
 ```{r searchLabel}
 # Check the content of the last column
 train[1:6, ncol(train), with  = F]
 # Save the name of the last column
 nameLastCol <- names(train)[ncol(train)]
 ```
 The classes are provided as character string in the `r ncol(train)`th column called `r nameLastCol`. As you may know, **XGBoost** doesn't support anything else than numbers. So we will convert classes to `integer`. Moreover, according to the documentation, it should start at `0`.
 For that purpose, we will:
 * extract the target column
 * remove `Class_` from each class name
 * convert to `integer`
 * remove `1` to the new value
 ```{r classToIntegers}
 # Convert from classes to numbers
 y <- train[, nameLastCol, with = F][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
 # Display the first 5 levels
 y[1:5]
 ```
 We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
 ```{r deleteCols, results='hide'}
 train[, nameLastCol:=NULL, with = F]
 ```
 `data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
 ```{r convertToNumericMatrix}
 trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
 testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
 ```
 Model training
 ==============
 Before the learning we will use the cross validation to evaluate the our error rate.
 Basically **XGBoost** will divide the training data in `nfold` parts, then **XGBoost** will retain the first part to use it as the test data and perform a training. Then it will reintegrate the first part and retain the second part, do a training and so on...
 You can look at the function documentation for more information.
 ```{r crossValidation}
 numberOfClasses <- max(y) + 1
 param <- list("objective" = "multi:softprob",
              "eval_metric" = "mlogloss",
              "num_class" = numberOfClasses)
 cv.nround <- 5
 cv.nfold <- 3
 bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, 
                nfold = cv.nfold, nrounds = cv.nround)
 ```
 > As we can see the error rate is low on the test dataset (for a 5mn trained model).
 Finally, we are ready to train the real model!!!
 ```{r modelTraining}
 nround = 50
 bst = xgboost(param=param, data = trainMatrix, label = y, nrounds=nround)
 ```
 Model understanding
 ===================
 Feature importance
 ------------------
 So far, we have built a model made of **`r nround`** trees.
 To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products). 
 Each division operation is called a *split*.
 Each group at each division level is called a branch and the deepest level is called a *leaf*.
 In the final model, these *leafs* are supposed to be as pure as possible for each tree, meaning in our case that each *leaf* should be made of one class of **Otto** product only (of course it is not true, but that's what we try to achieve in a minimum of splits).
 **Not all *splits* are equally important**. Basically the first *split* of a tree will have more impact on the purity that, for instance, the deepest *split*. Intuitively, we understand that the first *split* makes most of the work, and the following *splits* focus on smaller parts of the dataset which have been missclassified by the first *tree*.
 In the same way, in Boosting we try to optimize the missclassification at each round (it is called the *loss*). So the first *tree* will do the big work and the following trees will focus on the remaining, on the parts not correctly learned by the previous *trees*.
 The improvement brought by each *split* can be measured, it is the *gain*.
 Each *split* is done on one feature only at one value. 
 Let's see what the model looks like.
 ```{r modelDump}
 model <- xgb.dump(bst, with.stats = T)
 model[1:10]
 ```
 > For convenience, we are displaying the first 10 lines of the model only.
 Clearly, it is not easy to understand what it means. 
 Basically each line represents a *branch*, there is the *tree* ID, the feature ID, the point where it *splits*, and information regarding the next *branches* (left, right, when the row for this feature is N/A).
 Hopefully, **XGBoost** offers a better representation: **feature importance**.
 Feature importance is about averaging the *gain* of each feature for all *split* and all *trees*.
 Then we can use the function `xgb.plot.importance`.
 ```{r importanceFeature, fig.align='center', fig.height=5, fig.width=10}
 # Get the feature real names
 names <- dimnames(trainMatrix)[[2]]
 # Compute feature importance matrix
 importance_matrix <- xgb.importance(names, model = bst)
 # Nice graph
 xgb.plot.importance(importance_matrix[1:10,])
 ```
 > To make it understandable we first extract the column names from the `Matrix`.
 Interpretation
 --------------
 In the feature importance above, we can see the first 10 most important features.
 This function gives a color to each bar. These colors represent groups of features. Basically a K-means clustering is  applied to group each feature by importance.
 From here you can take several actions. For instance you can remove the less important feature (feature selection process), or go deeper in the interaction between the most important features and labels.
 Or you can just reason about why these features are so importat (in **Otto** challenge we can't go this way because there is not enough information).
 Tree graph
 ----------
 Feature importance gives you feature weight information but not interaction between features.
 **XGBoost R** package have another useful function for that.
 Please, scroll on the right to see the tree.
 ```{r treeGraph, dpi=1500, fig.align='left'}
 xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
 ```
 We are just displaying the first two trees here.
 On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated. 
 Besides, **XGBoost** generate `k` trees at each round for a `k`-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes.
 Going deeper
 ============
 There are 4 documents you may also be interested in:
 * [xgboostPresentation.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd): general presentation
 * [discoverYourData.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/discoverYourData.Rmd): explaining feature analysus
 * [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit): use case
 * [The Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/): very good book to have a good understanding of the model
--- a/demo/multiclass_classification/README.md
+++ b/demo/multiclass_classification/README.md
@@ -7,4 +7,4 @@ Make sure you make make xgboost python module in ../../python
 ./runexp.sh
 ```
-Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki)
+
--- a/demo/multiclass_classification/train.py
+++ b/demo/multiclass_classification/train.py
@@ -1,7 +1,5 @@
 #! /usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper/')
 import xgboost as xgb
 # label need to be 0 to num_class -1
--- a/Show More
+++ b/Show More