diff --git a/.gitignore b/.gitignore
index 276ed2d54..27ff1a764 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@
 *group
 *rar
 *vali
-*data
 *sdf
 Release
 *exe*
@@ -36,7 +35,6 @@ ipch
 *log
 Debug
 *suo
-*test*
 .Rhistory
 *.dll
 *i386
@@ -51,12 +49,9 @@ Debug
 ./xgboost
 ./xgboost.mpi
 ./xgboost.mock
-rabit
 #.Rbuildignore
 R-package.Rproj
 *.cache*
-R-package/inst
-R-package/src
 #java
 java/xgboost4j/target
 java/xgboost4j/tmp
@@ -65,9 +60,13 @@ java/xgboost4j-demo/data/
 java/xgboost4j-demo/tmp/
 java/xgboost4j-demo/model/
 nb-configuration*
-dmlc-core
 # Eclipse
 .project
 .cproject
 .pydevproject
 .settings/
+build
+config.mk
+xgboost
+*.data
+build_plugin
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..b2321b41f
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "dmlc-core"]
+	path = dmlc-core
+	url = https://github.com/dmlc/dmlc-core
+[submodule "rabit"]
+	path = rabit
+	url = https://github.com/dmlc/rabit
diff --git a/.travis.yml b/.travis.yml
index c7049be94..4f09eb083 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,5 @@
-sudo: true
+# disable sudo for container build.
+sudo: false
 
 # Enabling test on Linux and OS X
 os:
@@ -8,51 +9,60 @@ os:
 # Use Build Matrix to do lint and build seperately
 env:
   matrix:
-    - TASK=lint LINT_LANG=cpp
-    - TASK=lint LINT_LANG=python
-    - TASK=R-package CXX=g++
-    - TASK=python-package CXX=g++
-    - TASK=python-package3 CXX=g++
-    - TASK=java-package CXX=g++
-    - TASK=build CXX=g++
-    - TASK=build-with-dmlc CXX=g++
+    # code lint
+    - TASK=lint
+    # r package test
+    - TASK=r_test
+    # python package test
+    - TASK=python_test
+    # java package test
+    - TASK=java_test
 
 os:
   - linux
   - osx
 
+matrix:
+  exclude:
+    - os: osx
+      env: TASK=lint
+    - os: linux
+      env: TASK=r_test
+    - os: osx
+      env: TASK=java_test
+
 # dependent apt packages
 addons:
   apt:
     packages:
       - doxygen
-      - libopenmpi-dev
       - wget
       - libcurl4-openssl-dev
       - unzip
-      - python-numpy
-      - python-scipy
+      - graphviz
 
 before_install:
-  - scripts/travis_osx_install.sh
-  - git clone https://github.com/dmlc/dmlc-core
-  - export TRAVIS=dmlc-core/scripts/travis/
+  - source dmlc-core/scripts/travis/travis_setup_env.sh
   - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package
-  - source ${TRAVIS}/travis_setup_env.sh
 
 install:
-  - pip install cpplint pylint --user `whoami`
+  - source tests/travis/setup.sh
 
+script:
+  - tests/travis/run_test.sh
 
-script: scripts/travis_script.sh
+cache:
+  directories:
+    - ${HOME}/.cache/usr
+    - ${HOME}/.cache/pip
 
+before_cache:
+  - dmlc-core/scripts/travis/travis_before_cache.sh
 
 after_failure:
-  - scripts/travis_after_failure.sh
-
+  - tests/travis/travis_after_failure.sh
 
 notifications:
   email:
     on_success: change
     on_failure: always
-
diff --git a/Makefile b/Makefile
index 84636bd71..a9ed1f96f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,60 @@
-export CC  = $(if $(shell which gcc-5 2>/dev/null),gcc-5,gcc)
-export CXX = $(if $(shell which g++-5 2>/dev/null),g++-5,g++)
-
-export MPICXX = mpicxx
-export LDFLAGS= -pthread -lm
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops
-# java include path
-export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
-
-ifeq ($(OS), Windows_NT)
-	export CXX = g++ -m64
-	export CC = gcc -m64
+ifndef config
+ifneq ("$(wildcard ./config.mk)","")
+	config = config.mk
+else
+	config = make/config.mk
+endif
 endif
 
-UNAME= $(shell uname)
+ifndef DMLC_CORE
+	DMLC_CORE = dmlc-core
+endif
+
+ifndef RABIT
+	RABIT = rabit
+endif
+
+ROOTDIR = $(CURDIR)
+
+ifeq ($(OS), Windows_NT)
+	UNAME="Windows"
+else
+	UNAME=$(shell uname)
+endif
+
+include $(config)
+ifeq ($(USE_OPENMP), 0)
+	export NO_OPENMP = 1
+endif
+include $(DMLC_CORE)/make/dmlc.mk
+
+# include the plugins
+include $(XGB_PLUGINS)
+
+# use customized config file
+ifndef CC
+export CC  = $(if $(shell which gcc-5),gcc-5,gcc)
+endif
+ifndef CXX
+export CXX = $(if $(shell which g++-5),g++-5,g++)
+endif
+
+export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS)
+export CFLAGS=  -std=c++0x -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
+CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include
+#java include path
+export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
+
+ifndef LINT_LANG
+	LINT_LANG= "all"
+endif
+
+ifneq ($(UNAME), Windows)
+	CFLAGS += -fPIC
+	XGBOOST_DYLIB = lib/libxgboost.so
+else
+	XGBOOST_DYLIB = lib/libxgboost.dll
+endif
 
 ifeq ($(UNAME), Linux)
 	LDFLAGS += -lrt
@@ -23,192 +65,115 @@ ifeq ($(UNAME), Darwin)
 	JAVAINCFLAGS += -I${JAVA_HOME}/include/darwin
 endif
 
-ifeq ($(no_omp),1)
+ifeq ($(USE_OPENMP), 1)
+	CFLAGS += -fopenmp
+else
 	CFLAGS += -DDISABLE_OPENMP
-else
-	#CFLAGS += -fopenmp
-	ifeq ($(omp_mac_static),1)
-		#CFLAGS += -fopenmp -Bstatic
-		CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp
-		#LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive
-	else
-		CFLAGS += -fopenmp
-	endif
 endif
 
 
-# by default use c++11
-ifeq ($(cxx11),1)
-	CFLAGS += -std=c++11
-endif
-
-# handling dmlc
-ifdef dmlc
-	ifndef config
-		ifneq ("$(wildcard $(dmlc)/config.mk)","")
-			config = $(dmlc)/config.mk
-		else
-			config = $(dmlc)/make/config.mk
-		endif
-	endif
-	include $(config)
-	include $(dmlc)/make/dmlc.mk
-	LDFLAGS+= $(DMLC_LDFLAGS)
-	LIBDMLC=$(dmlc)/libdmlc.a
-else
-	LIBDMLC=dmlc_simple.o
-endif
-
-ifndef WITH_FPIC
-	WITH_FPIC = 1
-endif
-ifeq ($(WITH_FPIC), 1)
-	CFLAGS += -fPIC
-endif
-
-
-ifeq ($(OS), Windows_NT)
-	LIBRABIT = subtree/rabit/lib/librabit_empty.a
-	SLIB = wrapper/xgboost_wrapper.dll
-else
-	LIBRABIT = subtree/rabit/lib/librabit.a
-	SLIB = wrapper/libxgboostwrapper.so
-endif
-
-# java lib
-JLIB = java/libxgboost4j.so
-
 # specify tensor path
-BIN = xgboost
-MOCKBIN = xgboost.mock
-OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
-MPIBIN =
-ifeq ($(WITH_FPIC), 1)
-	TARGET = $(BIN) $(OBJ) $(SLIB)
-else
-	TARGET = $(BIN)
-endif
+.PHONY: clean all lint clean_all doxygen rcpplint Rpack Rbuild Rcheck java
 
-ifndef LINT_LANG
-	LINT_LANG= "all"
-endif
 
-.PHONY: clean all mpi python Rpack lint
+all: lib/libxgboost.a $(XGBOOST_DYLIB) xgboost
 
-all: $(TARGET)
-mpi: $(MPIBIN)
+$(DMLC_CORE)/libdmlc.a:
+	+ cd $(DMLC_CORE); make libdmlc.a config=$(ROOTDIR)/$(config); cd $(ROOTDIR)
 
-python: wrapper/libxgboostwrapper.so
-# now the wrapper takes in two files. io and wrapper part
-updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
-dmlc_simple.o: src/io/dmlc_simple.cpp src/utils/*.h
-gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
-io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
-xgboost:  updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC)
-wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
+$(RABIT)/lib/$(LIB_RABIT):
+	+ cd $(RABIT); make lib/$(LIB_RABIT); cd $(ROOTDIR)
 
 java: java/libxgboost4j.so
-java/libxgboost4j.so: java/xgboost4j_wrapper.cpp wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
 
-# dependency on rabit
-subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
-	+	cd subtree/rabit;make lib/librabit.a; cd ../..
-subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
-	+	cd subtree/rabit;make lib/librabit_empty.a; cd ../..
-subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
-	+	cd subtree/rabit;make lib/librabit_mock.a; cd ../..
-subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
-	+	cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
+SRC = $(wildcard src/*.cc src/*/*.cc)
+ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) $(PLUGIN_OBJS)
+AMALGA_OBJ = amalgamation/xgboost-all0.o
+LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)
+ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP)
+CLI_OBJ = build/cli_main.o
 
-$(BIN) :
-	$(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
+build/%.o: src/%.cc
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
+	$(CXX) -c $(CFLAGS) -c $< -o $@
 
-$(MOCKBIN) :
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
+build_plugin/%.o: plugin/%.cc
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
+	$(CXX) -c $(CFLAGS) -c $< -o $@
 
-$(SLIB) :
-	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS)
+# The should be equivalent to $(ALL_OBJ)  except for build/cli_main.o
+amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
+	$(CXX) -c $(CFLAGS) -c $< -o $@
 
-$(JLIB) :
-	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)  $(JAVAINCFLAGS)
+# Equivalent to lib/libxgboost_all.so
+lib/libxgboost_all.so: $(AMALGA_OBJ) $(LIB_DEP)
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 
-$(OBJ) :
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+lib/libxgboost.a: $(ALL_DEP)
+	@mkdir -p $(@D)
+	ar crv $@ $(filter %.o, $?)
 
-$(MPIOBJ) :
-	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+lib/libxgboost.dll lib/libxgboost.so: $(ALL_DEP)
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 
-$(MPIBIN) :
-	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
+java/libxgboost4j.so: java/xgboost4j_wrapper.cpp $(ALL_DEP)
+	$(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS)
 
-install:
-	cp -f -r $(BIN)  $(INSTALL_PATH)
+xgboost: $(CLI_OBJ) $(ALL_DEP)
+	$(CXX) $(CFLAGS) -o $@  $(filter %.o %.a, $^)  $(LDFLAGS)
 
+rcpplint:
+	python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src
+
+lint: rcpplint
+	python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin
+
+clean:
+	$(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o xgboost
+
+clean_all: clean
+	cd $(DMLC_CORE); make clean; cd $(ROODIR)
+	cd $(RABIT); make clean; cd $(ROODIR)
+
+doxygen:
+	doxygen doc/Doxyfile
+
+# Script to make a clean installable R package.
 Rpack:
-	make clean
-	cd subtree/rabit;make clean;cd ..
+	make clean_all
 	rm -rf xgboost xgboost*.tar.gz
 	cp -r R-package xgboost
 	rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
 	rm -rf xgboost/src/*/*.o
-	rm -rf subtree/rabit/src/*.o
 	rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
 	rm -rf xgboost/demo/runall.R
 	cp -r src xgboost/src/src
-	mkdir xgboost/src/subtree
-	mkdir xgboost/src/subtree/rabit
-	cp -r subtree/rabit/include xgboost/src/subtree/rabit/include
-	cp -r subtree/rabit/src xgboost/src/subtree/rabit/src
-	rm -rf xgboost/src/subtree/rabit/src/*.o
-	mkdir xgboost/src/wrapper
-	cp  wrapper/xgboost_wrapper.h xgboost/src/wrapper
-	cp  wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
+	cp -r include xgboost/src/include
+	cp -r amalgamation xgboost/src/amalgamation
+	mkdir -p xgboost/src/rabit
+	cp -r rabit/include xgboost/src/rabit/include
+	cp -r rabit/src xgboost/src/rabit/src
+	rm -rf xgboost/src/rabit/src/*.o
+	mkdir -p xgboost/src/dmlc-core
+	cp -r dmlc-core/include xgboost/src/dmlc-core/include
+	cp -r dmlc-core/src xgboost/src/dmlc-core/src
 	cp ./LICENSE xgboost
-	cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
+	cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' | sed '3s/.*/ENABLE_STD_THREAD=0/' > xgboost/src/Makevars
 	cp xgboost/src/Makevars xgboost/src/Makevars.win
-	# R CMD build --no-build-vignettes xgboost
-	# R CMD build xgboost
-	# rm -rf xgboost
-	# R CMD check --as-cran xgboost*.tar.gz
 
 Rbuild:
 	make Rpack
-	R CMD build xgboost
+	R CMD build --no-build-vignettes xgboost
 	rm -rf xgboost
 
 Rcheck:
 	make Rbuild
-	R CMD check --as-cran xgboost*.tar.gz
+	R CMD check  xgboost*.tar.gz
 
-pythonpack:
-	#for pip maintainer only
-	cd subtree/rabit;make clean;cd ..
-	rm -rf xgboost-deploy xgboost*.tar.gz
-	cp -r python-package xgboost-deploy
-	#cp *.md xgboost-deploy/
-	cp LICENSE xgboost-deploy/
-	cp Makefile xgboost-deploy/xgboost
-	cp -r wrapper xgboost-deploy/xgboost
-	cp -r subtree xgboost-deploy/xgboost
-	cp -r multi-node xgboost-deploy/xgboost
-	cp -r windows xgboost-deploy/xgboost
-	cp -r src xgboost-deploy/xgboost
-	cp python-package/setup_pip.py xgboost-deploy/setup.py
-	#make python
-
-pythonbuild:
-	make pythonpack
-	python setup.py install
-
-pythoncheck:
-	make pythonbuild
-	python -c 'import xgboost;print xgboost.core.find_lib_path()'
-
-# lint requires dmlc to be in current folder
-lint:
-	dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
-
-clean:
-	$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
-	cd subtree/rabit; make clean; cd ..
+-include build/*.d
+-include build/*/*.d
+-include build_plugin/*/*.d
diff --git a/CHANGES.md b/NEWS.md
similarity index 55%
rename from CHANGES.md
rename to NEWS.md
index 1a10f04e7..e9c89da00 100644
--- a/CHANGES.md
+++ b/NEWS.md
@@ -1,42 +1,30 @@
-Change Log
-==========
+XGBoost Change Log
+==================
 
-xgboost-0.1
------------
-* Initial release
+This file records the chanegs in xgboost library in reverse chronological order.
 
-xgboost-0.2x
-------------
-* Python module
-* Weighted samples instances
-* Initial version of pairwise rank
+## brick: next release candidate
+* Major refactor of core library.
+  - Goal: more flexible and modular code as a portable library.
+  - Switch to use of c++11 standard code.
+  - Random number generator defaults to ```std::mt19937```.
+  - Share the data loading pipeline and logging module from dmlc-core.
+  - Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader.
+    - Future plugin modules can be put into xgboost/plugin and register back to the library.
+  - Remove most of the raw pointers to smart ptrs, for RAII safety.
+* Change library name to libxgboost.so
+* Backward compatiblity
+  - The binary buffer file is not backward compatible with previous version.
+  - The model file is backward compatible on 64 bit platforms.
+* The model file is compatible between 64/32 bit platforms(not yet tested).
+* External memory version and other advanced features will be exposed to R library as well on linux.
+  - Previously some of the features are blocked due to C++11 and threading limits.
+  - The windows version is still blocked due to Rtools do not support ```std::thread```.
+* rabit and dmlc-core are maintained through git submodule
+  - Anyone can open PR to update these dependencies now.
 
-xgboost-0.3
------------
-* Faster tree construction module
-  - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
-* Support for boosting from initial predictions
-* Experimental version of LambdaRank
-* Linear booster is now parallelized, using parallel coordinated descent.
-* Add [Code Guide](src/README.md) for customizing objective function and evaluation
-* Add R module
+## v0.47 (2016.01.14)
 
-xgboost-0.4
------------
-* Distributed version of xgboost that runs on YARN, scales to billions of examples
-* Direct save/load data and model from/to S3 and HDFS
-* Feature importance visualization in R module, by Michael Benesty
-* Predict leaf index
-* Poisson regression for counts data
-* Early stopping option in training
-* Native save load support in R and python
-  - xgboost models now can be saved using save/load in R
-  - xgboost python model is now pickable
-* sklearn wrapper is supported in python module
-* Experimental External memory version
-
-xgboost-0.47
-------------
 * Changes in R library
   - fixed possible problem of poisson regression.
   - switched from 0 to NA for missing values.
@@ -52,10 +40,44 @@ xgboost-0.47
   - improved compatibility in sklearn module.
   - additional parameters added for sklearn wrapper.
   - added pip installation functionality.
-  - supports more Pandas DataFrame dtypes. 
+  - supports more Pandas DataFrame dtypes.
   - added best_ntree_limit attribute, in addition to best_score and best_iteration.
 * Java api is ready for use
 * Added more test cases and continuous integration to make each build more robust.
 
-on going at master
-------------------
+## v0.4 (2015.05.11)
+
+* Distributed version of xgboost that runs on YARN, scales to billions of examples
+* Direct save/load data and model from/to S3 and HDFS
+* Feature importance visualization in R module, by Michael Benesty
+* Predict leaf index
+* Poisson regression for counts data
+* Early stopping option in training
+* Native save load support in R and python
+  - xgboost models now can be saved using save/load in R
+  - xgboost python model is now pickable
+* sklearn wrapper is supported in python module
+* Experimental External memory version
+
+
+## v0.3 (2014.09.07)
+
+* Faster tree construction module
+  - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
+* Support for boosting from initial predictions
+* Experimental version of LambdaRank
+* Linear booster is now parallelized, using parallel coordinated descent.
+* Add [Code Guide](src/README.md) for customizing objective function and evaluation
+* Add R module
+
+
+## v0.2x (2014.05.20)
+
+* Python module
+* Weighted samples instances
+* Initial version of pairwise rank
+
+
+## v0.1 (2014.03.26)
+
+* Initial release
\ No newline at end of file
diff --git a/R-package/README.md b/R-package/README.md
index c92bc9b96..e7d45426f 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -3,6 +3,12 @@ R package for xgboost
 
 [![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
+[![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](http://xgboost.readthedocs.org/en/latest/R-package/index.html)
+
+Resources
+---------
+* [XGBoost R Package Online Documentation](http://xgboost.readthedocs.org/en/latest/R-package/index.html)
+  - Check this out for detailed documents, examples and tutorials.
 
 Installation
 ------------
@@ -16,7 +22,7 @@ install.packages('xgboost')
 For up-to-date version, please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
 
 ```r
-devtools::install_github('dmlc/xgboost',subdir='R-package')
+devtools::install_git('git://github.com/dmlc/xgboost',subdir='R-package')
 ```
 
 Examples
@@ -24,21 +30,3 @@ Examples
 
 * Please visit [walk through example](demo).
 * See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
-
-Notes
------
-
-If you face an issue installing the package using  ```devtools::install_github```, something like this (even after updating libxml and RCurl as lot of forums say) -
-
-```
-devtools::install_github('dmlc/xgboost',subdir='R-package')
-Downloading github repo dmlc/xgboost@master
-Error in function (type, msg, asError = TRUE)  :
-  Peer certificate cannot be authenticated with given CA certificates
-```
-To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) -
-```
-1. Clone the current repository and set your workspace to xgboost/R-package/
-2. Run R CMD INSTALL --build . in terminal to get the tarball.
-3. Run install.packages('path_to_the_tarball',repo=NULL) in R to install.
-```
diff --git a/R-package/src/Makevars b/R-package/src/Makevars
index d0eb23b25..14472acc8 100644
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -1,8 +1,17 @@
 # package root
 PKGROOT=../../
+ENABLE_STD_THREAD=1
 # _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
+
+CXX_STD = CXX11
+
+XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
+           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
+           -DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\
+           -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_
+
+PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
-
+OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o\
+         $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o $(PKGROOT)/rabit/src/engine_empty.o
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 56b550e7f..4134487fa 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -1,5 +1,6 @@
 # package root
 PKGROOT=./
+ENABLE_STD_THREAD=0
 # _*_ mode: Makefile; _*_
 
 # This file is only used for windows compilation from github
@@ -9,11 +10,22 @@ all: $(SHLIB)
 $(SHLIB): xgblib
 xgblib:
 	cp -r ../../src .
-	cp -r ../../wrapper .
-	cp -r ../../subtree .
+	cp -r ../../rabit .
+	cp -r ../../dmlc-core .
+	cp -r ../../include .
+	cp -r ../../amalgamation .
 
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
+CXX_STD = CXX11
+
+XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
+           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
+           -DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\
+           -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_
+
+PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
+OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o\
+         $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o $(PKGROOT)/rabit/src/engine_empty.o
+
 $(OBJECTS) : xgblib
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
new file mode 100644
index 000000000..665fb5faa
--- /dev/null
+++ b/R-package/src/xgboost_R.cc
@@ -0,0 +1,354 @@
+// Copyright (c) 2014 by Contributors
+#include <dmlc/logging.h>
+#include <dmlc/omp.h>
+#include <xgboost/c_api.h>
+#include <vector>
+#include <string>
+#include <utility>
+#include <cstring>
+#include <cstdio>
+#include <sstream>
+#include "./xgboost_R.h"
+
+/*!
+ * \brief macro to annotate begin of api
+ */
+#define R_API_BEGIN()                           \
+  GetRNGstate();                                \
+  try {
+/*!
+ * \brief macro to annotate end of api
+ */
+#define R_API_END()                             \
+  } catch(dmlc::Error& e) {                     \
+    PutRNGstate();                              \
+    error(e.what());                            \
+  }                                             \
+  PutRNGstate();
+
+/*!
+ * \brief macro to check the call.
+ */
+#define CHECK_CALL(x)                           \
+  if ((x) != 0) {                               \
+    error(XGBGetLastError());                   \
+  }
+
+
+using namespace dmlc;
+
+SEXP XGCheckNullPtr_R(SEXP handle) {
+  return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
+}
+
+void _DMatrixFinalizer(SEXP ext) {
+  R_API_BEGIN();
+  if (R_ExternalPtrAddr(ext) == NULL) return;
+  CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext)));
+  R_ClearExternalPtr(ext);
+  R_API_END();
+}
+
+SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
+  SEXP ret;
+  R_API_BEGIN();
+  DMatrixHandle handle;
+  CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
+  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+SEXP XGDMatrixCreateFromMat_R(SEXP mat,
+                              SEXP missing) {
+  SEXP ret;
+  R_API_BEGIN();
+  SEXP dim = getAttrib(mat, R_DimSymbol);
+  size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
+  size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
+  double *din = REAL(mat);
+  std::vector<float> data(nrow * ncol);
+  #pragma omp parallel for schedule(static)
+  for (omp_ulong i = 0; i < nrow; ++i) {
+    for (size_t j = 0; j < ncol; ++j) {
+      data[i * ncol +j] = din[i + nrow * j];
+    }
+  }
+  DMatrixHandle handle;
+  CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
+  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
+                              SEXP indices,
+                              SEXP data) {
+  SEXP ret;
+  R_API_BEGIN();
+  const int *p_indptr = INTEGER(indptr);
+  const int *p_indices = INTEGER(indices);
+  const double *p_data = REAL(data);
+  int nindptr = length(indptr);
+  int ndata = length(data);
+  std::vector<bst_ulong> col_ptr_(nindptr);
+  std::vector<unsigned> indices_(ndata);
+  std::vector<float> data_(ndata);
+
+  for (int i = 0; i < nindptr; ++i) {
+    col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
+  }
+  #pragma omp parallel for schedule(static)
+  for (int i = 0; i < ndata; ++i) {
+    indices_[i] = static_cast<unsigned>(p_indices[i]);
+    data_[i] = static_cast<float>(p_data[i]);
+  }
+  DMatrixHandle handle;
+  CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
+                                    BeginPtr(data_), nindptr, ndata,
+                                    &handle));
+  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
+  SEXP ret;
+  R_API_BEGIN();
+  int len = length(idxset);
+  std::vector<int> idxvec(len);
+  for (int i = 0; i < len; ++i) {
+    idxvec[i] = INTEGER(idxset)[i] - 1;
+  }
+  DMatrixHandle res;
+  CHECK_CALL(XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle),
+                                   BeginPtr(idxvec), len,
+                                   &res));
+  ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
+  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
+  R_API_BEGIN();
+  CHECK_CALL(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
+                                 CHAR(asChar(fname)),
+                                 asInteger(silent)));
+  R_API_END();
+}
+
+void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
+  R_API_BEGIN();
+  int len = length(array);
+  const char *name = CHAR(asChar(field));
+  if (!strcmp("group", name)) {
+    std::vector<unsigned> vec(len);
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < len; ++i) {
+      vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
+    }
+    CHECK_CALL(XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len));
+  } else {
+    std::vector<float> vec(len);
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < len; ++i) {
+      vec[i] = REAL(array)[i];
+    }
+    CHECK_CALL(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
+                                   CHAR(asChar(field)),
+                                   BeginPtr(vec), len));
+  }
+  R_API_END();
+}
+
+SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
+  SEXP ret;
+  R_API_BEGIN();
+  bst_ulong olen;
+  const float *res;
+  CHECK_CALL(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
+                                   CHAR(asChar(field)),
+                                 &olen,
+                                 &res));
+  ret = PROTECT(allocVector(REALSXP, olen));
+  for (size_t i = 0; i < olen; ++i) {
+    REAL(ret)[i] = res[i];
+  }
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+SEXP XGDMatrixNumRow_R(SEXP handle) {
+  bst_ulong nrow;
+  R_API_BEGIN();
+  CHECK_CALL(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow));
+  R_API_END();
+  return ScalarInteger(static_cast<int>(nrow));
+}
+
+// functions related to booster
+void _BoosterFinalizer(SEXP ext) {
+  if (R_ExternalPtrAddr(ext) == NULL) return;
+  CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(ext)));
+  R_ClearExternalPtr(ext);
+}
+
+SEXP XGBoosterCreate_R(SEXP dmats) {
+  SEXP ret;
+  R_API_BEGIN();
+  int len = length(dmats);
+  std::vector<void*> dvec;
+  for (int i = 0; i < len; ++i) {
+    dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+  }
+  BoosterHandle handle;
+  CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
+  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBoosterSetParam(R_ExternalPtrAddr(handle),
+                             CHAR(asChar(name)),
+                             CHAR(asChar(val))));
+  R_API_END();
+}
+
+void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
+                                  asInteger(iter),
+                                  R_ExternalPtrAddr(dtrain)));
+  R_API_END();
+}
+
+void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
+  R_API_BEGIN();
+  CHECK_EQ(length(grad), length(hess))
+      << "gradient and hess must have same length";
+  int len = length(grad);
+  std::vector<float> tgrad(len), thess(len);
+  #pragma omp parallel for schedule(static)
+  for (int j = 0; j < len; ++j) {
+    tgrad[j] = REAL(grad)[j];
+    thess[j] = REAL(hess)[j];
+  }
+  CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
+                                 R_ExternalPtrAddr(dtrain),
+                                 BeginPtr(tgrad), BeginPtr(thess),
+                                 len));
+  R_API_END();
+}
+
+SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
+  const char *ret;
+  R_API_BEGIN();
+  CHECK_EQ(length(dmats), length(evnames))
+      << "dmats and evnams must have same length";
+  int len = length(dmats);
+  std::vector<void*> vec_dmats;
+  std::vector<std::string> vec_names;
+  std::vector<const char*> vec_sptr;
+  for (int i = 0; i < len; ++i) {
+    vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+    vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
+  }
+  for (int i = 0; i < len; ++i) {
+    vec_sptr.push_back(vec_names[i].c_str());
+  }
+  CHECK_CALL(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
+                                asInteger(iter),
+                                BeginPtr(vec_dmats),
+                                BeginPtr(vec_sptr),
+                                len, &ret));
+  R_API_END();
+  return mkString(ret);
+}
+
+SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
+  SEXP ret;
+  R_API_BEGIN();
+  bst_ulong olen;
+  const float *res;
+  CHECK_CALL(XGBoosterPredict(R_ExternalPtrAddr(handle),
+                            R_ExternalPtrAddr(dmat),
+                            asInteger(option_mask),
+                            asInteger(ntree_limit),
+                            &olen, &res));
+  ret = PROTECT(allocVector(REALSXP, olen));
+  for (size_t i = 0; i < olen; ++i) {
+    REAL(ret)[i] = res[i];
+  }
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
+  R_API_END();
+}
+
+void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
+  R_API_END();
+}
+
+void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
+                                          RAW(raw),
+                                          length(raw)));
+  R_API_END();
+}
+
+SEXP XGBoosterModelToRaw_R(SEXP handle) {
+  SEXP ret;
+  R_API_BEGIN();
+  bst_ulong olen;
+  const char *raw;
+  CHECK_CALL(XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen, &raw));
+  ret = PROTECT(allocVector(RAWSXP, olen));
+  if (olen != 0) {
+    memcpy(RAW(ret), raw, olen);
+  }
+  UNPROTECT(1);
+  R_API_END();
+  return ret;
+}
+
+SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
+  SEXP out;
+  R_API_BEGIN();
+  bst_ulong olen;
+  const char **res;
+  CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+                                CHAR(asChar(fmap)),
+                                asInteger(with_stats),
+                                &olen, &res));
+  out = PROTECT(allocVector(STRSXP, olen));
+  for (size_t i = 0; i < olen; ++i) {
+    std::stringstream stream;
+    stream <<  "booster[" << i <<"]\n" << res[i];
+    SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+  }
+  UNPROTECT(1);
+  R_API_END();
+  return out;
+}
+
diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
deleted file mode 100644
index 1d426c496..000000000
--- a/R-package/src/xgboost_R.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright (c) 2014 by Contributors
-#include <vector>
-#include <string>
-#include <utility>
-#include <cstring>
-#include <cstdio>
-#include <sstream>
-#include "wrapper/xgboost_wrapper.h"
-#include "src/utils/utils.h"
-#include "src/utils/omp.h"
-#include "xgboost_R.h"
-
-using namespace std;
-using namespace xgboost;
-
-extern "C" {
-  void XGBoostAssert_R(int exp, const char *fmt, ...);
-  void XGBoostCheck_R(int exp, const char *fmt, ...);
-  int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...);
-}
-
-// implements error handling
-namespace xgboost {
-namespace utils {
-extern "C" {
-  void (*Printf)(const char *fmt, ...) = Rprintf;
-  int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R;
-  void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
-  void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
-  void (*Error)(const char *fmt, ...) = error;
-}
-bool CheckNAN(double v) {
-  return ISNAN(v);
-}
-double LogGamma(double v) {
-  return lgammafn(v);
-}
-}  // namespace utils
-
-namespace random {
-void Seed(unsigned seed) {
-  //  warning("parameter seed is ignored, please set random seed using set.seed");
-}
-double Uniform(void) {
-  return unif_rand();
-}
-double Normal(void) {
-  return norm_rand();
-}
-}  // namespace random
-}  // namespace xgboost
-
-// call before wrapper starts
-inline void _WrapperBegin(void) {
-  GetRNGstate();
-}
-// call after wrapper starts
-inline void _WrapperEnd(void) {
-  PutRNGstate();
-}
-
-// do nothing, check error
-inline void CheckErr(int ret) {
-}
-
-extern "C" {
-  SEXP XGCheckNullPtr_R(SEXP handle) {
-    return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
-  }
-  void _DMatrixFinalizer(SEXP ext) {
-    if (R_ExternalPtrAddr(ext) == NULL) return;
-    XGDMatrixFree(R_ExternalPtrAddr(ext));
-    R_ClearExternalPtr(ext);
-  }
-  SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
-    _WrapperBegin();
-    DMatrixHandle handle;
-    CheckErr(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
-    _WrapperEnd();
-    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
-    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-    UNPROTECT(1);
-    return ret;
-  }
-  SEXP XGDMatrixCreateFromMat_R(SEXP mat,
-                                SEXP missing) {
-    _WrapperBegin();
-    SEXP dim = getAttrib(mat, R_DimSymbol);
-    size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
-    size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
-    double *din = REAL(mat);
-    std::vector<float> data(nrow * ncol);
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < nrow; ++i) {
-      for (size_t j = 0; j < ncol; ++j) {
-        data[i * ncol +j] = din[i + nrow * j];
-      }
-    }
-    DMatrixHandle handle;
-    CheckErr(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
-    _WrapperEnd();
-    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
-    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-    UNPROTECT(1);
-    return ret;
-  }
-  SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
-                                SEXP indices,
-                                SEXP data) {
-    _WrapperBegin();
-    const int *p_indptr = INTEGER(indptr);
-    const int *p_indices = INTEGER(indices);
-    const double *p_data = REAL(data);
-    int nindptr = length(indptr);
-    int ndata = length(data);
-    std::vector<bst_ulong> col_ptr_(nindptr);
-    std::vector<unsigned> indices_(ndata);
-    std::vector<float> data_(ndata);
-
-    for (int i = 0; i < nindptr; ++i) {
-      col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
-    }
-    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < ndata; ++i) {
-      indices_[i] = static_cast<unsigned>(p_indices[i]);
-      data_[i] = static_cast<float>(p_data[i]);
-    }
-    DMatrixHandle handle;
-    CheckErr(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
-                                    BeginPtr(data_), nindptr, ndata,
-                                    &handle));
-    _WrapperEnd();
-    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
-    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-    UNPROTECT(1);
-    return ret;
-  }
-  SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
-    _WrapperBegin();
-    int len = length(idxset);
-    std::vector<int> idxvec(len);
-    for (int i = 0; i < len; ++i) {
-      idxvec[i] = INTEGER(idxset)[i] - 1;
-    }
-    DMatrixHandle res;
-    CheckErr(XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle),
-                                   BeginPtr(idxvec), len,
-                                   &res));
-    _WrapperEnd();
-    SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
-    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-    UNPROTECT(1);
-    return ret;
-  }
-  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
-    _WrapperBegin();
-    CheckErr(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
-                                 CHAR(asChar(fname)), asInteger(silent)));
-    _WrapperEnd();
-  }
-  void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
-    _WrapperBegin();
-    int len = length(array);
-    const char *name = CHAR(asChar(field));
-    if (!strcmp("group", name)) {
-      std::vector<unsigned> vec(len);
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < len; ++i) {
-        vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
-      }
-      CheckErr(XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len));
-    } else {
-      std::vector<float> vec(len);
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < len; ++i) {
-        vec[i] = REAL(array)[i];
-      }
-      CheckErr(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
-                                     CHAR(asChar(field)),
-                                     BeginPtr(vec), len));
-    }
-    _WrapperEnd();
-  }
-  SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
-    _WrapperBegin();
-    bst_ulong olen;
-    const float *res;
-    CheckErr(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
-                                   CHAR(asChar(field)),
-                                   &olen,
-                                   &res));
-    _WrapperEnd();
-    SEXP ret = PROTECT(allocVector(REALSXP, olen));
-    for (size_t i = 0; i < olen; ++i) {
-      REAL(ret)[i] = res[i];
-    }
-    UNPROTECT(1);
-    return ret;
-  }
-  SEXP XGDMatrixNumRow_R(SEXP handle) {
-    bst_ulong nrow;
-    CheckErr(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow));
-    return ScalarInteger(static_cast<int>(nrow));
-  }
-  // functions related to booster
-  void _BoosterFinalizer(SEXP ext) {
-    if (R_ExternalPtrAddr(ext) == NULL) return;
-    CheckErr(XGBoosterFree(R_ExternalPtrAddr(ext)));
-    R_ClearExternalPtr(ext);
-  }
-  SEXP XGBoosterCreate_R(SEXP dmats) {
-    _WrapperBegin();
-    int len = length(dmats);
-    std::vector<void*> dvec;
-    for (int i = 0; i < len; ++i) {
-      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-    }
-    BoosterHandle handle;
-    CheckErr(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
-    _WrapperEnd();
-    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
-    R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
-    UNPROTECT(1);
-    return ret;
-  }
-  void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
-    _WrapperBegin();
-    CheckErr(XGBoosterSetParam(R_ExternalPtrAddr(handle),
-                               CHAR(asChar(name)),
-                               CHAR(asChar(val))));
-    _WrapperEnd();
-  }
-  void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
-    _WrapperBegin();
-    CheckErr(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
-                                    asInteger(iter),
-                                    R_ExternalPtrAddr(dtrain)));
-    _WrapperEnd();
-  }
-  void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
-    _WrapperBegin();
-    utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
-    int len = length(grad);
-    std::vector<float> tgrad(len), thess(len);
-    #pragma omp parallel for schedule(static)
-    for (int j = 0; j < len; ++j) {
-      tgrad[j] = REAL(grad)[j];
-      thess[j] = REAL(hess)[j];
-    }
-    CheckErr(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
-                                   R_ExternalPtrAddr(dtrain),
-                                   BeginPtr(tgrad), BeginPtr(thess),
-                                   len));
-    _WrapperEnd();
-  }
-  SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
-    _WrapperBegin();
-    utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
-    int len = length(dmats);
-    std::vector<void*> vec_dmats;
-    std::vector<std::string> vec_names;
-    std::vector<const char*> vec_sptr;
-    for (int i = 0; i < len; ++i) {
-      vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-      vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
-    }
-    for (int i = 0; i < len; ++i) {
-      vec_sptr.push_back(vec_names[i].c_str());
-    }
-    const char *ret;
-    CheckErr(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
-                                  asInteger(iter),
-                                  BeginPtr(vec_dmats),
-                                  BeginPtr(vec_sptr),
-                                  len, &ret));
-    _WrapperEnd();
-    return mkString(ret);
-  }
-  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
-    _WrapperBegin();
-    bst_ulong olen;
-    const float *res;
-    CheckErr(XGBoosterPredict(R_ExternalPtrAddr(handle),
-                              R_ExternalPtrAddr(dmat),
-                              asInteger(option_mask),
-                              asInteger(ntree_limit),
-                              &olen, &res));
-    _WrapperEnd();
-    SEXP ret = PROTECT(allocVector(REALSXP, olen));
-    for (size_t i = 0; i < olen; ++i) {
-      REAL(ret)[i] = res[i];
-    }
-    UNPROTECT(1);
-    return ret;
-  }
-  void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
-    _WrapperBegin();
-    CheckErr(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
-    _WrapperEnd();
-  }
-  void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
-    _WrapperBegin();
-    CheckErr(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
-    _WrapperEnd();
-  }
-  void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
-    _WrapperBegin();
-    XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
-                                 RAW(raw),
-                                 length(raw));
-    _WrapperEnd();
-  }
-  SEXP XGBoosterModelToRaw_R(SEXP handle) {
-    bst_ulong olen;
-    _WrapperBegin();
-    const char *raw;
-    CheckErr(XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen, &raw));
-    _WrapperEnd();
-    SEXP ret = PROTECT(allocVector(RAWSXP, olen));
-    if (olen != 0) {
-      memcpy(RAW(ret), raw, olen);
-    }
-    UNPROTECT(1);
-    return ret;
-  }
-  SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
-    _WrapperBegin();
-    bst_ulong olen;
-    const char **res;
-    CheckErr(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
-                                CHAR(asChar(fmap)),
-                                asInteger(with_stats),
-                                &olen, &res));
-    _WrapperEnd();
-    SEXP out = PROTECT(allocVector(STRSXP, olen));
-    for (size_t i = 0; i < olen; ++i) {
-      stringstream stream;
-      stream <<  "booster[" << i <<"]\n" << res[i];
-      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
-    }
-    UNPROTECT(1);
-    return out;
-  }
-}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 768b2ced7..66dd1f1cf 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -4,155 +4,171 @@
  * \author Tianqi Chen
  * \brief R wrapper of xgboost
  */
-#ifndef XGBOOST_WRAPPER_R_H_ // NOLINT(*)
-#define XGBOOST_WRAPPER_R_H_ // NOLINT(*)
+#ifndef XGBOOST_R_H_ // NOLINT(*)
+#define XGBOOST_R_H_ // NOLINT(*)
 
 extern "C" {
 #include <Rinternals.h>
 #include <R_ext/Random.h>
 #include <Rmath.h>
 }
+#include <xgboost/c_api.h>
 
-extern "C" {
-  /*!
-   * \brief check whether a handle is NULL
-   * \param handle
-   * \return whether it is null ptr
+/*!
+ * \brief check whether a handle is NULL
+ * \param handle
+ * \return whether it is null ptr
+ */
+XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);
+
+/*!
+ * \brief load a data matrix
+ * \param fname name of the content
+ * \param silent whether print messages
+ * \return a loaded data matrix
+ */
+XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
+
+/*!
+ * \brief create matrix content from dense matrix
+ * This assumes the matrix is stored in column major format
+ * \param data R Matrix object
+ * \param missing which value to represent missing value
+ * \return created dmatrix
+ */
+XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
+                                      SEXP missing);
+/*!
+ * \brief create a matrix content from CSC format
+ * \param indptr pointer to column headers
+ * \param indices row indices
+ * \param data content of the data
+ * \return created dmatrix
+ */
+XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
+                                      SEXP indices,
+                                      SEXP data);
+
+/*!
+ * \brief create a new dmatrix from sliced content of existing matrix
+ * \param handle instance of data matrix to be sliced
+ * \param idxset index set
+ * \return a sliced new matrix
+ */
+XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
+
+/*!
+ * \brief load a data matrix into binary file
+ * \param handle a instance of data matrix
+ * \param fname file name
+ * \param silent print statistics when saving
+ */
+XGB_DLL void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
+
+/*!
+ * \brief set information to dmatrix
+ * \param handle a instance of data matrix
+ * \param field field name, can be label, weight
+ * \param array pointer to float vector
+ */
+XGB_DLL void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
+
+/*!
+ * \brief get info vector from matrix
+ * \param handle a instance of data matrix
+ * \param field field name
+ * \return info vector
+ */
+XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
+
+/*!
+ * \brief return number of rows
+ * \param handle a instance of data matrix
+ */
+XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
+
+/*!
+ * \brief create xgboost learner
+ * \param dmats a list of dmatrix handles that will be cached
+ */
+XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats);
+
+/*!
+ * \brief set parameters
+ * \param handle handle
+ * \param name  parameter name
+ * \param val value of parameter
+ */
+XGB_DLL void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
+
+/*!
+ * \brief update the model in one round using dtrain
+ * \param handle handle
+ * \param iter current iteration rounds
+ * \param dtrain training data
+ */
+XGB_DLL void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
+
+/*!
+ * \brief update the model, by directly specify gradient and second order gradient,
+ *        this can be used to replace UpdateOneIter, to support customized loss function
+ * \param handle handle
+ * \param dtrain training data
+ * \param grad gradient statistics
+ * \param hess second order gradient statistics
+ */
+XGB_DLL void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
+
+/*!
+ * \brief get evaluation statistics for xgboost
+ * \param handle handle
+ * \param iter current iteration rounds
+ * \param dmats list of handles to dmatrices
+ * \param evname name of evaluation
+ * \return the string containing evaluation stati
+ */
+XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
+
+/*!
+ * \brief make prediction based on dmat
+ * \param handle handle
+ * \param dmat data matrix
+ * \param option_mask output_margin:1 predict_leaf:2
+ * \param ntree_limit limit number of trees used in prediction
+ */
+XGB_DLL SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
+/*!
+ * \brief load model from existing file
+ * \param handle handle
+ * \param fname file name
+ */
+XGB_DLL void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
+
+/*!
+ * \brief save model into existing file
+ * \param handle handle
+ * \param fname file name
+ */
+XGB_DLL void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
+
+/*!
+ * \brief load model from raw array
+ * \param handle handle
+ */
+XGB_DLL void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw);
+
+/*!
+ * \brief save model into R's raw array
+ * \param handle handle
+ * \return raw array
    */
-  SEXP XGCheckNullPtr_R(SEXP handle);
-  /*!
-   * \brief load a data matrix
-   * \param fname name of the content
-   * \param silent whether print messages
-   * \return a loaded data matrix
-   */
-  SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
-  /*!
-   * \brief create matrix content from dense matrix
-   * This assumes the matrix is stored in column major format
-   * \param data R Matrix object
-   * \param missing which value to represent missing value
-   * \return created dmatrix
-   */
-  SEXP XGDMatrixCreateFromMat_R(SEXP mat,
-                                SEXP missing);
-  /*!
-   * \brief create a matrix content from CSC format
-   * \param indptr pointer to column headers
-   * \param indices row indices
-   * \param data content of the data
-   * \return created dmatrix
-   */
-  SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
-                                SEXP indices,
-                                SEXP data);
-  /*!
-   * \brief create a new dmatrix from sliced content of existing matrix
-   * \param handle instance of data matrix to be sliced
-   * \param idxset index set
-   * \return a sliced new matrix
-   */
-  SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
-  /*!
-   * \brief load a data matrix into binary file
-   * \param handle a instance of data matrix
-   * \param fname file name
-   * \param silent print statistics when saving
-   */
-  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
-  /*!
-   * \brief set information to dmatrix
-   * \param handle a instance of data matrix
-   * \param field field name, can be label, weight
-   * \param array pointer to float vector
-   */
-  void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
-  /*!
-   * \brief get info vector from matrix
-   * \param handle a instance of data matrix
-   * \param field field name
-   * \return info vector
-   */
-  SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
-  /*!
-   * \brief return number of rows
-   * \param handle a instance of data matrix
-   */
-  SEXP XGDMatrixNumRow_R(SEXP handle);
-  /*!
-   * \brief create xgboost learner
-   * \param dmats a list of dmatrix handles that will be cached
-   */
-  SEXP XGBoosterCreate_R(SEXP dmats);
-  /*!
-   * \brief set parameters
-   * \param handle handle
-   * \param name  parameter name
-   * \param val value of parameter
-   */
-  void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
-  /*!
-   * \brief update the model in one round using dtrain
-   * \param handle handle
-   * \param iter current iteration rounds
-   * \param dtrain training data
-   */
-  void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
-  /*!
-   * \brief update the model, by directly specify gradient and second order gradient,
-   *        this can be used to replace UpdateOneIter, to support customized loss function
-   * \param handle handle
-   * \param dtrain training data
-   * \param grad gradient statistics
-   * \param hess second order gradient statistics
-   */
-  void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
-  /*!
-   * \brief get evaluation statistics for xgboost
-   * \param handle handle
-   * \param iter current iteration rounds
-   * \param dmats list of handles to dmatrices
-   * \param evname name of evaluation
-   * \return the string containing evaluation stati
-   */
-  SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
-  /*!
-   * \brief make prediction based on dmat
-   * \param handle handle
-   * \param dmat data matrix
-   * \param option_mask output_margin:1 predict_leaf:2
-   * \param ntree_limit limit number of trees used in prediction
-   */
-  SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
-  /*!
-   * \brief load model from existing file
-   * \param handle handle
-   * \param fname file name
-   */
-  void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
-  /*!
-   * \brief save model into existing file
-   * \param handle handle
-   * \param fname file name
-   */
-  void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
-  /*!
-   * \brief load model from raw array
-   * \param handle handle
-   */
-  void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw);
-  /*!
-   * \brief save model into R's raw array
-   * \param handle handle
-   * \return raw array
-   */
-  SEXP XGBoosterModelToRaw_R(SEXP handle);
-  /*!
-   * \brief dump model into a string
-   * \param handle handle
-   * \param fmap  name to fmap can be empty string
-   * \param with_stats whether dump statistics of splits
-   */
-  SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
-}
+XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle);
+
+/*!
+ * \brief dump model into a string
+ * \param handle handle
+ * \param fmap  name to fmap can be empty string
+ * \param with_stats whether dump statistics of splits
+ */
+XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
 #endif  // XGBOOST_WRAPPER_R_H_ // NOLINT(*)
diff --git a/R-package/src/xgboost_assert.c b/R-package/src/xgboost_assert.c
index 072074243..4706a039e 100644
--- a/R-package/src/xgboost_assert.c
+++ b/R-package/src/xgboost_assert.c
@@ -24,11 +24,3 @@ void XGBoostCheck_R(int exp, const char *fmt, ...) {
     error("%s\n", buf);
   }
 }
-int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) {
-  int ret;
-  va_list args;
-  va_start(args, fmt);
-  ret = vsnprintf(buf, size, fmt, args);
-  va_end(args);
-  return ret;
-}
diff --git a/R-package/src/xgboost_custom.cc b/R-package/src/xgboost_custom.cc
new file mode 100644
index 000000000..9d0de76c4
--- /dev/null
+++ b/R-package/src/xgboost_custom.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2015 by Contributors
+// This file contains the customization implementations of R module
+// to change behavior of libxgboost
+
+#include <xgboost/logging.h>
+#include "src/common/random.h"
+#include "./xgboost_R.h"
+
+// redirect the messages to R's console.
+namespace dmlc {
+void CustomLogMessage::Log(const std::string& msg) {
+  Rprintf("%s\n", msg.c_str());
+}
+}  // namespace dmlc
+
+// implements rabit error handling.
+extern "C" {
+  void XGBoostAssert_R(int exp, const char *fmt, ...);
+  void XGBoostCheck_R(int exp, const char *fmt, ...);
+}
+
+namespace rabit {
+namespace utils {
+extern "C" {
+  void (*Printf)(const char *fmt, ...) = Rprintf;
+  void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
+  void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
+  void (*Error)(const char *fmt, ...) = error;
+}
+}
+}
+
+namespace xgboost {
+ConsoleLogger::~ConsoleLogger() {
+  dmlc::CustomLogMessage::Log(log_stream_.str());
+}
+TrackerLogger::~TrackerLogger() {
+  dmlc::CustomLogMessage::Log(log_stream_.str());
+}
+}  // namespace xgboost
+
+namespace xgboost {
+namespace common {
+
+// redirect the nath functions.
+bool CheckNAN(double v) {
+  return ISNAN(v);
+}
+double LogGamma(double v) {
+  return lgammafn(v);
+}
+
+// customize random engine.
+void CustomGlobalRandomEngine::seed(CustomGlobalRandomEngine::result_type val) {
+  // ignore the seed
+}
+
+// use R's PRNG to replacd
+CustomGlobalRandomEngine::result_type
+CustomGlobalRandomEngine::operator()() {
+  return static_cast<result_type>(
+      std::floor(unif_rand() * CustomGlobalRandomEngine::max()));
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R
index c5389dd0f..5473d930f 100644
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -10,5 +10,5 @@ test_that("poisson regression works", {
   expect_equal(class(bst), "xgb.Booster")
   pred <- predict(bst,as.matrix(mtcars[, -11]))
   expect_equal(length(pred), 32)
-  expect_equal(sqrt(mean( (pred - mtcars[,11]) ^ 2)), 1.16, tolerance = 0.01)
+  expect_less_than(sqrt(mean( (pred - mtcars[,11]) ^ 2)), 2.5)
 })
diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd
index 08d6bfdf5..e981df0ed 100644
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "Understand your dataset with Xgboost"
-output: 
+output:
   rmarkdown::html_vignette:
     css: vignette.css
     number_sections: yes
@@ -12,8 +12,11 @@ vignette: >
   \usepackage[utf8]{inputenc}
 ---
 
+Understand your dataset with XGBoost
+====================================
+
 Introduction
-============
+------------
 
 The purpose of this Vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.
 
@@ -25,16 +28,16 @@ Pacakge loading:
 require(xgboost)
 require(Matrix)
 require(data.table)
-if (!require('vcd')) install.packages('vcd') 
+if (!require('vcd')) install.packages('vcd')
 ```
 
 > **VCD** package is used for one of its embedded dataset only.
 
 Preparation of the dataset
-==========================
+--------------------------
+
+### Numeric VS categorical variables
 
-Numeric VS categorical variables
---------------------------------
 
 **Xgboost** manages only `numeric` vectors.
 
@@ -48,10 +51,9 @@ A *categorical* variable has a fixed number of different values. For instance, i
 
 To answer the question above we will convert *categorical* variables to `numeric` one.
 
-Conversion from categorical to numeric variables
-------------------------------------------------
+### Conversion from categorical to numeric variables
 
-### Looking at the raw data
+#### Looking at the raw data
 
 In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
 
@@ -85,11 +87,11 @@ str(df)
 > * can take a limited number of values (like `factor`) ;
 > * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None`
 
-### Creation of new features based on old ones
+#### Creation of new features based on old ones
 
 We will add some new *categorical* features to see if it helps.
 
-#### Grouping per 10 years
+##### Grouping per 10 years
 
 For the first feature we create groups of age by rounding the real age.
 
@@ -101,7 +103,7 @@ Therefore, 20 is not closer to 30 than 60. To make it short, the distance betwee
 head(df[,AgeDiscret := as.factor(round(Age/10,0))])
 ```
 
-#### Random split in two groups
+##### Random split in two groups
 
 Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
 
@@ -109,15 +111,15 @@ Following is an even stronger simplification of the real age with an arbitrary s
 head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
 ```
 
-#### Risks in adding correlated features
+##### Risks in adding correlated features
 
-These new features are highly correlated to the `Age` feature because they are simple transformations of this feature. 
+These new features are highly correlated to the `Age` feature because they are simple transformations of this feature.
 
 For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
 
 Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
 
-#### Cleaning data
+##### Cleaning data
 
 We remove ID as there is nothing to learn from this feature (it would just add some noise).
 
@@ -132,7 +134,7 @@ levels(df[,Treatment])
 ```
 
 
-### One-hot encoding
+#### One-hot encoding
 
 Next step, we will transform the categorical data to dummy variables.
 This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
@@ -156,12 +158,12 @@ Create the output `numeric` vector (not as a sparse `Matrix`):
 output_vector = df[,Improved] == "Marked"
 ```
 
-1. set `Y` vector to `0`; 
-2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ; 
+1. set `Y` vector to `0`;
+2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
 3. return `Y` vector.
 
 Build the model
-===============
+---------------
 
 The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
 
@@ -173,17 +175,17 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
 
 You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
 
-A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future). 
+A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
 
-> Here you can see the numbers decrease until line 7 and then increase. 
+> Here you can see the numbers decrease until line 7 and then increase.
 >
 > It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-)
 
 Feature importance
-==================
+------------------
+
+## Measure feature importance
 
-Measure feature importance
---------------------------
 
 ### Build the feature importance data.table
 
@@ -204,7 +206,7 @@ head(importance)
 
 `Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
 
-### Improvement in the interpretability of feature importance data.table
+#### Improvement in the interpretability of feature importance data.table
 
 We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
 
@@ -233,8 +235,8 @@ Therefore, according to our findings, getting a placebo doesn't seem to help but
 
 > You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
 
-Plotting the feature importance
--------------------------------
+### Plotting the feature importance
+
 
 All these things are nice, but it would be even better to plot the results.
 
@@ -250,11 +252,11 @@ According to the plot above, the most important features in this dataset to pred
 
 * the Age ;
 * having received a placebo or not ;
-* the sex is third but already included in the not interesting features group ; 
+* the sex is third but already included in the not interesting features group ;
 * then we see our generated features (AgeDiscret). We can see that their contribution is very low.
 
-Do these results make sense?
-------------------------------
+### Do these results make sense?
+
 
 Let's check some **Chi2** between each of these features and the label.
 
@@ -279,18 +281,18 @@ c2 <- chisq.test(df$AgeCat, output_vector)
 print(c2)
 ```
 
-The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. 
+The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same.
 
-Morality: don't let your *gut* lower the quality of your model. 
+Morality: don't let your *gut* lower the quality of your model.
 
 In *data science* expression, there is the word *science* :-)
 
 Conclusion
-==========
+----------
 
-As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that. 
+As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that.
 
-But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. 
+But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
 
 The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
 
@@ -299,7 +301,7 @@ Moreover, you can notice that even if we have added some not useful new features
 Linear model may not be that smart in this scenario.
 
 Special Note: What about Random Forests™?
-==========================================
+-----------------------------------------
 
 As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
 
@@ -313,7 +315,7 @@ However, in Random Forests™ this random choice will be done for each tree, bec
 
 In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
 
-If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters! 
+If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters!
 
 **Warning**: this is still an experimental parameter.
 
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index 1e6060eb1..aa33073ad 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -13,8 +13,11 @@ vignette: >
   \usepackage[utf8]{inputenc}
 ---
 
-Introduction
-============
+XGBoost R Tutorial
+==================
+
+## Introduction
+
 
 **Xgboost** is short for e**X**treme **G**radient **Boost**ing package.
 
@@ -40,16 +43,16 @@ It has several features:
 * Sparsity: it accepts *sparse* input for both *tree booster*  and *linear booster*, and is optimized for *sparse* input ;
 * Customization: it supports customized objective functions and evaluation functions.
 
-Installation
-============
+## Installation
+
+
+### Github version
 
-Github version
---------------
 
 For up-to-date version (highly recommended), install from *Github*:
 
 ```{r installGithub, eval=FALSE}
-devtools::install_github('dmlc/xgboost', subdir='R-package')
+devtools::install_git('git://github.com/dmlc/xgboost', subdir='R-package')
 ```
 
 > *Windows* user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
@@ -61,8 +64,8 @@ As of 2015-03-13, ‘xgboost’ was removed from the CRAN repository.
 
 Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
 
-Learning
-========
+## Learning
+
 
 For the purpose of this tutorial we will load **XGBoost** package.
 
@@ -70,15 +73,15 @@ For the purpose of this tutorial we will load **XGBoost** package.
 require(xgboost)
 ```
 
-Dataset presentation
---------------------
+### Dataset presentation
+
 
 In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-).
 
 Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.
 
-Dataset loading
----------------
+### Dataset loading
+
 
 We will load the `agaricus` datasets embedded with the package and will link them to variables.
 
@@ -124,12 +127,12 @@ class(train$data)[1]
 class(train$label)
 ```
 
-Basic Training using XGBoost
-----------------------------
+### Basic Training using XGBoost
+
 
 This step is the most critical part of the process for the quality of our model.
 
-### Basic training
+#### Basic training
 
 We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`.
 
@@ -148,9 +151,9 @@ bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta
 
 > More complex the relationship between your features and your `label` is, more passes you need.
 
-### Parameter variations
+#### Parameter variations
 
-#### Dense matrix
+##### Dense matrix
 
 Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix.
 
@@ -158,7 +161,7 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
 bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 ```
 
-#### xgb.DMatrix
+##### xgb.DMatrix
 
 **XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
 
@@ -167,7 +170,7 @@ dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 ```
 
-#### Verbose option
+##### Verbose option
 
 **XGBoost** has several features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
 
@@ -188,11 +191,11 @@ bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, o
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
 ```
 
-Basic prediction using XGBoost
-==============================
+## Basic prediction using XGBoost
+
+
+## Perform the prediction
 
-Perform the prediction
-----------------------
 
 The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
 
@@ -208,8 +211,8 @@ print(head(pred))
 
 These numbers doesn't look like *binary classification* `{0,1}`. We need to perform a simple transformation before being able to use these results.
 
-Transform the regression in a binary classification
----------------------------------------------------
+## Transform the regression in a binary classification
+
 
 The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model.
 
@@ -222,8 +225,8 @@ prediction <- as.numeric(pred > 0.5)
 print(head(prediction))
 ```
 
-Measuring model performance
----------------------------
+## Measuring model performance
+
 
 To measure the model performance, we will compute a simple metric, the *average error*.
 
@@ -246,14 +249,14 @@ The most important thing to remember is that **to do a classification, you just
 
 This metric is **`r round(err, 2)`** and is pretty low: our yummly mushroom model works well!
 
-Advanced features
-=================
+## Advanced features
+
 
 Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content.
 
 
-Dataset preparation
--------------------
+### Dataset preparation
+
 
 For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
 
@@ -262,8 +265,8 @@ dtrain <- xgb.DMatrix(data = train$data, label=train$label)
 dtest <- xgb.DMatrix(data = test$data, label=test$label)
 ```
 
-Measure learning progress with xgb.train
-----------------------------------------
+### Measure learning progress with xgb.train
+
 
 Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
 
@@ -295,8 +298,8 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli
 
 > `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`.
 
-Linear boosting
----------------
+### Linear boosting
+
 
 Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
 
@@ -308,10 +311,10 @@ In this specific case, *linear boosting* gets sligtly better performance metrics
 
 In simple cases, it will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
 
-Manipulating xgb.DMatrix
-------------------------
+### Manipulating xgb.DMatrix
 
-### Save / Load
+
+#### Save / Load
 
 Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function.
 
@@ -326,7 +329,7 @@ bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchl
 file.remove("dtrain.buffer")
 ```
 
-### Information extraction
+#### Information extraction
 
 Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data.
 
@@ -337,8 +340,8 @@ err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
 print(paste("test-error=", err))
 ```
 
-View feature importance/influence from the learnt model
--------------------------------------------------------
+### View feature importance/influence from the learnt model
+
 
 Feature importance is similar to R gbm package's relative influence (rel.inf).
 
@@ -348,8 +351,8 @@ print(importance_matrix)
 xgb.plot.importance(importance_matrix = importance_matrix)
 ```
 
-View the trees from a model
----------------------------
+#### View the trees from a model
+
 
 You can dump the tree you learned using `xgb.dump` into a text file.
 
@@ -365,8 +368,8 @@ xgb.plot.tree(model = bst)
 
 > if you provide a path to `fname` parameter you can save the trees to your hard drive.
 
-Save and load models
---------------------
+#### Save and load models
+
 
 Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
 
@@ -416,5 +419,4 @@ print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
 
 > Again `0`? It seems that `XGBoost` works pretty well!
 
-References
-==========
+## References
diff --git a/README.md b/README.md
index f33394d40..092e7abe3 100644
--- a/README.md
+++ b/README.md
@@ -7,47 +7,22 @@
 [![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/)
 [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
-An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
-
-It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data
-
-XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) <img src=https://avatars2.githubusercontent.com/u/11508361?v=3&s=20> projects
+XGBoost is an optimized distributed gradient boosting library designed to be highly *efficient*, *flexible* and *portable*.
+It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework.
+XGBoost provides a parallel tree boosting(also known as GBDT, GBM) that solve many data science problems in a fast and accurate way.
+The same code runs on major distributed environment(Hadoop, SGE, MPI) and can solve problems beyond billions of examples.
+XGBoost is part of [DMLC](http://dmlc.github.io/) projects.
 
 Contents
 --------
-* [What's New](#whats-new)
-* [Version](#version)
-* [Documentation](doc/index.md)
-* [Build Instruction](doc/build.md)
-* [Features](#features)
-* [Distributed XGBoost](multi-node)
-* [Usecases](doc/index.md#highlight-links)
-* [Bug Reporting](#bug-reporting)
-* [Contributing to XGBoost](#contributing-to-xgboost)
-* [Committers and Contributors](CONTRIBUTORS.md)
-* [License](#license)
-* [XGBoost in Graphlab Create](#xgboost-in-graphlab-create)
+* [Documentation and Tutorials](https://xgboost.readthedocs.org)
+* [Code Examples](demo)
+* [Installation](doc/build.md)
+* [Contribute to XGBoost](http://xgboost.readthedocs.org/en/latest/dev-guide/contribute.html)
 
 What's New
 ----------
-
-* XGBoost helps Vlad Mironov, Alexander Guschin to win the [CERN LHCb experiment Flavour of Physics competition](https://www.kaggle.com/c/flavours-of-physics). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/11/30/flavour-of-physics-technical-write-up-1st-place-go-polar-bears/).
-* XGBoost helps Mario Filho, Josef Feigl, Lucas, Gilberto to win the [Caterpillar Tube Pricing competition](https://www.kaggle.com/c/caterpillar-tube-pricing). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/09/22/caterpillar-winners-interview-1st-place-gilberto-josef-leustagos-mario/).
-* XGBoost helps Halla Yang to win the [Recruit Coupon Purchase Prediction Challenge](https://www.kaggle.com/c/coupon-purchase-prediction). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/10/21/recruit-coupon-purchase-winners-interview-2nd-place-halla-yang/).
-* XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
-* XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
-  Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
-* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
-* XGBoost helps three champion teams to win [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
-  Check out the [winning solution](doc/README.md#highlight-links)
-* [External Memory Version](doc/external_memory.md)
-
-Version
--------
-
-* Current version xgboost-0.4
-  - [Change log](CHANGES.md)
-  - This version is compatible with 0.3x versions
+* [XGBoost brick](NEWS.md) Release
 
 Features
 --------
@@ -61,24 +36,17 @@ Features
 
 Bug Reporting
 -------------
-
 * For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page.
 * For generic questions or to share your experience using xgboost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
 
-
 Contributing to XGBoost
 -----------------------
-
 XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
 * Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something.
 * Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users.
-* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged.
+* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) and after your patch has been merged.
+  - Please also update [NEWS.md](NEWS.md) on changes and improvements in API and docs.
 
 License
 -------
 © Contributors, 2015. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license.
-
-XGBoost in Graphlab Create
---------------------------
-* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html)
-* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge:
diff --git a/amalgamation/dmlc-minimum0.cc b/amalgamation/dmlc-minimum0.cc
new file mode 100644
index 000000000..bce61129e
--- /dev/null
+++ b/amalgamation/dmlc-minimum0.cc
@@ -0,0 +1,14 @@
+/*!
+ * Copyright 2015 by Contributors.
+ * \brief Mininum DMLC library Amalgamation, used for easy plugin of dmlc lib.
+ *  Normally this is not needed.
+ */
+#include "../dmlc-core/src/io/line_split.cc"
+#include "../dmlc-core/src/io/recordio_split.cc"
+#include "../dmlc-core/src/io/input_split_base.cc"
+#include "../dmlc-core/src/io/local_filesys.cc"
+#include "../dmlc-core/src/data.cc"
+#include "../dmlc-core/src/io.cc"
+#include "../dmlc-core/src/recordio.cc"
+
+
diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
new file mode 100644
index 000000000..7cc36c16b
--- /dev/null
+++ b/amalgamation/xgboost-all0.cc
@@ -0,0 +1,57 @@
+/*!
+ * Copyright 2015 by Contributors.
+ * \brief XGBoost Amalgamation.
+ *  This offers an alternative way to compile the entire library from this single file.
+ *
+ *  Example usage command.
+ *  - $(CXX) -std=c++0x -fopenmp -o -shared libxgboost.so xgboost-all0.cc -ldmlc -lrabit
+ *
+ * \author Tianqi Chen.
+ */
+
+// metrics
+#include "../src/metric/metric.cc"
+#include "../src/metric/elementwise_metric.cc"
+#include "../src/metric/multiclass_metric.cc"
+#include "../src/metric/rank_metric.cc"
+
+// objectives
+#include "../src/objective/objective.cc"
+#include "../src/objective/regression_obj.cc"
+#include "../src/objective/multiclass_obj.cc"
+#include "../src/objective/rank_obj.cc"
+
+// gbms
+#include "../src/gbm/gbm.cc"
+#include "../src/gbm/gbtree.cc"
+#include "../src/gbm/gblinear.cc"
+
+// data
+#include "../src/data/data.cc"
+#include "../src/data/simple_csr_source.cc"
+#include "../src/data/simple_dmatrix.cc"
+#include "../src/data/sparse_page_raw_format.cc"
+
+#if DMLC_ENABLE_STD_THREAD
+#include "../src/data/sparse_page_source.cc"
+#include "../src/data/sparse_page_dmatrix.cc"
+#endif
+
+// tress
+#include "../src/tree/tree_model.cc"
+#include "../src/tree/tree_updater.cc"
+#include "../src/tree/updater_colmaker.cc"
+#include "../src/tree/updater_prune.cc"
+#include "../src/tree/updater_refresh.cc"
+#include "../src/tree/updater_sync.cc"
+#include "../src/tree/updater_histmaker.cc"
+#include "../src/tree/updater_skmaker.cc"
+
+// global
+#include "../src/learner.cc"
+#include "../src/logging.cc"
+#include "../src/common/common.cc"
+
+// c_api
+#include "../src/c_api/c_api.cc"
+#include "../src/c_api/c_api_error.cc"
diff --git a/appveyor.yml b/appveyor.yml
deleted file mode 100644
index c1367d52e..000000000
--- a/appveyor.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-environment:
-  global:
-   CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd"
-   DISABLE_OPENMP: 1
-   VisualStudioVersion: 12.0
-   
-  matrix:
-    - PYTHON: "C:\\Python27-x64"
-      PYTHON_VERSION: "2.7.x" # currently 2.7.9
-      PYTHON_ARCH: "64"
-
-    - PYTHON: "C:\\Python33-x64"
-      PYTHON_VERSION: "3.3.x" # currently 3.3.5
-      PYTHON_ARCH: "64"
-
-platform:
-  - x64
-
-configuration:
-  - Release
-
-install:
-  - cmd: git clone https://github.com/ogrisel/python-appveyor-demo
-  - ECHO "Filesystem root:"
-  - ps: "ls \"C:/\""
-
-  - ECHO "Installed SDKs:"
-  - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\""
-
-  - ps: python-appveyor-demo\appveyor\install.ps1
-  - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
-  - "python --version"
-  - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
-
-build: off
-  #project: windows\xgboost.sln
\ No newline at end of file
diff --git a/build.sh b/build.sh
index 2a285597a..8480cd4f9 100755
--- a/build.sh
+++ b/build.sh
@@ -6,27 +6,14 @@
 
 # See additional instruction in doc/build.md
 
-#for building static OpenMP lib in MAC for easier installation in MAC
-#doesn't work with XCode clang/LLVM since Apple doesn't support, 
-#needs brew install gcc 4.9+ with OpenMP. By default the static link is OFF
-static_omp=0
-if ((${static_omp}==1)); then
-    rm libgomp.a
-    ln -s `g++ -print-file-name=libgomp.a`
-    make clean
-    make omp_mac_static=1
-    echo "Successfully build multi-thread static link xgboost"
-    exit 0
-fi
-
 if make; then
     echo "Successfully build multi-thread xgboost"
 else
     echo "-----------------------------"
     echo "Building multi-thread xgboost failed"
     echo "Start to build single-thread xgboost"
-    make clean
-    make no_omp=1
+    make clean_all
+    make config=config/mininum.mk
     echo "Successfully build single-thread xgboost"
     echo "If you want multi-threaded version"
     echo "See additional instructions in doc/build.md"
diff --git a/demo/README.md b/demo/README.md
index 5a7a25f76..229ffc6ff 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -44,8 +44,15 @@ However, the parameter settings can be applied to all versions
 * [Multiclass classification](multiclass_classification)
 * [Regression](regression)
 * [Learning to Rank](rank)
+* [Distributed Training](distributed-training)
 
 Benchmarks
 ----------
 * [Starter script for Kaggle Higgs Boson](kaggle-higgs)
 * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
+
+Machine Learning Challenge Winning Solutions
+--------------------------------------------
+* XGBoost helps Vlad Mironov, Alexander Guschin to win the [CERN LHCb experiment Flavour of Physics competition](https://www.kaggle.com/c/flavours-of-physics). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/11/30/flavour-of-physics-technical-write-up-1st-place-go-polar-bears/).
+* XGBoost helps Mario Filho, Josef Feigl, Lucas, Gilberto to win the [Caterpillar Tube Pricing competition](https://www.kaggle.com/c/caterpillar-tube-pricing). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/09/22/caterpillar-winners-interview-1st-place-gilberto-josef-leustagos-mario/).
+* XGBoost helps Halla Yang to win the [Recruit Coupon Purchase Prediction Challenge](https://www.kaggle.com/c/coupon-purchase-prediction). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/10/21/recruit-coupon-purchase-winners-interview-2nd-place-halla-yang/).
diff --git a/demo/distributed-training/README.md b/demo/distributed-training/README.md
new file mode 100644
index 000000000..3926612cc
--- /dev/null
+++ b/demo/distributed-training/README.md
@@ -0,0 +1,52 @@
+Distributed XGBoost Training
+============================
+This is an tutorial of Distributed XGBoost Training.
+Currently xgboost supports distributed training via CLI program with the configuration file.
+There is also plan push distributed python and other language bindings, please open an issue
+if you are interested in contributing.
+
+Build XGBoost with Distributed Filesystem Support
+-------------------------------------------------
+To use distributed xgboost, you only need to turn the options on to build
+with distributed filesystems(HDFS or S3) in ```xgboost/make/config.mk```.
+
+How to Use
+----------
+* Input data format: LIBSVM format. The example here uses generated data in ../data folder.
+* Put the data into some distribute filesytem (S3 or HDFS)
+* Use tracker script in dmlc-core/tracker to submit the jobs
+* Like all other DMLC tools, xgboost support taking a path to a folder as input argument
+  - All the files in the folder will be used as input
+* Quick start in Hadoop YARN: run ```bash run_yarn.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
+
+Example
+-------
+* [run_yarn.sh](run_yarn.sh) shows how to submit job to Hadoop via YARN.
+
+Single machine vs Distributed Version
+-------------------------------------
+If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
+* IO: instead of reading and writing file locally, we now use HDFS, put ```hdfs://``` prefix to the address of file you like to access
+* File cache: ```dmlc_yarn.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file
+  - ```dmlc_yarn.py``` will automatically cache files in the command line. For example, ```dmlc_yarn.py -n 3 $localPath/xgboost.dmlc mushroom.hadoop.conf``` will cache "xgboost.dmlc" and "mushroom.hadoop.conf".
+  - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2```
+  - The local path of cached files in command is "./".
+* More details of submission can be referred to the usage of ```dmlc_yarn.py```.
+* The model saved by hadoop version is compatible with single machine version.
+
+Notes
+-----
+* The code is optimized with multi-threading, so you will want to run xgboost with more vcores for best performance.
+  - You will want to set <n_thread_per_worker> to be number of cores you have on each machine.
+
+
+External Memory Version
+-----------------------
+XGBoost supports external memory, this will make each process cache data into local disk during computation, without taking up all the memory for storing the data.
+See [external memory](https://github.com/dmlc/xgboost/tree/master/doc/external_memory.md) for syntax using external memory.
+
+You only need to add cacheprefix to the input file to enable external memory mode. For example set training data as
+```
+data=hdfs:///path-to-my-data/#dtrain.cache
+```
+This will make xgboost more memory efficient, allows you to run xgboost on larger-scale dataset.
diff --git a/demo/distributed-training/run_yarn.sh b/demo/distributed-training/run_yarn.sh
new file mode 100755
index 000000000..3d7c6bf05
--- /dev/null
+++ b/demo/distributed-training/run_yarn.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+	echo "Usage: <nworkers> <nthreads> <path_in_HDFS>"
+	exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $3/data
+hadoop fs -put ../data/agaricus.txt.train $3/data
+hadoop fs -put ../data/agaricus.txt.test $3/data
+
+# running rabit, pass address in hdfs
+../../dmlc-core/tracker/dmlc_yarn.py  -n $1 --vcores $2 ../../xgboost mushroom.hadoop.conf nthread=$2\
+    data=hdfs://$3/data/agaricus.txt.train\
+    eval[test]=hdfs://$3/data/agaricus.txt.test\
+    model_out=hdfs://$3/mushroom.final.model
+
+# get the final model file
+hadoop fs -get $3/mushroom.final.model final.model
+
+# use dmlc-core/yarn/run_hdfs_prog.py to setup approperiate env
+
+# output prediction task=pred
+#../../xgboost.dmlc mushroom.hadoop.conf task=pred model_in=final.model test:data=../data/agaricus.txt.test
+../../dmlc-core/yarn/run_hdfs_prog.py ../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+#../../xgboost.dmlc mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+../../dmlc-core/yarn/run_hdfs_prog.py ../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+#../../xgboost.dmlc mushroom.hadoop.conf task=dump model_in=final.model fmap=../data/featmap.txt name_dump=dump.nice.txt
+../../dmlc-core/yarn/run_hdfs_prog.py ../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt
diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh
index 5c8ddf93c..21fa59de2 100755
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+export PYTHONPATH=PYTHONPATH:../../python-package
 python basic_walkthrough.py
 python custom_objective.py
 python boost_from_prediction.py
@@ -9,4 +10,4 @@ python predict_leaf_indices.py
 python sklearn_examples.py
 python sklearn_parallel.py
 python external_memory.py
-rm -rf *~ *.model *.buffer 
+rm -rf *~ *.model *.buffer
diff --git a/dmlc-core b/dmlc-core
new file mode 160000
index 000000000..ad2ddde8b
--- /dev/null
+++ b/dmlc-core
@@ -0,0 +1 @@
+Subproject commit ad2ddde8b6624abf3007a71b2923c3925530cc81
diff --git a/doc/.gitignore b/doc/.gitignore
index 382c3419f..61e15164c 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -5,3 +5,4 @@ _*
 doxygen
 parser.py
 *.pyc
+web-data
diff --git a/doc/Doxyfile b/doc/Doxyfile
new file mode 100644
index 000000000..7ec79dace
--- /dev/null
+++ b/doc/Doxyfile
@@ -0,0 +1,2353 @@
+# Doxyfile 1.8.8
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "xgboost"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc/doxygen
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+#ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+#MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+#AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+#EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+#SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = include src/common
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.h
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */test/* \
+                         logging.h
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+#USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+#SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+#CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+#CLANG_OPTIONS          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra stylesheet files is of importance (e.g. the last
+# stylesheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+#HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+#HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+#MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+#MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+#EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+#SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+#SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+#EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+#EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
+# for the replacement values of the other commands the user is refered to
+# HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+#LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+#MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+#GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+#DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+#DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = DMLC_USE_CXX11
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+#EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+#DIA_PATH               =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = YES
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+#UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+#DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+#PLANTUML_JAR_PATH      =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/doc/R-package/.gitignore b/doc/R-package/.gitignore
new file mode 100644
index 000000000..b25c15b81
--- /dev/null
+++ b/doc/R-package/.gitignore
@@ -0,0 +1 @@
+*~
diff --git a/doc/R-package/Makefile b/doc/R-package/Makefile
new file mode 100644
index 000000000..ae2c7ff4b
--- /dev/null
+++ b/doc/R-package/Makefile
@@ -0,0 +1,14 @@
+# This is the makefile for compiling Rmarkdown files into the md file with results.
+PKGROOT=../../R-package
+
+# ADD The Markdown to be built here, with suffix md
+discoverYourData.md: $(PKGROOT)/vignettes/discoverYourData.Rmd
+xgboostPresentation.md: $(PKGROOT)/vignettes/xgboostPresentation.Rmd
+
+# General Rules for build rmarkdowns, need knitr
+%.md:
+	Rscript -e \
+	"require(knitr);"\
+	"knitr::opts_knit\$$set(root.dir=\".\");"\
+	"knitr::opts_chunk\$$set(fig.path=\"../web-data/xgboost/knitr/$(basename $@)-\");"\
+	"knitr::knit(\"$+\")"
diff --git a/doc/R-package/discoverYourData.md b/doc/R-package/discoverYourData.md
new file mode 100644
index 000000000..e5327b8fc
--- /dev/null
+++ b/doc/R-package/discoverYourData.md
@@ -0,0 +1,484 @@
+---
+title: "Understand your dataset with Xgboost"
+output:
+  rmarkdown::html_vignette:
+    css: vignette.css
+    number_sections: yes
+    toc: yes
+author: Tianqi Chen, Tong He, Michaël Benesty
+vignette: >
+  %\VignetteIndexEntry{Discover your data}
+  %\VignetteEngine{knitr::rmarkdown}
+  \usepackage[utf8]{inputenc}
+---
+
+Understand your dataset with XGBoost
+====================================
+
+Introduction
+------------
+
+The purpose of this Vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.
+
+This Vignette is not about predicting anything (see [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **Xgboost** to highlight the *link* between the *features* of your data and the *outcome*.
+
+Pacakge loading:
+
+
+```r
+require(xgboost)
+require(Matrix)
+require(data.table)
+if (!require('vcd')) install.packages('vcd')
+```
+
+> **VCD** package is used for one of its embedded dataset only.
+
+Preparation of the dataset
+--------------------------
+
+### Numeric VS categorical variables
+
+
+**Xgboost** manages only `numeric` vectors.
+
+What to do when you have *categorical* data?
+
+A *categorical* variable has a fixed number of different values. For instance, if a variable called *Colour* can have only one of these three values, *red*, *blue* or *green*, then *Colour* is a *categorical* variable.
+
+> In **R**, a *categorical* variable is called `factor`.
+>
+> Type `?factor` in the console for more information.
+
+To answer the question above we will convert *categorical* variables to `numeric` one.
+
+### Conversion from categorical to numeric variables
+
+#### Looking at the raw data
+
+In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
+
+The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
+
+The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
+
+
+```r
+data(Arthritis)
+df <- data.table(Arthritis, keep.rownames = F)
+```
+
+> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `panda` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
+
+The first thing we want to do is to have a look to the first lines of the `data.table`:
+
+
+```r
+head(df)
+```
+
+```
+##    ID Treatment  Sex Age Improved
+## 1: 57   Treated Male  27     Some
+## 2: 46   Treated Male  29     None
+## 3: 77   Treated Male  30     None
+## 4: 17   Treated Male  32   Marked
+## 5: 36   Treated Male  46   Marked
+## 6: 23   Treated Male  58   Marked
+```
+
+Now we will check the format of each column.
+
+
+```r
+str(df)
+```
+
+```
+## Classes 'data.table' and 'data.frame':	84 obs. of  5 variables:
+##  $ ID       : int  57 46 77 17 36 23 75 39 33 55 ...
+##  $ Treatment: Factor w/ 2 levels "Placebo","Treated": 2 2 2 2 2 2 2 2 2 2 ...
+##  $ Sex      : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
+##  $ Age      : int  27 29 30 32 46 58 59 59 63 63 ...
+##  $ Improved : Ord.factor w/ 3 levels "None"<"Some"<..: 2 1 1 3 3 3 1 3 1 1 ...
+##  - attr(*, ".internal.selfref")=<externalptr>
+```
+
+2 columns have `factor` type, one has `ordinal` type.
+
+> `ordinal` variable :
+>
+> * can take a limited number of values (like `factor`) ;
+> * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None`
+
+#### Creation of new features based on old ones
+
+We will add some new *categorical* features to see if it helps.
+
+##### Grouping per 10 years
+
+For the first feature we create groups of age by rounding the real age.
+
+Note that we transform it to `factor` so the algorithm treat these age groups as independent values.
+
+Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
+
+
+```r
+head(df[,AgeDiscret := as.factor(round(Age/10,0))])
+```
+
+```
+##    ID Treatment  Sex Age Improved AgeDiscret
+## 1: 57   Treated Male  27     Some          3
+## 2: 46   Treated Male  29     None          3
+## 3: 77   Treated Male  30     None          3
+## 4: 17   Treated Male  32   Marked          3
+## 5: 36   Treated Male  46   Marked          5
+## 6: 23   Treated Male  58   Marked          6
+```
+
+##### Random split in two groups
+
+Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
+
+
+```r
+head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
+```
+
+```
+##    ID Treatment  Sex Age Improved AgeDiscret AgeCat
+## 1: 57   Treated Male  27     Some          3  Young
+## 2: 46   Treated Male  29     None          3  Young
+## 3: 77   Treated Male  30     None          3  Young
+## 4: 17   Treated Male  32   Marked          3    Old
+## 5: 36   Treated Male  46   Marked          5    Old
+## 6: 23   Treated Male  58   Marked          6    Old
+```
+
+##### Risks in adding correlated features
+
+These new features are highly correlated to the `Age` feature because they are simple transformations of this feature.
+
+For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
+
+Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
+
+##### Cleaning data
+
+We remove ID as there is nothing to learn from this feature (it would just add some noise).
+
+
+```r
+df[,ID:=NULL]
+```
+
+We will list the different values for the column `Treatment`:
+
+
+```r
+levels(df[,Treatment])
+```
+
+```
+## [1] "Placebo" "Treated"
+```
+
+
+#### One-hot encoding
+
+Next step, we will transform the categorical data to dummy variables.
+This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
+
+The purpose is to transform each value of each *categorical* feature in a *binary* feature `{0, 1}`.
+
+For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
+
+Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
+
+
+```r
+sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
+head(sparse_matrix)
+```
+
+```
+## 6 x 10 sparse Matrix of class "dgCMatrix"
+##                       
+## 1 . 1 1 27 1 . . . . 1
+## 2 . 1 1 29 1 . . . . 1
+## 3 . 1 1 30 1 . . . . 1
+## 4 . 1 1 32 1 . . . . .
+## 5 . 1 1 46 . . 1 . . .
+## 6 . 1 1 58 . . . 1 . .
+```
+
+> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
+
+Create the output `numeric` vector (not as a sparse `Matrix`):
+
+
+```r
+output_vector = df[,Improved] == "Marked"
+```
+
+1. set `Y` vector to `0`;
+2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
+3. return `Y` vector.
+
+Build the model
+---------------
+
+The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
+
+
+```r
+bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
+               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.202381
+## [1]	train-error:0.166667
+## [2]	train-error:0.166667
+## [3]	train-error:0.166667
+## [4]	train-error:0.154762
+## [5]	train-error:0.154762
+## [6]	train-error:0.154762
+## [7]	train-error:0.166667
+## [8]	train-error:0.166667
+## [9]	train-error:0.166667
+```
+
+You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
+
+A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
+
+> Here you can see the numbers decrease until line 7 and then increase.
+>
+> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-)
+
+Feature importance
+------------------
+
+## Measure feature importance
+
+
+### Build the feature importance data.table
+
+In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
+
+
+```r
+importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
+head(importance)
+```
+
+```
+##             Feature        Gain      Cover  Frequency
+## 1:              Age 0.622031651 0.67251706 0.67241379
+## 2: TreatmentPlacebo 0.285750607 0.11916656 0.10344828
+## 3:          SexMale 0.048744054 0.04522027 0.08620690
+## 4:      AgeDiscret6 0.016604647 0.04784637 0.05172414
+## 5:      AgeDiscret3 0.016373791 0.08028939 0.05172414
+## 6:      AgeDiscret4 0.009270558 0.02858801 0.01724138
+```
+
+> The column `Gain` provide the information we are looking for.
+>
+> As you can see, features are classified by `Gain`.
+
+`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
+
+`Cover` measures the relative quantity of observations concerned by a feature.
+
+`Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
+
+#### Improvement in the interpretability of feature importance data.table
+
+We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
+
+One simple solution is to count the co-occurrences of a feature and a class of the classification.
+
+For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
+
+
+```r
+importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
+
+# Cleaning for better display
+importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
+
+head(importanceClean)
+```
+
+```
+##             Feature        Split       Gain RealCover RealCover %
+## 1: TreatmentPlacebo -1.00136e-05 0.28575061         7   0.2500000
+## 2:              Age         61.5 0.16374034        12   0.4285714
+## 3:              Age           39 0.08705750         8   0.2857143
+## 4:              Age         57.5 0.06947553        11   0.3928571
+## 5:          SexMale -1.00136e-05 0.04874405         4   0.1428571
+## 6:              Age         53.5 0.04620627        10   0.3571429
+```
+
+> In the table above we have removed two not needed columns and select only the first lines.
+
+First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits.
+
+How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
+
+The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents.
+
+Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
+
+> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
+
+### Plotting the feature importance
+
+
+All these things are nice, but it would be even better to plot the results.
+
+
+```r
+xgb.plot.importance(importance_matrix = importanceRaw)
+```
+
+```
+## Error in xgb.plot.importance(importance_matrix = importanceRaw): Importance matrix is not correct (column names issue)
+```
+
+Feature have automatically been divided in 2 clusters: the interesting features... and the others.
+
+> Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information.
+
+According to the plot above, the most important features in this dataset to predict if the treatment will work are :
+
+* the Age ;
+* having received a placebo or not ;
+* the sex is third but already included in the not interesting features group ;
+* then we see our generated features (AgeDiscret). We can see that their contribution is very low.
+
+### Do these results make sense?
+
+
+Let's check some **Chi2** between each of these features and the label.
+
+Higher **Chi2** means better correlation.
+
+
+```r
+c2 <- chisq.test(df$Age, output_vector)
+print(c2)
+```
+
+```
+## 
+## 	Pearson's Chi-squared test
+## 
+## data:  df$Age and output_vector
+## X-squared = 35.475, df = 35, p-value = 0.4458
+```
+
+Pearson correlation between Age and illness disapearing is **35.48**.
+
+
+```r
+c2 <- chisq.test(df$AgeDiscret, output_vector)
+print(c2)
+```
+
+```
+## 
+## 	Pearson's Chi-squared test
+## 
+## data:  df$AgeDiscret and output_vector
+## X-squared = 8.2554, df = 5, p-value = 0.1427
+```
+
+Our first simplification of Age gives a Pearson correlation is **8.26**.
+
+
+```r
+c2 <- chisq.test(df$AgeCat, output_vector)
+print(c2)
+```
+
+```
+## 
+## 	Pearson's Chi-squared test with Yates' continuity correction
+## 
+## data:  df$AgeCat and output_vector
+## X-squared = 2.3571, df = 1, p-value = 0.1247
+```
+
+The perfectly random split I did between young and old at 30 years old have a low correlation of **2.36**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same.
+
+Morality: don't let your *gut* lower the quality of your model.
+
+In *data science* expression, there is the word *science* :-)
+
+Conclusion
+----------
+
+As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that.
+
+But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
+
+The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
+
+Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
+
+Linear model may not be that smart in this scenario.
+
+Special Note: What about Random Forests™?
+-----------------------------------------
+
+As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
+
+Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
+
+This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
+
+Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests™).
+
+However, in Random Forests™ this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
+
+In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
+
+If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters!
+
+**Warning**: this is still an experimental parameter.
+
+For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
+
+
+```r
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+train <- agaricus.train
+test <- agaricus.test
+
+#Random Forest™ - 1000 trees
+bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nround = 1, objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.002150
+```
+
+```r
+#Boosting - 3 rounds
+bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nround = 3, objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.006142
+## [1]	train-error:0.006756
+## [2]	train-error:0.001228
+```
+
+> Note that the parameter `round` is set to `1`.
+
+> [**Random Forests™**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.
diff --git a/doc/R-package/index.md b/doc/R-package/index.md
new file mode 100644
index 000000000..92df95e9f
--- /dev/null
+++ b/doc/R-package/index.md
@@ -0,0 +1,17 @@
+XGBoost R Package
+=================
+[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
+
+
+You have find XGBoost R Package!
+
+Get Started
+-----------
+* Checkout the [Installation Guide](../build.md) contains instructions to install xgboost, and [Tutorials](#tutorials) for examples on how to use xgboost for various tasks.
+* Please visit [walk through example](demo).
+
+Tutorials
+---------
+- [Introduction to XGBoost in R](xgboostPresentation.md)
+- [Discover your data with XGBoost in R](discoverYourData.md)
diff --git a/doc/R-package/xgboostPresentation.md b/doc/R-package/xgboostPresentation.md
new file mode 100644
index 000000000..05526ede9
--- /dev/null
+++ b/doc/R-package/xgboostPresentation.md
@@ -0,0 +1,590 @@
+---
+title: "Xgboost presentation"
+output:
+  rmarkdown::html_vignette:
+    css: vignette.css
+    number_sections: yes
+    toc: yes
+bibliography: xgboost.bib
+author: Tianqi Chen, Tong He, Michaël Benesty
+vignette: >
+  %\VignetteIndexEntry{Xgboost presentation}
+  %\VignetteEngine{knitr::rmarkdown}
+  \usepackage[utf8]{inputenc}
+---
+
+XGBoost R Tutorial
+==================
+
+## Introduction
+
+
+**Xgboost** is short for e**X**treme **G**radient **Boost**ing package.
+
+The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions.
+
+It is an efficient and scalable implementation of gradient boosting framework by @friedman2000additive and @friedman2001greedy. Two solvers are included:
+
+- *linear* model ;
+- *tree learning* algorithm.
+
+It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily.
+
+It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions.
+
+It has several features:
+
+* Speed: it can automatically do parallel computation on *Windows* and *Linux*, with *OpenMP*. It is generally over 10 times faster than the classical `gbm`.
+* Input Type: it takes several types of input data:
+    * *Dense* Matrix: *R*'s *dense* matrix, i.e. `matrix` ;
+    * *Sparse* Matrix: *R*'s *sparse* matrix, i.e. `Matrix::dgCMatrix` ;
+    * Data File: local data files ;
+    * `xgb.DMatrix`: its own class (recommended).
+* Sparsity: it accepts *sparse* input for both *tree booster*  and *linear booster*, and is optimized for *sparse* input ;
+* Customization: it supports customized objective functions and evaluation functions.
+
+## Installation
+
+
+### Github version
+
+
+For up-to-date version (highly recommended), install from *Github*:
+
+
+```r
+devtools::install_git('git://github.com/dmlc/xgboost', subdir='R-package')
+```
+
+> *Windows* user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
+
+Cran version
+------------
+
+As of 2015-03-13, ‘xgboost’ was removed from the CRAN repository.
+
+Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
+
+## Learning
+
+
+For the purpose of this tutorial we will load **XGBoost** package.
+
+
+```r
+require(xgboost)
+```
+
+### Dataset presentation
+
+
+In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-).
+
+Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.
+
+### Dataset loading
+
+
+We will load the `agaricus` datasets embedded with the package and will link them to variables.
+
+The datasets are already split in:
+
+* `train`: will be used to build the model ;
+* `test`: will be used to assess the quality of our model.
+
+Why *split* the dataset in two parts?
+
+In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
+
+
+```r
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+train <- agaricus.train
+test <- agaricus.test
+```
+
+> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/splitting.html).
+
+Each variable is a `list` containing two things, `label` and `data`:
+
+
+```r
+str(train)
+```
+
+```
+## List of 2
+##  $ data :
+```
+
+```
+## Error in str.default(obj, ...): could not find function "is"
+```
+
+`label` is the outcome of our dataset meaning it is the binary *classification* we will try to predict.
+
+Let's discover the dimensionality of our datasets.
+
+
+```r
+dim(train$data)
+```
+
+```
+## [1] 6513  126
+```
+
+```r
+dim(test$data)
+```
+
+```
+## [1] 1611  126
+```
+
+This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge dataset very efficiently.
+
+As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
+
+
+```r
+class(train$data)[1]
+```
+
+```
+## [1] "dgCMatrix"
+```
+
+```r
+class(train$label)
+```
+
+```
+## [1] "numeric"
+```
+
+### Basic Training using XGBoost
+
+
+This step is the most critical part of the process for the quality of our model.
+
+#### Basic training
+
+We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`.
+
+In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, in a dataset mainly made of `0`, memory size is reduced. It is very usual to have such dataset.
+
+We will train decision tree model using the following parameters:
+
+* `objective = "binary:logistic"`: we will train a binary classification model ;
+* `max.deph = 2`: the trees won't be deep, because our case is very simple ;
+* `nthread = 2`: the number of cpu threads we are going to use;
+* `nround = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
+
+
+```r
+bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.046522
+## [1]	train-error:0.022263
+```
+
+> More complex the relationship between your features and your `label` is, more passes you need.
+
+#### Parameter variations
+
+##### Dense matrix
+
+Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix.
+
+
+```r
+bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+```
+
+```
+## Error in as.vector(data): no method for coercing this S4 class to a vector
+```
+
+##### xgb.DMatrix
+
+**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
+
+
+```r
+dtrain <- xgb.DMatrix(data = train$data, label = train$label)
+bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.046522
+## [1]	train-error:0.022263
+```
+
+##### Verbose option
+
+**XGBoost** has several features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
+
+One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
+
+
+```r
+# verbose = 0, no message
+bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 0)
+```
+
+
+```r
+# verbose = 1, print evaluation metric
+bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 1)
+```
+
+```
+## [0]	train-error:0.046522
+## [1]	train-error:0.022263
+```
+
+
+```r
+# verbose = 2, also print information about tree
+bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
+```
+
+```
+## [11:43:20] ../..//amalgamation/../src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
+## [0]	train-error:0.046522
+## [11:43:20] ../..//amalgamation/../src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
+## [1]	train-error:0.022263
+```
+
+## Basic prediction using XGBoost
+
+
+## Perform the prediction
+
+
+The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
+
+
+```r
+pred <- predict(bst, test$data)
+
+# size of the prediction vector
+print(length(pred))
+```
+
+```
+## [1] 1611
+```
+
+```r
+# limit display of predictions to the first 10
+print(head(pred))
+```
+
+```
+## [1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391
+```
+
+These numbers doesn't look like *binary classification* `{0,1}`. We need to perform a simple transformation before being able to use these results.
+
+## Transform the regression in a binary classification
+
+
+The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model.
+
+How can we use a *regression* model to perform a binary classification?
+
+If we think about the meaning of a regression applied to our data, the numbers we get are probabilities that a datum will be classified as `1`. Therefore, we will set the rule that if this probability for a specific datum is `> 0.5` then the observation is classified as `1` (or `0` otherwise).
+
+
+```r
+prediction <- as.numeric(pred > 0.5)
+print(head(prediction))
+```
+
+```
+## [1] 0 1 0 0 0 1
+```
+
+## Measuring model performance
+
+
+To measure the model performance, we will compute a simple metric, the *average error*.
+
+
+```r
+err <- mean(as.numeric(pred > 0.5) != test$label)
+print(paste("test-error=", err))
+```
+
+```
+## [1] "test-error= 0.0217256362507759"
+```
+
+> Note that the algorithm has not seen the `test` data during the model construction.
+
+Steps explanation:
+
+1. `as.numeric(pred > 0.5)` applies our rule that when the probability (<=> regression <=> prediction) is `> 0.5` the observation is classified as `1` and `0` otherwise ;
+2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ;
+3. `mean(vectorOfErrors)` computes the *average error* itself.
+
+The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**.
+
+*Multiclass* classification works in a similar way.
+
+This metric is **0.02** and is pretty low: our yummly mushroom model works well!
+
+## Advanced features
+
+
+Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content.
+
+
+### Dataset preparation
+
+
+For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
+
+
+```r
+dtrain <- xgb.DMatrix(data = train$data, label=train$label)
+dtest <- xgb.DMatrix(data = test$data, label=test$label)
+```
+
+### Measure learning progress with xgb.train
+
+
+Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
+
+One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following techniques will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
+
+One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
+
+> in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
+
+For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
+
+
+```r
+watchlist <- list(train=dtrain, test=dtest)
+
+bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.046522	test-error:0.042831
+## [1]	train-error:0.022263	test-error:0.021726
+```
+
+**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
+
+Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
+
+If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
+
+For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
+
+
+```r
+bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.046522	train-logloss:0.233376	test-error:0.042831	test-logloss:0.226686
+## [1]	train-error:0.022263	train-logloss:0.136658	test-error:0.021726	test-logloss:0.137874
+```
+
+> `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`.
+
+### Linear boosting
+
+
+Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
+
+
+```r
+bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.019499	train-logloss:0.176561	test-error:0.018001	test-logloss:0.173835
+## [1]	train-error:0.004760	train-logloss:0.068214	test-error:0.003104	test-logloss:0.065493
+```
+
+In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
+
+In simple cases, it will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
+
+### Manipulating xgb.DMatrix
+
+
+#### Save / Load
+
+Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function.
+
+
+```r
+xgb.DMatrix.save(dtrain, "dtrain.buffer")
+```
+
+```
+## [1] TRUE
+```
+
+```r
+# to load it in, simply call xgb.DMatrix
+dtrain2 <- xgb.DMatrix("dtrain.buffer")
+```
+
+```
+## [11:43:20] 6513x126 matrix with 143286 entries loaded from dtrain.buffer
+```
+
+```r
+bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
+```
+
+```
+## [0]	train-error:0.046522	test-error:0.042831
+## [1]	train-error:0.022263	test-error:0.021726
+```
+
+
+
+#### Information extraction
+
+Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data.
+
+
+```r
+label = getinfo(dtest, "label")
+pred <- predict(bst, dtest)
+err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
+print(paste("test-error=", err))
+```
+
+```
+## [1] "test-error= 0.0217256362507759"
+```
+
+### View feature importance/influence from the learnt model
+
+
+Feature importance is similar to R gbm package's relative influence (rel.inf).
+
+```
+importance_matrix <- xgb.importance(model = bst)
+print(importance_matrix)
+xgb.plot.importance(importance_matrix = importance_matrix)
+```
+
+#### View the trees from a model
+
+
+You can dump the tree you learned using `xgb.dump` into a text file.
+
+
+```r
+xgb.dump(bst, with.stats = T)
+```
+
+```
+##  [1] "booster[0]"                                                          
+##  [2] "0:[f28<-1.00136e-05] yes=1,no=2,missing=1,gain=4000.53,cover=1628.25"
+##  [3] "1:[f55<-1.00136e-05] yes=3,no=4,missing=3,gain=1158.21,cover=924.5"  
+##  [4] "3:leaf=1.71218,cover=812"                                            
+##  [5] "4:leaf=-1.70044,cover=112.5"                                         
+##  [6] "2:[f108<-1.00136e-05] yes=5,no=6,missing=5,gain=198.174,cover=703.75"
+##  [7] "5:leaf=-1.94071,cover=690.5"                                         
+##  [8] "6:leaf=1.85965,cover=13.25"                                          
+##  [9] "booster[1]"                                                          
+## [10] "0:[f59<-1.00136e-05] yes=1,no=2,missing=1,gain=832.545,cover=788.852"
+## [11] "1:[f28<-1.00136e-05] yes=3,no=4,missing=3,gain=569.725,cover=768.39" 
+## [12] "3:leaf=0.784718,cover=458.937"                                       
+## [13] "4:leaf=-0.96853,cover=309.453"                                       
+## [14] "2:leaf=-6.23624,cover=20.4624"
+```
+
+You can plot the trees from your model using ```xgb.plot.tree``
+
+```
+xgb.plot.tree(model = bst)
+```
+
+> if you provide a path to `fname` parameter you can save the trees to your hard drive.
+
+#### Save and load models
+
+
+Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
+
+Hopefully for you, **XGBoost** implements such functions.
+
+
+```r
+# save model to binary local file
+xgb.save(bst, "xgboost.model")
+```
+
+```
+## [1] TRUE
+```
+
+> `xgb.save` function should return TRUE if everything goes well and crashes otherwise.
+
+An interesting test to see how identical our saved model is to the original one would be to compare the two predictions.
+
+
+```r
+# load binary model to R
+bst2 <- xgb.load("xgboost.model")
+pred2 <- predict(bst2, test$data)
+
+# And now the test
+print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
+```
+
+```
+## [1] "sum(abs(pred2-pred))= 0"
+```
+
+
+
+> result is `0`? We are good!
+
+In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
+
+
+```r
+# save model to R's raw vector
+rawVec <- xgb.save.raw(bst)
+
+# print class
+print(class(rawVec))
+```
+
+```
+## [1] "raw"
+```
+
+```r
+# load binary model to R
+bst3 <- xgb.load(rawVec)
+pred3 <- predict(bst3, test$data)
+
+# pred2 should be identical to pred
+print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
+```
+
+```
+## [1] "sum(abs(pred3-pred))= 0"
+```
+
+> Again `0`? It seems that `XGBoost` works pretty well!
+
+## References
diff --git a/doc/build.md b/doc/build.md
index 7eae0bbd0..ebc840c93 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -1,28 +1,144 @@
-Build XGBoost
-=============
-* Run ```bash build.sh``` (you can also type make)
-* If you have C++11 compiler, it is recommended to type ```make cxx11=1```
-  - C++11 is not used by default
-* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
-* You may get a error: -lgomp is not found
-  - You can type ```make no_omp=1```, this will get you single thread xgboost
-  - Alternatively, you can upgrade your compiler to compile multi-thread version
-* Windows(VS 2010): see [../windows](../windows) folder
-  - In principle, you put all the cpp files in the Makefile to the project, and build
-* OS X with multi-threading support: see [next section](#openmp-for-os-x)
+Installation Guide
+==================
 
-Build XGBoost in OS X with OpenMP
----------------------------------
-Here is the complete solution to use OpenMp-enabled compilers to install XGBoost.
+This page gives instructions of how to build and install the xgboost package from
+scratch on various systems. It consists of two steps:
 
-1. Obtain gcc-5.x.x with openmp support by `brew install gcc --without-multilib`. (`brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.)
+1. Fist build the shared library from the C++ codes (`libxgboost.so` for linux/osx and `libxgboost.dll` for windows).
+   - Exception: for R-package installation please directly refer to the R package section.
+2. Then install the language packages (e.g. Python Package).
 
-2. `cd xgboost` then `bash build.sh` to compile XGBoost.
+Please refer to [Installation FAQ](#frequently-asked-questions) first if you had any problem
+during installation. If the instructions do not work for you, please feel free
+to ask questions at [xgboost/issues](https://github.com/dmlc/xgboost/issues), or
+even better to send pull request if you can fix the problem.
 
-3. Install xgboost package for Python and R
+## Contents
+- [Build the Shared Library](#build-the-shared-library)
+  - [Prerequisites](#prerequisites)
+  - [Building on Ubuntu/Debian](#building-on-ubuntu-debian)
+  - [Building on OSX](#building-on-osx)
+  - [Building on Windows](#building-on-windows)
+  - [Customized Building](#customized-building)
+- [Python Package Installation](#python-package-installation)
+- [R Package Installation](#r-package-installation)
+- [Frequently asked questions](#frequently-asked-questions)
 
-- For Python: go to `python-package` sub-folder to install python version with `python setup.py install` (or `sudo python setup.py install`).
-- For R: Set the `Makevars` file in highest piority for R.
+## Build the Shared Library
+
+Our goal is to build the shared library:
+- On Linux/OSX the target library is ```libxgboost.so```
+- On Windows the target libary is ```libxgboost.dll```
+
+The minimal building requirement is
+
+- A recent c++ compiler supporting C++ 11 (g++-4.6 or higher)
+
+We can edit `make/config.mk` to change the compile options, and then build by
+`make`. If everything goes well, we can go the specific language installation section.
+
+### Building on Ubuntu/Debian
+
+On Ubuntu, one build xgboost by
+
+Then build xgboost
+```bash
+git clone --recursive https://github.com/dmlc/xgboost
+cd xgboost; make -j4
+```
+
+### Building on OSX
+
+On Ubuntu OSX, one build xgboost by
+
+```bash
+git clone --recursive https://github.com/dmlc/xgboost
+cd xgboost; cp make/minimum.mk ./config.mk; make -j4
+```
+
+This build xgboost without multi-threading, because by default clang in OSX does not come with open-mp.
+See the following paragraph for OpenMP enabled xgboost.
+
+
+Here is the complete solution to use OpenMP-enabled compilers to install XGBoost.
+Obtain gcc-5.x.x with openmp support by `brew install gcc --without-multilib`. (`brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.)
+
+```bash
+git clone --recursive https://github.com/dmlc/xgboost
+cd xgboost; cp make/config.mk ./config.mk; make -j4
+```
+
+### Building on Windows
+
+XGBoost support both build by MSVC or MinGW. Here is how you can build xgboost library using MinGW.
+
+Build with mingw
+```bash
+cp make/mingw64.mk config.mk; make -j4
+```
+
+The MSVC build for new version is not yet updated.
+
+
+### Customized Building
+
+The configuration of xgboost can be modified by ```config.mk```
+- modify configuration on various distributed filesystem such as HDFS/Amazon S3/...
+- First copy [make/config.mk](../make/config.mk) to the project root, on which
+  any local modification will be ignored by git, then modify the according flags.
+
+
+
+## Python Package Installation
+
+The python package is located at [python-package](../python-package).
+There are several ways to install the package:
+
+1. Install system-widely, which requires root permission
+
+   ```bash
+   cd python; sudo python setup.py install
+   ```
+
+   You will however need Python `distutils` module for this to
+   work. It is often part of the core python package or it can be installed using your
+   package manager, e.g. in Debian use
+
+   ```bash
+   sudo apt-get install python-setuptools
+   ```
+
+   *NOTE: If you recompiled xgboost, then you need to reinstall it again to
+    make the new library take effect*
+
+2. Only set the environment variable `PYTHONPATH` to tell python where to find
+   the library. For example, assume we cloned `xgboost` on the home directory
+   `~`. then we can added the following line in `~/.bashrc`
+   It is ***recommended for developers*** who may change the codes. The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ```setup``` again)
+
+    ```bash
+    export PYTHONPATH=~/xgboost/python-package
+    ```
+
+3. Install only for the current user.
+
+    ```bash
+    cd python; python setup.py develop --user
+    ```
+
+## R Package Installation
+
+You can install R package using devtools
+
+```r
+devtools::install_git('git://github.com/dmlc/xgboost',subdir='R-package')
+
+```
+
+For OSX users, single threaded version will be installed, to install multi-threaded version.
+First follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compiler, then:
+
+- Set the `Makevars` file in highest piority for R.
 
   The point is, there are three `Makevars` : `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by running `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!).
 
@@ -38,13 +154,19 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost
   devtools::install_local('xgboost/', subdir = 'R-package') # you may use devtools
   ```
 
+## Frequently Asked Questions
 
-Build with HDFS and S3 Support
-------------------------------
-* To build xgboost use with HDFS/S3 support and distributed learnig. It is recommended to build with dmlc, with the following steps
-  - ```git clone https://github.com/dmlc/dmlc-core```
-  - Follow instruction in dmlc-core/make/config.mk to compile libdmlc.a
-  - In root folder of xgboost, type ```make dmlc=dmlc-core```
-* This will allow xgboost to directly load data and save model from/to hdfs and s3
-  - Simply replace the filename with prefix s3:// or hdfs://
-* This xgboost that can be used for distributed learning
+1. **Compile failed after `git pull`**
+
+   Please first update the submodules, clean all and recompile:
+
+   ```bash
+   git submodule update && make clean_all && make -j4
+   ```
+
+2. **Compile failed after `config.mk` is modified**
+   Need to clean all first:
+
+    ```bash
+    make clean_all && make -j4
+    ```
diff --git a/doc/conf.py b/doc/conf.py
index 05e1e91ba..9de8930fc 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -26,7 +26,7 @@ from sphinx_util import MarkdownParser, AutoStructify
 
 # -- mock out modules
 import mock
-MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib']
+MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib', 'pandas', 'graphviz']
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = mock.Mock()
 
@@ -120,6 +120,7 @@ todo_include_todos = False
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 # html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/doc/dev-guide/contribute.md b/doc/dev-guide/contribute.md
index 03060ab59..e248f7d4b 100644
--- a/doc/dev-guide/contribute.md
+++ b/doc/dev-guide/contribute.md
@@ -1,13 +1,145 @@
-Developer Guide
-===============
-This page contains guide for developers of xgboost. XGBoost has been developed and used by a group of active community.
-Everyone is more than welcomed to is a great way to make the project better.
-The project is maintained by a committee of [committers](../../CONTRIBUTORS.md#comitters) who will review and merge pull requests from contributors.
+Contribute to XGBoost
+=====================
+XGBoost has been developed and used by a group of active community members.
+Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
 
-Contributing Code
------------------
-* The C++ code follows Google C++ style
-* We follow numpy style to document our python module
-* Tools to precheck codestyle
-  - clone https://github.com/dmlc/dmlc-core into root directory
-  - type ```make lint``` and fix possible errors.
+- Please add your name to [CONTRIBUTORS.md](../CONTRIBUTORS.md) after your patch has been merged.
+- Please also update [NEWS.md](../NEWS.md) to add note on your changes to the API or added a new document.
+
+Guidelines
+----------
+* [Submit Pull Request](#submit-pull-request)
+* [Git Workflow Howtos](#git-workflow-howtos)
+  - [How to resolve conflict with master](#how-to-resolve-conflict-with-master)
+  - [How to combine multiple commits into one](#how-to-combine-multiple-commits-into-one)
+  - [What is the consequence of force push](#what-is-the-consequence-of-force-push)
+* [Document](#document)
+* [Testcases](#testcases)
+* [Examples](#examples)
+* [Core Library](#core-library)
+* [Python Package](#python-package)
+* [R Package](#r-package)
+
+Submit Pull Request
+-------------------
+* Before submit, please rebase your code on the most recent version of master, you can do it by
+```bash
+git remote add upstream https://github.com/dmlc/xgboost
+git fetch upstream
+git rebase upstream/master
+```
+* If you have multiple small commits,
+  it might be good to merge them together(use git rebase then squash) into more meaningful groups.
+* Send the pull request!
+  - Fix the problems reported by automatic checks
+  - If you are contributing a new module, consider add a testcase in [tests](../tests)
+
+Git Workflow Howtos
+-------------------
+### How to resolve conflict with master
+- First rebase to most recent master
+```bash
+# The first two steps can be skipped after you do it once.
+git remote add upstream https://github.com/dmlc/xgboost
+git fetch upstream
+git rebase upstream/master
+```
+- The git may show some conflicts it cannot merge, say ```conflicted.py```.
+  - Manually modify the file to resolve the conflict.
+  - After you resolved the conflict, mark it as resolved by
+```bash
+git add conflicted.py
+```
+- Then you can continue rebase by
+```bash
+git rebase --continue
+```
+- Finally push to your fork, you may need to force push here.
+```bash
+git push --force
+```
+
+### How to combine multiple commits into one
+Sometimes we want to combine multiple commits, especially when later commits are only fixes to previous ones,
+to create a PR with set of meaningful commits. You can do it by following steps.
+- Before doing so, configure the default editor of git if you haven't done so before.
+```bash
+git config core.editor the-editor-you-like
+```
+- Assume we want to merge last 3 commits, type the following commands
+```bash
+git rebase -i HEAD~3
+```
+- It will pop up an text editor. Set the first commit as ```pick```, and change later ones to ```squash```.
+- After you saved the file, it will pop up another text editor to ask you modify the combined commit message.
+- Push the changes to your fork, you need to force push.
+```bash
+git push --force
+```
+
+### What is the consequence of force push
+The previous two tips requires force push, this is because we altered the path of the commits.
+It is fine to force push to your own fork, as long as the commits changed are only yours.
+
+Documents
+---------
+* The document is created using sphinx and [recommonmark](http://recommonmark.readthedocs.org/en/latest/)
+* You can build document locally to see the effect.
+
+Testcases
+---------
+* All the testcases are in [tests](../tests)
+* We use python nose for python test cases.
+
+Examples
+--------
+* Usecases and examples will be in [demo](../demo)
+* We are super excited to hear about your story, if you have blogposts,
+  tutorials code solutions using xgboost, please tell us and we will add
+  a link in the example pages.
+
+Core Library
+------------
+- Follow Google C style for C++.
+- We use doxygen to document all the interface code.
+- You can reproduce the linter checks by typing ```make lint```
+
+Python Package
+--------------
+- Always add docstring to the new functions in numpydoc format.
+- You can reproduce the linter checks by typing ```make lint```
+
+R Package
+---------
+### Code Style
+- We follow Google's C++ Style guide on C++ code.
+  - This is mainly to be consistent with the rest of the project.
+  - Another reason is we will be able to check style automatically with a linter.
+- You can check the style of the code by typing the following command at root folder.
+```bash
+make rcpplint
+```
+- When needed, you can disable the linter warning of certain line with ```// NOLINT(*)``` comments.
+
+### Rmarkdown Vignettes
+Rmarkdown vignettes are placed in [R-package/vignettes](../R-package/vignettes)
+These Rmarkdown files are not compiled. We host the compiled version on [doc/R-package](R-package)
+
+The following steps are followed to add a new Rmarkdown vignettes:
+- Add the original rmarkdown to ```R-package/vignettes```
+- Modify ```doc/R-package/Makefile``` to add the markdown files to be build
+- Clone the [dmlc/web-data](https://github.com/dmlc/web-data) repo to folder ```doc```
+- Now type the following command on ```doc/R-package```
+```bash
+make the-markdown-to-make.md
+```
+- This will generate the markdown, as well as the figures into ```doc/web-data/xgboost/knitr```
+- Modify the ```doc/R-package/index.md``` to point to the generated markdown.
+- Add the generated figure to the ```dmlc/web-data``` repo.
+  - If you already cloned the repo to doc, this means a ```git add```
+- Create PR for both the markdown  and ```dmlc/web-data```
+- You can also build the document locally by typing the followig command at ```doc```
+```bash
+make html
+```
+The reason we do this is to avoid exploded repo size due to generated images sizes.
diff --git a/doc/index.md b/doc/index.md
index 2329c9677..a03b46e21 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -5,23 +5,26 @@ XGBoost is short for eXtreme gradient boosting. This is a library that is design
 The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate***
 for large scale tree boosting.
 
-
 This document is hosted at http://xgboost.readthedocs.org/. You can also browse most of the documents in github directly.
 
-How to Get Started
-------------------
-The best way to get started to learn xgboost is by the examples. There are three types of examples you can find in xgboost.
-* [Tutorials](#tutorials) are self-contained tutorials on complete data science tasks.
-* [XGBoost Code Examples](../demo/) are collections of code and benchmarks of xgboost.
-  - There is a walkthrough section in this to walk you through specific API features.
-* [Highlight Solutions](#highlight-solutions) are presentations using xgboost to solve real world problems.
-  - These examples are usually more advanced. You can usually find state-of-art solutions to many problems and challenges in here.
-
-After you gets familiar with the interface, checkout the following additional resources
+User Guide
+----------
+* [Installation Guide](build.md)
+* [Introduction to Boosted Trees](model.md)
+* [Python Package Document](python/index.md)
+* [R Package Document](R-package/index.md)
+* [XGBoost.jl Julia Package](https://github.com/dmlc/XGBoost.jl)
+* [Distributed Training](../demo/distributed-training)
 * [Frequently Asked Questions](faq.md)
-* [Learning what is in Behind: Introduction to Boosted Trees](model.md)
-* [User Guide](#user-guide) contains comprehensive list of documents of xgboost.
-* [Developer Guide](dev-guide/contribute.md)
+* [External Memory Version](external_memory.md)
+* [Learning to use XGBoost by Example](../demo)
+* [Parameters](parameter.md)
+* [Text input format](input_format.md)
+* [Notes on Parameter Tunning](param_tuning.md)
+
+Developer Guide
+---------------
+* [Contributor Guide](dev-guide/contribute.md)
 
 Tutorials
 ---------
@@ -31,14 +34,13 @@ are great resources to learn xgboost by real examples. If you think you have som
   - This tutorial introduces the basic usage of CLI version of xgboost
 * [Introduction of XGBoost in Python](python/python_intro.md) (python)
   - This tutorial introduces the python package of xgboost
-* [Introduction to XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) (R package)
+* [Introduction to XGBoost in R](R-package/xgboostPresentation.md) (R package)
   - This is a general presentation about xgboost in R.
-* [Discover your data with XGBoost in R](../R-package/vignettes/discoverYourData.Rmd) (R package)
+* [Discover your data with XGBoost in R](R-package/discoverYourData.md) (R package)
   - This tutorial explaining feature analysis in xgboost.
 * [Understanding XGBoost Model on Otto Dataset](../demo/kaggle-otto/understandingXGBoostModel.Rmd) (R package)
   - This tutorial teaches you how to use xgboost to compete kaggle otto challenge.
 
-
 Highlight Solutions
 -------------------
 This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request.
@@ -49,23 +51,11 @@ This section is about blogposts, presentation and videos discussing how to use x
 * Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y)
 * [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/)
 
-User Guide
-----------
-* [Frequently Asked Questions](faq.md)
-* [Introduction to Boosted Trees](model.md)
-* [Using XGBoost in Python](python/python_intro.md)
-* [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd)
-* [Learning to use XGBoost by Example](../demo)
-* [External Memory Version](external_memory.md)
-* [Text input format](input_format.md)
-* [Build Instruction](build.md)
-* [Parameters](parameter.md)
-* [Notes on Parameter Tunning](param_tuning.md)
+Indices and tables
+------------------
 
-Developer Guide
----------------
-* [Developer Guide](dev-guide/contribute.md)
-
-API Reference
--------------
-* [Python API Reference](python/python_api.rst)
+```eval_rst
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+```
diff --git a/doc/python/index.md b/doc/python/index.md
new file mode 100644
index 000000000..c377fb784
--- /dev/null
+++ b/doc/python/index.md
@@ -0,0 +1,10 @@
+XGBoost Python Package
+======================
+This page contains links to all the python related documents on python package.
+To install the package package, checkout [Build and Installation Instruction](../build.md).
+
+Contents
+--------
+* [Python Overview Tutorial](python_intro.md)
+* [Learning to use XGBoost by Example](../../demo)
+* [Python API Reference](python_api.rst)
diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py
index a09f1e08b..e6b578cbe 100644
--- a/doc/sphinx_util.py
+++ b/doc/sphinx_util.py
@@ -5,11 +5,24 @@ import os
 import docutils
 import subprocess
 
-if os.environ.get('READTHEDOCS', None) == 'True':
+READTHEDOCS_BUILD = (os.environ.get('READTHEDOCS', None) is not None)
+
+if not os.path.exists('../recommonmark'):
     subprocess.call('cd ..; rm -rf recommonmark;' +
-                    'git clone https://github.com/tqchen/recommonmark', shell=True)
+                    'git clone https://github.com/tqchen/recommonmark', shell = True)
+else:
+    subprocess.call('cd ../recommonmark/; git pull', shell=True)
+
+if not os.path.exists('web-data'):
+    subprocess.call('rm -rf web-data;' +
+                    'git clone https://github.com/dmlc/web-data', shell = True)
+else:
+    subprocess.call('cd web-data; git pull', shell=True)
+
 
 sys.path.insert(0, os.path.abspath('../recommonmark/'))
+sys.stderr.write('READTHEDOCS=%s\n' % (READTHEDOCS_BUILD))
+
 
 from recommonmark import parser, transform
 
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
new file mode 100644
index 000000000..3674c6117
--- /dev/null
+++ b/include/xgboost/base.h
@@ -0,0 +1,85 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file base.h
+ * \brief defines configuration macros of xgboost.
+ */
+#ifndef XGBOOST_BASE_H_
+#define XGBOOST_BASE_H_
+
+#include <dmlc/base.h>
+#include <dmlc/omp.h>
+
+/*!
+ * \brief string flag for R library, to leave hooks when needed.
+ */
+#ifndef XGBOOST_STRICT_R_MODE
+#define XGBOOST_STRICT_R_MODE 0
+#endif
+
+/*!
+ * \brief Whether always log console message with time.
+ *  It will display like, with timestamp appended to head of the message.
+ *  "[21:47:50] 6513x126 matrix with 143286 entries loaded from ../data/agaricus.txt.train"
+ */
+#ifndef XGBOOST_LOG_WITH_TIME
+#define XGBOOST_LOG_WITH_TIME 1
+#endif
+
+/*!
+ * \brief Whether customize the logger outputs.
+ */
+#ifndef XGBOOST_CUSTOMIZE_LOGGER
+#define XGBOOST_CUSTOMIZE_LOGGER XGBOOST_STRICT_R_MODE
+#endif
+
+/*!
+ * \brief Whether to customize global PRNG.
+ */
+#ifndef XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG  XGBOOST_STRICT_R_MODE
+#endif
+
+/*! \brief namespace of xgboo st*/
+namespace xgboost {
+/*!
+ * \brief unsigned interger type used in boost,
+ *  used for feature index and row index.
+ */
+typedef uint32_t bst_uint;
+/*! \brief long integers */
+typedef unsigned long bst_ulong;  // NOLINT(*)
+/*! \brief float type, used for storing statistics */
+typedef float bst_float;
+
+/*! \brief gradient statistics pair usually needed in gradient boosting */
+struct bst_gpair {
+  /*! \brief gradient statistics */
+  bst_float grad;
+  /*! \brief second order gradient statistics */
+  bst_float hess;
+  bst_gpair() {}
+  bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {}
+};
+
+/*! \brief small eps gap for minimum split decision. */
+const float rt_eps = 1e-5f;
+/*! \brief min gap between feature values to allow a split happen */
+const float rt_2eps = rt_eps * 2.0f;
+
+/*! \brief define unsigned long for openmp loop */
+typedef dmlc::omp_ulong omp_ulong;
+/*! \brief define unsigned int for openmp loop */
+typedef dmlc::omp_uint bst_omp_uint;
+
+/*!
+ * \brief define compatible keywords in g++
+ *  Used to support g++-4.6 and g++4.7
+ */
+#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+#define override
+#define final
+#endif
+#endif
+}  // namespace xgboost
+#endif  // XGBOOST_BASE_H_
diff --git a/wrapper/xgboost_wrapper.h b/include/xgboost/c_api.h
similarity index 95%
rename from wrapper/xgboost_wrapper.h
rename to include/xgboost/c_api.h
index 8d0e78a91..5c8db3911 100644
--- a/wrapper/xgboost_wrapper.h
+++ b/include/xgboost/c_api.h
@@ -1,12 +1,11 @@
 /*!
- * Copyright (c) 2014 by Contributors
- * \file xgboost_wrapper.h
+ * Copyright (c) 2015 by Contributors
+ * \file c_api.h
  * \author Tianqi Chen
- * \brief a C style wrapper of xgboost
- *  can be used to create wrapper of other languages
+ * \brief C API of XGBoost, used to interfacing with other languages.
  */
-#ifndef XGBOOST_WRAPPER_H_
-#define XGBOOST_WRAPPER_H_
+#ifndef XGBOOST_C_API_H_
+#define XGBOOST_C_API_H_
 
 #ifdef __cplusplus
 #define XGB_EXTERN_C extern "C"
@@ -170,7 +169,8 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
  * \brief get uint32 info vector from matrix
  * \param handle a instance of data matrix
  * \param field field name
- * \param out_ptr pointer to the result
+ * \param out_len The length of the field.
+ * \param out_dptr pointer to the result
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
@@ -178,8 +178,9 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
                                  bst_ulong* out_len,
                                  const unsigned **out_dptr);
 /*!
- * \brief get number of rows
+ * \brief get number of rows.
  * \param handle the handle to the DMatrix
+ * \param out The address to hold number of rows.
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle,
@@ -187,6 +188,7 @@ XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle,
 /*!
  * \brief get number of columns
  * \param handle the handle to the DMatrix
+ * \param out The output of number of columns
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
@@ -213,7 +215,7 @@ XGB_DLL int XGBoosterFree(BoosterHandle handle);
  * \brief set parameters
  * \param handle handle
  * \param name  parameter name
- * \param val value of parameter
+ * \param value value of parameter
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSetParam(BoosterHandle handle,
@@ -336,11 +338,11 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
  * \brief dump model, return array of strings representing model dump
  * \param handle handle
  * \param fnum number of features
- * \param fnum names of features
- * \param fnum types of features
+ * \param fname names of features
+ * \param ftype types of features
  * \param with_stats whether to dump with statistics
  * \param out_len length of output array
- * \param out_dump_array pointer to hold representing dump of each model
+ * \param out_models pointer to hold representing dump of each model
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
@@ -348,7 +350,7 @@ XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
                                            const char **fname,
                                            const char **ftype,
                                            int with_stats,
-                                           bst_ulong *len,
+                                           bst_ulong *out_len,
                                            const char ***out_models);
 
-#endif  // XGBOOST_WRAPPER_H_
+#endif  // XGBOOST_C_API_H_
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
new file mode 100644
index 000000000..65e7ff0f6
--- /dev/null
+++ b/include/xgboost/data.h
@@ -0,0 +1,298 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file data.h
+ * \brief The input data structure of xgboost.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_H_
+#define XGBOOST_DATA_H_
+
+#include <dmlc/base.h>
+#include <dmlc/data.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include "./base.h"
+
+namespace xgboost {
+// forward declare learner.
+class LearnerImpl;
+
+/*! \brief data type accepted by xgboost interface */
+enum DataType {
+  kFloat32 = 1,
+  kDouble = 2,
+  kUInt32 = 3,
+  kUInt64 = 4
+};
+
+/*!
+ * \brief Meta information about dataset, always sit in memory.
+ */
+struct MetaInfo {
+  /*! \brief number of rows in the data */
+  uint64_t num_row;
+  /*! \brief number of columns in the data */
+  uint64_t num_col;
+  /*! \brief number of nonzero entries in the data */
+  uint64_t num_nonzero;
+  /*! \brief label of each instance */
+  std::vector<bst_float> labels;
+  /*!
+   * \brief specified root index of each instance,
+   *  can be used for multi task setting
+   */
+  std::vector<bst_uint> root_index;
+  /*!
+   * \brief the index of begin and end of a group
+   *  needed when the learning task is ranking.
+   */
+  std::vector<bst_uint> group_ptr;
+  /*! \brief weights of each instance, optional */
+  std::vector<bst_float> weights;
+  /*!
+   * \brief initialized margins,
+   * if specified, xgboost will start from this init margin
+   * can be used to specify initial prediction to boost from.
+   */
+  std::vector<bst_float> base_margin;
+  /*! \brief version flag, used to check version of this info */
+  static const int kVersion = 1;
+  /*! \brief default constructor */
+  MetaInfo() : num_row(0), num_col(0), num_nonzero(0) {}
+  /*!
+   * \brief Get weight of each instances.
+   * \param i Instance index.
+   * \return The weight.
+   */
+  inline float GetWeight(size_t i) const {
+    return weights.size() != 0 ?  weights[i] : 1.0f;
+  }
+  /*!
+   * \brief Get the root index of i-th instance.
+   * \param i Instance index.
+   * \return The pre-defined root index of i-th instance.
+   */
+  inline unsigned GetRoot(size_t i) const {
+    return root_index.size() != 0 ? root_index[i] : 0U;
+  }
+  /*! \brief clear all the information */
+  void Clear();
+  /*!
+   * \brief Load the Meta info from binary stream.
+   * \param fi The input stream
+   */
+  void LoadBinary(dmlc::Stream* fi);
+  /*!
+   * \brief Save the Meta info to binary stream
+   * \param fo The output stream.
+   */
+  void SaveBinary(dmlc::Stream* fo) const;
+  /*!
+   * \brief Set information in the meta info.
+   * \param key The key of the information.
+   * \param dptr The data pointer of the source array.
+   * \param dtype The type of the source data.
+   * \param num Number of elements in the source array.
+   */
+  void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num);
+};
+
+/*! \brief read-only sparse instance batch in CSR format */
+struct SparseBatch {
+  /*! \brief an entry of sparse vector */
+  struct Entry {
+    /*! \brief feature index */
+    bst_uint index;
+    /*! \brief feature value */
+    bst_float fvalue;
+    /*! \brief default constructor */
+    Entry() {}
+    /*!
+     * \brief constructor with index and value
+     * \param index The feature or row index.
+     * \param fvalue THe feature value.
+     */
+    Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
+    /*! \brief reversely compare feature values */
+    inline static bool CmpValue(const Entry& a, const Entry& b) {
+      return a.fvalue < b.fvalue;
+    }
+  };
+
+  /*! \brief an instance of sparse vector in the batch */
+  struct Inst {
+    /*! \brief pointer to the elements*/
+    const Entry *data;
+    /*! \brief length of the instance */
+    bst_uint length;
+    /*! \brief constructor */
+    Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
+    /*! \brief get i-th pair in the sparse vector*/
+    inline const Entry& operator[](size_t i) const {
+      return data[i];
+    }
+  };
+
+  /*! \brief batch size */
+  size_t size;
+};
+
+/*! \brief read-only row batch, used to access row continuously */
+struct RowBatch : public SparseBatch {
+  /*! \brief the offset of rowid of this batch */
+  size_t base_rowid;
+  /*! \brief array[size+1], row pointer of each of the elements */
+  const size_t *ind_ptr;
+  /*! \brief array[ind_ptr.back()], content of the sparse element */
+  const Entry *data_ptr;
+  /*! \brief get i-th row from the batch */
+  inline Inst operator[](size_t i) const {
+    return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i + 1] - ind_ptr[i]));
+  }
+};
+
+/*!
+ * \brief read-only column batch, used to access columns,
+ * the columns are not required to be continuous
+ */
+struct ColBatch : public SparseBatch {
+  /*! \brief column index of each columns in the data */
+  const bst_uint *col_index;
+  /*! \brief pointer to the column data */
+  const Inst *col_data;
+  /*! \brief get i-th column from the batch */
+  inline Inst operator[](size_t i) const {
+    return col_data[i];
+  }
+};
+
+/*!
+ * \brief This is data structure that user can pass to DMatrix::Create
+ *  to create a DMatrix for training, user can create this data structure
+ *  for customized Data Loading on single machine.
+ *
+ *  On distributed setting, usually an customized dmlc::Parser is needed instead.
+ */
+class DataSource : public dmlc::DataIter<RowBatch> {
+ public:
+  /*!
+   * \brief Meta information about the dataset
+   * The subclass need to be able to load this correctly from data.
+   */
+  MetaInfo info;
+};
+
+/*!
+ * \brief Internal data structured used by XGBoost during training.
+ *  There are two ways to create a customized DMatrix that reads in user defined-format.
+ *
+ *  - Provide a dmlc::Parser and pass into the DMatrix::Create
+ *  - Alternatively, if data can be represented by an URL, define a new dmlc::Parser and register by DMLC_REGISTER_DATA_PARSER;
+ *      - This works best for user defined data input source, such as data-base, filesystem.
+ *  - Provdie a DataSource, that can be passed to DMatrix::Create
+ *      This can be used to re-use inmemory data structure into DMatrix.
+ */
+class DMatrix {
+ public:
+  /*! \brief default constructor */
+  DMatrix() : cache_learner_ptr_(nullptr) {}
+  /*! \brief meta information of the dataset */
+  virtual MetaInfo& info() = 0;
+  /*! \brief meta information of the dataset */
+  virtual const MetaInfo& info() const = 0;
+  /*!
+   * \brief get the row iterator, reset to beginning position
+   * \note Only either RowIterator or  column Iterator can be active.
+   */
+  virtual dmlc::DataIter<RowBatch>* RowIterator() = 0;
+  /*!\brief get column iterator, reset to the beginning position */
+  virtual dmlc::DataIter<ColBatch>* ColIterator() = 0;
+  /*!
+   * \brief get the column iterator associated with subset of column features.
+   * \param fset is the list of column index set that must be contained in the returning Column iterator
+   * \return the column iterator, initialized so that it reads the elements in fset
+   */
+  virtual dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) = 0;
+  /*!
+   * \brief check if column access is supported, if not, initialize column access.
+   * \param enabled whether certain feature should be included in column access.
+   * \param subsample subsample ratio when generating column access.
+   * \param max_row_perbatch auxilary information, maximum row used in each column batch.
+   *         this is a hint information that can be ignored by the implementation.
+   * \return Number of column blocks in the column access.
+   */
+  virtual void InitColAccess(const std::vector<bool>& enabled,
+                             float subsample,
+                             size_t max_row_perbatch) = 0;
+  // the following are column meta data, should be able to answer them fast.
+  /*! \return whether column access is enabled */
+  virtual bool HaveColAccess() const = 0;
+  /*! \return Whether the data columns single column block. */
+  virtual bool SingleColBlock() const = 0;
+  /*! \brief get number of non-missing entries in column */
+  virtual size_t GetColSize(size_t cidx) const = 0;
+  /*! \brief get column density */
+  virtual float GetColDensity(size_t cidx) const = 0;
+  /*! \return reference of buffered rowset, in column access */
+  virtual const std::vector<bst_uint>& buffered_rowset() const = 0;
+  /*! \brief virtual destructor */
+  virtual ~DMatrix() {}
+  /*!
+   * \brief Save DMatrix to local file.
+   *  The saved file only works for non-sharded dataset(single machine training).
+   *  This API is deprecated and dis-encouraged to use.
+   * \param fname The file name to be saved.
+   * \return The created DMatrix.
+   */
+  virtual void SaveToLocalFile(const std::string& fname);
+  /*!
+   * \brief Load DMatrix from URI.
+   * \param uri The URI of input.
+   * \param silent Whether print information during loading.
+   * \param load_row_split Flag to read in part of rows, divided among the workers in distributed mode.
+   * \param file_format The format type of the file, used for dmlc::Parser::Create.
+   *   By default "auto" will be able to load in both local binary file.
+   * \return The created DMatrix.
+   */
+  static DMatrix* Load(const std::string& uri,
+                       bool silent,
+                       bool load_row_split,
+                       const std::string& file_format = "auto");
+  /*!
+   * \brief create a new DMatrix, by wrapping a row_iterator, and meta info.
+   * \param source The source iterator of the data, the create function takes ownership of the source.
+   * \param cache_prefix The path to prefix of temporary cache file of the DMatrix when used in external memory mode.
+   *     This can be nullptr for common cases, and in-memory mode will be used.
+   * \return a Created DMatrix.
+   */
+  static DMatrix* Create(std::unique_ptr<DataSource>&& source,
+                         const std::string& cache_prefix = "");
+  /*!
+   * \brief Create a DMatrix by loaidng data from parser.
+   *  Parser can later be deleted after the DMatrix i created.
+   * \param parser The input data parser
+   * \param cache_prefix The path to prefix of temporary cache file of the DMatrix when used in external memory mode.
+   *     This can be nullptr for common cases, and in-memory mode will be used.
+   * \sa dmlc::Parser
+   * \note dmlc-core provides efficient distributed data parser for libsvm format.
+   *  User can create and register customized parser to load their own format using DMLC_REGISTER_DATA_PARSER.
+   *  See "dmlc-core/include/dmlc/data.h" for detail.
+   * \return A created DMatrix.
+   */
+  static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
+                         const std::string& cache_prefix = "");
+
+ private:
+  // allow learner class to access this field.
+  friend class LearnerImpl;
+  /*! \brief public field to back ref cached matrix. */
+  LearnerImpl* cache_learner_ptr_;
+};
+
+}  // namespace xgboost
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(is_pod, xgboost::SparseBatch::Entry, true);
+}
+#endif  // XGBOOST_DATA_H_
diff --git a/include/xgboost/feature_map.h b/include/xgboost/feature_map.h
new file mode 100644
index 000000000..bdb39a9ee
--- /dev/null
+++ b/include/xgboost/feature_map.h
@@ -0,0 +1,92 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file feature_map.h
+ * \brief Feature map data structure to help visualization and model dump.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_FEATURE_MAP_H_
+#define XGBOOST_FEATURE_MAP_H_
+
+#include <vector>
+#include <string>
+#include <cstring>
+#include <iostream>
+
+namespace xgboost {
+/*!
+ * \brief Feature map data structure to help text model dump.
+ * TODO(tqchen) consider make it even more lightweight.
+ */
+class FeatureMap {
+ public:
+  /*! \brief type of feature maps */
+  enum Type {
+    kIndicator = 0,
+    kQuantitive = 1,
+    kInteger = 2,
+    kFloat = 3
+  };
+  /*!
+   * \brief load feature map from input stream
+   * \param is Input text stream
+   */
+  inline void LoadText(std::istream& is) { // NOLINT(*)
+    int fid;
+    std::string fname, ftype;
+    while (is >> fid >> fname >> ftype) {
+      this->PushBack(fid, fname.c_str(), ftype.c_str());
+    }
+  }
+  /*!
+   * \brief push back feature map.
+   * \param fid The feature index.
+   * \param fname The feature name.
+   * \param ftype The feature type.
+   */
+  inline void PushBack(int fid, const char *fname, const char *ftype) {
+    CHECK_EQ(fid, static_cast<int>(names_.size()));
+    names_.push_back(std::string(fname));
+    types_.push_back(GetType(ftype));
+  }
+  /*! \brief clear the feature map */
+  inline void Clear() {
+    names_.clear();
+    types_.clear();
+  }
+  /*! \return number of known features */
+  inline size_t size() const {
+    return names_.size();
+  }
+  /*! \return name of specific feature */
+  inline const char* name(size_t idx) const {
+    CHECK_LT(idx,  names_.size()) << "FeatureMap feature index exceed bound";
+    return names_[idx].c_str();
+  }
+  /*! \return type of specific feature */
+  const Type type(size_t idx) const {
+    CHECK_LT(idx, names_.size()) << "FeatureMap feature index exceed bound";
+    return types_[idx];
+  }
+
+ private:
+  /*!
+   * \return feature type enum given name.
+   * \param tname The type name.
+   * \return The translated type.
+   */
+  inline static Type GetType(const char* tname) {
+    using namespace std;
+    if (!strcmp("i", tname)) return kIndicator;
+    if (!strcmp("q", tname)) return kQuantitive;
+    if (!strcmp("int", tname)) return kInteger;
+    if (!strcmp("float", tname)) return kFloat;
+    LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
+    return kIndicator;
+  }
+  /*! \brief name of the feature */
+  std::vector<std::string> names_;
+  /*! \brief type of the feature */
+  std::vector<Type> types_;
+};
+}  // namespace xgboost
+#endif  // XGBOOST_FEATURE_MAP_H_
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
new file mode 100644
index 000000000..6766b0b8a
--- /dev/null
+++ b/include/xgboost/gbm.h
@@ -0,0 +1,168 @@
+/*!
+ * Copyright by Contributors
+ * \file gbm.h
+ * \brief Interface of gradient booster,
+ *  that learns through gradient statistics.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_GBM_H_
+#define XGBOOST_GBM_H_
+
+#include <dmlc/registry.h>
+#include <vector>
+#include <utility>
+#include <string>
+#include <functional>
+#include "./base.h"
+#include "./data.h"
+#include "./feature_map.h"
+
+namespace xgboost {
+/*!
+ * \brief interface of gradient boosting model.
+ */
+class GradientBooster {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~GradientBooster() {}
+  /*!
+   * \brief set configuration from pair iterators.
+   * \param begin The beginning iterator.
+   * \param end The end iterator.
+   * \tparam PairIter iterator<std::pair<std::string, std::string> >
+   */
+  template<typename PairIter>
+  inline void Configure(PairIter begin, PairIter end);
+  /*!
+   * \brief Set the configuration of gradient boosting.
+   *  User must call configure once before InitModel and Training.
+   *
+   * \param cfg configurations on both training and model parameters.
+   */
+  virtual void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) = 0;
+  /*!
+   * \brief load model from stream
+   * \param fi input stream.
+   */
+  virtual void Load(dmlc::Stream* fi) = 0;
+  /*!
+   * \brief save model to stream.
+   * \param fo output stream
+   */
+  virtual void Save(dmlc::Stream* fo) const = 0;
+  /*!
+   * \brief reset the predict buffer size.
+   *  This will invalidate all the previous cached results
+   *  and recalculate from scratch
+   * \param num_pbuffer The size of predict buffer.
+   */
+  virtual void ResetPredBuffer(size_t num_pbuffer) {}
+  /*!
+   * \brief whether the model allow lazy checkpoint
+   * return true if model is only updated in DoBoost
+   * after all Allreduce calls
+   */
+  virtual bool AllowLazyCheckPoint() const {
+    return false;
+  }
+  /*!
+   * \brief perform update to the model(boosting)
+   * \param p_fmat feature matrix that provide access to features
+   * \param buffer_offset buffer index offset of these instances, if equals -1
+   *        this means we do not have buffer index allocated to the gbm
+   * \param in_gpair address of the gradient pair statistics of the data
+   * the booster may change content of gpair
+   */
+  virtual void DoBoost(DMatrix* p_fmat,
+                       int64_t buffer_offset,
+                       std::vector<bst_gpair>* in_gpair) = 0;
+  /*!
+   * \brief generate predictions for given feature matrix
+   * \param dmat feature matrix
+   * \param buffer_offset buffer index offset of these instances, if equals -1
+   *        this means we do not have buffer index allocated to the gbm
+   *  a buffer index is assigned to each instance that requires repeative prediction
+   *  the size of buffer is set by convention using GradientBooster.ResetPredBuffer(size);
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
+   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
+   */
+  virtual void Predict(DMatrix* dmat,
+                       int64_t buffer_offset,
+                       std::vector<float>* out_preds,
+                       unsigned ntree_limit = 0) = 0;
+  /*!
+   * \brief online prediction function, predict score for one instance at a time
+   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
+   *        more efficient than online prediction
+   *        This function is NOT threadsafe, make sure you only call from one thread
+   *
+   * \param inst the instance you want to predict
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction
+   * \param root_index the root index
+   * \sa Predict
+   */
+  virtual void Predict(const SparseBatch::Inst& inst,
+                       std::vector<float>* out_preds,
+                       unsigned ntree_limit = 0,
+                       unsigned root_index = 0) = 0;
+  /*!
+   * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
+   *        this is only valid in gbtree predictor
+   * \param dmat feature matrix
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
+   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
+   */
+  virtual void PredictLeaf(DMatrix* dmat,
+                           std::vector<float>* out_preds,
+                           unsigned ntree_limit = 0) = 0;
+  /*!
+   * \brief dump the model to text format
+   * \param fmap feature map that may help give interpretations of feature
+   * \param option extra option of the dump model
+   * \return a vector of dump for boosters.
+   */
+  virtual std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const = 0;
+  /*!
+   * \brief create a gradient booster from given name
+   * \param name name of gradient booster
+   * \return The created booster.
+   */
+  static GradientBooster* Create(const std::string& name);
+};
+
+// implementing configure.
+template<typename PairIter>
+inline void GradientBooster::Configure(PairIter begin, PairIter end) {
+  std::vector<std::pair<std::string, std::string> > vec(begin, end);
+  this->Configure(vec);
+}
+
+/*!
+ * \brief Registry entry for tree updater.
+ */
+struct GradientBoosterReg
+    : public dmlc::FunctionRegEntryBase<GradientBoosterReg,
+                                        std::function<GradientBooster* ()> > {
+};
+
+/*!
+ * \brief Macro to register gradient booster.
+ *
+ * \code
+ * // example of registering a objective ndcg@k
+ * XGBOOST_REGISTER_GBM(GBTree, "gbtree")
+ * .describe("Boosting tree ensembles.")
+ * .set_body([]() {
+ *     return new GradientBooster<TStats>();
+ *   });
+ * \endcode
+ */
+#define XGBOOST_REGISTER_GBM(UniqueId, Name)                            \
+  static ::xgboost::GradientBoosterReg & __make_ ## GradientBoosterReg ## _ ## UniqueId ## __ = \
+      ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->__REGISTER__(Name)
+
+}  // namespace xgboost
+#endif  // XGBOOST_GBM_H_
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
new file mode 100644
index 000000000..2d5c5702e
--- /dev/null
+++ b/include/xgboost/learner.h
@@ -0,0 +1,178 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file learner.h
+ * \brief Learner interface that integrates objective, gbm and evaluation together.
+ *  This is the user facing XGBoost training module.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_LEARNER_H_
+#define XGBOOST_LEARNER_H_
+
+#include <rabit.h>
+#include <utility>
+#include <string>
+#include <vector>
+#include "./base.h"
+#include "./gbm.h"
+#include "./metric.h"
+#include "./objective.h"
+
+namespace xgboost {
+/*!
+ * \brief Learner class that do trainig and prediction.
+ *  This is the user facing module of xgboost training.
+ *  The Load/Save function corresponds to the model used in python/R.
+ *  \code
+ *
+ *  std::unique_ptr<Learner> learner(new Learner::Create(cache_mats));
+ *  learner.Configure(configs);
+ *
+ *  for (int iter = 0; iter < max_iter; ++i) {
+ *    learner->UpdateOneIter(iter, train_mat);
+ *    LOG(INFO) << learner->EvalOneIter(iter, data_sets, data_names);
+ *  }
+ *
+ *  \endcode
+ */
+class Learner : public rabit::Serializable {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~Learner() {}
+  /*!
+   * \brief set configuration from pair iterators.
+   * \param begin The beginning iterator.
+   * \param end The end iterator.
+   * \tparam PairIter iterator<std::pair<std::string, std::string> >
+   */
+  template<typename PairIter>
+  inline void Configure(PairIter begin, PairIter end);
+  /*!
+   * \brief Set the configuration of gradient boosting.
+   *  User must call configure once before InitModel and Training.
+   *
+   * \param cfg configurations on both training and model parameters.
+   */
+  virtual void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) = 0;
+  /*!
+   * \brief Initialize the model using the specified configurations via Configure.
+   *  An model have to be either Loaded or initialized before Update/Predict/Save can be called.
+   */
+  virtual void InitModel() = 0;
+  /*!
+   * \brief load model from stream
+   * \param fi input stream.
+   */
+  virtual void Load(dmlc::Stream* fi) = 0;
+  /*!
+   * \brief save model to stream.
+   * \param fo output stream
+   */
+  virtual void Save(dmlc::Stream* fo) const = 0;
+  /*!
+   * \brief update the model for one iteration
+   *  With the specified objective function.
+   * \param iter current iteration number
+   * \param train reference to the data matrix.
+   */
+  virtual void UpdateOneIter(int iter, DMatrix* train) = 0;
+  /*!
+   * \brief Do customized gradient boosting with in_gpair.
+   *  in_gair can be mutated after this call.
+   * \param iter current iteration number
+   * \param train reference to the data matrix.
+   * \param in_gpair The input gradient statistics.
+   */
+  virtual void BoostOneIter(int iter,
+                            DMatrix* train,
+                            std::vector<bst_gpair>* in_gpair) = 0;
+  /*!
+   * \brief evaluate the model for specific iteration using the configured metrics.
+   * \param iter iteration number
+   * \param data_sets datasets to be evaluated.
+   * \param data_names name of each dataset
+   * \return a string corresponding to the evaluation result
+   */
+  virtual std::string EvalOneIter(int iter,
+                                  const std::vector<DMatrix*>& data_sets,
+                                  const std::vector<std::string>& data_names) = 0;
+  /*!
+   * \brief get prediction given the model.
+   * \param data input data
+   * \param output_margin whether to only predict margin value instead of transformed prediction
+   * \param out_preds output vector that stores the prediction
+   * \param ntree_limit limit number of trees used for boosted tree
+   *   predictor, when it equals 0, this means we are using all the trees
+   * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
+   */
+  virtual void Predict(DMatrix* data,
+                       bool output_margin,
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit = 0,
+                       bool pred_leaf = false) const = 0;
+  /*!
+   * \return whether the model allow lazy checkpoint in rabit.
+   */
+  bool AllowLazyCheckPoint() const;
+  /*!
+   * \brief dump the model in text format
+   * \param fmap feature map that may help give interpretations of feature
+   * \param option extra option of the dump model
+   * \return a vector of dump for boosters.
+   */
+  std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const;
+  /*!
+   * \brief online prediction function, predict score for one instance at a time
+   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
+   *        more efficient than online prediction
+   *        This function is NOT threadsafe, make sure you only call from one thread.
+   *
+   * \param inst the instance you want to predict
+   * \param output_margin whether to only predict margin value instead of transformed prediction
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction
+   */
+  inline void Predict(const SparseBatch::Inst &inst,
+                      bool output_margin,
+                      std::vector<float> *out_preds,
+                      unsigned ntree_limit = 0) const;
+  /*!
+   * \brief Create a new instance of learner.
+   * \param cache_data The matrix to cache the prediction.
+   * \return Created learner.
+   */
+  static Learner* Create(const std::vector<DMatrix*>& cache_data);
+
+ protected:
+  /*! \brief internal base score of the model */
+  bst_float base_score_;
+  /*! \brief objective function */
+  std::unique_ptr<ObjFunction> obj_;
+  /*! \brief The gradient boosted used by the model*/
+  std::unique_ptr<GradientBooster> gbm_;
+  /*! \brief The evaluation metrics used to evaluate the model. */
+  std::vector<std::unique_ptr<Metric> > metrics_;
+};
+
+// implementation of inline functions.
+inline void Learner::Predict(const SparseBatch::Inst& inst,
+                             bool output_margin,
+                             std::vector<float>* out_preds,
+                             unsigned ntree_limit) const {
+  gbm_->Predict(inst, out_preds, ntree_limit);
+  if (out_preds->size() == 1) {
+    (*out_preds)[0] += base_score_;
+  }
+  if (!output_margin) {
+    obj_->PredTransform(out_preds);
+  }
+}
+
+// implementing configure.
+template<typename PairIter>
+inline void Learner::Configure(PairIter begin, PairIter end) {
+  std::vector<std::pair<std::string, std::string> > vec(begin, end);
+  this->Configure(vec);
+}
+
+}  // namespace xgboost
+#endif  // XGBOOST_LEARNER_H_
diff --git a/include/xgboost/logging.h b/include/xgboost/logging.h
new file mode 100644
index 000000000..03887fb61
--- /dev/null
+++ b/include/xgboost/logging.h
@@ -0,0 +1,50 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file logging.h
+ * \brief defines console logging options for xgboost.
+ *  Use to enforce unified print behavior.
+ *  For debug loggers, use LOG(INFO) and LOG(ERROR).
+ */
+#ifndef XGBOOST_LOGGING_H_
+#define XGBOOST_LOGGING_H_
+
+#include <dmlc/logging.h>
+#include <sstream>
+#include "./base.h"
+
+namespace xgboost {
+
+class BaseLogger {
+ public:
+  BaseLogger() {
+#if XGBOOST_LOG_WITH_TIME
+    log_stream_ << "[" << dmlc::DateLogger().HumanDate() << "] ";
+#endif
+  }
+  std::ostream& stream() { return log_stream_; }
+
+ protected:
+  std::ostringstream log_stream_;
+};
+
+class ConsoleLogger : public BaseLogger {
+ public:
+  ~ConsoleLogger();
+};
+
+class TrackerLogger : public BaseLogger {
+ public:
+  ~TrackerLogger();
+};
+
+// redefines the logging macro if not existed
+#ifndef LOG
+#define LOG(severity) LOG_##severity.stream()
+#endif
+
+// Enable LOG(CONSOLE) for print messages to console.
+#define LOG_CONSOLE ::xgboost::ConsoleLogger()
+// Enable LOG(TRACKER) for print messages to tracker
+#define LOG_TRACKER ::xgboost::TrackerLogger()
+}  // namespace xgboost.
+#endif  // XGBOOST_LOGGING_H_
diff --git a/include/xgboost/metric.h b/include/xgboost/metric.h
new file mode 100644
index 000000000..853065cd0
--- /dev/null
+++ b/include/xgboost/metric.h
@@ -0,0 +1,76 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file metric.h
+ * \brief interface of evaluation metric function supported in xgboost.
+ * \author Tianqi Chen, Kailong Chen
+ */
+#ifndef XGBOOST_METRIC_H_
+#define XGBOOST_METRIC_H_
+
+#include <dmlc/registry.h>
+#include <vector>
+#include <string>
+#include <functional>
+#include "./data.h"
+#include "./base.h"
+
+namespace xgboost {
+/*!
+ * \brief interface of evaluation metric used to evaluate model performance.
+ *  This has nothing to do with training, but merely act as evaluation purpose.
+ */
+class Metric {
+ public:
+  /*!
+   * \brief evaluate a specific metric
+   * \param preds prediction
+   * \param info information, including label etc.
+   * \param distributed whether a call to Allreduce is needed to gather
+   *        the average statistics across all the node,
+   *        this is only supported by some metrics
+   */
+  virtual float Eval(const std::vector<float>& preds,
+                     const MetaInfo& info,
+                     bool distributed) const = 0;
+  /*! \return name of metric */
+  virtual const char* Name() const = 0;
+  /*! \brief virtual destructor */
+  virtual ~Metric() {}
+  /*!
+   * \brief create a metric according to name.
+   * \param name name of the metric.
+   *  name can be in form metric[@]param
+   *  and the name will be matched in the registry.
+   * \return the created metric.
+   */
+  static Metric* Create(const std::string& name);
+};
+
+/*!
+ * \brief Registry entry for Metric factory functions.
+ *  The additional parameter const char* param gives the value after @, can be null.
+ *  For example, metric map@3, then: param == "3".
+ */
+struct MetricReg
+    : public dmlc::FunctionRegEntryBase<MetricReg,
+                                        std::function<Metric* (const char*)> > {
+};
+
+/*!
+ * \brief Macro to register metric.
+ *
+ * \code
+ * // example of registering a objective ndcg@k
+ * XGBOOST_REGISTER_METRIC(RMSE, "ndcg")
+ * .describe("Rooted mean square error.")
+ * .set_body([](const char* param) {
+ *     int at_k = atoi(param);
+ *     return new NDCG(at_k);
+ *   });
+ * \endcode
+ */
+#define XGBOOST_REGISTER_METRIC(UniqueId, Name)                         \
+  ::xgboost::MetricReg&  __make_ ## MetricReg ## _ ## UniqueId ## __ =  \
+      ::dmlc::Registry< ::xgboost::MetricReg>::Get()->__REGISTER__(Name)
+}  // namespace xgboost
+#endif  // XGBOOST_METRIC_H_
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
new file mode 100644
index 000000000..732644dd5
--- /dev/null
+++ b/include/xgboost/objective.h
@@ -0,0 +1,111 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file objective.h
+ * \brief interface of objective function used by xgboost.
+ * \author Tianqi Chen, Kailong Chen
+ */
+#ifndef XGBOOST_OBJECTIVE_H_
+#define XGBOOST_OBJECTIVE_H_
+
+#include <dmlc/registry.h>
+#include <vector>
+#include <utility>
+#include <string>
+#include <functional>
+#include "./data.h"
+#include "./base.h"
+
+namespace xgboost {
+/*! \brief interface of objective function */
+class ObjFunction {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~ObjFunction() {}
+  /*!
+   * \brief set configuration from pair iterators.
+   * \param begin The beginning iterator.
+   * \param end The end iterator.
+   * \tparam PairIter iterator<std::pair<std::string, std::string> >
+   */
+  template<typename PairIter>
+  inline void Configure(PairIter begin, PairIter end);
+  /*!
+   * \brief Configure the objective with the specified parameters.
+   * \param args arguments to the objective function.
+   */
+  virtual void Configure(const std::vector<std::pair<std::string, std::string> >& args) = 0;
+  /*!
+   * \brief Get gradient over each of predictions, given existing information.
+   * \param preds prediction of current round
+   * \param info information about labels, weights, groups in rank
+   * \param iteration current iteration number.
+   * \param out_gpair output of get gradient, saves gradient and second order gradient in
+   */
+  virtual void GetGradient(const std::vector<float>& preds,
+                           const MetaInfo& info,
+                           int iteration,
+                           std::vector<bst_gpair>* out_gpair) = 0;
+  /*! \return the default evaluation metric for the objective */
+  virtual const char* DefaultEvalMetric() const = 0;
+  // the following functions are optional, most of time default implementation is good enough
+  /*!
+   * \brief transform prediction values, this is only called when Prediction is called
+   * \param io_preds prediction values, saves to this vector as well
+   */
+  virtual void PredTransform(std::vector<float> *io_preds) {}
+  /*!
+   * \brief transform prediction values, this is only called when Eval is called,
+   *  usually it redirect to PredTransform
+   * \param io_preds prediction values, saves to this vector as well
+   */
+  virtual void EvalTransform(std::vector<float> *io_preds) {
+    this->PredTransform(io_preds);
+  }
+  /*!
+   * \brief transform probability value back to margin
+   * this is used to transform user-set base_score back to margin
+   * used by gradient boosting
+   * \return transformed value
+   */
+  virtual float ProbToMargin(float base_score) const {
+    return base_score;
+  }
+  /*!
+   * \brief Create an objective function according to name.
+   * \param name Name of the objective.
+   */
+  static ObjFunction* Create(const std::string& name);
+};
+
+// implementing configure.
+template<typename PairIter>
+inline void ObjFunction::Configure(PairIter begin, PairIter end) {
+  std::vector<std::pair<std::string, std::string> > vec(begin, end);
+  this->Configure(vec);
+}
+
+/*!
+ * \brief Registry entry for objective factory functions.
+ */
+struct ObjFunctionReg
+    : public dmlc::FunctionRegEntryBase<ObjFunctionReg,
+                                        std::function<ObjFunction* ()> > {
+};
+
+/*!
+ * \brief Macro to register objective function.
+ *
+ * \code
+ * // example of registering a objective
+ * XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
+ * .describe("Linear regression objective")
+ * .set_body([]() {
+ *     return new RegLossObj(LossType::kLinearSquare);
+ *   });
+ * \endcode
+ */
+#define XGBOOST_REGISTER_OBJECTIVE(UniqueId, Name)                      \
+  static ::xgboost::ObjFunctionReg & __make_ ## ObjFunctionReg ## _ ## UniqueId ## __ = \
+      ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->__REGISTER__(Name)
+}  // namespace xgboost
+#endif  // XGBOOST_OBJECTIVE_H_
diff --git a/src/tree/model.h b/include/xgboost/tree_model.h
similarity index 50%
rename from src/tree/model.h
rename to include/xgboost/tree_model.h
index 6f2479cc2..168d1e936 100644
--- a/src/tree/model.h
+++ b/include/xgboost/tree_model.h
@@ -1,25 +1,66 @@
 /*!
  * Copyright 2014 by Contributors
- * \file model.h
+ * \file tree_model.h
  * \brief model structure for tree
  * \author Tianqi Chen
  */
 #ifndef XGBOOST_TREE_MODEL_H_
 #define XGBOOST_TREE_MODEL_H_
 
+#include <dmlc/io.h>
+#include <dmlc/parameter.h>
+#include <limits>
+#include <vector>
 #include <string>
 #include <cstring>
-#include <sstream>
-#include <limits>
 #include <algorithm>
-#include <vector>
-#include <cmath>
-#include "../utils/io.h"
-#include "../utils/fmap.h"
-#include "../utils/utils.h"
+#include "./base.h"
+#include "./data.h"
+#include "./logging.h"
+#include "./feature_map.h"
 
 namespace xgboost {
-namespace tree {
+
+/*! \brief meta parameters of the tree */
+struct TreeParam : public dmlc::Parameter<TreeParam> {
+  /*! \brief number of start root */
+  int num_roots;
+  /*! \brief total number of nodes */
+  int num_nodes;
+  /*!\brief number of deleted nodes */
+  int num_deleted;
+  /*! \brief maximum depth, this is a statistics of the tree */
+  int max_depth;
+  /*! \brief number of features used for tree construction */
+  int num_feature;
+  /*!
+   * \brief leaf vector size, used for vector tree
+   * used to store more than one dimensional information in tree
+   */
+  int size_leaf_vector;
+  /*! \brief reserved part, make sure alignment works for 64bit */
+  int reserved[31];
+  /*! \brief constructor */
+  TreeParam() {
+    // assert compact alignment
+    static_assert(sizeof(TreeParam) == (31 + 6) * sizeof(int),
+                  "TreeParam: 64 bit align");
+    std::memset(this, 0, sizeof(TreeParam));
+    num_nodes = num_roots = 1;
+  }
+  // declare the parameters
+  DMLC_DECLARE_PARAMETER(TreeParam) {
+    // only declare the parameters that can be set by the user.
+    // other arguments are set by the algorithm.
+    DMLC_DECLARE_FIELD(num_roots).set_lower_bound(1).set_default(1)
+        .describe("Number of start root of trees.");
+    DMLC_DECLARE_FIELD(num_feature)
+        .describe("Number of features used in tree construction.");
+    DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
+        .describe("Size of leaf vector, reserved for vector tree");
+  }
+};
+
 /*!
  * \brief template class of TreeModel
  * \tparam TSplitCond data type to indicate split condition
@@ -32,98 +73,65 @@ class TreeModel {
   typedef TNodeStat  NodeStat;
   /*! \brief auxiliary statistics of node to help tree building */
   typedef TSplitCond SplitCond;
-  /*! \brief parameters of the tree */
-  struct Param{
-    /*! \brief number of start root */
-    int num_roots;
-    /*! \brief total number of nodes */
-    int num_nodes;
-    /*!\brief number of deleted nodes */
-    int num_deleted;
-    /*! \brief maximum depth, this is a statistics of the tree */
-    int max_depth;
-    /*! \brief  number of features used for tree construction */
-    int num_feature;
-    /*!
-     * \brief leaf vector size, used for vector tree
-     * used to store more than one dimensional information in tree
-     */
-    int size_leaf_vector;
-    /*! \brief reserved part */
-    int reserved[31];
-    /*! \brief constructor */
-    Param(void) {
-      max_depth = 0;
-      size_leaf_vector = 0;
-      std::memset(reserved, 0, sizeof(reserved));
-    }
-    /*!
-     * \brief set parameters from outside
-     * \param name name of the parameter
-     * \param val  value of the parameter
-     */
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      if (!strcmp("num_roots", name)) num_roots = atoi(val);
-      if (!strcmp("num_feature", name)) num_feature = atoi(val);
-      if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
-    }
-  };
   /*! \brief tree node */
   class Node {
    public:
-    Node(void) : sindex_(0) {}
+    Node() : sindex_(0) {
+      // assert compact alignment
+      static_assert(sizeof(Node) == 4 * sizeof(int) + sizeof(Info),
+                    "Node: 64 bit align");
+    }
     /*! \brief index of left child */
-    inline int cleft(void) const {
+    inline int cleft() const {
       return this->cleft_;
     }
     /*! \brief index of right child */
-    inline int cright(void) const {
+    inline int cright() const {
       return this->cright_;
     }
     /*! \brief index of default child when feature is missing */
-    inline int cdefault(void) const {
+    inline int cdefault() const {
       return this->default_left() ? this->cleft() : this->cright();
     }
     /*! \brief feature index of split condition */
-    inline unsigned split_index(void) const {
+    inline unsigned split_index() const {
       return sindex_ & ((1U << 31) - 1U);
     }
     /*! \brief when feature is unknown, whether goes to left child */
-    inline bool default_left(void) const {
+    inline bool default_left() const {
       return (sindex_ >> 31) != 0;
     }
     /*! \brief whether current node is leaf node */
-    inline bool is_leaf(void) const {
+    inline bool is_leaf() const {
       return cleft_ == -1;
     }
-    /*! \brief get leaf value of leaf node */
-    inline float leaf_value(void) const {
+    /*! \return get leaf value of leaf node */
+    inline float leaf_value() const {
       return (this->info_).leaf_value;
     }
-    /*! \brief get split condition of the node */
-    inline TSplitCond split_cond(void) const {
+    /*! \return get split condition of the node */
+    inline TSplitCond split_cond() const {
       return (this->info_).split_cond;
     }
     /*! \brief get parent of the node */
-    inline int parent(void) const {
+    inline int parent() const {
       return parent_ & ((1U << 31) - 1);
     }
     /*! \brief whether current node is left child */
-    inline bool is_left_child(void) const {
+    inline bool is_left_child() const {
       return (parent_ & (1U << 31)) != 0;
     }
     /*! \brief whether this node is deleted */
-    inline bool is_deleted(void) const {
+    inline bool is_deleted() const {
       return sindex_ == std::numeric_limits<unsigned>::max();
     }
     /*! \brief whether current node is root */
-    inline bool is_root(void) const {
+    inline bool is_root() const {
       return parent_ == -1;
     }
     /*!
      * \brief set the right child
-     * \param nide node id to right child
+     * \param nid node id to right child
      */
     inline void set_right_child(int nid) {
       this->cright_ = nid;
@@ -152,7 +160,7 @@ class TreeModel {
       this->cright_ = right;
     }
     /*! \brief mark that this node is deleted */
-    inline void mark_delete(void) {
+    inline void mark_delete() {
       this->sindex_ = std::numeric_limits<unsigned>::max();
     }
 
@@ -193,7 +201,7 @@ class TreeModel {
   std::vector<bst_float> leaf_vector;
   // allocate a new node,
   // !!!!!! NOTE: may cause BUG here, nodes.resize
-  inline int AllocNode(void) {
+  inline int AllocNode() {
     if (param.num_deleted != 0) {
       int nd = deleted_nodes.back();
       deleted_nodes.pop_back();
@@ -201,8 +209,8 @@ class TreeModel {
       return nd;
     }
     int nd = param.num_nodes++;
-    utils::Check(param.num_nodes < std::numeric_limits<int>::max(),
-                 "number of nodes in the tree exceed 2^31");
+    CHECK_LT(param.num_nodes, std::numeric_limits<int>::max())
+        << "number of nodes in the tree exceed 2^31";
     nodes.resize(param.num_nodes);
     stats.resize(param.num_nodes);
     leaf_vector.resize(param.num_nodes * param.size_leaf_vector);
@@ -210,7 +218,7 @@ class TreeModel {
   }
   // delete a tree node, keep the parent field to allow trace back
   inline void DeleteNode(int nid) {
-    utils::Assert(nid >= param.num_roots, "can not delete root");
+    CHECK_GE(nid, param.num_roots);
     deleted_nodes.push_back(nid);
     nodes[nid].mark_delete();
     ++param.num_deleted;
@@ -220,13 +228,11 @@ class TreeModel {
   /*!
    * \brief change a non leaf node to a leaf node, delete its children
    * \param rid node id of the node
-   * \param new leaf value
+   * \param value new leaf value
    */
   inline void ChangeToLeaf(int rid, float value) {
-    utils::Assert(nodes[nodes[rid].cleft() ].is_leaf(),
-                  "can not delete a non termial child");
-    utils::Assert(nodes[nodes[rid].cright()].is_leaf(),
-                  "can not delete a non termial child");
+    CHECK(nodes[nodes[rid].cleft() ].is_leaf());
+    CHECK(nodes[nodes[rid].cright()].is_leaf());
     this->DeleteNode(nodes[rid].cleft());
     this->DeleteNode(nodes[rid].cright());
     nodes[rid].set_leaf(value);
@@ -234,7 +240,7 @@ class TreeModel {
   /*!
    * \brief collapse a non leaf node to a leaf node, delete its children
    * \param rid node id of the node
-   * \param new leaf value
+   * \param value new leaf value
    */
   inline void CollapseToLeaf(int rid, float value) {
     if (nodes[rid].is_leaf()) return;
@@ -249,38 +255,42 @@ class TreeModel {
 
  public:
   /*! \brief model parameter */
-  Param param;
+  TreeParam param;
   /*! \brief constructor */
-  TreeModel(void) {
+  TreeModel() {
     param.num_nodes = 1;
     param.num_roots = 1;
     param.num_deleted = 0;
     nodes.resize(1);
   }
   /*! \brief get node given nid */
-  inline Node &operator[](int nid) {
+  inline Node& operator[](int nid) {
     return nodes[nid];
   }
   /*! \brief get node given nid */
-  inline const Node &operator[](int nid) const {
+  inline const Node& operator[](int nid) const {
     return nodes[nid];
   }
   /*! \brief get node statistics given nid */
-  inline NodeStat &stat(int nid) {
+  inline NodeStat& stat(int nid) {
+    return stats[nid];
+  }
+  /*! \brief get node statistics given nid */
+  inline const NodeStat& stat(int nid) const {
     return stats[nid];
   }
   /*! \brief get leaf vector given nid */
   inline bst_float* leafvec(int nid) {
-    if (leaf_vector.size() == 0) return NULL;
-    return &leaf_vector[nid * param.size_leaf_vector];
+    if (leaf_vector.size() == 0) return nullptr;
+    return& leaf_vector[nid * param.size_leaf_vector];
   }
   /*! \brief get leaf vector given nid */
   inline const bst_float* leafvec(int nid) const {
-    if (leaf_vector.size() == 0) return NULL;
-    return &leaf_vector[nid * param.size_leaf_vector];
+    if (leaf_vector.size() == 0) return nullptr;
+    return& leaf_vector[nid * param.size_leaf_vector];
   }
   /*! \brief initialize the model */
-  inline void InitModel(void) {
+  inline void InitModel() {
     param.num_nodes = param.num_roots;
     nodes.resize(param.num_nodes);
     stats.resize(param.num_nodes);
@@ -294,41 +304,37 @@ class TreeModel {
    * \brief load model from stream
    * \param fi input stream
    */
-  inline void LoadModel(utils::IStream &fi) { // NOLINT(*)
-    utils::Check(fi.Read(&param, sizeof(Param)) > 0,
-                 "TreeModel: wrong format");
-    nodes.resize(param.num_nodes); stats.resize(param.num_nodes);
-    utils::Assert(param.num_nodes != 0, "invalid model");
-    utils::Check(fi.Read(BeginPtr(nodes), sizeof(Node) * nodes.size()) > 0,
-                 "TreeModel: wrong format");
-    utils::Check(fi.Read(BeginPtr(stats), sizeof(NodeStat) * stats.size()) > 0,
-                 "TreeModel: wrong format");
+  inline void Load(dmlc::Stream* fi) {
+    CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
+    nodes.resize(param.num_nodes);
+    stats.resize(param.num_nodes);
+    CHECK_NE(param.num_nodes, 0);
+    CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes), sizeof(Node) * nodes.size()),
+             sizeof(Node) * nodes.size());
+    CHECK_EQ(fi->Read(dmlc::BeginPtr(stats), sizeof(NodeStat) * stats.size()),
+             sizeof(NodeStat) * stats.size());
     if (param.size_leaf_vector != 0) {
-      utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
+      CHECK(fi->Read(&leaf_vector));
     }
     // chg deleted nodes
     deleted_nodes.resize(0);
     for (int i = param.num_roots; i < param.num_nodes; ++i) {
       if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
     }
-    utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
-                  "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
-                  param.num_deleted, deleted_nodes.size(), param.num_nodes);
+    CHECK_EQ(static_cast<int>(deleted_nodes.size()), param.num_deleted);
   }
   /*!
    * \brief save model to stream
    * \param fo output stream
    */
-  inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
-    utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
-                  "TreeModel::SaveModel");
-    utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
-                  "TreeModel::SaveModel");
-    fo.Write(&param, sizeof(Param));
-    utils::Assert(param.num_nodes != 0, "invalid model");
-    fo.Write(BeginPtr(nodes), sizeof(Node) * nodes.size());
-    fo.Write(BeginPtr(stats), sizeof(NodeStat) * nodes.size());
-    if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
+  inline void Save(dmlc::Stream* fo) const {
+    CHECK_EQ(param.num_nodes, static_cast<int>(nodes.size()));
+    CHECK_EQ(param.num_nodes, static_cast<int>(stats.size()));
+    fo->Write(&param, sizeof(TreeParam));
+    CHECK_NE(param.num_nodes, 0);
+    fo->Write(dmlc::BeginPtr(nodes), sizeof(Node) * nodes.size());
+    fo->Write(dmlc::BeginPtr(stats), sizeof(NodeStat) * nodes.size());
+    if (param.size_leaf_vector != 0) fo->Write(leaf_vector);
   }
   /*!
    * \brief add child nodes to node
@@ -344,7 +350,7 @@ class TreeModel {
   }
   /*!
    * \brief only add a right child to a leaf node
-   * \param node id to add right child
+   * \param nid node id to add right child
    */
   inline void AddRightChild(int nid) {
     int pright = this->AllocNode();
@@ -376,7 +382,7 @@ class TreeModel {
   /*!
    * \brief get maximum depth
    */
-  inline int MaxDepth(void) {
+  inline int MaxDepth() {
     int maxd = 0;
     for (int i = 0; i < param.num_roots; ++i) {
       maxd = std::max(maxd, MaxDepth(i));
@@ -384,80 +390,9 @@ class TreeModel {
     return maxd;
   }
   /*! \brief number of extra nodes besides the root */
-  inline int num_extra_nodes(void) const {
+  inline int num_extra_nodes() const {
     return param.num_nodes - param.num_roots - param.num_deleted;
   }
-  /*!
-   * \brief dump model to text string
-   * \param fmap feature map of feature types
-   * \param with_stats whether dump out statistics as well
-   * \return the string of dumped model
-   */
-  inline std::string DumpModel(const utils::FeatMap& fmap, bool with_stats) {
-    std::stringstream fo("");
-    for (int i = 0; i < param.num_roots; ++i) {
-      this->Dump(i, fo, fmap, 0, with_stats);
-    }
-    return fo.str();
-  }
-
- private:
-  void Dump(int nid, std::stringstream &fo, // NOLINT(*)
-            const utils::FeatMap& fmap, int depth, bool with_stats) {
-    for (int i = 0;  i < depth; ++i) {
-      fo << '\t';
-    }
-    if (nodes[nid].is_leaf()) {
-      fo << nid << ":leaf=" << nodes[nid].leaf_value();
-      if (with_stats) {
-        stat(nid).Print(fo, true);
-      }
-      fo << '\n';
-    } else {
-      // right then left,
-      TSplitCond cond = nodes[nid].split_cond();
-      const unsigned split_index = nodes[nid].split_index();
-      if (split_index < fmap.size()) {
-        switch (fmap.type(split_index)) {
-          case utils::FeatMap::kIndicator: {
-            int nyes = nodes[nid].default_left() ?
-                nodes[nid].cright() : nodes[nid].cleft();
-            fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
-               << ",no=" << nodes[nid].cdefault();
-            break;
-          }
-          case utils::FeatMap::kInteger: {
-            fo << nid << ":[" << fmap.name(split_index) << "<"
-               << int(float(cond)+1.0f)
-               << "] yes=" << nodes[nid].cleft()
-               << ",no=" << nodes[nid].cright()
-               << ",missing=" << nodes[nid].cdefault();
-            break;
-          }
-          case utils::FeatMap::kFloat:
-          case utils::FeatMap::kQuantitive: {
-            fo << nid << ":[" << fmap.name(split_index) << "<"<< float(cond)
-               << "] yes=" << nodes[nid].cleft()
-               << ",no=" << nodes[nid].cright()
-               << ",missing=" << nodes[nid].cdefault();
-            break;
-          }
-          default: utils::Error("unknown fmap type");
-        }
-      } else {
-        fo << nid << ":[f" << split_index << "<"<< float(cond)
-           << "] yes=" << nodes[nid].cleft()
-           << ",no=" << nodes[nid].cright()
-           << ",missing=" << nodes[nid].cdefault();
-      }
-      if (with_stats) {
-        stat(nid).Print(fo, false);
-      }
-      fo << '\n';
-      this->Dump(nodes[nid].cleft(), fo, fmap, depth+1, with_stats);
-      this->Dump(nodes[nid].cright(), fo, fmap, depth+1, with_stats);
-    }
-  }
 };
 
 /*! \brief node statistics used in regression tree */
@@ -469,63 +404,59 @@ struct RTreeNodeStat {
   /*! \brief weight of current node */
   float base_weight;
   /*! \brief number of child that is leaf node known up to now */
-  int   leaf_child_cnt;
-  /*! \brief print information of current stats to fo */
-  inline void Print(std::stringstream &fo, bool is_leaf) const { // NOLINT(*)
-    if (!is_leaf) {
-      fo << ",gain=" << loss_chg << ",cover=" << sum_hess;
-    } else {
-      fo << ",cover=" << sum_hess;
-    }
-  }
+  int leaf_child_cnt;
 };
 
-/*! \brief define regression tree to be the most common tree model */
-class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
+/*!
+ * \brief define regression tree to be the most common tree model.
+ *  This is the data structure used in xgboost's major tree models.
+ */
+class RegTree: public TreeModel<bst_float, RTreeNodeStat> {
  public:
   /*!
    * \brief dense feature vector that can be taken by RegTree
-   * to do traverse efficiently
-   * and can be construct from sparse feature vector
+   * and can be construct from sparse feature vector.
    */
   struct FVec {
+   public:
+    /*!
+     * \brief initialize the vector with size vector
+     * \param size The size of the feature vector.
+     */
+    inline void Init(size_t size);
+    /*!
+     * \brief fill the vector with sparse vector
+     * \param inst The sparse instance to fil.
+     */
+    inline void Fill(const RowBatch::Inst& inst);
+    /*!
+     * \brief drop the trace after fill, must be called after fill.
+     * \param inst The sparse instanc to drop.
+     */
+    inline void Drop(const RowBatch::Inst& inst);
+    /*!
+     * \brief get ith value
+     * \param i feature index.
+     * \return the i-th feature value
+     */
+    inline float fvalue(size_t i) const;
+    /*!
+     * \brief check whether i-th entry is missing
+     * \param i feature index.
+     * \return whether i-th value is missing.
+     */
+    inline bool is_missing(size_t i) const;
+
+   private:
     /*!
      * \brief a union value of value and flag
-     * when flag == -1, this indicate the value is missing
+     *  when flag == -1, this indicate the value is missing
      */
-    union Entry{
+    union Entry {
       float fvalue;
       int flag;
     };
     std::vector<Entry> data;
-    /*! \brief initialize the vector with size vector */
-    inline void Init(size_t size) {
-      Entry e; e.flag = -1;
-      data.resize(size);
-      std::fill(data.begin(), data.end(), e);
-    }
-    /*! \brief fill the vector with sparse vector */
-    inline void Fill(const RowBatch::Inst &inst) {
-      for (bst_uint i = 0; i < inst.length; ++i) {
-        if (inst[i].index >= data.size()) continue;
-        data[inst[i].index].fvalue = inst[i].fvalue;
-      }
-    }
-    /*! \brief drop the trace after fill, must be called after fill */
-    inline void Drop(const RowBatch::Inst &inst) {
-      for (bst_uint i = 0; i < inst.length; ++i) {
-        if (inst[i].index >= data.size()) continue;
-        data[inst[i].index].flag = -1;
-      }
-    }
-    /*! \brief get ith value */
-    inline float fvalue(size_t i) const {
-      return data[i].fvalue;
-    }
-    /*! \brief check whether i-th entry is missing */
-    inline bool is_missing(size_t i) const {
-      return data[i].flag == -1;
-    }
   };
   /*!
    * \brief get the leaf index
@@ -533,41 +464,86 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
    * \param root_id starting root index of the instance
    * \return the leaf index of the given feature
    */
-  inline int GetLeafIndex(const FVec &feat, unsigned root_id = 0) const {
-    // start from groups that belongs to current data
-    int pid = static_cast<int>(root_id);
-    // traverse tree
-    while (!(*this)[ pid ].is_leaf()) {
-      unsigned split_index = (*this)[pid].split_index();
-      pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
-    }
-    return pid;
-  }
+  inline int GetLeafIndex(const FVec& feat, unsigned root_id = 0) const;
   /*!
    * \brief get the prediction of regression tree, only accepts dense feature vector
-   * \param feats dense feature vector, if the feature is missing the field is set to NaN
+   * \param feat dense feature vector, if the feature is missing the field is set to NaN
    * \param root_id starting root index of the instance
    * \return the leaf index of the given feature
    */
-  inline float Predict(const FVec &feat, unsigned root_id = 0) const {
-    int pid = this->GetLeafIndex(feat, root_id);
-    return (*this)[pid].leaf_value();
-  }
-  /*! \brief get next position of the tree given current pid */
-  inline int GetNext(int pid, float fvalue, bool is_unknown) const {
-    float split_value = (*this)[pid].split_cond();
-    if (is_unknown) {
-      return (*this)[pid].cdefault();
-    } else {
-      if (fvalue < split_value) {
-        return (*this)[pid].cleft();
-      } else {
-        return (*this)[pid].cright();
-      }
-    }
-  }
+  inline float Predict(const FVec& feat, unsigned root_id = 0) const;
+  /*!
+   * \brief get next position of the tree given current pid
+   * \param pid Current node id.
+   * \param fvalue feature value if not missing.
+   * \param is_unknown Whether current required feature is missing.
+   */
+  inline int GetNext(int pid, float fvalue, bool is_unknown) const;
+  /*!
+   * \brief dump model to text string
+   * \param fmap feature map of feature types
+   * \param with_stats whether dump out statistics as well
+   * \return the string of dumped model
+   */
+  std::string Dump2Text(const FeatureMap& fmap, bool with_stats) const;
 };
 
-}  // namespace tree
+// implementations of inline functions
+// do not need to read if only use the model
+inline void RegTree::FVec::Init(size_t size) {
+  Entry e; e.flag = -1;
+  data.resize(size);
+  std::fill(data.begin(), data.end(), e);
+}
+
+inline void RegTree::FVec::Fill(const RowBatch::Inst& inst) {
+  for (bst_uint i = 0; i < inst.length; ++i) {
+    if (inst[i].index >= data.size()) continue;
+    data[inst[i].index].fvalue = inst[i].fvalue;
+  }
+}
+
+inline void RegTree::FVec::Drop(const RowBatch::Inst& inst) {
+  for (bst_uint i = 0; i < inst.length; ++i) {
+    if (inst[i].index >= data.size()) continue;
+    data[inst[i].index].flag = -1;
+  }
+}
+
+inline float RegTree::FVec::fvalue(size_t i) const {
+  return data[i].fvalue;
+}
+
+inline bool RegTree::FVec::is_missing(size_t i) const {
+  return data[i].flag == -1;
+}
+
+inline int RegTree::GetLeafIndex(const RegTree::FVec& feat, unsigned root_id) const {
+  int pid = static_cast<int>(root_id);
+  while (!(*this)[pid].is_leaf()) {
+    unsigned split_index = (*this)[pid].split_index();
+    pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
+  }
+  return pid;
+}
+
+inline float RegTree::Predict(const RegTree::FVec& feat, unsigned root_id) const {
+  int pid = this->GetLeafIndex(feat, root_id);
+  return (*this)[pid].leaf_value();
+}
+
+/*! \brief get next position of the tree given current pid */
+inline int RegTree::GetNext(int pid, float fvalue, bool is_unknown) const {
+  float split_value = (*this)[pid].split_cond();
+  if (is_unknown) {
+    return (*this)[pid].cdefault();
+  } else {
+    if (fvalue < split_value) {
+      return (*this)[pid].cleft();
+    } else {
+      return (*this)[pid].cright();
+    }
+  }
+}
 }  // namespace xgboost
 #endif  // XGBOOST_TREE_MODEL_H_
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
new file mode 100644
index 000000000..ecace6571
--- /dev/null
+++ b/include/xgboost/tree_updater.h
@@ -0,0 +1,85 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file tree_updater.h
+ * \brief General primitive for tree learning,
+ *   Updating a collection of trees given the information.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_H_
+#define XGBOOST_TREE_UPDATER_H_
+
+#include <dmlc/registry.h>
+#include <vector>
+#include <utility>
+#include <string>
+#include "./base.h"
+#include "./data.h"
+#include "./tree_model.h"
+
+namespace xgboost {
+/*!
+ * \brief interface of tree update module, that performs update of a tree.
+ */
+class TreeUpdater {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~TreeUpdater() {}
+  /*!
+   * \brief Initialize the updater with given arguments.
+   * \param args arguments to the objective function.
+   */
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& args) = 0;
+  /*!
+   * \brief perform update to the tree models
+   * \param gpair the gradient pair statistics of the data
+   * \param data The data matrix passed to the updater.
+   * \param trees references the trees to be updated, updater will change the content of trees
+   *   note: all the trees in the vector are updated, with the same statistics,
+   *         but maybe different random seeds, usually one tree is passed in at a time,
+   *         there can be multiple trees when we train random forest style model
+   */
+  virtual void Update(const std::vector<bst_gpair>& gpair,
+                      DMatrix* data,
+                      const std::vector<RegTree*>& trees) = 0;
+  /*!
+   * \brief this is simply a function for optimizing performance
+   * this function asks the updater to return the leaf position of each instance in the previous performed update.
+   * if it is cached in the updater, if it is not available, return nullptr
+   * \return array of leaf position of each instance in the last updated tree
+   */
+  virtual const int* GetLeafPosition() const {
+    return nullptr;
+  }
+  /*!
+   * \brief Create a tree updater given name
+   * \param name Name of the tree updater.
+   */
+  static TreeUpdater* Create(const std::string& name);
+};
+
+/*!
+ * \brief Registry entry for tree updater.
+ */
+struct TreeUpdaterReg
+    : public dmlc::FunctionRegEntryBase<TreeUpdaterReg,
+                                        std::function<TreeUpdater* ()> > {
+};
+
+/*!
+ * \brief Macro to register tree updater.
+ *
+ * \code
+ * // example of registering a objective ndcg@k
+ * XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "colmaker")
+ * .describe("Column based tree maker.")
+ * .set_body([]() {
+ *     return new ColMaker<TStats>();
+ *   });
+ * \endcode
+ */
+#define XGBOOST_REGISTER_TREE_UPDATER(UniqueId, Name)                   \
+  static ::xgboost::TreeUpdaterReg& __make_ ## TreeUpdaterReg ## _ ## UniqueId ## __ = \
+      ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->__REGISTER__(Name)
+
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_H_
diff --git a/java/xgboost4j_wrapper.cpp b/java/xgboost4j_wrapper.cpp
index d8ba5fb9b..865426752 100644
--- a/java/xgboost4j_wrapper.cpp
+++ b/java/xgboost4j_wrapper.cpp
@@ -12,7 +12,7 @@
  limitations under the License.
  */
 
-#include "../wrapper/xgboost_wrapper.h"
+#include "xgboost/c_api.h"
 #include "xgboost4j_wrapper.h"
 #include <cstring>
 
diff --git a/make/config.mk b/make/config.mk
new file mode 100644
index 000000000..ff6844617
--- /dev/null
+++ b/make/config.mk
@@ -0,0 +1,54 @@
+#-----------------------------------------------------
+#  xgboost: the configuration compile script
+#
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory of xgboost.
+#  First copy the this file so that any local changes will be ignored by git
+#
+#  $ cp make/config.mk .
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ make
+#
+#  or build in parallel with 8 threads
+#
+#  $ make -j8
+#----------------------------------------------------
+
+# choice of compiler, by default use system preference.
+# export CC = gcc
+# export CXX = g++
+# export MPICXX = mpicxx
+
+# the additional link flags you want to add
+ADD_LDFLAGS =
+
+# the additional compile flags you want to add
+ADD_CFLAGS =
+
+# Whether enable openmp support, needed for multi-threading.
+USE_OPENMP = 1
+
+# whether use HDFS support during compile
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+USE_S3 = 0
+
+# whether use Azure blob support during compile
+USE_AZURE = 0
+
+# Rabit library version,
+# - librabit.a Normal distributed version.
+# - librabit_empty.a Non distributed mock version,
+LIB_RABIT = librabit.a
+
+# path to libjvm.so
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# List of additional plugins, checkout plugin folder.
+# uncomment the following lines to include these plugins
+# you can also add your own plugin like this
+#
+# XGB_PLUGINS += plugin/example/plugin.mk
diff --git a/make/mingw64.mk b/make/mingw64.mk
new file mode 100644
index 000000000..e20220bfc
--- /dev/null
+++ b/make/mingw64.mk
@@ -0,0 +1,30 @@
+#-----------------------------------------------------------
+# xgboost: Configuration for MinGW(Windows 64bit)
+# This allows to compile xgboost on windows by using mingw.
+# You will need to get install an mingw toolchain.
+# g++-4.6 or later is required.
+#
+# see config.mk for template.
+#-----------------------------------------------------------
+export CXX=g++ -m64
+export CC=gcc -m64
+
+# Whether enable openmp support, needed for multi-threading.
+USE_OPENMP = 1
+
+# whether use HDFS support during compile
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+USE_S3 = 0
+
+# whether use Azure blob support during compile
+USE_AZURE = 0
+
+# Rabit library version,
+# - librabit.a Normal distributed version.
+# - librabit_empty.a Non distributed mock version,
+LIB_RABIT = librabit_empty.a
+
+DMLC_CFLAGS = -DDMLC_ENABLE_STD_THREAD=0
+ADD_CFLAGS = -DDMLC_ENABLE_STD_THREAD=0
\ No newline at end of file
diff --git a/make/minimum.mk b/make/minimum.mk
new file mode 100644
index 000000000..f9bef57bd
--- /dev/null
+++ b/make/minimum.mk
@@ -0,0 +1,22 @@
+#-----------------------------------------------------
+# xgboost: minumum dependency configuration,
+# see config.mk for template.
+#----------------------------------------------------
+
+# Whether enable openmp support, needed for multi-threading.
+USE_OPENMP = 0
+
+# whether use HDFS support during compile
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+USE_S3 = 0
+
+# whether use Azure blob support during compile
+USE_AZURE = 0
+
+# Rabit library version,
+# - librabit.a Normal distributed version.
+# - librabit_empty.a Non distributed mock version,
+LIB_RABIT = librabit_empty.a
+
diff --git a/make/minimum_parallel.mk b/make/minimum_parallel.mk
new file mode 100644
index 000000000..c41158aaf
--- /dev/null
+++ b/make/minimum_parallel.mk
@@ -0,0 +1,23 @@
+#------------------------------------------------------------------------
+# xgboost: minumum dependency configuration with Parallelization.
+# This configuration is standard but cannot run distributed computing.
+#
+# see config.mk for template.
+#------------------------------------------------------------------------
+
+# Whether enable openmp support, needed for multi-threading.
+USE_OPENMP = 1
+
+# whether use HDFS support during compile
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+USE_S3 = 0
+
+# whether use Azure blob support during compile
+USE_AZURE = 0
+
+# Rabit library version,
+# - librabit.a Normal distributed version.
+# - librabit_empty.a Non distributed mock version,
+LIB_RABIT = librabit_empty.a
diff --git a/make/travis.mk b/make/travis.mk
new file mode 100644
index 000000000..82a9696bd
--- /dev/null
+++ b/make/travis.mk
@@ -0,0 +1,33 @@
+
+# the additional link flags you want to add
+ADD_LDFLAGS =
+
+# the additional compile flags you want to add
+ADD_CFLAGS =
+
+# Whether enable openmp support, needed for multi-threading.
+USE_OPENMP = 1
+
+# whether use HDFS support during compile
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+USE_S3 = 0
+
+# whether use Azure blob support during compile
+USE_AZURE = 0
+
+# Rabit library version,
+# - librabit.a Normal distributed version.
+# - librabit_empty.a Non distributed mock version,
+LIB_RABIT = librabit.a
+
+# path to libjvm.so
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# List of additional plugins, checkout plugin folder.
+# uncomment the following lines to include these plugins
+# you can also add your own plugin like this
+#
+XGB_PLUGINS += plugin/example/plugin.mk
+XGB_PLUGINS += plugin/lz4/plugin.mk
diff --git a/multi-node/README.md b/multi-node/README.md
deleted file mode 100644
index 593a7d3c8..000000000
--- a/multi-node/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-Distributed XGBoost
-======
-Distributed XGBoost is now part of [Wormhole](https://github.com/dmlc/wormhole).
-Checkout this [Link](https://github.com/dmlc/wormhole/tree/master/learn/xgboost) for usage examples, build and job submissions.
-* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/dmlc/rabit)
-  - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning  
-  - This makes xgboost portable and fault-tolerant against node failures
-
-Notes
-====
-* Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs
-  - The Hadoop version does not rely on Mapreduce to do iterations
-  - You can expect xgboost not suffering the drawbacks of iterative MapReduce program
-* The design choice was made because Allreduce is very natural and efficient for distributed tree building
-  - In current version of xgboost, the distributed version is only adds several lines of Allreduce synchronization code
-* The multi-threading nature of xgboost is inheritated in distributed mode
-  - This means xgboost efficiently use all the threads in one machine, and communicates only between machines
-  - Remember to run on xgboost process per machine and this will give you maximum speedup
-* For more information about rabit and how it works, see the [Rabit's Tutorial](https://github.com/dmlc/rabit/tree/master/guide)
-
-Solvers
-=====
-* Column-based solver split data by column, each node work on subset of columns, 
-  it uses exactly the same algorithm as single node version.
-* Row-based solver split data by row, each node work on subset of rows,
-  it uses an approximate histogram count algorithm, and will only examine subset of 
-  potential split points as opposed to all split points.
-  - This is the mode used by current hadoop version, since usually data was stored by rows in many industry system
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
deleted file mode 100644
index 3ea0799fe..000000000
--- a/multi-node/col-split/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-Distributed XGBoost: Column Split Version
-====
-* run ```bash mushroom-col-rabit.sh <n-process>```
-  - mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
-* run ```bash mushroom-col-rabit-mock.sh <n-process>```
-  - mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
-
-How to Use
-====
-* First split the data by column, 
-* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
-* Enable column split mode by ```dsplit=col```
-
-Notes
-====
-* The code is multi-threaded, so you want to run one process per node
-* The code will work correctly as long as union of each column subset is all the columns we are interested in.
-  - The column subset can overlap with each other.
-* It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
deleted file mode 100755
index b4208f04c..000000000
--- a/multi-node/col-split/mushroom-col-rabit-mock.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-#
-# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
-# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
-#
-rm -rf train.col* *.model
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../../demo/data/agaricus.txt.train train $k
-
-# run xgboost mpi
-../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
-
-
-#cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh
deleted file mode 100755
index 77e0c904c..000000000
--- a/multi-node/col-split/mushroom-col-rabit.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-#
-# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
-# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
-#
-rm -rf train.col* *.model
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../../demo/data/agaricus.txt.train train $k
-
-# run xgboost mpi
-../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt 
-
-# run for one round, and continue training
-../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf dsplit=col num_round=1
-../../subtree/rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
-
-cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.conf b/multi-node/col-split/mushroom-col.conf
deleted file mode 100644
index 2c779a44d..000000000
--- a/multi-node/col-split/mushroom-col.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-use_buffer = 0
-
-# The path of training data %d is the wildcard for the rank of the data
-# The idea is each process take a feature matrix with subset of columns
-#
-data = "train.col%d" 
-
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "../../demo/data/agaricus.txt.test" 
-# evaluate on training data as well each round
-eval_train = 1
-
-# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
-test:data = "../../demo/data/agaricus.txt.test"      
diff --git a/multi-node/col-split/splitsvm.py b/multi-node/col-split/splitsvm.py
deleted file mode 100644
index 365aef610..000000000
--- a/multi-node/col-split/splitsvm.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different subcolumns
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-fmap = {}
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    arr = l.split()
-    for f in fos:
-        f.write(arr[0])
-    for it in arr[1:]:
-        fid = int(it.split(':')[0])
-        if fid not in fmap:
-            fmap[fid] = random.randint(0, k-1)
-        fos[fmap[fid]].write(' '+it)
-    for f in fos:
-        f.write('\n')
-for f in fos:    
-    f.close()
diff --git a/plugin/README.md b/plugin/README.md
new file mode 100644
index 000000000..56d973fd3
--- /dev/null
+++ b/plugin/README.md
@@ -0,0 +1,32 @@
+XGBoost Plugins Modules
+=======================
+This folder contains plugin modules to xgboost that can be optionally installed.
+The plugin system helps us to extend xgboost with additional features,
+and add experimental features that may not yet ready to be included in main project.
+
+To include a certain plugin, say ```plugin_a```, you only need to add the following line to the config.mk.
+
+```makefile
+# Add plugin by include the plugin in config
+XGB_PLUGINS += plugin/plugin_a/plugin.mk
+```
+
+Then rebuild libxgboost by typing make, you can get a new library with the plugin enabled.
+
+Link Static XGBoost Library with Plugins
+----------------------------------------
+This problem only happens when you link ```libxgboost.a```.
+If you only use ```libxgboost.so```(this include python and other bindings),
+you can ignore this section.
+
+When you want to link ```libxgboost.a``` with additional plugins included,
+you will need to enabled whole archeive via The following option.
+```bash
+--whole-archive libxgboost.a --no-whole-archive
+```
+
+Write Your Own Plugin
+---------------------
+You can plugin your own modules to xgboost by adding code to this folder,
+without modification to the main code repo.
+The [example](example) folder provides an example to write a plugin.
diff --git a/plugin/example/README.md b/plugin/example/README.md
new file mode 100644
index 000000000..f0ff5478c
--- /dev/null
+++ b/plugin/example/README.md
@@ -0,0 +1,21 @@
+XGBoost Plugin Example
+======================
+This folder provides an example of xgboost plugin.
+
+There are three steps you need to to do to add plugin to xgboost
+- Create your source .cc file, implement a new extension
+  - In this example [custom_obj.cc](custom_obj.cc)
+- Register this extension to xgboost via registration macr
+  - In this example ```XGBOOST_REGISTER_OBJECTIVE``` in [this line](custom_obj.cc#L75)
+- Create a [plugin.mk](plugin.mk) on this folder
+
+To add this plugin, add the following line to ```config.mk```(template in make/config.mk).
+```makefile
+# Add plugin by include the plugin in config
+XGB_PLUGINS += plugin/plugin_a/plugin.mk
+```
+
+Then you can test this plugin by using ```objective=mylogistic``` parameter.
+
+
+
diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc
new file mode 100644
index 000000000..95384f21d
--- /dev/null
+++ b/plugin/example/custom_obj.cc
@@ -0,0 +1,80 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file custom_metric.cc
+ * \brief This is an example to define plugin of xgboost.
+ *  This plugin defines the additional metric function.
+ */
+#include <xgboost/base.h>
+#include <dmlc/parameter.h>
+#include <xgboost/objective.h>
+
+namespace xgboost {
+namespace obj {
+
+// This is a helpful data structure to define parameters
+// You do not have to use it.
+// see http://dmlc-core.readthedocs.org/en/latest/parameter.html
+// for introduction of this module.
+struct MyLogisticParam : public dmlc::Parameter<MyLogisticParam> {
+  float scale_neg_weight;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(MyLogisticParam) {
+    DMLC_DECLARE_FIELD(scale_neg_weight).set_default(1.0f).set_lower_bound(0.0f)
+        .describe("Scale the weight of negative examples by this factor");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(MyLogisticParam);
+
+// Define a customized logistic regression objective in C++.
+// Implement the interface.
+class MyLogistic : public ObjFunction {
+ public:
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+  void GetGradient(const std::vector<float> &preds,
+                   const MetaInfo &info,
+                   int iter,
+                   std::vector<bst_gpair> *out_gpair) override {
+    out_gpair->resize(preds.size());
+    for (size_t i = 0; i < preds.size(); ++i) {
+      float w = info.GetWeight(i);
+      // scale the negative examples!
+      if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight;
+      // logistic transoformation
+      float p = 1.0f / (1.0f + expf(-preds[i]));
+      // this is the gradient
+      float grad = (p - info.labels[i]) * w;
+      // this is the second order gradient
+      float hess = p * (1.0f - p) * w;
+      out_gpair->at(i) = bst_gpair(grad, hess);
+    }
+  }
+  const char* DefaultEvalMetric() const override {
+    return "error";
+  }
+  void PredTransform(std::vector<float> *io_preds) override {
+    // transform margin value to probability.
+    std::vector<float> &preds = *io_preds;
+    for (size_t i = 0; i < preds.size(); ++i) {
+      preds[i] = 1.0f / (1.0f + expf(-preds[i]));
+    }
+  }
+  float ProbToMargin(float base_score) const override {
+    // transform probability to margin value
+    return -std::log(1.0f / base_score - 1.0f);
+  }
+
+ private:
+  MyLogisticParam param_;
+};
+
+// Finally register the objective function.
+// After it succeeds you can try use xgboost with objective=mylogistic
+XGBOOST_REGISTER_OBJECTIVE(MyLogistic, "mylogistic")
+.describe("User defined logistic regression plugin")
+.set_body([]() { return new MyLogistic(); });
+
+}  // namespace obj
+}  // namespace xgboost
diff --git a/plugin/example/plugin.mk b/plugin/example/plugin.mk
new file mode 100644
index 000000000..8ebd26d61
--- /dev/null
+++ b/plugin/example/plugin.mk
@@ -0,0 +1,4 @@
+# Add the object files you like to include in this plugin.
+PLUGIN_OBJS += build_plugin/example/custom_obj.o
+# Add additional dependent libraries this plugin might have
+PLUGIN_LDFLAGS +=
\ No newline at end of file
diff --git a/plugin/lz4/plugin.mk b/plugin/lz4/plugin.mk
new file mode 100644
index 000000000..7a69027c7
--- /dev/null
+++ b/plugin/lz4/plugin.mk
@@ -0,0 +1,2 @@
+PLUGIN_OBJS += build_plugin/lz4/sparse_page_lz4_format.o
+PLUGIN_LDFLAGS += -llz4
diff --git a/plugin/lz4/sparse_page_lz4_format.cc b/plugin/lz4/sparse_page_lz4_format.cc
new file mode 100644
index 000000000..cad2ceadd
--- /dev/null
+++ b/plugin/lz4/sparse_page_lz4_format.cc
@@ -0,0 +1,327 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sparse_page_lz4_format.cc
+ *  XGBoost Plugin to enable LZ4 compressed format on the external memory pages.
+ */
+#include <xgboost/data.h>
+#include <xgboost/logging.h>
+#include <dmlc/registry.h>
+#include <dmlc/parameter.h>
+#include <lz4.h>
+#include <lz4hc.h>
+#include "../../src/data/sparse_batch_page.h"
+
+namespace xgboost {
+namespace data {
+
+DMLC_REGISTRY_FILE_TAG(sparse_page_lz4_format);
+
+// array to help compression of decompression.
+template<typename DType>
+class CompressArray {
+ public:
+  // the data content.
+  std::vector<DType> data;
+  // Decompression helper
+  // number of chunks
+  inline int num_chunk() const {
+    CHECK_GT(raw_chunks_.size(), 1);
+    return static_cast<int>(raw_chunks_.size() - 1);
+  }
+  // raw bytes
+  inline size_t RawBytes() const {
+    return raw_chunks_.back() * sizeof(DType);
+  }
+  // encoded bytes
+  inline size_t EncodedBytes() const {
+    return encoded_chunks_.back() +
+        (encoded_chunks_.size() + raw_chunks_.size()) * sizeof(bst_uint);
+  }
+  // load the array from file.
+  inline void Read(dmlc::SeekStream* fi);
+  // run decode on chunk_id
+  inline void Decompress(int chunk_id);
+  // Compression helper
+  // initialize the compression chunks
+  inline void InitCompressChunks(const std::vector<bst_uint>& chunk_ptr);
+  // initialize the compression chunks
+  inline void InitCompressChunks(size_t chunk_size, size_t max_nchunk);
+  // run decode on chunk_id, level = -1 means default.
+  inline void Compress(int chunk_id, bool use_hc);
+  // save the output buffer into file.
+  inline void Write(dmlc::Stream* fo);
+
+ private:
+  // the chunk split of the data, by number of elements
+  std::vector<bst_uint> raw_chunks_;
+  // the encoded chunk, by number of bytes
+  std::vector<bst_uint> encoded_chunks_;
+  // output buffer of compression.
+  std::vector<std::string> out_buffer_;
+  // input buffer of data.
+  std::string in_buffer_;
+};
+
+template<typename DType>
+inline void CompressArray<DType>::Read(dmlc::SeekStream* fi) {
+  CHECK(fi->Read(&raw_chunks_));
+  CHECK(fi->Read(&encoded_chunks_));
+  size_t buffer_size = encoded_chunks_.back();
+  in_buffer_.resize(buffer_size);
+  CHECK_EQ(fi->Read(dmlc::BeginPtr(in_buffer_), buffer_size), buffer_size);
+  data.resize(raw_chunks_.back());
+}
+
+template<typename DType>
+inline void CompressArray<DType>::Decompress(int chunk_id) {
+  int chunk_size = static_cast<int>(
+      raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
+  int encoded_size = static_cast<int>(
+      encoded_chunks_[chunk_id + 1] - encoded_chunks_[chunk_id]);
+  // decompress data
+  int src_size = LZ4_decompress_fast(
+      dmlc::BeginPtr(in_buffer_) + encoded_chunks_[chunk_id],
+      reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
+      chunk_size);
+  CHECK_EQ(encoded_size, src_size);
+}
+
+template<typename DType>
+inline void CompressArray<DType>::InitCompressChunks(
+    const std::vector<bst_uint>& chunk_ptr) {
+  raw_chunks_ = chunk_ptr;
+  CHECK_GE(raw_chunks_.size(), 2);
+  out_buffer_.resize(raw_chunks_.size() - 1);
+  for (size_t i = 0; i < out_buffer_.size(); ++i) {
+    out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
+  }
+}
+
+template<typename DType>
+inline void CompressArray<DType>::InitCompressChunks(size_t chunk_size, size_t max_nchunk) {
+  raw_chunks_.clear();
+  raw_chunks_.push_back(0);
+  size_t min_chunk_size = data.size() / max_nchunk;
+  chunk_size = std::max(min_chunk_size, chunk_size);
+  size_t nstep = data.size() / chunk_size;
+  for (size_t i = 0; i < nstep; ++i) {
+    raw_chunks_.push_back(raw_chunks_.back() + chunk_size);
+    CHECK_LE(raw_chunks_.back(), data.size());
+  }
+  if (nstep == 0) raw_chunks_.push_back(0);
+  raw_chunks_.back() = data.size();
+  CHECK_GE(raw_chunks_.size(), 2);
+  out_buffer_.resize(raw_chunks_.size() - 1);
+  for (size_t i = 0; i < out_buffer_.size(); ++i) {
+    out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
+  }
+}
+
+template<typename DType>
+inline void CompressArray<DType>::Compress(int chunk_id, bool use_hc) {
+  CHECK_LT(static_cast<size_t>(chunk_id + 1), raw_chunks_.size());
+  std::string& buf = out_buffer_[chunk_id];
+  size_t raw_chunk_size = (raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
+  int bound = LZ4_compressBound(raw_chunk_size);
+  CHECK_NE(bound, 0);
+  buf.resize(bound);
+  int encoded_size;
+  if (use_hc) {
+    encoded_size = LZ4_compress_HC(
+        reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
+        dmlc::BeginPtr(buf), raw_chunk_size, buf.length(), 9);
+  } else {
+    encoded_size = LZ4_compress_default(
+        reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
+        dmlc::BeginPtr(buf), raw_chunk_size, buf.length());
+  }
+  CHECK_NE(encoded_size, 0);
+  CHECK_LE(static_cast<size_t>(encoded_size), buf.length());
+  buf.resize(encoded_size);
+}
+
+template<typename DType>
+inline void CompressArray<DType>::Write(dmlc::Stream* fo) {
+  encoded_chunks_.clear();
+  encoded_chunks_.push_back(0);
+  for (size_t i = 0; i < out_buffer_.size(); ++i) {
+    encoded_chunks_.push_back(encoded_chunks_.back() + out_buffer_[i].length());
+  }
+  fo->Write(raw_chunks_);
+  fo->Write(encoded_chunks_);
+  for (const std::string& buf : out_buffer_) {
+    fo->Write(dmlc::BeginPtr(buf), buf.length());
+  }
+}
+
+template<typename StorageIndex>
+class SparsePageLZ4Format : public SparsePage::Format {
+ public:
+  explicit SparsePageLZ4Format(bool use_lz4_hc)
+      : use_lz4_hc_(use_lz4_hc) {
+    raw_bytes_ = raw_bytes_value_ = raw_bytes_index_ = 0;
+    encoded_bytes_value_ = encoded_bytes_index_ = 0;
+    nthread_ = dmlc::GetEnv("XGBOOST_LZ4_DECODE_NTHREAD", 4);
+    nthread_write_ = dmlc::GetEnv("XGBOOST_LZ4_COMPRESS_NTHREAD", 12);
+  }
+  virtual ~SparsePageLZ4Format() {
+    size_t encoded_bytes = raw_bytes_ +  encoded_bytes_value_ + encoded_bytes_index_;
+    raw_bytes_ += raw_bytes_value_ + raw_bytes_index_;
+    if (raw_bytes_ != 0) {
+      LOG(CONSOLE) << "raw_bytes=" << raw_bytes_
+                   << ", encoded_bytes=" << encoded_bytes
+                   << ", ratio=" << double(encoded_bytes) / raw_bytes_
+                   << ", ratio-index=" << double(encoded_bytes_index_) /raw_bytes_index_
+                   << ", ratio-value=" << double(encoded_bytes_value_) /raw_bytes_value_;
+    }
+  }
+
+  bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
+    if (!fi->Read(&(page->offset))) return false;
+    CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file";
+    this->LoadIndexValue(fi);
+
+    page->data.resize(page->offset.back());
+    CHECK_EQ(index_.data.size(), value_.data.size());
+    CHECK_EQ(index_.data.size(), page->data.size());
+    for (size_t i = 0; i < page->data.size(); ++i) {
+      page->data[i] = SparseBatch::Entry(index_.data[i] + min_index_, value_.data[i]);
+    }
+    return true;
+  }
+
+  bool Read(SparsePage* page,
+            dmlc::SeekStream* fi,
+            const std::vector<bst_uint>& sorted_index_set) override {
+    if (!fi->Read(&disk_offset_)) return false;
+    this->LoadIndexValue(fi);
+
+    page->offset.clear();
+    page->offset.push_back(0);
+    for (bst_uint cid : sorted_index_set) {
+      page->offset.push_back(
+          page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
+    }
+    page->data.resize(page->offset.back());
+    CHECK_EQ(index_.data.size(), value_.data.size());
+    CHECK_EQ(index_.data.size(), disk_offset_.back());
+
+    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
+      bst_uint cid = sorted_index_set[i];
+      size_t dst_begin = page->offset[i];
+      size_t src_begin = disk_offset_[cid];
+      size_t num = disk_offset_[cid + 1] - disk_offset_[cid];
+      for (size_t j = 0; j < num; ++j) {
+        page->data[dst_begin + j] = SparseBatch::Entry(
+            index_.data[src_begin + j] + min_index_, value_.data[src_begin + j]);
+      }
+    }
+    return true;
+  }
+
+  void Write(const SparsePage& page, dmlc::Stream* fo) override {
+    CHECK(page.offset.size() != 0 && page.offset[0] == 0);
+    CHECK_EQ(page.offset.back(), page.data.size());
+    fo->Write(page.offset);
+    min_index_ = page.min_index;
+    fo->Write(&min_index_, sizeof(min_index_));
+    index_.data.resize(page.data.size());
+    value_.data.resize(page.data.size());
+
+    for (size_t i = 0; i < page.data.size(); ++i) {
+      bst_uint idx = page.data[i].index - min_index_;
+      CHECK_LE(idx, static_cast<bst_uint>(std::numeric_limits<StorageIndex>::max()))
+          << "The storage index is chosen to limited to smaller equal than "
+          << std::numeric_limits<StorageIndex>::max()
+          << "min_index=" << min_index_;
+      index_.data[i] = static_cast<StorageIndex>(idx);
+      value_.data[i] = page.data[i].fvalue;
+    }
+
+    index_.InitCompressChunks(kChunkSize, kMaxChunk);
+    value_.InitCompressChunks(kChunkSize, kMaxChunk);
+
+    int nindex = index_.num_chunk();
+    int nvalue = value_.num_chunk();
+    int ntotal = nindex + nvalue;
+    #pragma omp parallel for schedule(dynamic, 1)  num_threads(nthread_write_)
+    for (int i = 0; i < ntotal; ++i) {
+      if (i < nindex) {
+        index_.Compress(i, use_lz4_hc_);
+      } else {
+        value_.Compress(i - nindex, use_lz4_hc_);
+      }
+    }
+    index_.Write(fo);
+    value_.Write(fo);
+    // statistics
+    raw_bytes_index_ += index_.RawBytes() * sizeof(bst_uint) / sizeof(StorageIndex);
+    raw_bytes_value_ += value_.RawBytes();
+    encoded_bytes_index_ += index_.EncodedBytes();
+    encoded_bytes_value_ += value_.EncodedBytes();
+    raw_bytes_ += page.offset.size() * sizeof(size_t);
+  }
+
+  inline void LoadIndexValue(dmlc::SeekStream* fi) {
+    fi->Read(&min_index_, sizeof(min_index_));
+    index_.Read(fi);
+    value_.Read(fi);
+
+    int nindex = index_.num_chunk();
+    int nvalue = value_.num_chunk();
+    int ntotal = nindex + nvalue;
+    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
+    for (int i = 0; i < ntotal; ++i) {
+      if (i < nindex) {
+        index_.Decompress(i);
+      } else {
+        value_.Decompress(i - nindex);
+      }
+    }
+  }
+
+ private:
+  // default chunk size.
+  static const size_t kChunkSize = 64 << 10UL;
+  // maximum chunk size.
+  static const size_t kMaxChunk = 128;
+  // bool whether use hc
+  bool use_lz4_hc_;
+  // number of threads
+  int nthread_;
+  // number of writing threads
+  int nthread_write_;
+  // raw bytes
+  size_t raw_bytes_, raw_bytes_index_, raw_bytes_value_;
+  // encoded bytes
+  size_t encoded_bytes_index_, encoded_bytes_value_;
+  /*! \brief minimum index value */
+  uint32_t min_index_;
+  /*! \brief external memory column offset */
+  std::vector<size_t> disk_offset_;
+  // internal index
+  CompressArray<StorageIndex> index_;
+  // value set.
+  CompressArray<bst_float> value_;
+};
+
+XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4)
+.describe("Apply LZ4 binary data compression for ext memory.")
+.set_body([]() {
+    return new SparsePageLZ4Format<bst_uint>(false);
+  });
+
+XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4hc)
+.describe("Apply LZ4 binary data compression(high compression ratio) for ext memory.")
+.set_body([]() {
+    return new SparsePageLZ4Format<bst_uint>(true);
+  });
+
+XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4i16hc)
+.describe("Apply LZ4 binary data compression(16 bit index mode) for ext memory.")
+.set_body([]() {
+    return new SparsePageLZ4Format<uint16_t>(true);
+  });
+
+}  // namespace data
+}  // namespace xgboost
diff --git a/python-package/MANIFEST.in b/python-package/MANIFEST.in
index 83596d826..04ca379a6 100644
--- a/python-package/MANIFEST.in
+++ b/python-package/MANIFEST.in
@@ -1,14 +1,11 @@
-include *.sh *.md *.rst
+include *.md *.rst
 recursive-include xgboost *
-recursive-include xgboost/wrapper *
-recursive-include xgboost/windows *
-recursive-include xgboost/subtree *
+recursive-include xgboost/include *
 recursive-include xgboost/src *
-recursive-include xgboost/multi-node *
 #exclude pre-compiled .o file for less confusions
 #include the pre-compiled .so is needed as a placeholder
 #since it will be copy after compiling on the fly
-global-exclude xgboost/wrapper/*.so.gz
+global-exclude xgboost/build/*
 global-exclude xgboost/*.o
 global-exclude *.pyo
 global-exclude *.pyc
diff --git a/python-package/setup.py b/python-package/setup.py
index 4b05bc710..155a30bc8 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -17,7 +17,7 @@ libpath = {'__file__': libpath_py}
 exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath)
 
 LIB_PATH = libpath['find_lib_path']()
-
+print("Install libxgboost from: %s" % LIB_PATH)
 #Please use setup_pip.py for generating and deploying pip installation
 #detailed instruction in setup_pip.py
 setup(name='xgboost',
diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
index cd50ca6cc..1fe438289 100644
--- a/python-package/xgboost/__init__.py
+++ b/python-package/xgboost/__init__.py
@@ -14,7 +14,7 @@ try:
     from .sklearn import XGBModel, XGBClassifier, XGBRegressor
     from .plotting import plot_importance, plot_tree, to_graphviz
 except ImportError:
-    print('Error when loading sklearn/plotting. Please install scikit-learn')
+    pass
 
 VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION')
 __version__ = open(VERSION_FILE).read().strip()
diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py
index 5df72dd3d..a703dcd7b 100644
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -20,8 +20,8 @@ def find_lib_path():
     """
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     # make pythonpack hack: copy this directory one level upper for setup.py
-    dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/'),
-                os.path.join(curr_path, './wrapper/')]
+    dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
+                os.path.join(curr_path, './lib/')]
     if os.name == 'nt':
         if platform.architecture()[0] == '64bit':
             dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/'))
@@ -32,9 +32,9 @@ def find_lib_path():
             # hack for pip installation when copy all parent source directory here
             dll_path.append(os.path.join(curr_path, './windows/Release/'))
     if os.name == 'nt':
-        dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
+        dll_path = [os.path.join(p, 'libxgboost.dll') for p in dll_path]
     else:
-        dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
+        dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path]
     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
     #From github issues, most of installation errors come from machines w/o compilers
     if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
diff --git a/rabit b/rabit
new file mode 160000
index 000000000..112d866dc
--- /dev/null
+++ b/rabit
@@ -0,0 +1 @@
+Subproject commit 112d866dc92354304c0891500374fe40cdf13a50
diff --git a/scripts/travis_R_script.sh b/scripts/travis_R_script.sh
deleted file mode 100755
index 5a9ea7528..000000000
--- a/scripts/travis_R_script.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# Test R package of xgboost
-set -e
-export _R_CHECK_TIMINGS_=0
-export R_BUILD_ARGS="--no-build-vignettes --no-manual"
-export R_CHECK_ARGS="--no-vignettes --no-manual"
-
-curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
-chmod 755 ./travis-tool.sh
-./travis-tool.sh bootstrap
-make Rpack
-cd ./xgboost
-../travis-tool.sh install_deps
-../travis-tool.sh run_tests
\ No newline at end of file
diff --git a/scripts/travis_after_failure.sh b/scripts/travis_after_failure.sh
deleted file mode 100755
index 921e14953..000000000
--- a/scripts/travis_after_failure.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-if [ ${TASK} == "R-package" ]; then
-    cat xgboost/xgboost.Rcheck/*.log
-fi
diff --git a/scripts/travis_java_script.sh b/scripts/travis_java_script.sh
deleted file mode 100755
index e0583e1fb..000000000
--- a/scripts/travis_java_script.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Test java package of xgboost
-set -e
-cd java
-./create_wrap.sh
-cd xgboost4j
-mvn clean install -DskipTests=true
-mvn test
diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh
deleted file mode 100755
index 8c449c843..000000000
--- a/scripts/travis_osx_install.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-if [ ${TRAVIS_OS_NAME} != "osx" ]; then
-    exit 0
-fi
-
-brew update
diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh
deleted file mode 100755
index 1e62b5b46..000000000
--- a/scripts/travis_script.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-# main script of travis
-if [ ${TASK} == "lint" ]; then
-    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
-        make lint  || exit -1
-    fi
-fi
-
-if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-    export no_omp=1
-    export NO_OPENMP=1
-fi
-
-if [ ${TASK} == "build" ]; then
-    make all CXX=${CXX} || exit -1
-fi
-
-if [ ${TASK} == "build-with-dmlc" ]; then
-    cd dmlc-core
-    cp make/config.mk .
-    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
-        echo "USE_S3=1" >> config.mk
-    else
-        echo "USE_S3=0" >> config.mk
-    fi
-    make all CXX=${CXX}|| exit -1
-    cd ..
-    make dmlc=dmlc-core CXX=${CXX} || exit -1
-fi
-
-if [ ${TASK} == "R-package" ]; then
-    scripts/travis_R_script.sh || exit -1
-fi
-
-if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then
-
-    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        brew install graphviz
-        if [ ${TASK} == "python-package3" ]; then
-            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-        else
-            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh
-        fi
-    else
-        sudo apt-get install graphviz
-        if [ ${TASK} == "python-package3" ]; then
-            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-        else
-            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh
-        fi
-    fi
-    bash conda.sh -b -p $HOME/miniconda
-    export PATH="$HOME/miniconda/bin:$PATH"
-    hash -r
-    conda config --set always_yes yes --set changeps1 no
-    conda update -q conda
-    # Useful for debugging any issues with conda
-    conda info -a
-
-    if [ ${TASK} == "python-package3" ]; then
-        conda create -n myenv python=3.4
-    else
-        conda create -n myenv python=2.7
-    fi
-    source activate myenv
-    conda install numpy scipy pandas matplotlib nose scikit-learn
-    python -m pip install graphviz
-
-    make all CXX=${CXX} || exit -1
-
-    python -m nose tests/python || exit -1
-    python --version
-fi
-
-# only test java under linux for now
-if [ ${TASK} == "java-package" ]; then
-    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
-        make java CXX=${CXX} || exit -1
-        scripts/travis_java_script.sh || exit -1
-    fi
-fi
diff --git a/src/README.md b/src/README.md
deleted file mode 100644
index 4b8420306..000000000
--- a/src/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-Coding Guide
-======
-This file is intended to be notes about code structure in xgboost
-
-Project Logical Layout
-=======
-* Dependency order: io->learner->gbm->tree
-  - All module depends on data.h
-* tree are implementations of tree construction algorithms.
-* gbm is gradient boosting interface, that takes trees and other base learner to do boosting.
-  - gbm only takes gradient as sufficient statistics, it does not compute the gradient.
-* learner is learning module that computes gradient for specific object, and pass it to GBM
-
-File Naming Convention
-======= 
-* .h files are data structures and interface, which are needed to use functions in that layer.
-* -inl.hpp files are implementations of interface, like cpp file in most project.
-  - You only need to understand the interface file to understand the usage of that layer
-* In each folder, there can be a .cpp file, that compiles the module of that layer
-
-How to Hack the Code
-======
-* Add objective function: add to learner/objective-inl.hpp and register it in learner/objective.h ```CreateObjFunction``` 
-  - You can also directly do it in python
-* Add new evaluation metric: add to learner/evaluation-inl.hpp and register it in learner/evaluation.h ```CreateEvaluator``` 
-* Add wrapper for a new language, most likely you can do it by taking the functions in python/xgboost_wrapper.h, which is purely C based, and call these C functions to use xgboost
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
new file mode 100644
index 000000000..b734349c3
--- /dev/null
+++ b/src/c_api/c_api.cc
@@ -0,0 +1,528 @@
+// Copyright (c) 2014 by Contributors
+
+#include <xgboost/data.h>
+#include <xgboost/learner.h>
+#include <xgboost/c_api.h>
+#include <cstdio>
+#include <vector>
+#include <string>
+#include <cstring>
+#include <memory>
+
+#include "./c_api_error.h"
+#include "../data/simple_csr_source.h"
+#include "../common/thread_local.h"
+#include "../common/math.h"
+#include "../common/io.h"
+#include "../common/group_data.h"
+
+namespace xgboost {
+
+// booster wrapper for backward compatible reason.
+class Booster {
+ public:
+  explicit Booster(const std::vector<DMatrix*>& cache_mats)
+      : configured_(false),
+        initialized_(false),
+        learner_(Learner::Create(cache_mats)) {}
+
+  inline Learner* learner() {
+    return learner_.get();
+  }
+
+  inline void SetParam(const std::string& name, const std::string& val) {
+    cfg_.push_back(std::make_pair(name, val));
+    if (configured_) {
+      learner_->Configure(cfg_);
+    }
+  }
+
+  inline void LazyInit() {
+    if (!configured_) {
+      learner_->Configure(cfg_);
+      configured_ = true;
+    }
+    if (!initialized_) {
+      learner_->InitModel();
+      initialized_ = true;
+    }
+  }
+
+  inline void LoadModel(dmlc::Stream* fi) {
+    learner_->Load(fi);
+    initialized_ = true;
+  }
+
+ public:
+  bool configured_;
+  bool initialized_;
+  std::unique_ptr<Learner> learner_;
+  std::vector<std::pair<std::string, std::string> > cfg_;
+};
+}  // namespace xgboost
+
+using namespace xgboost; // NOLINT(*);
+
+/*! \brief entry to to easily hold returning information */
+struct XGBAPIThreadLocalEntry {
+  /*! \brief result holder for returning string */
+  std::string ret_str;
+  /*! \brief result holder for returning strings */
+  std::vector<std::string> ret_vec_str;
+  /*! \brief result holder for returning string pointers */
+  std::vector<const char *> ret_vec_charp;
+  /*! \brief returning float vector. */
+  std::vector<float> ret_vec_float;
+  /*! \brief temp variable of gradient pairs. */
+  std::vector<bst_gpair> tmp_gpair;
+};
+
+// define the threadlocal store.
+typedef xgboost::common::ThreadLocalStore<XGBAPIThreadLocalEntry> XGBAPIThreadLocalStore;
+
+int XGDMatrixCreateFromFile(const char *fname,
+                            int silent,
+                            DMatrixHandle *out) {
+  API_BEGIN();
+  *out = DMatrix::Load(
+      fname, silent != 0, false);
+  API_END();
+}
+
+int XGDMatrixCreateFromCSR(const bst_ulong* indptr,
+                           const unsigned *indices,
+                           const float* data,
+                           bst_ulong nindptr,
+                           bst_ulong nelem,
+                           DMatrixHandle* out) {
+  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+
+  API_BEGIN();
+  data::SimpleCSRSource& mat = *source;
+  mat.row_ptr_.resize(nindptr);
+  for (bst_ulong i = 0; i < nindptr; ++i) {
+    mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
+  }
+  mat.row_data_.resize(nelem);
+  for (bst_ulong i = 0; i < nelem; ++i) {
+    mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
+    mat.info.num_col = std::max(mat.info.num_col,
+                                static_cast<uint64_t>(indices[i] + 1));
+  }
+  mat.info.num_row = nindptr - 1;
+  mat.info.num_nonzero = static_cast<uint64_t>(nelem);
+  *out  = DMatrix::Create(std::move(source));
+  API_END();
+}
+
+int XGDMatrixCreateFromCSC(const bst_ulong* col_ptr,
+                           const unsigned* indices,
+                           const float* data,
+                           bst_ulong nindptr,
+                           bst_ulong nelem,
+                           DMatrixHandle* out) {
+  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+
+  API_BEGIN();
+  int nthread;
+  #pragma omp parallel
+  {
+    nthread = omp_get_num_threads();
+  }
+  data::SimpleCSRSource& mat = *source;
+  common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
+  builder.InitBudget(0, nthread);
+  long ncol = static_cast<long>(nindptr - 1);  // NOLINT(*)
+  #pragma omp parallel for schedule(static)
+  for (long i = 0; i < ncol; ++i) {  // NOLINT(*)
+    int tid = omp_get_thread_num();
+    for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
+      builder.AddBudget(indices[j], tid);
+    }
+  }
+  builder.InitStorage();
+  #pragma omp parallel for schedule(static)
+  for (long i = 0; i < ncol; ++i) {  // NOLINT(*)
+    int tid = omp_get_thread_num();
+    for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
+      builder.Push(indices[j],
+                   RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
+                   tid);
+    }
+}
+  mat.info.num_row = mat.row_ptr_.size() - 1;
+  mat.info.num_col = static_cast<uint64_t>(ncol);
+  mat.info.num_nonzero = nelem;
+  *out  = DMatrix::Create(std::move(source));
+  API_END();
+}
+
+int XGDMatrixCreateFromMat(const float* data,
+                           bst_ulong nrow,
+                           bst_ulong ncol,
+                           float  missing,
+                           DMatrixHandle* out) {
+  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+
+  API_BEGIN();
+  data::SimpleCSRSource& mat = *source;
+  bool nan_missing = common::CheckNAN(missing);
+  mat.info.num_row = nrow;
+  mat.info.num_col = ncol;
+  for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
+    bst_ulong nelem = 0;
+    for (bst_ulong j = 0; j < ncol; ++j) {
+      if (common::CheckNAN(data[j])) {
+        CHECK(nan_missing)
+            << "There are NAN in the matrix, however, you did not set missing=NAN";
+      } else {
+        if (nan_missing || data[j] != missing) {
+          mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
+          ++nelem;
+        }
+      }
+    }
+    mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
+  }
+  mat.info.num_nonzero = mat.row_data_.size();
+  *out  = DMatrix::Create(std::move(source));
+  API_END();
+}
+
+int XGDMatrixSliceDMatrix(DMatrixHandle handle,
+                          const int* idxset,
+                          bst_ulong len,
+                          DMatrixHandle* out) {
+  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+
+  API_BEGIN();
+  data::SimpleCSRSource src;
+  src.CopyFrom(static_cast<DMatrix*>(handle));
+  data::SimpleCSRSource& ret = *source;
+
+  CHECK_EQ(src.info.group_ptr.size(), 0)
+      << "slice does not support group structure";
+
+  ret.Clear();
+  ret.info.num_row = len;
+  ret.info.num_col = src.info.num_col;
+
+  dmlc::DataIter<RowBatch>* iter = &src;
+  iter->BeforeFirst();
+  CHECK(iter->Next());
+
+  const RowBatch& batch = iter->Value();
+  for (bst_ulong i = 0; i < len; ++i) {
+    const int ridx = idxset[i];
+    RowBatch::Inst inst = batch[ridx];
+    CHECK_LT(static_cast<bst_ulong>(ridx), batch.size);
+    ret.row_data_.resize(ret.row_data_.size() + inst.length);
+    std::memcpy(dmlc::BeginPtr(ret.row_data_) + ret.row_ptr_.back(), inst.data,
+                sizeof(RowBatch::Entry) * inst.length);
+    ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
+    ret.info.num_nonzero += inst.length;
+
+    if (src.info.labels.size() != 0) {
+      ret.info.labels.push_back(src.info.labels[ridx]);
+    }
+    if (src.info.weights.size() != 0) {
+      ret.info.weights.push_back(src.info.weights[ridx]);
+    }
+    if (src.info.root_index.size() != 0) {
+      ret.info.root_index.push_back(src.info.root_index[ridx]);
+    }
+  }
+  *out  = DMatrix::Create(std::move(source));
+  API_END();
+}
+
+int XGDMatrixFree(DMatrixHandle handle) {
+  API_BEGIN();
+  delete static_cast<DMatrix*>(handle);
+  API_END();
+}
+
+int XGDMatrixSaveBinary(DMatrixHandle handle,
+                        const char* fname,
+                        int silent) {
+  API_BEGIN();
+  static_cast<DMatrix*>(handle)->SaveToLocalFile(fname);
+  API_END();
+}
+
+int XGDMatrixSetFloatInfo(DMatrixHandle handle,
+                          const char* field,
+                          const float* info,
+                          bst_ulong len) {
+  API_BEGIN();
+  static_cast<DMatrix*>(handle)->info().SetInfo(field, info, kFloat32, len);
+  API_END();
+}
+
+int XGDMatrixSetUIntInfo(DMatrixHandle handle,
+                         const char* field,
+                         const unsigned* info,
+                         bst_ulong len) {
+  API_BEGIN();
+  static_cast<DMatrix*>(handle)->info().SetInfo(field, info, kUInt32, len);
+  API_END();
+}
+
+int XGDMatrixSetGroup(DMatrixHandle handle,
+                      const unsigned* group,
+                      bst_ulong len) {
+  API_BEGIN();
+  DMatrix *pmat = static_cast<DMatrix*>(handle);
+  MetaInfo& info = pmat->info();
+  info.group_ptr.resize(len + 1);
+  info.group_ptr[0] = 0;
+  for (uint64_t i = 0; i < len; ++i) {
+    info.group_ptr[i + 1] = info.group_ptr[i] + group[i];
+  }
+  API_END();
+}
+
+int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
+                          const char* field,
+                          bst_ulong* out_len,
+                          const float** out_dptr) {
+  API_BEGIN();
+  const MetaInfo& info = static_cast<const DMatrix*>(handle)->info();
+  const std::vector<float>* vec = nullptr;
+  if (!std::strcmp(field, "label")) {
+    vec = &info.labels;
+  } else if (!std::strcmp(field, "weight")) {
+    vec = &info.weights;
+  } else if (!std::strcmp(field, "base_margin")) {
+    vec = &info.base_margin;
+  } else {
+    LOG(FATAL) << "Unknown float field name " << field;
+  }
+  *out_len = static_cast<bst_ulong>(vec->size());
+  *out_dptr = dmlc::BeginPtr(*vec);
+  API_END();
+}
+
+int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
+                         const char *field,
+                         bst_ulong *out_len,
+                         const unsigned **out_dptr) {
+  API_BEGIN();
+  const MetaInfo& info = static_cast<const DMatrix*>(handle)->info();
+  const std::vector<unsigned>* vec = nullptr;
+  if (!std::strcmp(field, "root_index")) {
+    vec = &info.root_index;
+  } else {
+    LOG(FATAL) << "Unknown uint field name " << field;
+  }
+  *out_len = static_cast<bst_ulong>(vec->size());
+  *out_dptr = dmlc::BeginPtr(*vec);
+  API_END();
+}
+
+int XGDMatrixNumRow(const DMatrixHandle handle,
+                    bst_ulong *out) {
+  API_BEGIN();
+  *out = static_cast<bst_ulong>(static_cast<const DMatrix*>(handle)->info().num_row);
+  API_END();
+}
+
+int XGDMatrixNumCol(const DMatrixHandle handle,
+                    bst_ulong *out) {
+  API_BEGIN();
+  *out = static_cast<size_t>(static_cast<const DMatrix*>(handle)->info().num_col);
+  API_END();
+}
+
+// xgboost implementation
+int XGBoosterCreate(DMatrixHandle dmats[],
+                    bst_ulong len,
+                    BoosterHandle *out) {
+  API_BEGIN();
+  std::vector<DMatrix*> mats;
+  for (bst_ulong i = 0; i < len; ++i) {
+    mats.push_back(static_cast<DMatrix*>(dmats[i]));
+  }
+  *out = new Booster(mats);
+  API_END();
+}
+
+int XGBoosterFree(BoosterHandle handle) {
+  API_BEGIN();
+  delete static_cast<Booster*>(handle);
+  API_END();
+}
+
+int XGBoosterSetParam(BoosterHandle handle,
+                      const char *name,
+                      const char *value) {
+  API_BEGIN();
+  static_cast<Booster*>(handle)->SetParam(name, value);
+  API_END();
+}
+
+int XGBoosterUpdateOneIter(BoosterHandle handle,
+                           int iter,
+                           DMatrixHandle dtrain) {
+  API_BEGIN();
+  Booster* bst = static_cast<Booster*>(handle);
+  DMatrix *dtr = static_cast<DMatrix*>(dtrain);
+
+  bst->LazyInit();
+  bst->learner()->UpdateOneIter(iter, dtr);
+  API_END();
+}
+
+int XGBoosterBoostOneIter(BoosterHandle handle,
+                          DMatrixHandle dtrain,
+                          float *grad,
+                          float *hess,
+                          bst_ulong len) {
+  std::vector<bst_gpair>& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair;
+  API_BEGIN();
+  Booster* bst = static_cast<Booster*>(handle);
+  DMatrix* dtr = static_cast<DMatrix*>(dtrain);
+  tmp_gpair.resize(len);
+  for (bst_ulong i = 0; i < len; ++i) {
+    tmp_gpair[i] = bst_gpair(grad[i], hess[i]);
+  }
+
+  bst->LazyInit();
+  bst->learner()->BoostOneIter(0, dtr, &tmp_gpair);
+  API_END();
+}
+
+int XGBoosterEvalOneIter(BoosterHandle handle,
+                         int iter,
+                         DMatrixHandle dmats[],
+                         const char* evnames[],
+                         bst_ulong len,
+                         const char** out_str) {
+  std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str;
+  API_BEGIN();
+  Booster* bst = static_cast<Booster*>(handle);
+  std::vector<DMatrix*> data_sets;
+  std::vector<std::string> data_names;
+
+  for (bst_ulong i = 0; i < len; ++i) {
+    data_sets.push_back(static_cast<DMatrix*>(dmats[i]));
+    data_names.push_back(std::string(evnames[i]));
+  }
+
+  bst->LazyInit();
+  eval_str = bst->learner()->EvalOneIter(iter, data_sets, data_names);
+  *out_str = eval_str.c_str();
+  API_END();
+}
+
+int XGBoosterPredict(BoosterHandle handle,
+                     DMatrixHandle dmat,
+                     int option_mask,
+                     unsigned ntree_limit,
+                     bst_ulong *len,
+                     const float **out_result) {
+  std::vector<float>& preds = XGBAPIThreadLocalStore::Get()->ret_vec_float;
+  API_BEGIN();
+  Booster *bst = static_cast<Booster*>(handle);
+  bst->LazyInit();
+  bst->learner()->Predict(
+      static_cast<DMatrix*>(dmat),
+      (option_mask & 1) != 0,
+      &preds, ntree_limit,
+      (option_mask & 2) != 0);
+  *out_result = dmlc::BeginPtr(preds);
+  *len = static_cast<bst_ulong>(preds.size());
+  API_END();
+}
+
+int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
+  API_BEGIN();
+  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
+  static_cast<Booster*>(handle)->LoadModel(fi.get());
+  API_END();
+}
+
+int XGBoosterSaveModel(BoosterHandle handle, const char* fname) {
+  API_BEGIN();
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname, "w"));
+  Booster *bst = static_cast<Booster*>(handle);
+  bst->LazyInit();
+  bst->learner()->Save(fo.get());
+  API_END();
+}
+
+int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
+                                 const void* buf,
+                                 bst_ulong len) {
+  API_BEGIN();
+  common::MemoryFixSizeBuffer fs((void*)buf, len);  // NOLINT(*)
+  static_cast<Booster*>(handle)->LoadModel(&fs);
+  API_END();
+}
+
+int XGBoosterGetModelRaw(BoosterHandle handle,
+                         bst_ulong* out_len,
+                         const char** out_dptr) {
+  std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
+  raw_str.resize(0);
+
+  API_BEGIN();
+  common::MemoryBufferStream fo(&raw_str);
+  Booster *bst = static_cast<Booster*>(handle);
+  bst->LazyInit();
+  bst->learner()->Save(&fo);
+  *out_dptr = dmlc::BeginPtr(raw_str);
+  *out_len = static_cast<bst_ulong>(raw_str.length());
+  API_END();
+}
+
+inline void XGBoostDumpModelImpl(
+    BoosterHandle handle,
+    const FeatureMap& fmap,
+    int with_stats,
+    bst_ulong* len,
+    const char*** out_models) {
+  std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
+  std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
+  Booster *bst = static_cast<Booster*>(handle);
+  bst->LazyInit();
+  str_vecs = bst->learner()->Dump2Text(fmap, with_stats != 0);
+  charp_vecs.resize(str_vecs.size());
+  for (size_t i = 0; i < str_vecs.size(); ++i) {
+    charp_vecs[i] = str_vecs[i].c_str();
+  }
+  *out_models = dmlc::BeginPtr(charp_vecs);
+  *len = static_cast<bst_ulong>(charp_vecs.size());
+}
+int XGBoosterDumpModel(BoosterHandle handle,
+                       const char* fmap,
+                       int with_stats,
+                       bst_ulong* len,
+                       const char*** out_models) {
+  API_BEGIN();
+  FeatureMap featmap;
+  if (strlen(fmap) != 0) {
+    std::unique_ptr<dmlc::Stream> fs(
+        dmlc::Stream::Create(fmap, "r"));
+    dmlc::istream is(fs.get());
+    featmap.LoadText(is);
+  }
+  XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models);
+  API_END();
+}
+
+int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
+                                   int fnum,
+                                   const char** fname,
+                                   const char** ftype,
+                                   int with_stats,
+                                   bst_ulong* len,
+                                   const char*** out_models) {
+  API_BEGIN();
+  FeatureMap featmap;
+  for (int i = 0; i < fnum; ++i) {
+    featmap.PushBack(i, fname[i], ftype[i]);
+  }
+  XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models);
+  API_END();
+}
diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc
new file mode 100644
index 000000000..e1949e560
--- /dev/null
+++ b/src/c_api/c_api_error.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_api_error.cc
+ * \brief C error handling
+ */
+#include "./c_api_error.h"
+#include "../common/thread_local.h"
+
+struct XGBAPIErrorEntry {
+  std::string last_error;
+};
+
+typedef xgboost::common::ThreadLocalStore<XGBAPIErrorEntry> XGBAPIErrorStore;
+
+const char *XGBGetLastError() {
+  return XGBAPIErrorStore::Get()->last_error.c_str();
+}
+
+void XGBAPISetLastError(const char* msg) {
+  XGBAPIErrorStore::Get()->last_error = msg;
+}
diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h
new file mode 100644
index 000000000..4bb631ecd
--- /dev/null
+++ b/src/c_api/c_api_error.h
@@ -0,0 +1,39 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_api_error.h
+ * \brief Error handling for C API.
+ */
+#ifndef XGBOOST_C_API_C_API_ERROR_H_
+#define XGBOOST_C_API_C_API_ERROR_H_
+
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <xgboost/c_api.h>
+
+/*! \brief  macro to guard beginning and end section of all functions */
+#define API_BEGIN() try {
+/*! \brief every function starts with API_BEGIN();
+     and finishes with API_END() or API_END_HANDLE_ERROR */
+#define API_END() } catch(dmlc::Error &_except_) { return XGBAPIHandleException(_except_); } return 0;  // NOLINT(*)
+/*!
+ * \brief every function starts with API_BEGIN();
+ *   and finishes with API_END() or API_END_HANDLE_ERROR
+ *   The finally clause contains procedure to cleanup states when an error happens.
+ */
+#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return XGBAPIHandleException(_except_); } return 0; // NOLINT(*)
+
+/*!
+ * \brief Set the last error message needed by C API
+ * \param msg The error message to set.
+ */
+void XGBAPISetLastError(const char* msg);
+/*!
+ * \brief handle exception throwed out
+ * \param e the exception
+ * \return the return value of API after exception is handled
+ */
+inline int XGBAPIHandleException(const dmlc::Error &e) {
+  XGBAPISetLastError(e.what());
+  return -1;
+}
+#endif  // XGBOOST_C_API_C_API_ERROR_H_
diff --git a/src/cli_main.cc b/src/cli_main.cc
new file mode 100644
index 000000000..a08e3fd6d
--- /dev/null
+++ b/src/cli_main.cc
@@ -0,0 +1,352 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file cli_main.cc
+ * \brief The command line interface program of xgboost.
+ *  This file is not included in dynamic library.
+ */
+// Copyright 2014 by Contributors
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+
+#include <xgboost/learner.h>
+#include <xgboost/data.h>
+#include <xgboost/logging.h>
+#include <dmlc/timer.h>
+#include <iomanip>
+#include <ctime>
+#include <string>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include "./common/sync.h"
+#include "./common/config.h"
+
+
+namespace xgboost {
+
+enum CLITask {
+  kTrain = 0,
+  kDump2Text = 1,
+  kPredict = 2
+};
+
+struct CLIParam : public dmlc::Parameter<CLIParam> {
+  /*! \brief the task name */
+  int task;
+  /*! \brief whether silent */
+  int silent;
+  /*! \brief whether evaluate training statistics */
+  bool eval_train;
+  /*! \brief number of boosting iterations */
+  int num_round;
+  /*! \brief the period to save the model, 0 means only save the final round model */
+  int save_period;
+  /*! \brief the path of training set */
+  std::string train_path;
+  /*! \brief path of test dataset */
+  std::string test_path;
+  /*! \brief the path of test model file, or file to restart training */
+  std::string model_in;
+  /*! \brief the path of final model file, to be saved */
+  std::string model_out;
+  /*! \brief the path of directory containing the saved models */
+  std::string model_dir;
+  /*! \brief name of predict file */
+  std::string name_pred;
+  /*! \brief data split mode */
+  int dsplit;
+  /*!\brief limit number of trees in prediction */
+  int ntree_limit;
+  /*!\brief whether to directly output margin value */
+  bool pred_margin;
+  /*! \brief whether dump statistics along with model */
+  int dump_stats;
+  /*! \brief name of feature map */
+  std::string name_fmap;
+  /*! \brief name of dump file */
+  std::string name_dump;
+  /*! \brief the paths of validation data sets */
+  std::vector<std::string> eval_data_paths;
+  /*! \brief the names of the evaluation data used in output log */
+  std::vector<std::string> eval_data_names;
+  /*! \brief all the configurations */
+  std::vector<std::pair<std::string, std::string> > cfg;
+
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(CLIParam) {
+    // NOTE: declare everything except eval_data_paths.
+    DMLC_DECLARE_FIELD(task).set_default(kTrain)
+        .add_enum("train", kTrain)
+        .add_enum("dump", kDump2Text)
+        .add_enum("pred", kPredict)
+        .describe("Task to be performed by the CLI program.");
+    DMLC_DECLARE_FIELD(silent).set_default(0).set_range(0, 2)
+        .describe("Silent level during the task.");
+    DMLC_DECLARE_FIELD(eval_train).set_default(false)
+        .describe("Whether evaluate on training data during training.");
+    DMLC_DECLARE_FIELD(num_round).set_default(10).set_lower_bound(1)
+        .describe("Number of boosting iterations");
+    DMLC_DECLARE_FIELD(save_period).set_default(0).set_lower_bound(0)
+        .describe("The period to save the model, 0 means only save final model.");
+    DMLC_DECLARE_FIELD(train_path).set_default("NULL")
+        .describe("Training data path.");
+    DMLC_DECLARE_FIELD(test_path).set_default("NULL")
+        .describe("Test data path.");
+    DMLC_DECLARE_FIELD(model_in).set_default("NULL")
+        .describe("Input model path, if any.");
+    DMLC_DECLARE_FIELD(model_out).set_default("NULL")
+        .describe("Output model path, if any.");
+    DMLC_DECLARE_FIELD(model_dir).set_default("./")
+        .describe("Output directory of period checkpoint.");
+    DMLC_DECLARE_FIELD(name_pred).set_default("pred.txt")
+        .describe("Name of the prediction file.");
+    DMLC_DECLARE_FIELD(dsplit).set_default(0)
+        .add_enum("auto", 0)
+        .add_enum("col", 1)
+        .add_enum("row", 2)
+        .describe("Data split mode.");
+    DMLC_DECLARE_FIELD(ntree_limit).set_default(0).set_lower_bound(0)
+        .describe("Number of trees used for prediction, 0 means use all trees.");
+    DMLC_DECLARE_FIELD(pred_margin).set_default(false)
+        .describe("Whether to predict margin value instead of probability.");
+    DMLC_DECLARE_FIELD(dump_stats).set_default(false)
+        .describe("Whether dump the model statistics.");
+    DMLC_DECLARE_FIELD(name_fmap).set_default("NULL")
+        .describe("Name of the feature map file.");
+    DMLC_DECLARE_FIELD(name_dump).set_default("dump.txt")
+        .describe("Name of the output dump text file.");
+    // alias
+    DMLC_DECLARE_ALIAS(train_path, data);
+    DMLC_DECLARE_ALIAS(test_path, test:data);
+    DMLC_DECLARE_ALIAS(name_fmap, fmap);
+  }
+  // customized configure function of CLIParam
+  inline void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) {
+    this->cfg = cfg;
+    this->InitAllowUnknown(cfg);
+    for (const auto& kv : cfg) {
+      if (!strncmp("eval[", kv.first.c_str(), 5)) {
+        char evname[256];
+        CHECK_EQ(sscanf(kv.first.c_str(), "eval[%[^]]", evname), 1)
+            << "must specify evaluation name for display";
+        eval_data_names.push_back(std::string(evname));
+        eval_data_paths.push_back(kv.second);
+      }
+    }
+    // constraint.
+    if (name_pred == "stdout") {
+      save_period = 0;
+      silent = 1;
+    }
+    if (dsplit == 0 && rabit::IsDistributed()) {
+      dsplit = 2;
+    }
+    if (rabit::GetRank() != 0) {
+      silent = 2;
+    }
+  }
+};
+
+DMLC_REGISTER_PARAMETER(CLIParam);
+
+void CLITrain(const CLIParam& param) {
+  if (rabit::IsDistributed()) {
+    std::string pname = rabit::GetProcessorName();
+    LOG(CONSOLE) << "start " << pname << ":" << rabit::GetRank();
+  }
+  // load in data.
+  std::unique_ptr<DMatrix> dtrain(
+      DMatrix::Load(param.train_path, param.silent != 0, param.dsplit == 2));
+  std::vector<std::unique_ptr<DMatrix> > deval;
+  std::vector<DMatrix*> cache_mats, eval_datasets;
+  cache_mats.push_back(dtrain.get());
+  for (size_t i = 0; i < param.eval_data_names.size(); ++i) {
+    deval.emplace_back(
+        DMatrix::Load(param.eval_data_paths[i], param.silent != 0, param.dsplit == 2));
+    eval_datasets.push_back(deval.back().get());
+    cache_mats.push_back(deval.back().get());
+  }
+  std::vector<std::string> eval_data_names = param.eval_data_names;
+  if (param.eval_train) {
+    eval_datasets.push_back(dtrain.get());
+    eval_data_names.push_back(std::string("train"));
+  }
+  // initialize the learner.
+  std::unique_ptr<Learner> learner(Learner::Create(cache_mats));
+  learner->Configure(param.cfg);
+  int version = rabit::LoadCheckPoint(learner.get());
+  if (version == 0) {
+    // initializ the model if needed.
+    if (param.model_in != "NULL") {
+      std::unique_ptr<dmlc::Stream> fi(
+          dmlc::Stream::Create(param.model_in.c_str(), "r"));
+      learner->Load(fi.get());
+    } else {
+      learner->InitModel();
+    }
+  }
+  // start training.
+  const double start = dmlc::GetTime();
+  for (int i = version / 2; i < param.num_round; ++i) {
+    double elapsed = dmlc::GetTime() - start;
+    if (version % 2 == 0) {
+      if (param.silent == 0) {
+        LOG(CONSOLE) << "boosting round " << i << ", " << elapsed << " sec elapsed";
+      }
+      learner->UpdateOneIter(i, dtrain.get());
+      if (learner->AllowLazyCheckPoint()) {
+        rabit::LazyCheckPoint(learner.get());
+      } else {
+        rabit::CheckPoint(learner.get());
+      }
+      version += 1;
+    }
+    CHECK_EQ(version, rabit::VersionNumber());
+    std::string res = learner->EvalOneIter(i, eval_datasets, eval_data_names);
+    if (rabit::IsDistributed()) {
+      if (rabit::GetRank() == 0) {
+        LOG(TRACKER) << res;
+      }
+    } else {
+      if (param.silent < 2) {
+        LOG(CONSOLE) << res;
+      }
+    }
+    if (param.save_period != 0 && (i + 1) % param.save_period == 0) {
+      std::ostringstream os;
+      os << param.model_dir << '/'
+         << std::setfill('0') << std::setw(4)
+         << i + 1 << ".model";
+      std::unique_ptr<dmlc::Stream> fo(
+          dmlc::Stream::Create(os.str().c_str(), "w"));
+      learner->Save(fo.get());
+    }
+
+    if (learner->AllowLazyCheckPoint()) {
+      rabit::LazyCheckPoint(learner.get());
+    } else {
+      rabit::CheckPoint(learner.get());
+    }
+    version += 1;
+    CHECK_EQ(version, rabit::VersionNumber());
+  }
+  // always save final round
+  if ((param.save_period == 0 || param.num_round % param.save_period != 0) &&
+      param.model_out != "NONE") {
+    std::ostringstream os;
+    if (param.model_out == "NULL") {
+      os << param.model_dir << '/'
+         << std::setfill('0') << std::setw(4)
+         << param.num_round << ".model";
+    } else {
+      os << param.model_out;
+    }
+    std::unique_ptr<dmlc::Stream> fo(
+        dmlc::Stream::Create(os.str().c_str(), "w"));
+    learner->Save(fo.get());
+  }
+
+  if (param.silent == 0) {
+    double elapsed = dmlc::GetTime() - start;
+    LOG(CONSOLE) << "update end, " << elapsed << " sec in all";
+  }
+}
+
+void CLIDump2Text(const CLIParam& param) {
+  FeatureMap fmap;
+  if (param.name_fmap != "NULL") {
+    std::unique_ptr<dmlc::Stream> fs(
+        dmlc::Stream::Create(param.name_fmap.c_str(), "r"));
+    dmlc::istream is(fs.get());
+    fmap.LoadText(is);
+  }
+  // load model
+  CHECK_NE(param.model_in, "NULL")
+      << "Must specifiy model_in for dump";
+  std::unique_ptr<Learner> learner(Learner::Create({}));
+  std::unique_ptr<dmlc::Stream> fi(
+      dmlc::Stream::Create(param.model_in.c_str(), "r"));
+  learner->Load(fi.get());
+  // dump data
+  std::vector<std::string> dump = learner->Dump2Text(fmap, param.dump_stats);
+  std::unique_ptr<dmlc::Stream> fo(
+      dmlc::Stream::Create(param.name_dump.c_str(), "w"));
+  dmlc::ostream os(fo.get());
+  for (size_t i = 0; i < dump.size(); ++i) {
+    os << "booster[" << i << "]:\n";
+    os << dump[i];
+  }
+  // force flush before fo destruct.
+  os.set_stream(nullptr);
+}
+
+void CLIPredict(const CLIParam& param) {
+  CHECK_NE(param.test_path, "NULL")
+      << "Test dataset parameter test:data must be specified.";
+  // load data
+  std::unique_ptr<DMatrix> dtest(
+      DMatrix::Load(param.test_path, param.silent != 0, param.dsplit == 2));
+  // load model
+  CHECK_NE(param.model_in, "NULL")
+      << "Must specifiy model_in for dump";
+  std::unique_ptr<Learner> learner(Learner::Create({}));
+  std::unique_ptr<dmlc::Stream> fi(
+      dmlc::Stream::Create(param.model_in.c_str(), "r"));
+  learner->Load(fi.get());
+
+  if (param.silent == 0) {
+    LOG(CONSOLE) << "start prediction...";
+  }
+  std::vector<float> preds;
+  learner->Predict(dtest.get(), param.pred_margin, &preds, param.ntree_limit);
+  if (param.silent == 0) {
+    LOG(CONSOLE) << "writing prediction to " << param.name_pred;
+  }
+  std::unique_ptr<dmlc::Stream> fo(
+      dmlc::Stream::Create(param.name_pred.c_str(), "w"));
+  dmlc::ostream os(fo.get());
+  for (float p : preds) {
+    os << p << '\n';
+  }
+  // force flush before fo destruct.
+  os.set_stream(nullptr);
+}
+
+int CLIRunTask(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: <config>\n");
+    return 0;
+  }
+
+  std::vector<std::pair<std::string, std::string> > cfg;
+  cfg.push_back(std::make_pair("seed", "0"));
+
+  common::ConfigIterator itr(argv[1]);
+  while (itr.Next()) {
+    cfg.push_back(std::make_pair(std::string(itr.name()), std::string(itr.val())));
+  }
+
+  for (int i = 2; i < argc; ++i) {
+    char name[256], val[256];
+    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+      cfg.push_back(std::make_pair(std::string(name), std::string(val)));
+    }
+  }
+  CLIParam param;
+  param.Configure(cfg);
+
+  rabit::Init(argc, argv);
+  switch (param.task) {
+    case kTrain: CLITrain(param); break;
+    case kDump2Text: CLIDump2Text(param); break;
+    case kPredict: CLIPredict(param); break;
+  }
+  rabit::Finalize();
+  return 0;
+}
+}  // namespace xgboost
+
+int main(int argc, char *argv[]) {
+  return xgboost::CLIRunTask(argc, argv);
+}
diff --git a/src/utils/base64-inl.h b/src/common/base64.h
similarity index 82%
rename from src/utils/base64-inl.h
rename to src/common/base64.h
index be99e07b7..4c876b5f8 100644
--- a/src/utils/base64-inl.h
+++ b/src/common/base64.h
@@ -5,16 +5,17 @@
  * base64 is easier to store and pass as text format in mapreduce
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_UTILS_BASE64_INL_H_
-#define XGBOOST_UTILS_BASE64_INL_H_
+#ifndef XGBOOST_COMMON_BASE64_H_
+#define XGBOOST_COMMON_BASE64_H_
 
+#include <xgboost/logging.h>
 #include <cctype>
 #include <cstdio>
 #include <string>
 #include "./io.h"
 
 namespace xgboost {
-namespace utils {
+namespace common {
 /*! \brief buffer reader of the stream that allows you to get */
 class StreamBufferReader {
  public:
@@ -26,7 +27,7 @@ class StreamBufferReader {
   /*!
    * \brief set input stream
    */
-  inline void set_stream(IStream *stream) {
+  inline void set_stream(dmlc::Stream *stream) {
     stream_ = stream;
     read_len_ = read_ptr_ = 1;
   }
@@ -51,7 +52,7 @@ class StreamBufferReader {
 
  private:
   /*! \brief the underlying stream */
-  IStream *stream_;
+  dmlc::Stream *stream_;
   /*! \brief buffer to hold data */
   std::string buffer_;
   /*! \brief length of valid data in buffer */
@@ -80,9 +81,9 @@ static const char EncodeTable[] =
     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 }  // namespace base64
 /*! \brief the stream that reads from base64, note we take from file pointers */
-class Base64InStream: public IStream {
+class Base64InStream: public dmlc::Stream {
  public:
-  explicit Base64InStream(IStream *fs) : reader_(256) {
+  explicit Base64InStream(dmlc::Stream *fs) : reader_(256) {
     reader_.set_stream(fs);
     num_prev = 0; tmp_ch = 0;
   }
@@ -134,20 +135,22 @@ class Base64InStream: public IStream {
       nvalue = DecodeTable[tmp_ch] << 18;
       {
         // second byte
-        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
-                     "invalid base64 format");
+        tmp_ch = reader_.GetChar();
+        CHECK(tmp_ch != EOF && !isspace(tmp_ch)) << "invalid base64 format";
         nvalue |= DecodeTable[tmp_ch] << 12;
         *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
-      }
+        }
       {
         // third byte
-        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
-                     "invalid base64 format");
+        tmp_ch = reader_.GetChar();
+        CHECK(tmp_ch != EOF && !isspace(tmp_ch)) << "invalid base64 format";
         // handle termination
         if (tmp_ch == '=') {
-          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
-          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
-                       "invalid base64 format");
+          tmp_ch = reader_.GetChar();
+          CHECK(tmp_ch == '=') << "invalid base64 format";
+          tmp_ch = reader_.GetChar();
+          CHECK(tmp_ch == EOF || isspace(tmp_ch))
+              << "invalid base64 format";
           break;
         }
         nvalue |= DecodeTable[tmp_ch] << 6;
@@ -159,11 +162,13 @@ class Base64InStream: public IStream {
       }
       {
         // fourth byte
-        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
-                     "invalid base64 format");
+        tmp_ch = reader_.GetChar();
+        CHECK(tmp_ch != EOF && !isspace(tmp_ch))
+            << "invalid base64 format";
         if (tmp_ch == '=') {
-          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
-                       "invalid base64 format");
+          tmp_ch = reader_.GetChar();
+          CHECK(tmp_ch == EOF || isspace(tmp_ch))
+              << "invalid base64 format";
           break;
         }
         nvalue |= DecodeTable[tmp_ch];
@@ -177,12 +182,12 @@ class Base64InStream: public IStream {
       tmp_ch = reader_.GetChar();
     }
     if (kStrictCheck) {
-      utils::Check(tlen == 0, "Base64InStream: read incomplete");
+      CHECK_EQ(tlen, 0) << "Base64InStream: read incomplete";
     }
     return size - tlen;
   }
   virtual void Write(const void *ptr, size_t size) {
-    utils::Error("Base64InStream do not support write");
+    LOG(FATAL) << "Base64InStream do not support write";
   }
 
  private:
@@ -194,9 +199,9 @@ class Base64InStream: public IStream {
   static const bool kStrictCheck = false;
 };
 /*! \brief the stream that write to base64, note we take from file pointers */
-class Base64OutStream: public IStream {
+class Base64OutStream: public dmlc::Stream {
  public:
-  explicit Base64OutStream(IStream *fp) : fp(fp) {
+  explicit Base64OutStream(dmlc::Stream *fp) : fp(fp) {
     buf_top = 0;
   }
   virtual void Write(const void *ptr, size_t size) {
@@ -218,7 +223,7 @@ class Base64OutStream: public IStream {
     }
   }
   virtual size_t Read(void *ptr, size_t size) {
-    utils::Error("Base64OutStream do not support read");
+    LOG(FATAL) << "Base64OutStream do not support read";
     return 0;
   }
   /*!
@@ -245,7 +250,7 @@ class Base64OutStream: public IStream {
   }
 
  private:
-  IStream *fp;
+  dmlc::Stream *fp;
   int buf_top;
   unsigned char buf[4];
   std::string out_buf;
@@ -262,6 +267,6 @@ class Base64OutStream: public IStream {
     }
   }
 };
-}  // namespace utils
+}  // namespace common
 }  // namespace xgboost
-#endif  // XGBOOST_UTILS_BASE64_INL_H_
+#endif  // XGBOOST_COMMON_BASE64_H_
diff --git a/src/utils/bitmap.h b/src/common/bitmap.h
similarity index 85%
rename from src/utils/bitmap.h
rename to src/common/bitmap.h
index eecccbda5..4f14d8b37 100644
--- a/src/utils/bitmap.h
+++ b/src/common/bitmap.h
@@ -5,15 +5,14 @@
  *  NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_UTILS_BITMAP_H_
-#define XGBOOST_UTILS_BITMAP_H_
+#ifndef XGBOOST_COMMON_BITMAP_H_
+#define XGBOOST_COMMON_BITMAP_H_
 
+#include <dmlc/omp.h>
 #include <vector>
-#include "./utils.h"
-#include "./omp.h"
 
 namespace xgboost {
-namespace utils {
+namespace common {
 /*! \brief bit map that contains set of bit indicators */
 struct BitMap {
   /*! \brief internal data structure */
@@ -40,7 +39,7 @@ struct BitMap {
     data[i >> 5] |= (1 << (i & 31U));
   }
   /*! \brief initialize the value of bit map from vector of bool*/
-  inline void InitFromBool(const std::vector<int> &vec) {
+  inline void InitFromBool(const std::vector<int>& vec) {
     this->Resize(vec.size());
     // parallel over the full cases
     bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
@@ -59,10 +58,10 @@ struct BitMap {
     }
   }
   /*! \brief clear the bitmap, set all places to false */
-  inline void Clear(void) {
+  inline void Clear() {
     std::fill(data.begin(), data.end(), 0U);
   }
 };
-}  // namespace utils
+}  // namespace common
 }  // namespace xgboost
-#endif  // XGBOOST_UTILS_BITMAP_H_
+#endif  // XGBOOST_COMMON_BITMAP_H_
diff --git a/src/common/common.cc b/src/common/common.cc
new file mode 100644
index 000000000..6e12045f6
--- /dev/null
+++ b/src/common/common.cc
@@ -0,0 +1,15 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file common.cc
+ * \brief Enable all kinds of global variables in common.
+ */
+#include "./random.h"
+
+namespace xgboost {
+namespace common {
+GlobalRandomEngine& GlobalRandom() {
+  static GlobalRandomEngine inst;
+  return inst;
+}
+}
+}  // namespace xgboost
diff --git a/src/utils/config.h b/src/common/config.h
similarity index 88%
rename from src/utils/config.h
rename to src/common/config.h
index 43d7bc8bd..7385dff4e 100644
--- a/src/utils/config.h
+++ b/src/common/config.h
@@ -4,18 +4,17 @@
  * \brief helper class to load in configures from file
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_UTILS_CONFIG_H_
-#define XGBOOST_UTILS_CONFIG_H_
+#ifndef XGBOOST_COMMON_CONFIG_H_
+#define XGBOOST_COMMON_CONFIG_H_
 
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <istream>
 #include <fstream>
-#include "./utils.h"
 
 namespace xgboost {
-namespace utils {
+namespace common {
 /*!
  * \brief base implementation of config reader
  */
@@ -79,11 +78,11 @@ class ConfigReaderBase {
         case '\\': *tok += this->GetChar(); break;
         case '\"': return;
         case '\r':
-        case '\n': Error("ConfigReader: unterminated string");
+        case '\n': LOG(FATAL)<< "ConfigReader: unterminated string";
         default: *tok += ch_buf;
       }
     }
-    Error("ConfigReader: unterminated string");
+    LOG(FATAL) << "ConfigReader: unterminated string";
   }
   inline void ParseStrML(std::string *tok) {
     while ((ch_buf = this->GetChar()) != EOF) {
@@ -93,7 +92,7 @@ class ConfigReaderBase {
         default: *tok += ch_buf;
       }
     }
-    Error("unterminated string");
+    LOG(FATAL) << "unterminated string";
   }
   // return newline
   inline bool GetNextToken(std::string *tok) {
@@ -106,13 +105,13 @@ class ConfigReaderBase {
           if (tok->length() == 0) {
             ParseStr(tok); ch_buf = this->GetChar(); return new_line;
           } else {
-            Error("ConfigReader: token followed directly by string");
+            LOG(FATAL) << "ConfigReader: token followed directly by string";
           }
         case '\'':
           if (tok->length() == 0) {
             ParseStrML(tok); ch_buf = this->GetChar(); return new_line;
           } else {
-            Error("ConfigReader: token followed directly by string");
+            LOG(FATAL) << "ConfigReader: token followed directly by string";
           }
         case '=':
           if (tok->length() == 0) {
@@ -148,7 +147,7 @@ class ConfigStreamReader: public ConfigReaderBase {
  public:
   /*!
    * \brief constructor
-   * \param istream input stream
+   * \param fin istream input stream
    */
   explicit ConfigStreamReader(std::istream &fin) : fin(fin) {}
 
@@ -177,7 +176,7 @@ class ConfigIterator: public ConfigStreamReader {
   explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) {
     fi.open(fname);
     if (fi.fail()) {
-      utils::Error("cannot open file %s", fname);
+      LOG(FATAL) << "cannot open file " << fname;
     }
     ConfigReaderBase::Init();
   }
@@ -189,6 +188,6 @@ class ConfigIterator: public ConfigStreamReader {
  private:
   std::ifstream fi;
 };
-}  // namespace utils
+}  // namespace common
 }  // namespace xgboost
-#endif  // XGBOOST_UTILS_CONFIG_H_
+#endif  // XGBOOST_COMMON_CONFIG_H_
diff --git a/src/utils/group_data.h b/src/common/group_data.h
similarity index 91%
rename from src/utils/group_data.h
rename to src/common/group_data.h
index 31f9c3a50..3759e1ee3 100644
--- a/src/utils/group_data.h
+++ b/src/common/group_data.h
@@ -11,13 +11,13 @@
  * The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_UTILS_GROUP_DATA_H_
-#define XGBOOST_UTILS_GROUP_DATA_H_
+#ifndef XGBOOST_COMMON_GROUP_DATA_H_
+#define XGBOOST_COMMON_GROUP_DATA_H_
 
 #include <vector>
 
 namespace xgboost {
-namespace utils {
+namespace common {
 /*!
  * \brief multi-thread version of group builder
  * \tparam ValueType type of entries in the sparse matrix
@@ -91,7 +91,8 @@ struct ParallelGroupBuilder {
    * \brief step 4: add data to the allocated space,
    *   the calls to this function should be exactly match previous call to AddBudget
    *
-   * \param key the key of
+   * \param key the key of group.
+   * \param value The value to be pushed to the group.
    * \param threadid the id of thread that calls this function
    */
   inline void Push(size_t key, ValueType value, int threadid) {
@@ -105,10 +106,10 @@ struct ParallelGroupBuilder {
   /*! \brief index of nonzero entries in each row */
   std::vector<ValueType> &data;
   /*! \brief thread local data structure */
-  std::vector< std::vector<SizeType> > &thread_rptr;
+  std::vector<std::vector<SizeType> > &thread_rptr;
   /*! \brief local temp thread ptr, use this if not specified by the constructor */
-  std::vector< std::vector<SizeType> > tmp_thread_rptr;
+  std::vector<std::vector<SizeType> > tmp_thread_rptr;
 };
-}  // namespace utils
+}  // namespace common
 }  // namespace xgboost
-#endif  // XGBOOST_UTILS_GROUP_DATA_H_
+#endif  // XGBOOST_COMMON_GROUP_DATA_H_
diff --git a/src/common/io.h b/src/common/io.h
new file mode 100644
index 000000000..c6e3a11c0
--- /dev/null
+++ b/src/common/io.h
@@ -0,0 +1,75 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file io.h
+ * \brief general stream interface for serialization, I/O
+ * \author Tianqi Chen
+ */
+
+#ifndef XGBOOST_COMMON_IO_H_
+#define XGBOOST_COMMON_IO_H_
+
+#include <dmlc/io.h>
+#include <string>
+#include <cstring>
+#include "./sync.h"
+
+namespace xgboost {
+namespace common {
+typedef rabit::utils::MemoryFixSizeBuffer MemoryFixSizeBuffer;
+typedef rabit::utils::MemoryBufferStream MemoryBufferStream;
+
+/*!
+ * \brief Input stream that support additional PeekRead
+ *  operation, besides read.
+ */
+class PeekableInStream : public dmlc::Stream {
+ public:
+  explicit PeekableInStream(dmlc::Stream* strm)
+      : strm_(strm), buffer_ptr_(0) {}
+
+  size_t Read(void* dptr, size_t size) override {
+    size_t nbuffer = buffer_.length() - buffer_ptr_;
+    if (nbuffer == 0) return strm_->Read(dptr, size);
+    if (nbuffer < size) {
+      std::memcpy(dptr, dmlc::BeginPtr(buffer_) + buffer_ptr_, nbuffer);
+      buffer_ptr_ += nbuffer;
+      return nbuffer + strm_->Read(reinterpret_cast<char*>(dptr) + nbuffer,
+                                   size - nbuffer);
+    } else {
+      std::memcpy(dptr, dmlc::BeginPtr(buffer_) + buffer_ptr_, size);
+      buffer_ptr_ += size;
+      return size;
+    }
+  }
+
+  size_t PeekRead(void* dptr, size_t size) {
+    size_t nbuffer = buffer_.length() - buffer_ptr_;
+    if (nbuffer < size) {
+      buffer_ = buffer_.substr(buffer_ptr_, buffer_.length());
+      buffer_ptr_ = 0;
+      buffer_.resize(size);
+      size_t nadd = strm_->Read(dmlc::BeginPtr(buffer_) + nbuffer, size - nbuffer);
+      buffer_.resize(nbuffer + nadd);
+      std::memcpy(dptr, dmlc::BeginPtr(buffer_), buffer_.length());
+      return buffer_.length();
+    } else {
+      std::memcpy(dptr, dmlc::BeginPtr(buffer_) + buffer_ptr_, size);
+      return size;
+    }
+  }
+
+  void Write(const void* dptr, size_t size) override {
+    LOG(FATAL) << "Not implemented";
+  }
+
+ private:
+  /*! \brief input stream */
+  dmlc::Stream *strm_;
+  /*! \brief current buffer pointer */
+  size_t buffer_ptr_;
+  /*! \brief internal buffer */
+  std::string buffer_;
+};
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_IO_H_
diff --git a/src/common/math.h b/src/common/math.h
new file mode 100644
index 000000000..518da23bb
--- /dev/null
+++ b/src/common/math.h
@@ -0,0 +1,137 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file math.h
+ * \brief additional math utils
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_MATH_H_
+#define XGBOOST_COMMON_MATH_H_
+
+#include <utility>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+namespace xgboost {
+namespace common {
+/*!
+ * \brief calculate the sigmoid of the input.
+ * \param x input parameter
+ * \return the transformed value.
+ */
+inline float Sigmoid(float x) {
+  return 1.0f / (1.0f + std::exp(-x));
+}
+
+/*!
+ * \brief do inplace softmax transformaton on p_rec
+ * \param p_rec the input/output vector of the values.
+ */
+inline void Softmax(std::vector<float>* p_rec) {
+  std::vector<float> &rec = *p_rec;
+  float wmax = rec[0];
+  for (size_t i = 1; i < rec.size(); ++i) {
+    wmax = std::max(rec[i], wmax);
+  }
+  double wsum = 0.0f;
+  for (size_t i = 0; i < rec.size(); ++i) {
+    rec[i] = std::exp(rec[i] - wmax);
+    wsum += rec[i];
+  }
+  for (size_t i = 0; i < rec.size(); ++i) {
+    rec[i] /= static_cast<float>(wsum);
+  }
+}
+
+/*!
+ * \brief Find the maximum iterator within the iterators
+ * \param begin The begining iterator.
+ * \param end The end iterator.
+ * \return the iterator point to the maximum value.
+ * \tparam Iterator The type of the iterator.
+ */
+template<typename Iterator>
+inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
+  Iterator maxit = begin;
+  for (Iterator it = begin; it != end; ++it) {
+    if (*it > *maxit) maxit = it;
+  }
+  return maxit;
+}
+
+/*!
+ * \brief perform numerically safe logsum
+ * \param x left input operand
+ * \param y right input operand
+ * \return  log(exp(x) + exp(y))
+ */
+inline float LogSum(float x, float y) {
+  if (x < y) {
+    return y + std::log(std::exp(x - y) + 1.0f);
+  } else {
+    return x + std::log(std::exp(y - x) + 1.0f);
+  }
+}
+
+/*!
+ * \brief perform numerically safe logsum
+ * \param begin The begining iterator.
+ * \param end The end iterator.
+ * \return the iterator point to the maximum value.
+ * \tparam Iterator The type of the iterator.
+ */
+template<typename Iterator>
+inline float LogSum(Iterator begin, Iterator end) {
+  float mx = *begin;
+  for (Iterator it = begin; it != end; ++it) {
+    mx = std::max(mx, *it);
+  }
+  float sum = 0.0f;
+  for (Iterator it = begin; it != end; ++it) {
+    sum += std::exp(*it - mx);
+  }
+  return mx + std::log(sum);
+}
+
+// comparator functions for sorting pairs in descending order
+inline static bool CmpFirst(const std::pair<float, unsigned> &a,
+                            const std::pair<float, unsigned> &b) {
+  return a.first > b.first;
+}
+inline static bool CmpSecond(const std::pair<float, unsigned> &a,
+                             const std::pair<float, unsigned> &b) {
+  return a.second > b.second;
+}
+
+#if XGBOOST_STRICT_R_MODE
+// check nan
+bool CheckNAN(double v);
+double LogGamma(double v);
+#else
+template<typename T>
+inline bool CheckNAN(T v) {
+#ifdef _MSC_VER
+  return (_isnan(v) != 0);
+#else
+  return std::isnan(v);
+#endif
+}
+template<typename T>
+inline T LogGamma(T v) {
+#ifdef _MSC_VER
+#if _MSC_VER >= 1800
+  return lgamma(v);
+#else
+#pragma message("Warning: lgamma function was not available until VS2013"\
+                ", poisson regression will be disabled")
+  utils::Error("lgamma function was not available until VS2013");
+  return static_cast<T>(1.0);
+#endif
+#else
+  return lgamma(v);
+#endif
+}
+#endif  // XGBOOST_STRICT_R_MODE_
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_MATH_H_
diff --git a/src/utils/quantile.h b/src/common/quantile.h
similarity index 89%
rename from src/utils/quantile.h
rename to src/common/quantile.h
index d1c029f65..9c427470f 100644
--- a/src/utils/quantile.h
+++ b/src/common/quantile.h
@@ -4,19 +4,19 @@
  * \brief util to compute quantiles
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_UTILS_QUANTILE_H_
-#define XGBOOST_UTILS_QUANTILE_H_
+#ifndef XGBOOST_COMMON_QUANTILE_H_
+#define XGBOOST_COMMON_QUANTILE_H_
 
+#include <dmlc/base.h>
+#include <xgboost/logging.h>
 #include <cmath>
 #include <vector>
 #include <cstring>
 #include <algorithm>
 #include <iostream>
-#include "./io.h"
-#include "./utils.h"
 
 namespace xgboost {
-namespace utils {
+namespace common {
 /*!
  * \brief experimental wsummary
  * \tparam DType type of data content
@@ -35,7 +35,7 @@ struct WQSummary {
     /*! \brief the value of data */
     DType value;
     // constructor
-    Entry(void) {}
+    Entry() {}
     // constructor
     Entry(RType rmin, RType rmax, RType wmin, DType value)
         : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
@@ -44,15 +44,15 @@ struct WQSummary {
      * \param eps the tolerate level for violating the relation
      */
     inline void CheckValid(RType eps = 0) const {
-      utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
-      utils::Assert(rmax- rmin - wmin > -eps, "relation constraint: min/max");
+      CHECK(rmin >= 0 && rmax >= 0 && wmin >= 0) << "nonneg constraint";
+      CHECK(rmax- rmin - wmin > -eps) <<  "relation constraint: min/max";
     }
     /*! \return rmin estimation for v strictly bigger than value */
-    inline RType rmin_next(void) const {
+    inline RType rmin_next() const {
       return rmin + wmin;
     }
     /*! \return rmax estimation for v strictly smaller than value */
-    inline RType rmax_prev(void) const {
+    inline RType rmax_prev() const {
       return rmax - wmin;
     }
   };
@@ -65,7 +65,7 @@ struct WQSummary {
       // weight of instance
       RType weight;
       // default constructor
-      QEntry(void) {}
+      QEntry() {}
       // constructor
       QEntry(DType value, RType weight)
           : value(value), weight(weight) {}
@@ -113,7 +113,7 @@ struct WQSummary {
   /*!
    * \return the maximum error of the Summary
    */
-  inline RType MaxError(void) const {
+  inline RType MaxError() const {
     RType res = data[0].rmax - data[0].rmin - data[0].wmin;
     for (size_t i = 1; i < size; ++i) {
       res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
@@ -147,7 +147,7 @@ struct WQSummary {
     }
   }
   /*! \return maximum rank in the summary */
-  inline RType MaxRank(void) const {
+  inline RType MaxRank() const {
     return data[size - 1].rmax;
   }
   /*!
@@ -168,8 +168,8 @@ struct WQSummary {
     for (size_t i = 0; i < size; ++i) {
       data[i].CheckValid(eps);
       if (i != 0) {
-        utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
-        utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
+        CHECK(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin) << "rmin range constraint";
+        CHECK(data[i].rmax >= data[i - 1].rmax + data[i].wmin) << "rmax range constraint";
       }
     }
   }
@@ -196,7 +196,7 @@ struct WQSummary {
       // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2
       while (i < src.size - 1
              && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
-      utils::Assert(i != src.size - 1, "this cannot happen");
+      CHECK(i != src.size - 1);
       if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
         if (i != lastidx) {
           data[size++] = src.data[i]; lastidx = i;
@@ -224,7 +224,7 @@ struct WQSummary {
     if (sb.size == 0) {
       this->CopyFrom(sa); return;
     }
-    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge");
+    CHECK(sa.size > 0 && sb.size > 0);
     const Entry *a = sa.data, *a_end = sa.data + sa.size;
     const Entry *b = sb.data, *b_end = sb.data + sb.size;
     // extended rmin value
@@ -272,18 +272,19 @@ struct WQSummary {
     RType err_mingap, err_maxgap, err_wgap;
     this->FixError(&err_mingap, &err_maxgap, &err_wgap);
     if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
-      utils::Printf("INFO: mingap=%g, maxgap=%g, wgap=%g\n",
-                    err_mingap, err_maxgap, err_wgap);
+      LOG(INFO) << "mingap=" << err_mingap
+                << ", maxgap=" << err_maxgap
+                << ", wgap=" << err_wgap;
     }
-
-    utils::Assert(size <= sa.size + sb.size, "bug in combine");
+    CHECK(size <= sa.size + sb.size) << "bug in combine";
   }
   // helper function to print the current content of sketch
   inline void Print() const {
     for (size_t i = 0; i < this->size; ++i) {
-      utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g\n",
-                    i, data[i].rmin, data[i].rmax,
-                    data[i].wmin, data[i].value);
+      LOG(INFO) << "[" << i << "] rmin=" << data[i].rmin
+                << ", rmax=" << data[i].rmax
+                << ", wmin=" << data[i].wmin
+                << ", v=" << data[i].value;
     }
   }
   // try to fix rounding error
@@ -320,7 +321,7 @@ struct WQSummary {
     for (size_t i = 0; i < this->size; ++i) {
       if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
           data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
-        utils::Printf("----%s: Check not Pass------\n", msg);
+        LOG(INFO) << "----------check not pass----------";
         this->Print();
         return false;
       }
@@ -380,12 +381,11 @@ struct WXQSummary : public WQSummary<DType, RType> {
     }
     if (nbig >= n - 1) {
       // see what was the case
-      utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
-      utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
-                    src.size, maxsize, static_cast<double>(range),
-                    static_cast<double>(chunk));
+      LOG(INFO) << " check quantile stats, nbig=" << nbig << ", n=" << n;
+      LOG(INFO) << " srcsize=" << src.size << ", maxsize=" << maxsize
+                << ", range=" << range << ", chunk=" << chunk;
       src.Print();
-      utils::Assert(nbig < n - 1, "quantile: too many large chunk");
+      CHECK(nbig < n - 1) << "quantile: too many large chunk";
     }
     this->data[0] = src.data[0];
     this->size = 1;
@@ -440,7 +440,7 @@ struct GKSummary {
     /*! \brief the value of data */
     DType value;
     // constructor
-    Entry(void) {}
+    Entry() {}
     // constructor
     Entry(RType rmin, RType rmax, DType value)
         : rmin(rmin), rmax(rmax), value(value) {}
@@ -470,7 +470,7 @@ struct GKSummary {
   GKSummary(Entry *data, size_t size)
       : data(data), size(size) {}
   /*! \brief the maximum error of the summary */
-  inline RType MaxError(void) const {
+  inline RType MaxError() const {
     RType res = 0;
     for (size_t i = 1; i < size; ++i) {
       res = std::max(data[i].rmax - data[i-1].rmin, res);
@@ -478,7 +478,7 @@ struct GKSummary {
     return res;
   }
   /*! \return maximum rank in the summary */
-  inline RType MaxRank(void) const {
+  inline RType MaxRank() const {
     return data[size - 1].rmax;
   }
   /*!
@@ -493,7 +493,7 @@ struct GKSummary {
     // assume always valid
   }
   /*! \brief used for debug purpose, print the summary */
-  inline void Print(void) const {
+  inline void Print() const {
     for (size_t i = 0; i < size; ++i) {
       std::cout << "x=" << data[i].value << "\t"
                 << "[" << data[i].rmin << "," << data[i].rmax << "]"
@@ -536,7 +536,7 @@ struct GKSummary {
     if (sb.size == 0) {
       this->CopyFrom(sa); return;
     }
-    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge");
+    CHECK(sa.size > 0 && sb.size > 0) << "invalid input for merge";
     const Entry *a = sa.data, *a_end = sa.data + sa.size;
     const Entry *b = sb.data, *b_end = sb.data + sb.size;
     this->size = sa.size + sb.size;
@@ -569,7 +569,7 @@ struct GKSummary {
         ++dst; ++b;
       } while (b != b_end);
     }
-    utils::Assert(dst == data + size, "bug in combine");
+    CHECK(dst == data + size) << "bug in combine";
   }
 };
 
@@ -592,15 +592,15 @@ class QuantileSketchTemplate {
     std::vector<Entry> space;
     SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) {
       this->space = src.space;
-      this->data = BeginPtr(this->space);
+      this->data = dmlc::BeginPtr(this->space);
     }
-    SummaryContainer(void) : Summary(NULL, 0) {
+    SummaryContainer() : Summary(NULL, 0) {
     }
     /*! \brief reserve space for summary */
     inline void Reserve(size_t size) {
       if (size > space.size()) {
         space.resize(size);
-        this->data = BeginPtr(space);
+        this->data = dmlc::BeginPtr(space);
       }
     }
     /*!
@@ -610,7 +610,7 @@ class QuantileSketchTemplate {
      */
     inline void SetMerge(const Summary *begin,
                          const Summary *end) {
-      utils::Assert(begin < end, "can not set combine to empty instance");
+      CHECK(begin < end) << "can not set combine to empty instance";
       size_t len = end - begin;
       if (len == 1) {
         this->Reserve(begin[0].size);
@@ -631,7 +631,7 @@ class QuantileSketchTemplate {
      * \brief do elementwise combination of summary array
      *        this[i] = combine(this[i], src[i]) for each i
      * \param src the source summary
-     * \param max_nbyte, maximum number of byte allowed in here
+     * \param max_nbyte maximum number of byte allowed in here
      */
     inline void Reduce(const Summary &src, size_t max_nbyte) {
       this->Reserve((max_nbyte - sizeof(this->size)) / sizeof(Entry));
@@ -655,11 +655,11 @@ class QuantileSketchTemplate {
     /*! \brief load data structure from input stream */
     template<typename TStream>
     inline void Load(TStream &fi) {  // NOLINT(*)
-      utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
+      CHECK_EQ(fi.Read(&this->size, sizeof(this->size)), sizeof(this->size));
       this->Reserve(this->size);
       if (this->size != 0) {
-        utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0,
-                     "invalid SummaryArray 2");
+        CHECK_EQ(fi.Read(this->data, this->size * sizeof(Entry)),
+                 this->size * sizeof(Entry));
       }
     }
   };
@@ -678,8 +678,8 @@ class QuantileSketchTemplate {
     }
     // check invariant
     size_t n = (1UL << nlevel);
-    utils::Assert(n * limit_size >= maxn, "invalid init parameter");
-    utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
+    CHECK(n * limit_size >= maxn) << "invalid init parameter";
+    CHECK(nlevel <= limit_size * eps) << "invalid init parameter";
     // lazy reserve the space, if there is only one value, no need to allocate space
     inqueue.queue.resize(1);
     inqueue.qtail = 0;
@@ -688,7 +688,8 @@ class QuantileSketchTemplate {
   }
   /*!
    * \brief add an element to a sketch
-   * \param x the element added to the sketch
+   * \param x The element added to the sketch
+   * \param w The weight of the element.
    */
   inline void Push(DType x, RType w = 1) {
     if (w == static_cast<RType>(0)) return;
@@ -707,7 +708,7 @@ class QuantileSketchTemplate {
     inqueue.Push(x, w);
   }
   /*! \brief push up temp */
-  inline void PushTemp(void) {
+  inline void PushTemp() {
     temp.Reserve(limit_size * 2);
     for (size_t l = 1; true; ++l) {
       this->InitLevel(l + 1);
@@ -769,7 +770,7 @@ class QuantileSketchTemplate {
     data.resize(limit_size * nlevel);
     level.resize(nlevel, Summary(NULL, 0));
     for (size_t l = 0; l < level.size(); ++l) {
-      level[l].data = BeginPtr(data) + l * limit_size;
+      level[l].data = dmlc::BeginPtr(data) + l * limit_size;
     }
   }
   // input data queue
@@ -793,7 +794,7 @@ class QuantileSketchTemplate {
  */
 template<typename DType, typename RType = unsigned>
 class WQuantileSketch :
-      public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
+      public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> > {
 };
 
 /*!
@@ -803,7 +804,7 @@ class WQuantileSketch :
  */
 template<typename DType, typename RType = unsigned>
 class WXQuantileSketch :
-      public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> >{
+      public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> > {
 };
 /*!
  * \brief Quantile sketch use WQSummary
@@ -812,9 +813,8 @@ class WXQuantileSketch :
  */
 template<typename DType, typename RType = unsigned>
 class GKQuantileSketch :
-      public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> >{
+      public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> > {
 };
-
-}  // namespace utils
+}  // namespace common
 }  // namespace xgboost
-#endif  // XGBOOST_UTILS_QUANTILE_H_
+#endif  // XGBOOST_COMMON_QUANTILE_H_
diff --git a/src/common/random.h b/src/common/random.h
new file mode 100644
index 000000000..f47ff5f75
--- /dev/null
+++ b/src/common/random.h
@@ -0,0 +1,70 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file random.h
+ * \brief Utility related to random.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_RANDOM_H_
+#define XGBOOST_COMMON_RANDOM_H_
+
+#include <random>
+#include <limits>
+
+namespace xgboost {
+namespace common {
+/*!
+ * \brief Define mt19937 as default type Random Engine.
+ */
+typedef std::mt19937 RandomEngine;
+
+#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+/*!
+ * \brief An customized random engine, used to be plugged in PRNG from other systems.
+ *  The implementation of this library is not provided by xgboost core library.
+ *  Instead the other library can implement this class, which will be used as GlobalRandomEngine
+ *  If XGBOOST_RANDOM_CUSTOMIZE = 1, by default this is switched off.
+ */
+class CustomGlobalRandomEngine {
+ public:
+  /*! \brief The result type */
+  typedef size_t result_type;
+  /*! \brief The minimum of random numbers generated */
+  inline static constexpr result_type min() {
+    return 0;
+  }
+  /*! \brief The maximum random numbers generated */
+  inline static constexpr result_type max() {
+    return std::numeric_limits<size_t>::max();
+  }
+  /*!
+   * \brief seed function, to be implemented
+   * \param val The value of the seed.
+   */
+  void seed(result_type val);
+  /*!
+   * \return next random number.
+   */
+  result_type operator()();
+};
+
+/*!
+ * \brief global random engine
+ */
+typedef CustomGlobalRandomEngine GlobalRandomEngine;
+
+#else
+/*!
+ * \brief global random engine
+ */
+typedef RandomEngine GlobalRandomEngine;
+#endif
+
+/*!
+ * \brief global singleton of a random engine.
+ *  Only use this engine when necessary, not thread-safe.
+ */
+GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
+
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_RANDOM_H_
diff --git a/src/common/sync.h b/src/common/sync.h
new file mode 100644
index 000000000..e85e17ba2
--- /dev/null
+++ b/src/common/sync.h
@@ -0,0 +1,13 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file sync.h
+ * \brief the synchronization module of rabit
+ *        redirects to rabit header
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_SYNC_H_
+#define XGBOOST_COMMON_SYNC_H_
+
+#include <rabit.h>
+
+#endif  // XGBOOST_COMMON_SYNC_H_
diff --git a/src/common/thread_local.h b/src/common/thread_local.h
new file mode 100644
index 000000000..6ea8eb5ab
--- /dev/null
+++ b/src/common/thread_local.h
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file thread_local.h
+ * \brief Common utility for thread local storage.
+ */
+#ifndef XGBOOST_COMMON_THREAD_LOCAL_H_
+#define XGBOOST_COMMON_THREAD_LOCAL_H_
+
+#if DMLC_ENABLE_STD_THREAD
+#include <mutex>
+#endif
+
+#include <memory>
+#include <vector>
+
+namespace xgboost {
+namespace common {
+
+// macro hanlding for threadlocal variables
+#ifdef __GNUC__
+  #define MX_TREAD_LOCAL __thread
+#elif __STDC_VERSION__ >= 201112L
+  #define  MX_TREAD_LOCAL _Thread_local
+#elif defined(_MSC_VER)
+  #define MX_TREAD_LOCAL __declspec(thread)
+#endif
+
+#ifndef MX_TREAD_LOCAL
+#message("Warning: Threadlocal is not enabled");
+#endif
+
+/*!
+ * \brief A threadlocal store to store threadlocal variables.
+ *  Will return a thread local singleton of type T
+ * \tparam T the type we like to store
+ */
+template<typename T>
+class ThreadLocalStore {
+ public:
+  /*! \return get a thread local singleton */
+  static T* Get() {
+    static MX_TREAD_LOCAL T* ptr = nullptr;
+    if (ptr == nullptr) {
+      ptr = new T();
+      Singleton()->RegisterDelete(ptr);
+    }
+    return ptr;
+  }
+
+ private:
+  /*! \brief constructor */
+  ThreadLocalStore() {}
+  /*! \brief destructor */
+  ~ThreadLocalStore() {
+    for (size_t i = 0; i < data_.size(); ++i) {
+      delete data_[i];
+    }
+  }
+  /*! \return singleton of the store */
+  static ThreadLocalStore<T> *Singleton() {
+    static ThreadLocalStore<T> inst;
+    return &inst;
+  }
+  /*!
+   * \brief register str for internal deletion
+   * \param str the string pointer
+   */
+  void RegisterDelete(T *str) {
+#if DMLC_ENABLE_STD_THREAD
+    std::unique_lock<std::mutex> lock(mutex_);
+    data_.push_back(str);
+    lock.unlock();
+#else
+    data_.push_back(str);
+#endif
+  }
+
+#if DMLC_ENABLE_STD_THREAD
+  /*! \brief internal mutex */
+  std::mutex mutex_;
+#endif
+  /*!\brief internal data */
+  std::vector<T*> data_;
+};
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_THREAD_LOCAL_H_
diff --git a/src/data.h b/src/data.h
deleted file mode 100644
index 9bcb84ced..000000000
--- a/src/data.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*!
- * Copyright (c) 2014 by Contributors
- * \file data.h
- * \brief the input data structure for gradient boosting
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_DATA_H_
-#define XGBOOST_DATA_H_
-
-#include <cstdio>
-#include <vector>
-#include "utils/utils.h"
-#include "utils/iterator.h"
-
-namespace xgboost {
-/*!
- * \brief unsigned integer type used in boost,
- *        used for feature index and row index
- */
-typedef unsigned bst_uint;
-/*! \brief float type, used for storing statistics */
-typedef float bst_float;
-const float rt_eps = 1e-5f;
-// min gap between feature values to allow a split happen
-const float rt_2eps = rt_eps * 2.0f;
-
-/*! \brief gradient statistics pair usually needed in gradient boosting */
-struct bst_gpair {
-  /*! \brief gradient statistics */
-  bst_float grad;
-  /*! \brief second order gradient statistics */
-  bst_float hess;
-  bst_gpair(void) {}
-  bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {}
-};
-
-/*!
- * \brief extra information that might be needed by gbm and tree module
- * this information is not necessarily present, and can be empty
- */
-struct BoosterInfo {
-  /*! \brief number of rows in the data */
-  size_t num_row;
-  /*! \brief number of columns in the data */
-  size_t num_col;
-  /*!
-   * \brief specified root index of each instance,
-   *  can be used for multi task setting
-   */
-  std::vector<unsigned> root_index;
-  /*! \brief set fold indicator */
-  std::vector<unsigned> fold_index;
-  /*! \brief number of rows, number of columns */
-  BoosterInfo(void) : num_row(0), num_col(0) {
-  }
-  /*! \brief get root of i-th instance */
-  inline unsigned GetRoot(size_t i) const {
-    return root_index.size() == 0 ? 0 : root_index[i];
-  }
-};
-
-/*! \brief read-only sparse instance batch in CSR format */
-struct SparseBatch {
-  /*! \brief an entry of sparse vector */
-  struct Entry {
-    /*! \brief feature index */
-    bst_uint index;
-    /*! \brief feature value */
-    bst_float fvalue;
-    // default constructor
-    Entry(void) {}
-    Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
-    /*! \brief reversely compare feature values */
-    inline static bool CmpValue(const Entry &a, const Entry &b) {
-      return a.fvalue < b.fvalue;
-    }
-  };
-  /*! \brief an instance of sparse vector in the batch */
-  struct Inst {
-    /*! \brief pointer to the elements*/
-    const Entry *data;
-    /*! \brief length of the instance */
-    bst_uint length;
-    /*! \brief constructor */
-    Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
-    /*! \brief get i-th pair in the sparse vector*/
-    inline const Entry& operator[](size_t i) const {
-      return data[i];
-    }
-  };
-  /*! \brief batch size */
-  size_t size;
-};
-/*! \brief read-only row batch, used to access row continuously */
-struct RowBatch : public SparseBatch {
-  /*! \brief the offset of rowid of this batch */
-  size_t base_rowid;
-  /*! \brief array[size+1], row pointer of each of the elements */
-  const size_t *ind_ptr;
-  /*! \brief array[ind_ptr.back()], content of the sparse element */
-  const Entry *data_ptr;
-  /*! \brief get i-th row from the batch */
-  inline Inst operator[](size_t i) const {
-    return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
-  }
-};
-/*!
- * \brief read-only column batch, used to access columns,
- * the columns are not required to be continuous
- */
-struct ColBatch : public SparseBatch {
-  /*! \brief column index of each columns in the data */
-  const bst_uint *col_index;
-  /*! \brief pointer to the column data */
-  const Inst *col_data;
-  /*! \brief get i-th column from the batch */
-  inline Inst operator[](size_t i) const {
-    return col_data[i];
-  }
-};
-/**
- * \brief interface of feature matrix, needed for tree construction
- *  this interface defines two ways to access features:
- *   row access is defined by iterator of RowBatch
- *   col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
- */
-class IFMatrix {
- public:
-  // the interface only need to guarantee row iter
-  // column iter is active, when ColIterator is called, row_iter can be disabled
-  /*! \brief get the row iterator associated with FMatrix */
-  virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
-  /*!\brief get column iterator */
-  virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
-  /*!
-   * \brief get the column iterator associated with FMatrix with subset of column features
-   * \param fset is the list of column index set that must be contained in the returning Column iterator
-   * \return the column iterator, initialized so that it reads the elements in fset
-   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
-  /*!
-   * \brief check if column access is supported, if not, initialize column access
-   * \param enabled whether certain feature should be included in column access
-   * \param subsample subsample ratio when generating column access
-   * \param max_row_perbatch auxiliary information, maximum row used in each column batch
-   *         this is a hint information that can be ignored by the implementation
-   */
-  virtual void InitColAccess(const std::vector<bool> &enabled,
-                             float subsample,
-                             size_t max_row_perbatch) = 0;
-  // the following are column meta data, should be able to answer them fast
-  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const = 0;
-  /*! \return number of columns in the FMatrix */
-  virtual size_t NumCol(void) const = 0;
-  /*! \brief get number of non-missing entries in column */
-  virtual size_t GetColSize(size_t cidx) const = 0;
-  /*! \brief get column density */
-  virtual float GetColDensity(size_t cidx) const = 0;
-  /*! \brief reference of buffered rowset */
-  virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
-  // virtual destructor
-  virtual ~IFMatrix(void){}
-};
-}  // namespace xgboost
-#endif  // XGBOOST_DATA_H_
diff --git a/src/data/data.cc b/src/data/data.cc
new file mode 100644
index 000000000..9c63f8aa2
--- /dev/null
+++ b/src/data/data.cc
@@ -0,0 +1,278 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file data.cc
+ */
+#include <xgboost/data.h>
+#include <xgboost/logging.h>
+#include <dmlc/registry.h>
+#include <cstring>
+#include "./sparse_batch_page.h"
+#include "./simple_dmatrix.h"
+#include "./simple_csr_source.h"
+#include "../common/io.h"
+
+#if DMLC_ENABLE_STD_THREAD
+#include "./sparse_page_source.h"
+#include "./sparse_page_dmatrix.h"
+#endif
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg);
+}  // namespace dmlc
+
+namespace xgboost {
+// implementation of inline functions
+void MetaInfo::Clear() {
+  num_row = num_col = num_nonzero = 0;
+  labels.clear();
+  root_index.clear();
+  group_ptr.clear();
+  weights.clear();
+  base_margin.clear();
+}
+
+void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
+  int version = kVersion;
+  fo->Write(&version, sizeof(version));
+  fo->Write(&num_row, sizeof(num_row));
+  fo->Write(&num_col, sizeof(num_col));
+  fo->Write(&num_nonzero, sizeof(num_nonzero));
+  fo->Write(labels);
+  fo->Write(group_ptr);
+  fo->Write(weights);
+  fo->Write(root_index);
+  fo->Write(base_margin);
+}
+
+void MetaInfo::LoadBinary(dmlc::Stream *fi) {
+  int version;
+  CHECK(fi->Read(&version, sizeof(version)) == sizeof(version)) << "MetaInfo: invalid version";
+  CHECK_EQ(version, kVersion) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&num_row, sizeof(num_row)) == sizeof(num_row)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&num_col, sizeof(num_col)) == sizeof(num_col)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&num_nonzero, sizeof(num_nonzero)) == sizeof(num_nonzero))
+      << "MetaInfo: invalid format";
+  CHECK(fi->Read(&labels)) <<  "MetaInfo: invalid format";
+  CHECK(fi->Read(&group_ptr)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&weights)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&root_index)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&base_margin)) << "MetaInfo: invalid format";
+}
+
+// try to load group information from file, if exists
+inline bool MetaTryLoadGroup(const std::string& fname,
+                             std::vector<unsigned>* group) {
+  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
+  if (fi.get() == nullptr) return false;
+  dmlc::istream is(fi.get());
+  group->clear();
+  group->push_back(0);
+  unsigned nline;
+  while (is >> nline) {
+    group->push_back(group->back() + nline);
+  }
+  return true;
+}
+
+// try to load weight information from file, if exists
+inline bool MetaTryLoadFloatInfo(const std::string& fname,
+                                 std::vector<float>* data) {
+  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
+  if (fi.get() == nullptr) return false;
+  dmlc::istream is(fi.get());
+  data->clear();
+  float value;
+  while (is >> value) {
+    data->push_back(value);
+  }
+  return true;
+}
+
+// macro to dispatch according to specified pointer types
+#define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc)              \
+  switch (dtype) {                                                      \
+    case kFloat32: {                                                    \
+      const float* cast_ptr = reinterpret_cast<const float*>(old_ptr); proc; break; \
+    }                                                                   \
+    case kDouble: {                                                     \
+      const double* cast_ptr = reinterpret_cast<const double*>(old_ptr); proc; break; \
+    }                                                                   \
+    case kUInt32: {                                                     \
+      const uint32_t* cast_ptr = reinterpret_cast<const uint32_t*>(old_ptr); proc; break; \
+    }                                                                   \
+    case kUInt64: {                                                     \
+      const uint64_t* cast_ptr = reinterpret_cast<const uint64_t*>(old_ptr); proc; break; \
+    }                                                                   \
+    default: LOG(FATAL) << "Unknown data type" << dtype;                \
+  }                                                                     \
+
+
+void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
+  if (!std::strcmp(key, "root_index")) {
+    root_index.resize(num);
+    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
+                       std::copy(cast_dptr, cast_dptr + num, root_index.begin()));
+  } else if (!std::strcmp(key, "label")) {
+    labels.resize(num);
+    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
+                       std::copy(cast_dptr, cast_dptr + num, labels.begin()));
+  } else if (!std::strcmp(key, "weight")) {
+    weights.resize(num);
+    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
+                       std::copy(cast_dptr, cast_dptr + num, weights.begin()));
+  } else if (!std::strcmp(key, "base_margin")) {
+    base_margin.resize(num);
+    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
+                       std::copy(cast_dptr, cast_dptr + num, base_margin.begin()));
+  }
+}
+
+
+DMatrix* DMatrix::Load(const std::string& uri,
+                       bool silent,
+                       bool load_row_split,
+                       const std::string& file_format) {
+  std::string fname, cache_file;
+  size_t dlm_pos = uri.find('#');
+  if (dlm_pos != std::string::npos) {
+    cache_file = uri.substr(dlm_pos + 1, uri.length());
+    fname = uri.substr(0, dlm_pos);
+    CHECK_EQ(cache_file.find('#'), std::string::npos)
+        << "Only one `#` is allowed in file path for cache file specification.";
+    if (load_row_split) {
+      std::ostringstream os;
+      os << cache_file << ".r" << rabit::GetRank();
+      cache_file = os.str();
+    }
+  } else {
+    fname = uri;
+  }
+  int partid = 0, npart = 1;
+  if (load_row_split) {
+    partid = rabit::GetRank();
+    npart = rabit::GetWorldSize();
+  } else {
+    // test option to load in part
+    npart = dmlc::GetEnv("XGBOOST_TEST_NPART", 1);
+    if (npart != 1) {
+      LOG(CONSOLE) << "Partial load option on npart=" << npart;
+    }
+  }
+  // legacy handling of binary data loading
+  if (file_format == "auto" && !load_row_split) {
+    int magic;
+    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
+    if (fi.get() != nullptr) {
+      common::PeekableInStream is(fi.get());
+      if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
+          magic == data::SimpleCSRSource::kMagic) {
+        std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+        source->LoadBinary(&is);
+        DMatrix* dmat = DMatrix::Create(std::move(source), cache_file);
+        if (!silent) {
+          LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
+                       << dmat->info().num_nonzero << " entries loaded from " << uri;
+        }
+        return dmat;
+      }
+    }
+  }
+
+  std::string ftype = file_format;
+  if (file_format == "auto") ftype = "libsvm";
+  std::unique_ptr<dmlc::Parser<uint32_t> > parser(
+      dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, ftype.c_str()));
+  DMatrix* dmat = DMatrix::Create(parser.get(), cache_file);
+  if (!silent) {
+    LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
+                 << dmat->info().num_nonzero << " entries loaded from " << uri;
+  }
+  // backward compatiblity code.
+  if (!load_row_split) {
+    MetaInfo& info = dmat->info();
+    if (MetaTryLoadGroup(fname + ".group", &info.group_ptr) && !silent) {
+      LOG(CONSOLE) << info.group_ptr.size() - 1
+                   << " groups are loaded from " << fname << ".group";
+    }
+    if (MetaTryLoadFloatInfo(fname + ".base_margin", &info.base_margin) && !silent) {
+      LOG(CONSOLE) << info.base_margin.size()
+                   << " base_margin are loaded from " << fname << ".base_margin";
+    }
+  }
+  return dmat;
+}
+
+DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
+                         const std::string& cache_prefix) {
+  if (cache_prefix.length() == 0) {
+    std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+    source->CopyFrom(parser);
+    return DMatrix::Create(std::move(source), cache_prefix);
+  } else {
+#if DMLC_ENABLE_STD_THREAD
+    if (!data::SparsePageSource::CacheExist(cache_prefix)) {
+      data::SparsePageSource::Create(parser, cache_prefix);
+    }
+    std::unique_ptr<data::SparsePageSource> source(new data::SparsePageSource(cache_prefix));
+    return DMatrix::Create(std::move(source), cache_prefix);
+#else
+    LOG(FATAL) << "External memory is not enabled in mingw";
+    return nullptr;
+#endif
+  }
+}
+
+void DMatrix::SaveToLocalFile(const std::string& fname) {
+  data::SimpleCSRSource source;
+  source.CopyFrom(this);
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
+  source.SaveBinary(fo.get());
+}
+
+DMatrix* DMatrix::Create(std::unique_ptr<DataSource>&& source,
+                         const std::string& cache_prefix) {
+  if (cache_prefix.length() == 0) {
+    return new data::SimpleDMatrix(std::move(source));
+  } else {
+#if DMLC_ENABLE_STD_THREAD
+    return new data::SparsePageDMatrix(std::move(source), cache_prefix);
+#else
+    LOG(FATAL) << "External memory is not enabled in mingw";
+    return nullptr;
+#endif
+  }
+}
+}  // namespace xgboost
+
+namespace xgboost {
+namespace data {
+SparsePage::Format* SparsePage::Format::Create(const std::string& name) {
+  auto *e = ::dmlc::Registry< ::xgboost::data::SparsePageFormatReg>::Get()->Find(name);
+  if (e == nullptr) {
+    LOG(FATAL) << "Unknown format type " << name;
+  }
+  return (e->body)();
+}
+
+std::pair<std::string, std::string>
+SparsePage::Format::DecideFormat(const std::string& cache_prefix) {
+  size_t pos = cache_prefix.rfind(".fmt-");
+
+  if (pos != std::string::npos) {
+    std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
+    size_t cpos = fmt.rfind('-');
+    if (cpos != std::string::npos) {
+      return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
+    } else {
+      return std::make_pair(fmt, fmt);
+    }
+  } else {
+    std::string raw = "raw";
+    return std::make_pair(raw, raw);
+  }
+}
+
+// List of files that will be force linked in static links.
+DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
+}  // namespace data
+}  // namespace xgboost
diff --git a/src/data/simple_csr_source.cc b/src/data/simple_csr_source.cc
new file mode 100644
index 000000000..db23bb9d7
--- /dev/null
+++ b/src/data/simple_csr_source.cc
@@ -0,0 +1,101 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file simple_csr_source.cc
+ */
+#include <dmlc/base.h>
+#include <xgboost/logging.h>
+#include "./simple_csr_source.h"
+
+namespace xgboost {
+namespace data {
+
+void SimpleCSRSource::Clear() {
+  row_data_.clear();
+  row_ptr_.resize(1);
+  row_ptr_[0] = 0;
+  this->info.Clear();
+}
+
+void SimpleCSRSource::CopyFrom(DMatrix* src) {
+  this->Clear();
+  this->info = src->info();
+  dmlc::DataIter<RowBatch>* iter = src->RowIterator();
+  iter->BeforeFirst();
+  while (iter->Next()) {
+    const RowBatch &batch = iter->Value();
+    for (size_t i = 0; i < batch.size; ++i) {
+      RowBatch::Inst inst = batch[i];
+      row_data_.insert(row_data_.end(), inst.data, inst.data + inst.length);
+      row_ptr_.push_back(row_ptr_.back() + inst.length);
+    }
+  }
+}
+
+void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
+  this->Clear();
+  while (parser->Next()) {
+    const dmlc::RowBlock<uint32_t>& batch = parser->Value();
+    if (batch.label != nullptr) {
+      info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
+    }
+    if (batch.weight != nullptr) {
+      info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
+    }
+    row_data_.reserve(row_data_.size() + batch.offset[batch.size] - batch.offset[0]);
+    CHECK(batch.index != nullptr);
+    // update information
+    this->info.num_row += batch.size;
+    // copy the data over
+    for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
+      uint32_t index = batch.index[i];
+      bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
+      row_data_.push_back(SparseBatch::Entry(index, fvalue));
+      this->info.num_col = std::max(this->info.num_col,
+                                    static_cast<uint64_t>(index + 1));
+    }
+    size_t top = row_ptr_.size();
+    row_ptr_.resize(top + batch.size);
+    for (size_t i = 0; i < batch.size; ++i) {
+      row_ptr_[top + i] = row_ptr_[top - 1] + batch.offset[i + 1] - batch.offset[0];
+    }
+  }
+  this->info.num_nonzero = static_cast<uint64_t>(row_data_.size());
+}
+
+void SimpleCSRSource::LoadBinary(dmlc::Stream* fi) {
+  int tmagic;
+  CHECK(fi->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic)) << "invalid input file format";
+  CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
+  info.LoadBinary(fi);
+  fi->Read(&row_ptr_);
+  fi->Read(&row_data_);
+}
+
+void SimpleCSRSource::SaveBinary(dmlc::Stream* fo) const {
+  int tmagic = kMagic;
+  fo->Write(&tmagic, sizeof(tmagic));
+  info.SaveBinary(fo);
+  fo->Write(row_ptr_);
+  fo->Write(row_data_);
+}
+
+void SimpleCSRSource::BeforeFirst() {
+  at_first_ = true;
+}
+
+bool SimpleCSRSource::Next() {
+  if (!at_first_) return false;
+  at_first_ = false;
+  batch_.size = row_ptr_.size() - 1;
+  batch_.base_rowid = 0;
+  batch_.ind_ptr = dmlc::BeginPtr(row_ptr_);
+  batch_.data_ptr = dmlc::BeginPtr(row_data_);
+  return true;
+}
+
+const RowBatch& SimpleCSRSource::Value() const {
+  return batch_;
+}
+
+}  // namespace data
+}  // namespace xgboost
diff --git a/src/data/simple_csr_source.h b/src/data/simple_csr_source.h
new file mode 100644
index 000000000..1e7adb0b2
--- /dev/null
+++ b/src/data/simple_csr_source.h
@@ -0,0 +1,81 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file simple_csr_source.h
+ * \brief The simplest form of data source, can be used to create DMatrix.
+ *  This is an in-memory data structure that holds the data in row oriented format.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SIMPLE_CSR_SOURCE_H_
+#define XGBOOST_DATA_SIMPLE_CSR_SOURCE_H_
+
+#include <xgboost/base.h>
+#include <xgboost/data.h>
+#include <vector>
+#include <algorithm>
+
+
+namespace xgboost {
+namespace data {
+/*!
+ * \brief The simplest form of data holder, can be used to create DMatrix.
+ *  This is an in-memory data structure that holds the data in row oriented format.
+ * \code
+ * std::unique_ptr<DataSource> source(new SimpleCSRSource());
+ * // add data to source
+ * DMatrix* dmat = DMatrix::Create(std::move(source));
+ * \encode
+ */
+class SimpleCSRSource : public DataSource {
+ public:
+  // public data members
+  // MetaInfo info;  // inheritated from DataSource
+  /*! \brief row pointer of CSR sparse storage */
+  std::vector<size_t> row_ptr_;
+  /*! \brief data in the CSR sparse storage */
+  std::vector<RowBatch::Entry> row_data_;
+  // functions
+  /*! \brief default constructor */
+  SimpleCSRSource() : row_ptr_(1, 0), at_first_(true) {}
+  /*! \brief destructor */
+  virtual ~SimpleCSRSource() {}
+  /*! \brief clear the data structure */
+  void Clear();
+  /*!
+   * \brief copy content of data from src
+   * \param src source data iter.
+   */
+  void CopyFrom(DMatrix* src);
+  /*!
+   * \brief copy content of data from parser, also set the additional information.
+   * \param src source data iter.
+   * \param info The additional information reflected in the parser.
+   */
+  void CopyFrom(dmlc::Parser<uint32_t>* src);
+  /*!
+   * \brief Load data from binary stream.
+   * \param fi the pointer to load data from.
+   */
+  void LoadBinary(dmlc::Stream* fi);
+  /*!
+   * \brief Save data into binary stream
+   * \param fo The output stream.
+   */
+  void SaveBinary(dmlc::Stream* fo) const;
+  // implement Next
+  bool Next() override;
+  // implement BeforeFirst
+  void BeforeFirst() override;
+  // implement Value
+  const RowBatch &Value() const override;
+  /*! \brief magic number used to identify SimpleCSRSource */
+  static const int kMagic = 0xffffab01;
+
+ private:
+  /*! \brief internal variable, used to support iterator interface */
+  bool at_first_;
+  /*! \brief */
+  RowBatch batch_;
+};
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SIMPLE_CSR_SOURCE_H_
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
new file mode 100644
index 000000000..69700f45b
--- /dev/null
+++ b/src/data/simple_dmatrix.cc
@@ -0,0 +1,265 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file simple_dmatrix.cc
+ * \brief the input data structure for gradient boosting
+ * \author Tianqi Chen
+ */
+#include <xgboost/data.h>
+#include <limits>
+#include <algorithm>
+#include <vector>
+#include "./simple_dmatrix.h"
+#include "../common/random.h"
+#include "../common/group_data.h"
+
+namespace xgboost {
+namespace data {
+
+bool SimpleDMatrix::ColBatchIter::Next() {
+  if (data_ptr_ >= cpages_.size()) return false;
+  data_ptr_ += 1;
+  SparsePage* pcol = cpages_[data_ptr_ - 1].get();
+  batch_.size = col_index_.size();
+  col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
+  for (size_t i = 0; i < col_data_.size(); ++i) {
+    const bst_uint ridx = col_index_[i];
+    col_data_[i] = SparseBatch::Inst
+        (dmlc::BeginPtr(pcol->data) + pcol->offset[ridx],
+         static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
+  }
+  batch_.col_index = dmlc::BeginPtr(col_index_);
+  batch_.col_data = dmlc::BeginPtr(col_data_);
+  return true;
+}
+
+dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator() {
+  size_t ncol = this->info().num_col;
+  col_iter_.col_index_.resize(ncol);
+  for (size_t i = 0; i < ncol; ++i) {
+    col_iter_.col_index_[i] = static_cast<bst_uint>(i);
+  }
+  col_iter_.BeforeFirst();
+  return &col_iter_;
+}
+
+dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator(const std::vector<bst_uint>&fset) {
+  size_t ncol = this->info().num_col;
+  col_iter_.col_index_.resize(0);
+  for (size_t i = 0; i < fset.size(); ++i) {
+    if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
+  }
+  col_iter_.BeforeFirst();
+  return &col_iter_;
+}
+
+void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
+                                  float pkeep,
+                                  size_t max_row_perbatch) {
+  if (this->HaveColAccess()) return;
+
+  col_iter_.cpages_.clear();
+  if (info().num_row < max_row_perbatch) {
+    std::unique_ptr<SparsePage> page(new SparsePage());
+    this->MakeOneBatch(enabled, pkeep, page.get());
+    col_iter_.cpages_.push_back(std::move(page));
+  } else {
+    this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
+  }
+  // setup col-size
+  col_size_.resize(info().num_col);
+  std::fill(col_size_.begin(), col_size_.end(), 0);
+  for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
+    SparsePage *pcol = col_iter_.cpages_[i].get();
+    for (size_t j = 0; j < pcol->Size(); ++j) {
+      col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
+    }
+  }
+}
+
+// internal function to make one batch from row iter.
+void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled,
+                                 float pkeep,
+                                 SparsePage *pcol) {
+  // clear rowset
+  buffered_rowset_.clear();
+  // bit map
+  int nthread;
+  std::vector<bool> bmap;
+  #pragma omp parallel
+  {
+    nthread = omp_get_num_threads();
+  }
+
+  pcol->Clear();
+  common::ParallelGroupBuilder<SparseBatch::Entry>
+      builder(&pcol->offset, &pcol->data);
+  builder.InitBudget(info().num_col, nthread);
+  // start working
+  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
+  iter->BeforeFirst();
+  while (iter->Next()) {
+    const RowBatch& batch = iter->Value();
+    bmap.resize(bmap.size() + batch.size, true);
+    std::bernoulli_distribution coin_flip(pkeep);
+    auto& rnd = common::GlobalRandom();
+
+    long batch_size = static_cast<long>(batch.size); // NOLINT(*)
+    for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (pkeep == 1.0f || coin_flip(rnd)) {
+        buffered_rowset_.push_back(ridx);
+      } else {
+        bmap[i] = false;
+      }
+    }
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
+      int tid = omp_get_thread_num();
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (bmap[ridx]) {
+        RowBatch::Inst inst = batch[i];
+        for (bst_uint j = 0; j < inst.length; ++j) {
+          if (enabled[inst[j].index]) {
+            builder.AddBudget(inst[j].index, tid);
+          }
+        }
+      }
+    }
+  }
+  builder.InitStorage();
+
+  iter->BeforeFirst();
+  while (iter->Next()) {
+    const RowBatch& batch = iter->Value();
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
+      int tid = omp_get_thread_num();
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (bmap[ridx]) {
+        RowBatch::Inst inst = batch[i];
+        for (bst_uint j = 0; j < inst.length; ++j) {
+          if (enabled[inst[j].index]) {
+            builder.Push(inst[j].index,
+                         SparseBatch::Entry((bst_uint)(batch.base_rowid+i),
+                                            inst[j].fvalue), tid);
+          }
+        }
+      }
+    }
+  }
+
+  CHECK_EQ(pcol->Size(), info().num_col);
+  // sort columns
+  bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+  #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ncol; ++i) {
+    if (pcol->offset[i] < pcol->offset[i + 1]) {
+      std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
+                dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+                SparseBatch::Entry::CmpValue);
+    }
+  }
+}
+
+void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
+                                  float pkeep,
+                                  size_t max_row_perbatch) {
+  size_t btop = 0;
+  std::bernoulli_distribution coin_flip(pkeep);
+  auto& rnd = common::GlobalRandom();
+  buffered_rowset_.clear();
+  // internal temp cache
+  SparsePage tmp; tmp.Clear();
+  // start working
+  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
+  iter->BeforeFirst();
+
+  while (iter->Next()) {
+    const RowBatch &batch = iter->Value();
+    for (size_t i = 0; i < batch.size; ++i) {
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (pkeep == 1.0f || coin_flip(rnd)) {
+        buffered_rowset_.push_back(ridx);
+        tmp.Push(batch[i]);
+      }
+      if (tmp.Size() >= max_row_perbatch) {
+        std::unique_ptr<SparsePage> page(new SparsePage());
+        this->MakeColPage(tmp.GetRowBatch(0),
+                          dmlc::BeginPtr(buffered_rowset_) + btop,
+                          enabled, page.get());
+        col_iter_.cpages_.push_back(std::move(page));
+        btop = buffered_rowset_.size();
+        tmp.Clear();
+      }
+    }
+  }
+
+  if (tmp.Size() != 0) {
+    std::unique_ptr<SparsePage> page(new SparsePage());
+    this->MakeColPage(tmp.GetRowBatch(0),
+                      dmlc::BeginPtr(buffered_rowset_) + btop,
+                      enabled, page.get());
+    col_iter_.cpages_.push_back(std::move(page));
+  }
+}
+
+// make column page from subset of rowbatchs
+void SimpleDMatrix::MakeColPage(const RowBatch& batch,
+                                const bst_uint* ridx,
+                                const std::vector<bool>& enabled,
+                                SparsePage* pcol) {
+  int nthread;
+  #pragma omp parallel
+  {
+    nthread = omp_get_num_threads();
+    int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
+    if (nthread > max_nthread) {
+      nthread = max_nthread;
+    }
+  }
+  pcol->Clear();
+  common::ParallelGroupBuilder<SparseBatch::Entry>
+      builder(&pcol->offset, &pcol->data);
+  builder.InitBudget(info().num_col, nthread);
+  bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
+  #pragma omp parallel for schedule(static) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ndata; ++i) {
+    int tid = omp_get_thread_num();
+    RowBatch::Inst inst = batch[i];
+    for (bst_uint j = 0; j < inst.length; ++j) {
+      const SparseBatch::Entry &e = inst[j];
+      if (enabled[e.index]) {
+        builder.AddBudget(e.index, tid);
+      }
+    }
+  }
+  builder.InitStorage();
+  #pragma omp parallel for schedule(static) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ndata; ++i) {
+    int tid = omp_get_thread_num();
+    RowBatch::Inst inst = batch[i];
+    for (bst_uint j = 0; j < inst.length; ++j) {
+      const SparseBatch::Entry &e = inst[j];
+      builder.Push(e.index,
+                   SparseBatch::Entry(ridx[i], e.fvalue),
+                   tid);
+    }
+  }
+  CHECK_EQ(pcol->Size(), info().num_col);
+  // sort columns
+  bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+  #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ncol; ++i) {
+    if (pcol->offset[i] < pcol->offset[i + 1]) {
+      std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
+                dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+                SparseBatch::Entry::CmpValue);
+    }
+  }
+}
+
+bool SimpleDMatrix::SingleColBlock() const {
+  return col_iter_.cpages_.size() <= 1;
+}
+}  // namespace data
+}  // namespace xgboost
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
new file mode 100644
index 000000000..3b63e1e97
--- /dev/null
+++ b/src/data/simple_dmatrix.h
@@ -0,0 +1,119 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file simple_dmatrix.h
+ * \brief In-memory version of DMatrix.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_H_
+#define XGBOOST_DATA_SIMPLE_DMATRIX_H_
+
+#include <xgboost/base.h>
+#include <xgboost/data.h>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include "./sparse_batch_page.h"
+
+namespace xgboost {
+namespace data {
+
+class SimpleDMatrix : public DMatrix {
+ public:
+  explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
+      : source_(std::move(source)) {}
+
+  MetaInfo& info() override {
+    return source_->info;
+  }
+
+  const MetaInfo& info() const override {
+    return source_->info;
+  }
+
+  dmlc::DataIter<RowBatch>* RowIterator() override {
+    dmlc::DataIter<RowBatch>* iter = source_.get();
+    iter->BeforeFirst();
+    return iter;
+  }
+
+  bool HaveColAccess() const override {
+    return col_size_.size() != 0;
+  }
+
+  const std::vector<bst_uint>& buffered_rowset() const override {
+    return buffered_rowset_;
+  }
+
+  size_t GetColSize(size_t cidx) const {
+    return col_size_[cidx];
+  }
+
+  float GetColDensity(size_t cidx) const override {
+    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
+    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
+  }
+
+  dmlc::DataIter<ColBatch>* ColIterator() override;
+
+  dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) override;
+
+  void InitColAccess(const std::vector<bool>& enabled,
+                     float subsample,
+                     size_t max_row_perbatch) override;
+
+  bool SingleColBlock() const override;
+
+ private:
+  // in-memory column batch iterator.
+  struct ColBatchIter: dmlc::DataIter<ColBatch> {
+   public:
+    ColBatchIter() : data_ptr_(0) {}
+    void BeforeFirst() override {
+      data_ptr_ = 0;
+    }
+    const ColBatch &Value() const override {
+      return batch_;
+    }
+    bool Next() override;
+
+   private:
+    // allow SimpleDMatrix to access it.
+    friend class SimpleDMatrix;
+    // data content
+    std::vector<bst_uint> col_index_;
+    // column content
+    std::vector<ColBatch::Inst> col_data_;
+    // column sparse pages
+    std::vector<std::unique_ptr<SparsePage> > cpages_;
+    // data pointer
+    size_t data_ptr_;
+    // temporal space for batch
+    ColBatch batch_;
+  };
+
+  // source data pointer.
+  std::unique_ptr<DataSource> source_;
+  // column iterator
+  ColBatchIter col_iter_;
+  // list of row index that are buffered.
+  std::vector<bst_uint> buffered_rowset_;
+  /*! \brief sizeof column data */
+  std::vector<size_t> col_size_;
+
+  // internal function to make one batch from row iter.
+  void MakeOneBatch(const std::vector<bool>& enabled,
+                    float pkeep,
+                    SparsePage *pcol);
+
+  void MakeManyBatch(const std::vector<bool>& enabled,
+                     float pkeep,
+                     size_t max_row_perbatch);
+
+  void MakeColPage(const RowBatch& batch,
+                   const bst_uint* ridx,
+                   const std::vector<bool>& enabled,
+                   SparsePage* pcol);
+};
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SIMPLE_DMATRIX_H_
diff --git a/src/data/sparse_batch_page.h b/src/data/sparse_batch_page.h
new file mode 100644
index 000000000..41893e6b5
--- /dev/null
+++ b/src/data/sparse_batch_page.h
@@ -0,0 +1,199 @@
+/*!
+ * Copyright (c) 2014 by Contributors
+ * \file sparse_batch_page.h
+ *   content holder of sparse batch that can be saved to disk
+ *   the representation can be effectively
+ *   use in external memory computation
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
+#define XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
+
+#include <xgboost/data.h>
+#include <dmlc/io.h>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <utility>
+
+namespace xgboost {
+namespace data {
+/*!
+ * \brief in-memory storage unit of sparse batch
+ */
+class SparsePage {
+ public:
+  /*! \brief Format of the sparse page. */
+  class Format;
+  /*! \brief minimum index of all index, used as hint for compression. */
+  bst_uint min_index;
+  /*! \brief offset of the segments */
+  std::vector<size_t> offset;
+  /*! \brief the data of the segments */
+  std::vector<SparseBatch::Entry> data;
+
+  /*! \brief constructor */
+  SparsePage() {
+    this->Clear();
+  }
+  /*! \return number of instance in the page */
+  inline size_t Size() const {
+    return offset.size() - 1;
+  }
+  /*! \return estimation of memory cost of this page */
+  inline size_t MemCostBytes(void) const {
+    return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
+  }
+  /*! \brief clear the page */
+  inline void Clear(void) {
+    min_index = 0;
+    offset.clear();
+    offset.push_back(0);
+    data.clear();
+  }
+
+  /*!
+   * \brief Push row batch into the page
+   * \param batch the row batch
+   */
+  inline void Push(const RowBatch &batch) {
+    data.resize(offset.back() + batch.ind_ptr[batch.size]);
+    std::memcpy(dmlc::BeginPtr(data) + offset.back(),
+                batch.data_ptr + batch.ind_ptr[0],
+                sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
+    size_t top = offset.back();
+    size_t begin = offset.size();
+    offset.resize(offset.size() + batch.size);
+    for (size_t i = 0; i < batch.size; ++i) {
+      offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
+    }
+  }
+  /*!
+   * \brief Push row block into the page.
+   * \param batch the row batch.
+   */
+  inline void Push(const dmlc::RowBlock<uint32_t>& batch) {
+    data.reserve(data.size() + batch.offset[batch.size] - batch.offset[0]);
+    offset.reserve(offset.size() + batch.size);
+    CHECK(batch.index != nullptr);
+    for (size_t i = 0; i < batch.size; ++i) {
+      offset.push_back(offset.back() + batch.offset[i + 1] - batch.offset[i]);
+    }
+    for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
+      uint32_t index = batch.index[i];
+      bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
+      data.push_back(SparseBatch::Entry(index, fvalue));
+    }
+    CHECK_EQ(offset.back(), data.size());
+  }
+  /*!
+   * \brief Push a sparse page
+   * \param batch the row page
+   */
+  inline void Push(const SparsePage &batch) {
+    size_t top = offset.back();
+    data.resize(top + batch.data.size());
+    std::memcpy(dmlc::BeginPtr(data) + top,
+                dmlc::BeginPtr(batch.data),
+                sizeof(SparseBatch::Entry) * batch.data.size());
+    size_t begin = offset.size();
+    offset.resize(begin + batch.Size());
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      offset[i + begin] = top + batch.offset[i + 1];
+    }
+  }
+  /*!
+   * \brief Push one instance into page
+   *  \param row an instance row
+   */
+  inline void Push(const SparseBatch::Inst &inst) {
+    offset.push_back(offset.back() + inst.length);
+    size_t begin = data.size();
+    data.resize(begin + inst.length);
+    if (inst.length != 0) {
+      std::memcpy(dmlc::BeginPtr(data) + begin, inst.data,
+                  sizeof(SparseBatch::Entry) * inst.length);
+    }
+  }
+  /*!
+   * \param base_rowid base_rowid of the data
+   * \return row batch representation of the page
+   */
+  inline RowBatch GetRowBatch(size_t base_rowid) const {
+    RowBatch out;
+    out.base_rowid  = base_rowid;
+    out.ind_ptr = dmlc::BeginPtr(offset);
+    out.data_ptr = dmlc::BeginPtr(data);
+    out.size = offset.size() - 1;
+    return out;
+  }
+};
+
+/*!
+ * \brief Format specification of SparsePage.
+ */
+class SparsePage::Format {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~Format() {}
+  /*!
+   * \brief Load all the segments into page, advance fi to end of the block.
+   * \param page The data to read page into.
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  virtual bool Read(SparsePage* page, dmlc::SeekStream* fi) = 0;
+  /*!
+   * \brief read only the segments we are interested in, advance fi to end of the block.
+   * \param page The page to load the data into.
+   * \param fi the input stream of the file
+   * \param sorted_index_set sorted index of segments we are interested in
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  virtual bool Read(SparsePage* page,
+                    dmlc::SeekStream* fi,
+                    const std::vector<bst_uint>& sorted_index_set) = 0;
+  /*!
+   * \brief save the data to fo, when a page was written.
+   * \param fo output stream
+   */
+  virtual void Write(const SparsePage& page, dmlc::Stream* fo) = 0;
+  /*!
+   * \brief Create sparse page of format.
+   * \return The created format functors.
+   */
+  static Format* Create(const std::string& name);
+  /*!
+   * \brief decide the format from cache prefix.
+   * \return pair of row format, column format type of the cache prefix.
+   */
+  static std::pair<std::string, std::string> DecideFormat(const std::string& cache_prefix);
+};
+
+/*!
+ * \brief Registry entry for sparse page format.
+ */
+struct SparsePageFormatReg
+    : public dmlc::FunctionRegEntryBase<SparsePageFormatReg,
+                                        std::function<SparsePage::Format* ()> > {
+};
+
+/*!
+ * \brief Macro to register sparse page format.
+ *
+ * \code
+ * // example of registering a objective
+ * XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
+ * .describe("Raw binary data format.")
+ * .set_body([]() {
+ *     return new RawFormat();
+ *   });
+ * \endcode
+ */
+#define XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(Name)                       \
+  DMLC_REGISTRY_REGISTER(::xgboost::data::SparsePageFormatReg, SparsePageFormat, Name)
+
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
new file mode 100644
index 000000000..0cbf27e5d
--- /dev/null
+++ b/src/data/sparse_page_dmatrix.cc
@@ -0,0 +1,283 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file sparse_page_dmatrix.cc
+ * \brief The external memory version of Page Iterator.
+ * \author Tianqi Chen
+ */
+#include <dmlc/base.h>
+#include <dmlc/timer.h>
+#include <xgboost/logging.h>
+#include <memory>
+
+#if DMLC_ENABLE_STD_THREAD
+#include "./sparse_page_dmatrix.h"
+#include "../common/random.h"
+#include "../common/group_data.h"
+
+namespace xgboost {
+namespace data {
+
+SparsePageDMatrix::ColPageIter::ColPageIter(std::unique_ptr<dmlc::SeekStream>&& fi)
+    : fi_(std::move(fi)), page_(nullptr) {
+  load_all_ = false;
+
+  std::string format;
+  CHECK(fi_->Read(&format)) << "Invalid page format";
+  format_.reset(SparsePage::Format::Create(format));
+  size_t fbegin = fi_->Tell();
+
+  prefetcher_.Init([this](SparsePage** dptr) {
+      if (*dptr == nullptr) {
+        *dptr = new SparsePage();
+      }
+      if (load_all_) {
+        return format_->Read(*dptr, fi_.get());
+      } else {
+        return format_->Read(*dptr, fi_.get(), index_set_);
+      }
+    }, [this, fbegin] () {
+      fi_->Seek(fbegin);
+      index_set_ = set_index_set_;
+      load_all_ = set_load_all_;
+    });
+}
+
+SparsePageDMatrix::ColPageIter::~ColPageIter() {
+  delete page_;
+}
+
+bool SparsePageDMatrix::ColPageIter::Next() {
+  if (page_ != nullptr) {
+    prefetcher_.Recycle(&page_);
+  }
+  if (prefetcher_.Next(&page_)) {
+    out_.col_index = dmlc::BeginPtr(index_set_);
+    col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(nullptr, 0));
+    for (size_t i = 0; i < col_data_.size(); ++i) {
+      col_data_[i] = SparseBatch::Inst
+          (dmlc::BeginPtr(page_->data) + page_->offset[i],
+           static_cast<bst_uint>(page_->offset[i + 1] - page_->offset[i]));
+    }
+    out_.col_data = dmlc::BeginPtr(col_data_);
+    out_.size = col_data_.size();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void SparsePageDMatrix::ColPageIter::Init(const std::vector<bst_uint>& index_set,
+                                          bool load_all) {
+  set_index_set_ = index_set;
+  set_load_all_ = load_all;
+  std::sort(set_index_set_.begin(), set_index_set_.end());
+
+  this->BeforeFirst();
+}
+
+dmlc::DataIter<ColBatch>* SparsePageDMatrix::ColIterator() {
+  CHECK(col_iter_.get() != nullptr);
+  std::vector<bst_uint> col_index;
+  size_t ncol = this->info().num_col;
+  for (size_t i = 0; i < ncol; ++i) {
+    col_index.push_back(static_cast<bst_uint>(i));
+  }
+  col_iter_->Init(col_index, true);
+  return col_iter_.get();
+}
+
+dmlc::DataIter<ColBatch>* SparsePageDMatrix::
+ColIterator(const std::vector<bst_uint>& fset) {
+  CHECK(col_iter_.get() != nullptr);
+  std::vector<bst_uint> col_index;
+  size_t ncol = this->info().num_col;
+  for (size_t i = 0; i < fset.size(); ++i) {
+    if (fset[i] < ncol) {
+      col_index.push_back(fset[i]);
+    }
+  }
+  col_iter_->Init(col_index, false);
+  return col_iter_.get();
+}
+
+
+bool SparsePageDMatrix::TryInitColData() {
+  // load meta data.
+  {
+    std::string col_meta_name = cache_prefix_ + ".col.meta";
+    std::unique_ptr<dmlc::Stream> fmeta(
+        dmlc::Stream::Create(col_meta_name.c_str(), "r", true));
+    if (fmeta.get() == nullptr) return false;
+    CHECK(fmeta->Read(&buffered_rowset_)) << "invalid col.meta file";
+    CHECK(fmeta->Read(&col_size_)) << "invalid col.meta file";
+  }
+  // load real data
+  {
+    std::string col_data_name = cache_prefix_ + ".col.page";
+    std::unique_ptr<dmlc::SeekStream> fdata(
+        dmlc::SeekStream::CreateForRead(col_data_name.c_str(), true));
+    if (fdata.get() == nullptr) return false;
+    col_iter_.reset(new ColPageIter(std::move(fdata)));
+  }
+  return true;
+}
+
+void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
+                                      float pkeep,
+                                      size_t max_row_perbatch) {
+  if (HaveColAccess()) return;
+  if (TryInitColData()) return;
+
+  const MetaInfo& info = this->info();
+  if (max_row_perbatch == std::numeric_limits<size_t>::max()) {
+    max_row_perbatch = kMaxRowPerBatch;
+  }
+  buffered_rowset_.clear();
+  col_size_.resize(info.num_col);
+  std::fill(col_size_.begin(), col_size_.end(), 0);
+  // make the sparse page.
+  dmlc::ThreadedIter<SparsePage> cmaker;
+  SparsePage tmp;
+  size_t batch_ptr = 0, batch_top = 0;
+  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
+  std::bernoulli_distribution coin_flip(pkeep);
+
+  auto& rnd = common::GlobalRandom();
+
+  // function to create the page.
+  auto make_col_batch = [&] (
+      const SparsePage& prow,
+      const bst_uint* ridx,
+      SparsePage **dptr) {
+    if (*dptr == nullptr) {
+      *dptr = new SparsePage();
+    }
+    SparsePage* pcol = *dptr;
+    pcol->Clear();
+    pcol->min_index = ridx[0];
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+      nthread = std::max(nthread, std::max(omp_get_num_procs() / 2 - 1, 1));
+    }
+    common::ParallelGroupBuilder<SparseBatch::Entry>
+    builder(&pcol->offset, &pcol->data);
+    builder.InitBudget(info.num_col, nthread);
+    bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
+    #pragma omp parallel for schedule(static) num_threads(nthread)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      int tid = omp_get_thread_num();
+      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
+        const SparseBatch::Entry &e = prow.data[j];
+        if (enabled[e.index]) {
+          builder.AddBudget(e.index, tid);
+        }
+      }
+    }
+    builder.InitStorage();
+    #pragma omp parallel for schedule(static) num_threads(nthread)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      int tid = omp_get_thread_num();
+      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
+        const SparseBatch::Entry &e = prow.data[j];
+        builder.Push(e.index,
+                     SparseBatch::Entry(ridx[i], e.fvalue),
+                     tid);
+      }
+    }
+    CHECK_EQ(pcol->Size(), info.num_col);
+    // sort columns
+    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
+    for (bst_omp_uint i = 0; i < ncol; ++i) {
+      if (pcol->offset[i] < pcol->offset[i + 1]) {
+        std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
+                  dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+                  SparseBatch::Entry::CmpValue);
+      }
+    }
+  };
+
+  auto make_next_col = [&] (SparsePage** dptr) {
+    tmp.Clear();
+    size_t btop = buffered_rowset_.size();
+
+    while (true) {
+      if (batch_ptr != batch_top) {
+        const RowBatch& batch = iter->Value();
+        CHECK_EQ(batch_top, batch.size);
+        for (size_t i = batch_ptr; i < batch_top; ++i) {
+          bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+          if (pkeep == 1.0f || coin_flip(rnd)) {
+            buffered_rowset_.push_back(ridx);
+            tmp.Push(batch[i]);
+          }
+
+          if (tmp.Size() >= max_row_perbatch ||
+              tmp.MemCostBytes() >= kPageSize) {
+            make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
+            batch_ptr = i + 1;
+            return true;
+          }
+        }
+        batch_ptr = batch_top;
+      }
+      if (!iter->Next()) break;
+      batch_ptr = 0;
+      batch_top = iter->Value().size;
+    }
+
+    if (tmp.Size() != 0) {
+      make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
+      return true;
+    } else {
+      return false;
+    }
+  };
+
+  cmaker.Init(make_next_col, []() {});
+
+  std::string col_data_name = cache_prefix_ + ".col.page";
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(col_data_name.c_str(), "w"));
+  // find format.
+  std::string name_format = SparsePage::Format::DecideFormat(cache_prefix_).second;
+  fo->Write(name_format);
+  std::unique_ptr<SparsePage::Format> format(SparsePage::Format::Create(name_format));
+
+  double tstart = dmlc::GetTime();
+  size_t bytes_write = 0;
+  // print every 4 sec.
+  const double kStep = 4.0;
+  size_t tick_expected = kStep;
+  SparsePage* pcol = nullptr;
+
+  while (cmaker.Next(&pcol)) {
+    for (size_t i = 0; i < pcol->Size(); ++i) {
+      col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
+    }
+    format->Write(*pcol, fo.get());
+    size_t spage = pcol->MemCostBytes();
+    bytes_write += spage;
+    double tdiff = dmlc::GetTime() - tstart;
+    if (tdiff >= tick_expected) {
+      LOG(CONSOLE) << "Writing to " << col_data_name
+                   << " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
+                   << (bytes_write >> 20UL) << " MB writen";
+      tick_expected += kStep;
+    }
+    cmaker.Recycle(&pcol);
+  }
+  // save meta data
+  std::string col_meta_name = cache_prefix_ + ".col.meta";
+  fo.reset(dmlc::Stream::Create(col_meta_name.c_str(), "w"));
+  fo->Write(buffered_rowset_);
+  fo->Write(col_size_);
+  fo.reset(nullptr);
+  // initialize column data
+  CHECK(TryInitColData());
+}
+
+}  // namespace data
+}  // namespace xgboost
+#endif
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
new file mode 100644
index 000000000..e4aebee9c
--- /dev/null
+++ b/src/data/sparse_page_dmatrix.h
@@ -0,0 +1,130 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file simple_dmatrix.h
+ * \brief In-memory version of DMatrix.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
+#define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
+
+#include <xgboost/base.h>
+#include <xgboost/data.h>
+#include <dmlc/threadediter.h>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "./sparse_batch_page.h"
+
+namespace xgboost {
+namespace data {
+
+class SparsePageDMatrix : public DMatrix {
+ public:
+  explicit SparsePageDMatrix(std::unique_ptr<DataSource>&& source,
+                             const std::string& cache_prefix)
+      : source_(std::move(source)),
+        cache_prefix_(cache_prefix) {}
+
+  MetaInfo& info() override {
+    return source_->info;
+  }
+
+  const MetaInfo& info() const override {
+    return source_->info;
+  }
+
+  dmlc::DataIter<RowBatch>* RowIterator() override {
+    dmlc::DataIter<RowBatch>* iter = source_.get();
+    iter->BeforeFirst();
+    return iter;
+  }
+
+  bool HaveColAccess() const override {
+    return col_iter_.get() != nullptr;
+  }
+
+  const std::vector<bst_uint>& buffered_rowset() const override {
+    return buffered_rowset_;
+  }
+
+  size_t GetColSize(size_t cidx) const {
+    return col_size_[cidx];
+  }
+
+  float GetColDensity(size_t cidx) const override {
+    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
+    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
+  }
+
+  bool SingleColBlock() const override {
+    return false;
+  }
+
+  dmlc::DataIter<ColBatch>* ColIterator() override;
+
+  dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) override;
+
+  void InitColAccess(const std::vector<bool>& enabled,
+                     float subsample,
+                     size_t max_row_perbatch) override;
+
+  /*! \brief page size 256 MB */
+  static const size_t kPageSize = 256UL << 20UL;
+  /*! \brief Maximum number of rows per batch. */
+  static const size_t kMaxRowPerBatch = 64UL << 10UL;
+
+ private:
+  // declare the column batch iter.
+  class ColPageIter : public dmlc::DataIter<ColBatch> {
+   public:
+    explicit ColPageIter(std::unique_ptr<dmlc::SeekStream>&& fi);
+    virtual ~ColPageIter();
+    void BeforeFirst() override {
+      prefetcher_.BeforeFirst();
+    }
+    const ColBatch &Value() const override {
+      return out_;
+    }
+    bool Next() override;
+    // initialize the column iterator with the specified index set.
+    void Init(const std::vector<bst_uint>& index_set, bool load_all);
+
+   private:
+    // data file pointer.
+    std::unique_ptr<dmlc::SeekStream> fi_;
+    // the temp page.
+    SparsePage* page_;
+    // page format.
+    std::unique_ptr<SparsePage::Format> format_;
+    // The index set to be loaded.
+    std::vector<bst_uint> index_set_;
+    // The index set by the outsiders
+    std::vector<bst_uint> set_index_set_;
+    // whether to load data dataset.
+    bool set_load_all_, load_all_;
+    // data prefetcher.
+    dmlc::ThreadedIter<SparsePage> prefetcher_;
+    // temporal space for batch
+    ColBatch out_;
+    // the pointer data.
+    std::vector<SparseBatch::Inst> col_data_;
+  };
+  /*!
+   * \brief Try to intitialize column data.
+   * \return true if data already exists, false if they do not.
+   */
+  bool TryInitColData();
+  // source data pointer.
+  std::unique_ptr<DataSource> source_;
+  // the cache prefix
+  std::string cache_prefix_;
+  /*! \brief list of row index that are buffered */
+  std::vector<bst_uint> buffered_rowset_;
+  // count for column data
+  std::vector<size_t> col_size_;
+  // internal column iter.
+  std::unique_ptr<ColPageIter> col_iter_;
+};
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
diff --git a/src/data/sparse_page_raw_format.cc b/src/data/sparse_page_raw_format.cc
new file mode 100644
index 000000000..d0019fde6
--- /dev/null
+++ b/src/data/sparse_page_raw_format.cc
@@ -0,0 +1,99 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sparse_page_raw_format.cc
+ *  Raw binary format of sparse page.
+ */
+#include <xgboost/data.h>
+#include <dmlc/registry.h>
+#include "./sparse_batch_page.h"
+
+namespace xgboost {
+namespace data {
+
+DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
+
+class SparsePageRawFormat : public SparsePage::Format {
+ public:
+  bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
+    if (!fi->Read(&(page->offset))) return false;
+    CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file";
+    page->data.resize(page->offset.back());
+    if (page->data.size() != 0) {
+      CHECK_EQ(fi->Read(dmlc::BeginPtr(page->data),
+                        (page->data).size() * sizeof(SparseBatch::Entry)),
+               (page->data).size() * sizeof(SparseBatch::Entry))
+          << "Invalid SparsePage file";
+    }
+    return true;
+  }
+
+  bool Read(SparsePage* page,
+            dmlc::SeekStream* fi,
+            const std::vector<bst_uint>& sorted_index_set) override {
+    if (!fi->Read(&disk_offset_)) return false;
+    // setup the offset
+    page->offset.clear();
+    page->offset.push_back(0);
+    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
+      bst_uint fid = sorted_index_set[i];
+      CHECK_LT(fid + 1, disk_offset_.size());
+      size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
+      page->offset.push_back(page->offset.back() + size);
+    }
+    page->data.resize(page->offset.back());
+    // read in the data
+    size_t begin = fi->Tell();
+    size_t curr_offset = 0;
+    for (size_t i = 0; i < sorted_index_set.size();) {
+      bst_uint fid = sorted_index_set[i];
+      if (disk_offset_[fid] != curr_offset) {
+        CHECK_GT(disk_offset_[fid], curr_offset);
+        fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
+        curr_offset = disk_offset_[fid];
+      }
+      size_t j, size_to_read = 0;
+      for (j = i; j < sorted_index_set.size(); ++j) {
+        if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
+          size_to_read += page->offset[j + 1] - page->offset[j];
+        } else {
+          break;
+        }
+      }
+
+      if (size_to_read != 0) {
+        CHECK_EQ(fi->Read(dmlc::BeginPtr(page->data) + page->offset[i],
+                          size_to_read * sizeof(SparseBatch::Entry)),
+                 size_to_read * sizeof(SparseBatch::Entry))
+            << "Invalid SparsePage file";
+        curr_offset += size_to_read;
+      }
+      i = j;
+    }
+    // seek to end of record
+    if (curr_offset != disk_offset_.back()) {
+      fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
+    }
+    return true;
+  }
+
+  void Write(const SparsePage& page, dmlc::Stream* fo) override {
+    CHECK(page.offset.size() != 0 && page.offset[0] == 0);
+    CHECK_EQ(page.offset.back(), page.data.size());
+    fo->Write(page.offset);
+    if (page.data.size() != 0) {
+      fo->Write(dmlc::BeginPtr(page.data), page.data.size() * sizeof(SparseBatch::Entry));
+    }
+  }
+
+ private:
+  /*! \brief external memory column offset */
+  std::vector<size_t> disk_offset_;
+};
+
+XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
+.describe("Raw binary data format.")
+.set_body([]() {
+    return new SparsePageRawFormat();
+  });
+}  // namespace data
+}  // namespace xgboost
diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc
new file mode 100644
index 000000000..5730da9b5
--- /dev/null
+++ b/src/data/sparse_page_source.cc
@@ -0,0 +1,180 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file sparse_page_source.cc
+ */
+#include <dmlc/base.h>
+#include <dmlc/timer.h>
+#include <xgboost/logging.h>
+#include <memory>
+
+#if DMLC_ENABLE_STD_THREAD
+#include "./sparse_page_source.h"
+
+namespace xgboost {
+namespace data {
+
+SparsePageSource::SparsePageSource(const std::string& cache_prefix)
+    : base_rowid_(0), page_(nullptr) {
+  // read in the info files.
+  {
+    std::string name_info = cache_prefix;
+    std::unique_ptr<dmlc::Stream> finfo(dmlc::Stream::Create(name_info.c_str(), "r"));
+    int tmagic;
+    CHECK_EQ(finfo->Read(&tmagic, sizeof(tmagic)), sizeof(tmagic));
+    this->info.LoadBinary(finfo.get());
+  }
+  // read in the cache files.
+  std::string name_row = cache_prefix + ".row.page";
+  fi_.reset(dmlc::SeekStream::CreateForRead(name_row.c_str()));
+
+  std::string format;
+  CHECK(fi_->Read(&format)) << "Invalid page format";
+  format_.reset(SparsePage::Format::Create(format));
+  size_t fbegin = fi_->Tell();
+
+  prefetcher_.Init([this] (SparsePage** dptr) {
+      if (*dptr == nullptr) {
+        *dptr = new SparsePage();
+      }
+      return format_->Read(*dptr, fi_.get());
+    }, [this, fbegin] () { fi_->Seek(fbegin); });
+}
+
+SparsePageSource::~SparsePageSource() {
+  delete page_;
+}
+
+bool SparsePageSource::Next() {
+  if (page_ != nullptr) {
+    prefetcher_.Recycle(&page_);
+  }
+  if (prefetcher_.Next(&page_)) {
+    batch_ = page_->GetRowBatch(base_rowid_);
+    base_rowid_ += batch_.size;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void SparsePageSource::BeforeFirst() {
+  base_rowid_ = 0;
+  prefetcher_.BeforeFirst();
+}
+
+const RowBatch& SparsePageSource::Value() const {
+  return batch_;
+}
+
+bool SparsePageSource::CacheExist(const std::string& cache_prefix) {
+  std::string name_info = cache_prefix;
+  std::string name_row = cache_prefix + ".row.page";
+  std::unique_ptr<dmlc::Stream> finfo(dmlc::Stream::Create(name_info.c_str(), "r", true));
+  std::unique_ptr<dmlc::Stream> frow(dmlc::Stream::Create(name_row.c_str(), "r", true));
+  return finfo.get() != nullptr && frow.get() != nullptr;
+}
+
+void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
+                              const std::string& cache_prefix) {
+  // read in the info files.
+  std::string name_info = cache_prefix;
+  std::string name_row = cache_prefix + ".row.page";
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(name_row.c_str(), "w"));
+  std::string name_format = SparsePage::Format::DecideFormat(cache_prefix).first;
+  fo->Write(name_format);
+  std::unique_ptr<SparsePage::Format> format(SparsePage::Format::Create(name_format));
+
+  MetaInfo info;
+  SparsePage page;
+  size_t bytes_write = 0;
+  double tstart = dmlc::GetTime();
+  // print every 4 sec.
+  const double kStep = 4.0;
+  size_t tick_expected = kStep;
+
+  while (src->Next()) {
+    const dmlc::RowBlock<uint32_t>& batch = src->Value();
+    if (batch.label != nullptr) {
+      info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
+    }
+    if (batch.weight != nullptr) {
+      info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
+    }
+    info.num_row += batch.size;
+    info.num_nonzero +=  batch.offset[batch.size] - batch.offset[0];
+    for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
+      uint32_t index = batch.index[i];
+      info.num_col = std::max(info.num_col,
+                              static_cast<uint64_t>(index + 1));
+    }
+    page.Push(batch);
+    if (page.MemCostBytes() >= kPageSize) {
+      bytes_write += page.MemCostBytes();
+      format->Write(page, fo.get());
+      page.Clear();
+      double tdiff = dmlc::GetTime() - tstart;
+      if (tdiff >= tick_expected) {
+        LOG(CONSOLE) << "Writing to " << name_row << " in "
+                     << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
+                     << (bytes_write >> 20UL) << " written";
+        tick_expected += kStep;
+      }
+    }
+  }
+
+  if (page.data.size() != 0) {
+    format->Write(page, fo.get());
+  }
+
+  fo.reset(dmlc::Stream::Create(name_info.c_str(), "w"));
+  int tmagic = kMagic;
+  fo->Write(&tmagic, sizeof(tmagic));
+  info.SaveBinary(fo.get());
+
+  LOG(CONSOLE) << "SparsePageSource: Finished writing to " << cache_prefix;
+}
+
+void SparsePageSource::Create(DMatrix* src,
+                              const std::string& cache_prefix) {
+  // read in the info files.
+  std::string name_info = cache_prefix;
+  std::string name_row = cache_prefix + ".row.page";
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(name_row.c_str(), "w"));
+  // find format.
+  std::string name_format = SparsePage::Format::DecideFormat(cache_prefix).first;
+  fo->Write(name_format);
+  std::unique_ptr<SparsePage::Format> format(SparsePage::Format::Create(name_format));
+
+  SparsePage page;
+  size_t bytes_write = 0;
+  double tstart = dmlc::GetTime();
+  dmlc::DataIter<RowBatch>* iter = src->RowIterator();
+
+  while (iter->Next()) {
+    page.Push(iter->Value());
+    if (page.MemCostBytes() >= kPageSize) {
+      bytes_write += page.MemCostBytes();
+      format->Write(page, fo.get());
+      page.Clear();
+      double tdiff = dmlc::GetTime() - tstart;
+      LOG(CONSOLE) << "Writing to " << name_row << " in "
+                   << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
+                   << (bytes_write >> 20UL) << " written";
+    }
+  }
+
+  if (page.data.size() != 0) {
+    format->Write(page, fo.get());
+  }
+
+  fo.reset(dmlc::Stream::Create(name_info.c_str(), "w"));
+  int tmagic = kMagic;
+  fo->Write(&tmagic, sizeof(tmagic));
+  src->info().SaveBinary(fo.get());
+
+  LOG(CONSOLE) << "SparsePageSource: Finished writing to " << cache_prefix;
+}
+
+}  // namespace data
+}  // namespace xgboost
+#endif
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
new file mode 100644
index 000000000..79c55b4ba
--- /dev/null
+++ b/src/data/sparse_page_source.h
@@ -0,0 +1,85 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file page_csr_source.h
+ *  External memory data source, saved with sparse_batch_page binary format.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
+#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
+
+#include <xgboost/base.h>
+#include <xgboost/data.h>
+#include <dmlc/threadediter.h>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "./sparse_batch_page.h"
+
+namespace xgboost {
+namespace data {
+/*!
+ * \brief External memory data source.
+ * \code
+ * std::unique_ptr<DataSource> source(new SimpleCSRSource(cache_prefix));
+ * // add data to source
+ * DMatrix* dmat = DMatrix::Create(std::move(source));
+ * \encode
+ */
+class SparsePageSource : public DataSource {
+ public:
+  /*!
+   * \brief Create source from cache files the cache_prefix.
+   * \param cache_prefix The prefix of cache we want to solve.
+   */
+  explicit SparsePageSource(const std::string& cache_prefix) noexcept(false);
+  /*! \brief destructor */
+  virtual ~SparsePageSource();
+  // implement Next
+  bool Next() override;
+  // implement BeforeFirst
+  void BeforeFirst() override;
+  // implement Value
+  const RowBatch& Value() const override;
+  /*!
+   * \brief Create source by taking data from parser.
+   * \param src source parser.
+   * \param cache_prefix The cache_prefix of cache file location.
+   */
+  static void Create(dmlc::Parser<uint32_t>* src,
+                     const std::string& cache_prefix);
+  /*!
+   * \brief Create source cache by copy content from DMatrix.
+   * \param cache_prefix The cache_prefix of cache file location.
+   */
+  static void Create(DMatrix* src,
+                     const std::string& cache_prefix);
+  /*!
+   * \brief Check if the cache file already exists.
+   * \param cache_prefix The cache prefix of files.
+   * \return Whether cache file already exists.
+   */
+  static bool CacheExist(const std::string& cache_prefix);
+  /*! \brief page size 32 MB */
+  static const size_t kPageSize = 32UL << 20UL;
+  /*! \brief magic number used to identify Page */
+  static const int kMagic = 0xffffab02;
+
+ private:
+  /*! \brief number of rows */
+  size_t base_rowid_;
+  /*! \brief temp data. */
+  RowBatch batch_;
+  /*! \brief page currently on hold. */
+  SparsePage *page_;
+  /*! \brief The cache predix of the dataset. */
+  std::string cache_prefix_;
+  /*! \brief file pointer to the row blob file. */
+  std::unique_ptr<dmlc::SeekStream> fi_;
+  /*! \brief Sparse page format file. */
+  std::unique_ptr<SparsePage::Format> format_;
+  /*! \brief internal prefetcher. */
+  dmlc::ThreadedIter<SparsePage> prefetcher_;
+};
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear.cc
similarity index 51%
rename from src/gbm/gblinear-inl.hpp
rename to src/gbm/gblinear.cc
index 17d90e556..f4d235e1b 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear.cc
@@ -1,53 +1,112 @@
 /*!
- * Copyright by Contributors
- * \file gblinear-inl.hpp
+ * Copyright 2014 by Contributors
+ * \file gblinear.cc
  * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
  *        the update rule is parallel coordinate descent (shotgun)
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_GBM_GBLINEAR_INL_HPP_
-#define XGBOOST_GBM_GBLINEAR_INL_HPP_
-
+#include <dmlc/omp.h>
+#include <dmlc/parameter.h>
+#include <xgboost/gbm.h>
+#include <xgboost/logging.h>
 #include <vector>
 #include <string>
 #include <sstream>
+#include <cstring>
 #include <algorithm>
-#include "./gbm.h"
-#include "../tree/updater.h"
 
 namespace xgboost {
 namespace gbm {
+
+DMLC_REGISTRY_FILE_TAG(gblinear);
+
+// model parameter
+struct GBLinearModelParam :public dmlc::Parameter<GBLinearModelParam> {
+  // number of feature dimension
+  unsigned num_feature;
+  // number of output group
+  int num_output_group;
+  // reserved field
+  int reserved[32];
+  // constructor
+  GBLinearModelParam() {
+    std::memset(this, 0, sizeof(GBLinearModelParam));
+  }
+  DMLC_DECLARE_PARAMETER(GBLinearModelParam) {
+    DMLC_DECLARE_FIELD(num_feature).set_lower_bound(0)
+        .describe("Number of features used in classification.");
+    DMLC_DECLARE_FIELD(num_output_group).set_lower_bound(1).set_default(1)
+        .describe("Number of output groups in the setting.");
+  }
+};
+
+// training parameter
+struct GBLinearTrainParam : public dmlc::Parameter<GBLinearTrainParam> {
+  /*! \brief learning_rate */
+  float learning_rate;
+  /*! \brief regularization weight for L2 norm */
+  float reg_lambda;
+  /*! \brief regularization weight for L1 norm */
+  float reg_alpha;
+  /*! \brief regularization weight for L2 norm in bias */
+  float reg_lambda_bias;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(GBLinearTrainParam) {
+    DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(1.0f)
+        .describe("Learning rate of each update.");
+    DMLC_DECLARE_FIELD(reg_lambda).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("L2 regularization on weights.");
+    DMLC_DECLARE_FIELD(reg_alpha).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("L1 regularization on weights.");
+    DMLC_DECLARE_FIELD(reg_lambda_bias).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("L2 regularization on bias.");
+    // alias of parameters
+    DMLC_DECLARE_ALIAS(learning_rate, eta);
+    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
+    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
+    DMLC_DECLARE_ALIAS(reg_lambda_bias, lambda_bias);
+  }
+  // given original weight calculate delta
+  inline double CalcDelta(double sum_grad, double sum_hess, double w) const {
+    if (sum_hess < 1e-5f) return 0.0f;
+    double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
+    if (tmp >=0) {
+      return std::max(-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
+    } else {
+      return std::min(-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
+    }
+  }
+  // given original weight calculate delta bias
+  inline double CalcDeltaBias(double sum_grad, double sum_hess, double w) const {
+    return - (sum_grad + reg_lambda_bias * w) / (sum_hess + reg_lambda_bias);
+  }
+};
+
 /*!
  * \brief gradient boosted linear model
- * \tparam FMatrix the data type updater taking
  */
-class GBLinear : public IGradBooster {
+class GBLinear : public GradientBooster {
  public:
-  virtual ~GBLinear(void) {
-  }
-  // set model parameters
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strncmp(name, "bst:", 4)) {
-      param.SetParam(name + 4, val);
-    }
+  void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
     if (model.weight.size() == 0) {
-      model.param.SetParam(name, val);
+      model.param.InitAllowUnknown(cfg);
     }
+    param.InitAllowUnknown(cfg);
   }
-  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*)
-    model.LoadModel(fi);
+  void Load(dmlc::Stream* fi) override {
+    model.Load(fi);
   }
-  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*)
-    model.SaveModel(fo);
+  void Save(dmlc::Stream* fo) const override {
+    model.Save(fo);
   }
-  virtual void InitModel(void) {
-    model.InitModel();
-  }
-  virtual void DoBoost(IFMatrix *p_fmat,
+  virtual void DoBoost(DMatrix *p_fmat,
                        int64_t buffer_offset,
-                       const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
+    // lazily initialize the model when not ready.
+    if (model.weight.size() == 0) {
+      model.InitModel();
+    }
+
     std::vector<bst_gpair> &gpair = *in_gpair;
     const int ngroup = model.param.num_output_group;
     const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
@@ -75,7 +134,7 @@ class GBLinear : public IGradBooster {
         }
       }
     }
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
     while (iter->Next()) {
       // number of features
       const ColBatch &batch = iter->Value();
@@ -108,22 +167,23 @@ class GBLinear : public IGradBooster {
     }
   }
 
-  virtual void Predict(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) {
-    utils::Check(ntree_limit == 0,
-                 "GBLinear::Predict ntrees is only valid for gbtree predictor");
+  void Predict(DMatrix *p_fmat,
+               int64_t buffer_offset,
+               std::vector<float> *out_preds,
+               unsigned ntree_limit) override {
+    if (model.weight.size() == 0) {
+      model.InitModel();
+    }
+    CHECK_EQ(ntree_limit, 0)
+        << "GBLinear::Predict ntrees is only valid for gbtree predictor";
     std::vector<float> &preds = *out_preds;
     preds.resize(0);
     // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
     const int ngroup = model.param.num_output_group;
     while (iter->Next()) {
       const RowBatch &batch = iter->Value();
-      utils::Assert(batch.base_rowid * ngroup == preds.size(),
-                    "base_rowid is not set correctly");
+      CHECK_EQ(batch.base_rowid * ngroup, preds.size());
       // output convention: nrow * k, where nrow is number of rows
       // k is number of group
       preds.resize(preds.size() + batch.size * ngroup);
@@ -139,22 +199,22 @@ class GBLinear : public IGradBooster {
       }
     }
   }
-  virtual void Predict(const SparseBatch::Inst &inst,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit,
-                       unsigned root_index) {
+  void Predict(const SparseBatch::Inst &inst,
+               std::vector<float> *out_preds,
+               unsigned ntree_limit,
+               unsigned root_index) override {
     const int ngroup = model.param.num_output_group;
     for (int gid = 0; gid < ngroup; ++gid) {
-      this->Pred(inst, BeginPtr(*out_preds));
+      this->Pred(inst, dmlc::BeginPtr(*out_preds));
     }
   }
-  virtual void PredictLeaf(IFMatrix *p_fmat,
-                           const BoosterInfo &info,
-                           std::vector<float> *out_preds,
-                           unsigned ntree_limit = 0) {
-    utils::Error("gblinear does not support predict leaf index");
+  void PredictLeaf(DMatrix *p_fmat,
+                   std::vector<float> *out_preds,
+                   unsigned ntree_limit) override {
+    LOG(FATAL) << "gblinear does not support predict leaf index";
   }
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
+
+  std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const override {
     std::stringstream fo("");
     fo << "bias:\n";
     for (int i = 0; i < model.param.num_output_group; ++i) {
@@ -182,76 +242,11 @@ class GBLinear : public IGradBooster {
       preds[gid] = psum;
     }
   }
-  // training parameter
-  struct ParamTrain {
-    /*! \brief learning_rate */
-    float learning_rate;
-    /*! \brief regularization weight for L2 norm */
-    float reg_lambda;
-    /*! \brief regularization weight for L1 norm */
-    float reg_alpha;
-    /*! \brief regularization weight for L2 norm in bias */
-    float reg_lambda_bias;
-    // parameter
-    ParamTrain(void) {
-      reg_alpha = 0.0f;
-      reg_lambda = 0.0f;
-      reg_lambda_bias = 0.0f;
-      learning_rate = 1.0f;
-    }
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      // sync-names
-      if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
-      if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
-      if (!strcmp( "alpha", name)) reg_alpha = static_cast<float>(atof(val));
-      if (!strcmp( "lambda_bias", name)) reg_lambda_bias = static_cast<float>(atof(val));
-      // real names
-      if (!strcmp( "learning_rate", name)) learning_rate = static_cast<float>(atof(val));
-      if (!strcmp( "reg_lambda", name)) reg_lambda = static_cast<float>(atof(val));
-      if (!strcmp( "reg_alpha", name)) reg_alpha = static_cast<float>(atof(val));
-      if (!strcmp( "reg_lambda_bias", name)) reg_lambda_bias = static_cast<float>(atof(val));
-    }
-    // given original weight calculate delta
-    inline double CalcDelta(double sum_grad, double sum_hess, double w) {
-      if (sum_hess < 1e-5f) return 0.0f;
-      double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
-      if (tmp >=0) {
-        return std::max(-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
-      } else {
-        return std::min(-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
-      }
-    }
-    // given original weight calculate delta bias
-    inline double CalcDeltaBias(double sum_grad, double sum_hess, double w) {
-      return - (sum_grad + reg_lambda_bias * w) / (sum_hess + reg_lambda_bias);
-    }
-  };
   // model for linear booster
   class Model {
    public:
-    // model parameter
-    struct Param {
-      // number of feature dimension
-      unsigned num_feature;
-      // number of output group
-      int num_output_group;
-      // reserved field
-      int reserved[32];
-      // constructor
-      Param(void) {
-        num_feature = 0;
-        num_output_group = 1;
-        std::memset(reserved, 0, sizeof(reserved));
-      }
-      inline void SetParam(const char *name, const char *val) {
-        using namespace std;
-        if (!strcmp(name, "bst:num_feature")) num_feature = static_cast<unsigned>(atoi(val));
-        if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
-      }
-    };
     // parameter
-    Param param;
+    GBLinearModelParam param;
     // weight for each of feature, bias is the last one
     std::vector<float> weight;
     // initialize the model parameter
@@ -261,32 +256,46 @@ class GBLinear : public IGradBooster {
       std::fill(weight.begin(), weight.end(), 0.0f);
     }
     // save the model to file
-    inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
-      fo.Write(&param, sizeof(Param));
-      fo.Write(weight);
+    inline void Save(dmlc::Stream* fo) const {
+      fo->Write(&param, sizeof(param));
+      fo->Write(weight);
     }
     // load model from file
-    inline void LoadModel(utils::IStream &fi) { // NOLINT(*)
-      utils::Assert(fi.Read(&param, sizeof(Param)) != 0, "Load LinearBooster");
-      fi.Read(&weight);
+    inline void Load(dmlc::Stream* fi) {
+      CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param));
+      fi->Read(&weight);
     }
     // model bias
-    inline float* bias(void) {
+    inline float* bias() {
+      return &weight[param.num_feature * param.num_output_group];
+    }
+    inline const float* bias() const {
       return &weight[param.num_feature * param.num_output_group];
     }
     // get i-th weight
     inline float* operator[](size_t i) {
       return &weight[i * param.num_output_group];
     }
+    inline const float* operator[](size_t i) const {
+      return &weight[i * param.num_output_group];
+    }
   };
   // model field
   Model model;
   // training parameter
-  ParamTrain param;
+  GBLinearTrainParam param;
   // Per feature: shuffle index of each feature index
   std::vector<bst_uint> feat_index;
 };
 
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(GBLinearModelParam);
+DMLC_REGISTER_PARAMETER(GBLinearTrainParam);
+
+XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
+.describe("Linear booster, implement generalized linear model.")
+.set_body([]() {
+    return new GBLinear();
+  });
 }  // namespace gbm
 }  // namespace xgboost
-#endif  // XGBOOST_GBM_GBLINEAR_INL_HPP_
diff --git a/src/gbm/gbm.cc b/src/gbm/gbm.cc
new file mode 100644
index 000000000..ae5185867
--- /dev/null
+++ b/src/gbm/gbm.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file gbm.cc
+ * \brief Registry of gradient boosters.
+ */
+#include <xgboost/gbm.h>
+#include <dmlc/registry.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg);
+}  // namespace dmlc
+
+namespace xgboost {
+GradientBooster* GradientBooster::Create(const std::string& name) {
+  auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name);
+  if (e == nullptr) {
+    LOG(FATAL) << "Unknown gbm type " << name;
+  }
+  return (e->body)();
+}
+}  // namespace xgboost
+
+namespace xgboost {
+namespace gbm {
+// List of files that will be force linked in static links.
+DMLC_REGISTRY_LINK_TAG(gblinear);
+DMLC_REGISTRY_LINK_TAG(gbtree);
+}  // namespace gbm
+}  // namespace xgboost
diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp
deleted file mode 100644
index 13ad44c57..000000000
--- a/src/gbm/gbm.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <cstring>
-#include "./gbm.h"
-#include "./gbtree-inl.hpp"
-#include "./gblinear-inl.hpp"
-
-namespace xgboost {
-namespace gbm {
-IGradBooster* CreateGradBooster(const char *name) {
-  using namespace std;
-  if (!strcmp("gbtree", name)) return new GBTree();
-  if (!strcmp("gblinear", name)) return new GBLinear();
-  utils::Error("unknown booster type: %s", name);
-  return NULL;
-}
-}  // namespace gbm
-}  // namespace xgboost
-
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
deleted file mode 100644
index 8ff692c05..000000000
--- a/src/gbm/gbm.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file gbm.h
- * \brief interface of gradient booster, that learns through gradient statistics
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_GBM_GBM_H_
-#define XGBOOST_GBM_GBM_H_
-
-#include <vector>
-#include <string>
-#include "../data.h"
-#include "../utils/io.h"
-#include "../utils/fmap.h"
-
-namespace xgboost {
-/*! \brief namespace for gradient booster */
-namespace gbm {
-/*!
- * \brief interface of gradient boosting model
- */
-class IGradBooster {
- public:
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  virtual void SetParam(const char *name, const char *val) = 0;
-  /*!
-   * \brief load model from stream
-   * \param fi input stream
-   * \param with_pbuffer whether the incoming data contains pbuffer
-   */
-  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0; // NOLINT(*)
-  /*!
-   * \brief save model to stream
-   * \param fo output stream
-   * \param with_pbuffer whether save out pbuffer
-   */
-  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0; // NOLINT(*)
-  /*!
-   * \brief initialize the model
-   */
-  virtual void InitModel(void) = 0;
-  /*!
-   * \brief reset the predict buffer
-   * this will invalidate all the previous cached results
-   * and recalculate from scratch
-   */
-  virtual void ResetPredBuffer(size_t num_pbuffer) {}
-  /*!
-   * \brief whether the model allow lazy checkpoint
-   * return true if model is only updated in DoBoost
-   * after all Allreduce calls
-   */
-  virtual bool AllowLazyCheckPoint(void) const {
-    return false;
-  }
-  /*!
-   * \brief perform update to the model(boosting)
-   * \param p_fmat feature matrix that provide access to features
-   * \param buffer_offset buffer index offset of these instances, if equals -1
-   *        this means we do not have buffer index allocated to the gbm
-   * \param info meta information about training
-   * \param in_gpair address of the gradient pair statistics of the data
-   * the booster may change content of gpair
-   */
-  virtual void DoBoost(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<bst_gpair> *in_gpair) = 0;
-  /*!
-   * \brief generate predictions for given feature matrix
-   * \param p_fmat feature matrix
-   * \param buffer_offset buffer index offset of these instances, if equals -1
-   *        this means we do not have buffer index allocated to the gbm
-   *  a buffer index is assigned to each instance that requires repeative prediction
-   *  the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
-   * \param info extra side information that may be needed for prediction
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
-   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
-   */
-  virtual void Predict(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) = 0;
-  /*!
-   * \brief online prediction function, predict score for one instance at a time
-   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
-   *        more efficient than online prediction
-   *        This function is NOT threadsafe, make sure you only call from one thread
-   *
-   * \param inst the instance you want to predict
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction
-   * \param root_index the root index
-   * \sa Predict
-   */
-  virtual void Predict(const SparseBatch::Inst &inst,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0,
-                       unsigned root_index = 0)  = 0;
-  /*!
-   * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
-   *        this is only valid in gbtree predictor
-   * \param p_fmat feature matrix
-   * \param info extra side information that may be needed for prediction
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
-   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
-   */
-  virtual void PredictLeaf(IFMatrix *p_fmat,
-                           const BoosterInfo &info,
-                           std::vector<float> *out_preds,
-                           unsigned ntree_limit = 0) = 0;
-  /*!
-   * \brief dump the model in text format
-   * \param fmap feature map that may help give interpretations of feature
-   * \param option extra option of the dump model
-   * \return a vector of dump for boosters
-   */
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) = 0;
-  // destrcutor
-  virtual ~IGradBooster(void){}
-};
-/*!
- * \breif create a gradient booster from given name
- * \param name name of gradient booster
- */
-IGradBooster* CreateGradBooster(const char *name);
-}  // namespace gbm
-}  // namespace xgboost
-#endif  // XGBOOST_GBM_GBM_H_
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
deleted file mode 100644
index 65fe7e9da..000000000
--- a/src/gbm/gbtree-inl.hpp
+++ /dev/null
@@ -1,520 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file gbtree-inl.hpp
- * \brief gradient boosted tree implementation
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_GBM_GBTREE_INL_HPP_
-#define XGBOOST_GBM_GBTREE_INL_HPP_
-
-#include <vector>
-#include <utility>
-#include <string>
-#include <limits>
-#include "./gbm.h"
-#include "../utils/omp.h"
-#include "../tree/updater.h"
-
-namespace xgboost {
-namespace gbm {
-/*!
- * \brief gradient boosted tree
- */
-class GBTree : public IGradBooster {
- public:
-  GBTree(void) {
-  }
-  virtual ~GBTree(void) {
-    this->Clear();
-  }
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strncmp(name, "bst:", 4)) {
-      cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
-      // set into updaters, if already initialized
-      for (size_t i = 0; i < updaters.size(); ++i) {
-        updaters[i]->SetParam(name+4, val);
-      }
-    }
-    if (!strcmp(name, "silent")) {
-      this->SetParam("bst:silent", val);
-    }
-    tparam.SetParam(name, val);
-    if (trees.size() == 0) mparam.SetParam(name, val);
-  }
-  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*)
-    this->Clear();
-    utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
-                 "GBTree: invalid model file");
-    trees.resize(mparam.num_trees);
-    for (size_t i = 0; i < trees.size(); ++i) {
-      trees[i] = new tree::RegTree();
-      trees[i]->LoadModel(fi);
-    }
-    tree_info.resize(mparam.num_trees);
-    if (mparam.num_trees != 0) {
-      utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
-                   "GBTree: invalid model file");
-    }
-    if (mparam.num_pbuffer != 0 && with_pbuffer) {
-      pred_buffer.resize(mparam.PredBufferSize());
-      pred_counter.resize(mparam.PredBufferSize());
-      utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
-                   "GBTree: invalid model file");
-      utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
-                   "GBTree: invalid model file");
-    }
-  }
-  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*)
-    utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
-    if (with_pbuffer) {
-      fo.Write(&mparam, sizeof(ModelParam));
-    } else {
-      ModelParam p = mparam;
-      p.num_pbuffer = 0;
-      fo.Write(&p, sizeof(ModelParam));
-    }
-    for (size_t i = 0; i < trees.size(); ++i) {
-      trees[i]->SaveModel(fo);
-    }
-    if (tree_info.size() != 0) {
-      fo.Write(BeginPtr(tree_info), sizeof(int) * tree_info.size());
-    }
-    if (mparam.num_pbuffer != 0 && with_pbuffer) {
-      fo.Write(BeginPtr(pred_buffer), pred_buffer.size() * sizeof(float));
-      fo.Write(BeginPtr(pred_counter), pred_counter.size() * sizeof(unsigned));
-    }
-  }
-  // initialize the predict buffer
-  virtual void InitModel(void) {
-    pred_buffer.clear(); pred_counter.clear();
-    pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
-    pred_counter.resize(mparam.PredBufferSize(), 0);
-    utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
-    utils::Assert(trees.size() == 0, "GBTree: model already initialized");
-  }
-  virtual void ResetPredBuffer(size_t num_pbuffer) {
-    mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer);
-    pred_buffer.clear(); pred_counter.clear();
-    pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
-    pred_counter.resize(mparam.PredBufferSize(), 0);
-  }
-  virtual bool AllowLazyCheckPoint(void) const {
-    return !(tparam.distcol_mode != 0  && mparam.num_output_group != 1);
-  }
-  virtual void DoBoost(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<bst_gpair> *in_gpair) {
-    const std::vector<bst_gpair> &gpair = *in_gpair;
-    std::vector<std::vector<tree::RegTree*> > new_trees;
-    if (mparam.num_output_group == 1) {
-      new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
-    } else {
-      const int ngroup = mparam.num_output_group;
-      utils::Check(gpair.size() % ngroup == 0,
-                   "must have exactly ngroup*nrow gpairs");
-      std::vector<bst_gpair> tmp(gpair.size()/ngroup);
-      for (int gid = 0; gid < ngroup; ++gid) {
-        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          tmp[i] = gpair[i * ngroup + gid];
-        }
-        new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
-      }
-    }
-    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-      this->CommitModel(new_trees[gid], gid);
-    }
-  }
-  virtual void Predict(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    InitThreadTemp(nthread);
-    std::vector<float> &preds = *out_preds;
-    const size_t stride = info.num_row * mparam.num_output_group;
-    preds.resize(stride * (mparam.size_leaf_vector+1));
-    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      // parallel over local batch
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        const int tid = omp_get_thread_num();
-        tree::RegTree::FVec &feats = thread_temp[tid];
-        int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
-        utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound");
-        // loop over output groups
-        for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-          this->Pred(batch[i],
-                     buffer_offset < 0 ? -1 : buffer_offset + ridx,
-                     gid, info.GetRoot(ridx), &feats,
-                     &preds[ridx * mparam.num_output_group + gid], stride,
-                     ntree_limit);
-        }
-      }
-    }
-  }
-  virtual void Predict(const SparseBatch::Inst &inst,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit,
-                       unsigned root_index) {
-    if (thread_temp.size() == 0) {
-      thread_temp.resize(1, tree::RegTree::FVec());
-      thread_temp[0].Init(mparam.num_feature);
-    }
-    out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
-    // loop over output groups
-    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-      this->Pred(inst, -1, gid, root_index, &thread_temp[0],
-                 &(*out_preds)[gid], mparam.num_output_group,
-                 ntree_limit);
-    }
-  }
-  virtual void PredictLeaf(IFMatrix *p_fmat,
-                           const BoosterInfo &info,
-                           std::vector<float> *out_preds,
-                           unsigned ntree_limit) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    InitThreadTemp(nthread);
-    this->PredPath(p_fmat, info, out_preds, ntree_limit);
-  }
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    std::vector<std::string> dump;
-    for (size_t i = 0; i < trees.size(); i++) {
-      dump.push_back(trees[i]->DumpModel(fmap, option&1));
-    }
-    return dump;
-  }
-
- protected:
-  // clear the model
-  inline void Clear(void) {
-    for (size_t i = 0; i < trees.size(); ++i) {
-      delete trees[i];
-    }
-    for (size_t i = 0; i < updaters.size(); ++i) {
-      delete updaters[i];
-    }
-    updaters.clear();
-    trees.clear();
-    pred_buffer.clear();
-    pred_counter.clear();
-  }
-  // initialize updater before using them
-  inline void InitUpdater(void) {
-    if (tparam.updater_initialized != 0) return;
-    for (size_t i = 0; i < updaters.size(); ++i) {
-      delete updaters[i];
-    }
-    updaters.clear();
-    std::string tval = tparam.updater_seq;
-    char *pstr;
-    pstr = std::strtok(&tval[0], ",");
-    while (pstr != NULL) {
-      updaters.push_back(tree::CreateUpdater(pstr));
-      for (size_t j = 0; j < cfg.size(); ++j) {
-        // set parameters
-        updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
-      }
-      pstr = std::strtok(NULL, ",");
-    }
-    tparam.updater_initialized = 1;
-  }
-  // do group specific group
-  inline std::vector<tree::RegTree*>
-  BoostNewTrees(const std::vector<bst_gpair> &gpair,
-                IFMatrix *p_fmat,
-                int64_t buffer_offset,
-                const BoosterInfo &info,
-                int bst_group) {
-    std::vector<tree::RegTree *> new_trees;
-    this->InitUpdater();
-    // create the trees
-    for (int i = 0; i < tparam.num_parallel_tree; ++i) {
-      new_trees.push_back(new tree::RegTree());
-      for (size_t j = 0; j < cfg.size(); ++j) {
-        new_trees.back()->param.SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
-      }
-      new_trees.back()->InitModel();
-    }
-    // update the trees
-    for (size_t i = 0; i < updaters.size(); ++i) {
-      updaters[i]->Update(gpair, p_fmat, info, new_trees);
-    }
-    // optimization, update buffer, if possible
-    // this is only under distributed column mode
-    // for safety check of lazy checkpoint
-    if (
-        buffer_offset >= 0 &&
-        new_trees.size() == 1 && updaters.size() > 0 &&
-        updaters.back()->GetLeafPosition() != NULL) {
-      utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
-                   "distributed mode is not compatible with prob_buffer_row");
-      this->UpdateBufferByPosition(p_fmat,
-                                   buffer_offset, bst_group,
-                                   *new_trees[0],
-                                   updaters.back()->GetLeafPosition());
-    }
-    return new_trees;
-  }
-  // commit new trees all at once
-  inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
-    for (size_t i = 0; i < new_trees.size(); ++i) {
-      trees.push_back(new_trees[i]);
-      tree_info.push_back(bst_group);
-    }
-    mparam.num_trees += static_cast<int>(new_trees.size());
-  }
-  // update buffer by pre-cached position
-  inline void UpdateBufferByPosition(IFMatrix *p_fmat,
-                                     int64_t buffer_offset,
-                                     int bst_group,
-                                     const tree::RegTree &new_tree,
-                                     const int* leaf_position) {
-    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_uint ridx = rowset[i];
-      const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
-      const int tid = leaf_position[ridx];
-      utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date");
-      utils::Assert(tid >= 0, "invalid leaf position");
-      pred_buffer[bid] += new_tree[tid].leaf_value();
-      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-        pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
-      }
-      pred_counter[bid] += tparam.num_parallel_tree;
-    }
-  }
-  // make a prediction for a single instance
-  inline void Pred(const RowBatch::Inst &inst,
-                   int64_t buffer_index,
-                   int bst_group,
-                   unsigned root_index,
-                   tree::RegTree::FVec *p_feats,
-                   float *out_pred, size_t stride,
-                   unsigned ntree_limit) {
-    size_t itop = 0;
-    float  psum = 0.0f;
-    // sum of leaf vector
-    std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
-    const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
-    // number of valid trees
-    unsigned treeleft = ntree_limit == 0 ? std::numeric_limits<unsigned>::max() : ntree_limit;
-    // load buffered results if any
-    if (bid >= 0 && ntree_limit == 0) {
-      itop = pred_counter[bid];
-      psum = pred_buffer[bid];
-      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-        vec_psum[i] = pred_buffer[bid + i + 1];
-      }
-    }
-    if (itop != trees.size()) {
-      p_feats->Fill(inst);
-      for (size_t i = itop; i < trees.size(); ++i) {
-        if (tree_info[i] == bst_group) {
-          int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
-          psum += (*trees[i])[tid].leaf_value();
-          for (int j = 0; j < mparam.size_leaf_vector; ++j) {
-            vec_psum[j] += trees[i]->leafvec(tid)[j];
-          }
-          if (--treeleft == 0) break;
-        }
-      }
-      p_feats->Drop(inst);
-    }
-    // updated the buffered results
-    if (bid >= 0 && ntree_limit == 0) {
-      pred_counter[bid] = static_cast<unsigned>(trees.size());
-      pred_buffer[bid] = psum;
-      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-        pred_buffer[bid + i + 1] = vec_psum[i];
-      }
-    }
-    out_pred[0] = psum;
-    for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-      out_pred[stride * (i + 1)] = vec_psum[i];
-    }
-  }
-  // predict independent leaf index
-  inline void PredPath(IFMatrix *p_fmat,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit) {
-    // number of valid trees
-    if (ntree_limit == 0 || ntree_limit > trees.size()) {
-      ntree_limit = static_cast<unsigned>(trees.size());
-    }
-    std::vector<float> &preds = *out_preds;
-    preds.resize(info.num_row * ntree_limit);
-    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      // parallel over local batch
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        const int tid = omp_get_thread_num();
-        size_t ridx = static_cast<size_t>(batch.base_rowid + i);
-        tree::RegTree::FVec &feats = thread_temp[tid];
-        feats.Fill(batch[i]);
-        for (unsigned j = 0; j < ntree_limit; ++j) {
-          int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
-          preds[ridx * ntree_limit + j] = static_cast<float>(tid);
-        }
-        feats.Drop(batch[i]);
-      }
-    }
-  }
-  // init thread buffers
-  inline void InitThreadTemp(int nthread) {
-    int prev_thread_temp_size = thread_temp.size();
-    if (prev_thread_temp_size < nthread) {
-      thread_temp.resize(nthread, tree::RegTree::FVec());
-      for (int i = prev_thread_temp_size; i < nthread; ++i) {
-        thread_temp[i].Init(mparam.num_feature);
-      }
-    }
-  }
-
-  // --- data structure ---
-  /*! \brief training parameters */
-  struct TrainParam {
-    /*! \brief number of threads */
-    int nthread;
-    /*!
-     * \brief number of parallel trees constructed each iteration
-     *  use this option to support boosted random forest
-     */
-    int num_parallel_tree;
-    /*! \brief whether updater is already initialized */
-    int updater_initialized;
-    /*! \brief distributed column mode */
-    int distcol_mode;
-    /*! \brief tree updater sequence */
-    std::string updater_seq;
-    // construction
-    TrainParam(void) {
-      nthread = 0;
-      updater_seq = "grow_colmaker,prune";
-      num_parallel_tree = 1;
-      updater_initialized = 0;
-      distcol_mode = 0;
-    }
-    inline void SetParam(const char *name, const char *val){
-      using namespace std;
-      if (!strcmp(name, "updater") &&
-          strcmp(updater_seq.c_str(), val) != 0) {
-        updater_seq = val;
-        updater_initialized = 0;
-      }
-      if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
-        distcol_mode = 1;
-      }
-      if (!strcmp(name, "nthread")) {
-        omp_set_num_threads(nthread = atoi(val));
-      }
-      if (!strcmp(name, "num_parallel_tree")) {
-        num_parallel_tree = atoi(val);
-      }
-    }
-  };
-  /*! \brief model parameters */
-  struct ModelParam {
-    /*! \brief number of trees */
-    int num_trees;
-    /*! \brief number of root: default 0, means single tree */
-    int num_roots;
-    /*! \brief number of features to be used by trees */
-    int num_feature;
-    /*! \brief size of prediction buffer allocated used for buffering */
-    int64_t num_pbuffer;
-    /*!
-     * \brief how many output group a single instance can produce
-     *  this affects the behavior of number of output we have:
-     *    suppose we have n instance and k group, output will be k*n
-     */
-    int num_output_group;
-    /*! \brief size of leaf vector needed in tree */
-    int size_leaf_vector;
-    /*! \brief reserved parameters */
-    int reserved[31];
-    /*! \brief constructor */
-    ModelParam(void) {
-      std::memset(this, 0, sizeof(ModelParam));
-      num_trees = 0;
-      num_roots = num_feature = 0;
-      num_pbuffer = 0;
-      num_output_group = 1;
-      size_leaf_vector = 0;
-    }
-    /*!
-     * \brief set parameters from outside
-     * \param name name of the parameter
-     * \param val  value of the parameter
-     */
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
-      if (!strcmp("num_output_group", name)) num_output_group = atol(val);
-      if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
-      if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
-      if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
-    }
-    /*! \return size of prediction buffer actually needed */
-    inline size_t PredBufferSize(void) const {
-      return num_output_group * num_pbuffer * (size_leaf_vector + 1);
-    }
-    /*!
-     * \brief get the buffer offset given a buffer index and group id
-     * \return calculated buffer offset
-     */
-    inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
-      if (buffer_index < 0) return -1;
-      utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
-      return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
-    }
-  };
-  // training parameter
-  TrainParam tparam;
-  // model parameter
-  ModelParam mparam;
-  /*! \brief vector of trees stored in the model */
-  std::vector<tree::RegTree*> trees;
-  /*! \brief some information indicator of the tree, reserved */
-  std::vector<int> tree_info;
-  /*! \brief prediction buffer */
-  std::vector<float>  pred_buffer;
-  /*! \brief prediction buffer counter, remember the prediction */
-  std::vector<unsigned> pred_counter;
-  // ----training fields----
-  // configurations for tree
-  std::vector< std::pair<std::string, std::string> > cfg;
-  // temporal storage for per thread
-  std::vector<tree::RegTree::FVec> thread_temp;
-  // the updaters that can be applied to each of tree
-  std::vector<tree::IUpdater*> updaters;
-};
-
-}  // namespace gbm
-}  // namespace xgboost
-#endif  // XGBOOST_GBM_GBTREE_INL_HPP_
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
new file mode 100644
index 000000000..6618cd503
--- /dev/null
+++ b/src/gbm/gbtree.cc
@@ -0,0 +1,486 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file gbtree.cc
+ * \brief gradient boosted tree implementation.
+ * \author Tianqi Chen
+ */
+#include <dmlc/omp.h>
+#include <dmlc/parameter.h>
+#include <xgboost/logging.h>
+#include <xgboost/gbm.h>
+#include <xgboost/tree_updater.h>
+
+#include <vector>
+#include <memory>
+#include <utility>
+#include <string>
+#include <limits>
+
+namespace xgboost {
+namespace gbm {
+
+DMLC_REGISTRY_FILE_TAG(gbtree);
+
+/*! \brief training parameters */
+struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
+  /*! \brief number of threads */
+  int nthread;
+  /*!
+   * \brief number of parallel trees constructed each iteration
+   *  use this option to support boosted random forest
+   */
+  int num_parallel_tree;
+  /*! \brief tree updater sequence */
+  std::string updater_seq;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
+    DMLC_DECLARE_FIELD(nthread).set_lower_bound(0).set_default(0)
+        .describe("Number of threads used for training.");
+    DMLC_DECLARE_FIELD(num_parallel_tree).set_lower_bound(1).set_default(1)
+        .describe("Number of parallel trees constructed during each iteration."\
+                  " This option is used to support boosted random forest");
+    DMLC_DECLARE_FIELD(updater_seq).set_default("grow_colmaker,prune")
+        .describe("Tree updater sequence.");
+    // add alias
+    DMLC_DECLARE_ALIAS(updater_seq, updater);
+  }
+};
+
+/*! \brief model parameters */
+struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
+  /*! \brief number of trees */
+  int num_trees;
+  /*! \brief number of roots */
+  int num_roots;
+  /*! \brief number of features to be used by trees */
+  int num_feature;
+  /*! \brief pad this space, for backward compatiblity reason.*/
+  int pad_32bit;
+  /*! \brief deprecated padding space. */
+  int64_t num_pbuffer_deprecated;
+  /*!
+   * \brief how many output group a single instance can produce
+   *  this affects the behavior of number of output we have:
+   *    suppose we have n instance and k group, output will be k * n
+   */
+  int num_output_group;
+  /*! \brief size of leaf vector needed in tree */
+  int size_leaf_vector;
+  /*! \brief reserved parameters */
+  int reserved[32];
+  /*! \brief constructor */
+  GBTreeModelParam() {
+    std::memset(this, 0, sizeof(GBTreeModelParam));
+    static_assert(sizeof(GBTreeModelParam) == (4 + 2 + 2 + 32) * sizeof(int),
+                  "64/32 bit compatibility issue");
+  }
+  // declare parameters, only declare those that need to be set.
+  DMLC_DECLARE_PARAMETER(GBTreeModelParam) {
+    DMLC_DECLARE_FIELD(num_output_group).set_lower_bound(1).set_default(1)
+        .describe("Number of output groups to be predicted,"\
+                  " used for multi-class classification.");
+    DMLC_DECLARE_FIELD(num_roots).set_lower_bound(1).set_default(1)
+        .describe("Tree updater sequence.");
+    DMLC_DECLARE_FIELD(num_feature).set_lower_bound(0)
+        .describe("Number of features used for training and prediction.");
+    DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
+        .describe("Reserved option for vector tree.");
+  }
+};
+
+// gradient boosted trees
+class GBTree : public GradientBooster {
+ public:
+  GBTree() : num_pbuffer(0) {}
+
+  void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
+    this->cfg = cfg;
+    // initialize model parameters if not yet been initialized.
+    if (trees.size() == 0) {
+      mparam.InitAllowUnknown(cfg);
+    }
+    // initialize the updaters only when needed.
+    std::string updater_seq = tparam.updater_seq;
+    tparam.InitAllowUnknown(cfg);
+    if (updater_seq != tparam.updater_seq) updaters.clear();
+    for (const auto& up : updaters) {
+      up->Init(cfg);
+    }
+    if (tparam.nthread != 0) {
+      omp_set_num_threads(tparam.nthread);
+    }
+  }
+
+  void Load(dmlc::Stream* fi) override {
+    CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam))
+        << "GBTree: invalid model file";
+    trees.clear();
+    for (int i = 0; i < mparam.num_trees; ++i) {
+      std::unique_ptr<RegTree> ptr(new RegTree());
+      ptr->Load(fi);
+      trees.push_back(std::move(ptr));
+    }
+    tree_info.resize(mparam.num_trees);
+    if (mparam.num_trees != 0) {
+      CHECK_EQ(fi->Read(dmlc::BeginPtr(tree_info), sizeof(int) * mparam.num_trees),
+               sizeof(int) * mparam.num_trees);
+    }
+    // clear the predict buffer.
+    this->ResetPredBuffer(num_pbuffer);
+  }
+
+  void Save(dmlc::Stream* fo) const override {
+    CHECK_EQ(mparam.num_trees, static_cast<int>(trees.size()));
+    fo->Write(&mparam, sizeof(mparam));
+    for (size_t i = 0; i < trees.size(); ++i) {
+      trees[i]->Save(fo);
+    }
+    if (tree_info.size() != 0) {
+      fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size());
+    }
+  }
+
+  void ResetPredBuffer(size_t num_pbuffer) override {
+    this->num_pbuffer = num_pbuffer;
+    pred_buffer.clear();
+    pred_counter.clear();
+    pred_buffer.resize(this->PredBufferSize(), 0.0f);
+    pred_counter.resize(this->PredBufferSize(), 0);
+  }
+
+  bool AllowLazyCheckPoint() const override {
+    return mparam.num_output_group == 1 ||
+        tparam.updater_seq.find("distcol") != std::string::npos;
+  }
+
+  void DoBoost(DMatrix* p_fmat,
+               int64_t buffer_offset,
+               std::vector<bst_gpair>* in_gpair) override {
+    const std::vector<bst_gpair>& gpair = *in_gpair;
+    std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
+    if (mparam.num_output_group == 1) {
+      std::vector<std::unique_ptr<RegTree> > ret;
+      BoostNewTrees(gpair, p_fmat, buffer_offset, 0, &ret);
+      new_trees.push_back(std::move(ret));
+    } else {
+      const int ngroup = mparam.num_output_group;
+      CHECK_EQ(gpair.size() % ngroup, 0)
+          << "must have exactly ngroup*nrow gpairs";
+      std::vector<bst_gpair> tmp(gpair.size() / ngroup);
+      for (int gid = 0; gid < ngroup; ++gid) {
+        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          tmp[i] = gpair[i * ngroup + gid];
+        }
+        std::vector<std::unique_ptr<RegTree> > ret;
+        BoostNewTrees(tmp, p_fmat, buffer_offset, gid, &ret);
+        new_trees.push_back(std::move(ret));
+      }
+    }
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->CommitModel(std::move(new_trees[gid]), gid);
+    }
+  }
+
+  void Predict(DMatrix* p_fmat,
+               int64_t buffer_offset,
+               std::vector<float>* out_preds,
+               unsigned ntree_limit) override {
+    const MetaInfo& info = p_fmat->info();
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    InitThreadTemp(nthread);
+    std::vector<float> &preds = *out_preds;
+    const size_t stride = p_fmat->info().num_row * mparam.num_output_group;
+    preds.resize(stride * (mparam.size_leaf_vector+1));
+    // start collecting the prediction
+    dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
+
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel over local batch
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const int tid = omp_get_thread_num();
+        RegTree::FVec &feats = thread_temp[tid];
+        int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
+        CHECK_LT(static_cast<size_t>(ridx), info.num_row);
+        // loop over output groups
+        for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+          this->Pred(batch[i],
+                     buffer_offset < 0 ? -1 : buffer_offset + ridx,
+                     gid, info.GetRoot(ridx), &feats,
+                     &preds[ridx * mparam.num_output_group + gid], stride,
+                     ntree_limit);
+        }
+      }
+    }
+  }
+
+  void Predict(const SparseBatch::Inst& inst,
+               std::vector<float>* out_preds,
+               unsigned ntree_limit,
+               unsigned root_index) override {
+    if (thread_temp.size() == 0) {
+      thread_temp.resize(1, RegTree::FVec());
+      thread_temp[0].Init(mparam.num_feature);
+    }
+    out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
+    // loop over output groups
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->Pred(inst, -1, gid, root_index, &thread_temp[0],
+                 &(*out_preds)[gid], mparam.num_output_group,
+                 ntree_limit);
+    }
+  }
+
+  void PredictLeaf(DMatrix* p_fmat,
+                   std::vector<float>* out_preds,
+                   unsigned ntree_limit) override {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    InitThreadTemp(nthread);
+    this->PredPath(p_fmat, out_preds, ntree_limit);
+  }
+
+  std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const override {
+    std::vector<std::string> dump;
+    for (size_t i = 0; i < trees.size(); i++) {
+      dump.push_back(trees[i]->Dump2Text(fmap, option & 1));
+    }
+    return dump;
+  }
+
+ protected:
+  // initialize updater before using them
+  inline void InitUpdater() {
+    if (updaters.size() != 0) return;
+    std::string tval = tparam.updater_seq;
+    char *pstr;
+    pstr = std::strtok(&tval[0], ",");
+    while (pstr != nullptr) {
+      std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr));
+      up->Init(this->cfg);
+      updaters.push_back(std::move(up));
+      pstr = std::strtok(nullptr, ",");
+    }
+  }
+  // do group specific group
+  inline void
+  BoostNewTrees(const std::vector<bst_gpair> &gpair,
+                DMatrix *p_fmat,
+                int64_t buffer_offset,
+                int bst_group,
+                std::vector<std::unique_ptr<RegTree> >* ret) {
+    this->InitUpdater();
+    std::vector<RegTree*> new_trees;
+    ret->clear();
+    // create the trees
+    for (int i = 0; i < tparam.num_parallel_tree; ++i) {
+      std::unique_ptr<RegTree> ptr(new RegTree());
+      ptr->param.InitAllowUnknown(this->cfg);
+      ptr->InitModel();
+      new_trees.push_back(ptr.get());
+      ret->push_back(std::move(ptr));
+    }
+    // update the trees
+    for (auto& up : updaters) {
+      up->Update(gpair, p_fmat, new_trees);
+    }
+    // optimization, update buffer, if possible
+    // this is only under distributed column mode
+    // for safety check of lazy checkpoint
+    if (buffer_offset >= 0 &&
+        new_trees.size() == 1 && updaters.size() > 0 &&
+        updaters.back()->GetLeafPosition() != nullptr) {
+      CHECK_EQ(p_fmat->info().num_row, p_fmat->buffered_rowset().size());
+      this->UpdateBufferByPosition(p_fmat,
+                                   buffer_offset,
+                                   bst_group,
+                                   *new_trees[0],
+                                   updaters.back()->GetLeafPosition());
+    }
+  }
+  // commit new trees all at once
+  inline void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
+                          int bst_group) {
+    for (size_t i = 0; i < new_trees.size(); ++i) {
+      trees.push_back(std::move(new_trees[i]));
+      tree_info.push_back(bst_group);
+    }
+    mparam.num_trees += static_cast<int>(new_trees.size());
+  }
+  // update buffer by pre-cached position
+  inline void UpdateBufferByPosition(DMatrix *p_fmat,
+                                     int64_t buffer_offset,
+                                     int bst_group,
+                                     const RegTree &new_tree,
+                                     const int* leaf_position) {
+    const std::vector<bst_uint>& rowset = p_fmat->buffered_rowset();
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int64_t bid = this->BufferOffset(buffer_offset + ridx, bst_group);
+      const int tid = leaf_position[ridx];
+      CHECK_EQ(pred_counter[bid], trees.size());
+      CHECK_GE(tid, 0);
+      pred_buffer[bid] += new_tree[tid].leaf_value();
+      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+        pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
+      }
+      pred_counter[bid] += tparam.num_parallel_tree;
+    }
+  }
+  // make a prediction for a single instance
+  inline void Pred(const RowBatch::Inst &inst,
+                   int64_t buffer_index,
+                   int bst_group,
+                   unsigned root_index,
+                   RegTree::FVec *p_feats,
+                   float *out_pred,
+                   size_t stride,
+                   unsigned ntree_limit) {
+    size_t itop = 0;
+    float  psum = 0.0f;
+    // sum of leaf vector
+    std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
+    const int64_t bid = this->BufferOffset(buffer_index, bst_group);
+    // number of valid trees
+    unsigned treeleft = ntree_limit == 0 ? std::numeric_limits<unsigned>::max() : ntree_limit;
+    // load buffered results if any
+    if (bid >= 0 && ntree_limit == 0) {
+      itop = pred_counter[bid];
+      psum = pred_buffer[bid];
+      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+        vec_psum[i] = pred_buffer[bid + i + 1];
+      }
+    }
+    if (itop != trees.size()) {
+      p_feats->Fill(inst);
+      for (size_t i = itop; i < trees.size(); ++i) {
+        if (tree_info[i] == bst_group) {
+          int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
+          psum += (*trees[i])[tid].leaf_value();
+          for (int j = 0; j < mparam.size_leaf_vector; ++j) {
+            vec_psum[j] += trees[i]->leafvec(tid)[j];
+          }
+          if (--treeleft == 0) break;
+        }
+      }
+      p_feats->Drop(inst);
+    }
+    // updated the buffered results
+    if (bid >= 0 && ntree_limit == 0) {
+      pred_counter[bid] = static_cast<unsigned>(trees.size());
+      pred_buffer[bid] = psum;
+      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+        pred_buffer[bid + i + 1] = vec_psum[i];
+      }
+    }
+    out_pred[0] = psum;
+    for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+      out_pred[stride * (i + 1)] = vec_psum[i];
+    }
+  }
+  // predict independent leaf index
+  inline void PredPath(DMatrix *p_fmat,
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit) {
+    const MetaInfo& info = p_fmat->info();
+    // number of valid trees
+    if (ntree_limit == 0 || ntree_limit > trees.size()) {
+      ntree_limit = static_cast<unsigned>(trees.size());
+    }
+    std::vector<float>& preds = *out_preds;
+    preds.resize(info.num_row * ntree_limit);
+    // start collecting the prediction
+    dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch& batch = iter->Value();
+      // parallel over local batch
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const int tid = omp_get_thread_num();
+        size_t ridx = static_cast<size_t>(batch.base_rowid + i);
+        RegTree::FVec &feats = thread_temp[tid];
+        feats.Fill(batch[i]);
+        for (unsigned j = 0; j < ntree_limit; ++j) {
+          int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
+          preds[ridx * ntree_limit + j] = static_cast<float>(tid);
+        }
+        feats.Drop(batch[i]);
+      }
+    }
+  }
+  // init thread buffers
+  inline void InitThreadTemp(int nthread) {
+    int prev_thread_temp_size = thread_temp.size();
+    if (prev_thread_temp_size < nthread) {
+      thread_temp.resize(nthread, RegTree::FVec());
+      for (int i = prev_thread_temp_size; i < nthread; ++i) {
+        thread_temp[i].Init(mparam.num_feature);
+      }
+    }
+  }
+  /*! \return size of prediction buffer actually needed */
+  inline size_t PredBufferSize() const {
+    return mparam.num_output_group * num_pbuffer * (mparam.size_leaf_vector + 1);
+  }
+  /*!
+   * \brief get the buffer offset given a buffer index and group id
+   * \return calculated buffer offset
+   */
+  inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
+    if (buffer_index < 0) return -1;
+    size_t bidx = static_cast<size_t>(buffer_index);
+    CHECK_LT(bidx, num_pbuffer);
+    return (bidx + num_pbuffer * bst_group) * (mparam.size_leaf_vector + 1);
+  }
+
+  // --- data structure ---
+  // training parameter
+  GBTreeTrainParam tparam;
+  // model parameter
+  GBTreeModelParam mparam;
+  /*! \brief vector of trees stored in the model */
+  std::vector<std::unique_ptr<RegTree> > trees;
+  /*! \brief some information indicator of the tree, reserved */
+  std::vector<int> tree_info;
+  /*! \brief predict buffer size */
+  size_t num_pbuffer;
+  /*! \brief prediction buffer */
+  std::vector<float> pred_buffer;
+  /*! \brief prediction buffer counter, remember the prediction */
+  std::vector<unsigned> pred_counter;
+  // ----training fields----
+  // configurations for tree
+  std::vector<std::pair<std::string, std::string> > cfg;
+  // temporal storage for per thread
+  std::vector<RegTree::FVec> thread_temp;
+  // the updaters that can be applied to each of tree
+  std::vector<std::unique_ptr<TreeUpdater> > updaters;
+};
+
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(GBTreeModelParam);
+DMLC_REGISTER_PARAMETER(GBTreeTrainParam);
+
+XGBOOST_REGISTER_GBM(GBTree, "gbtree")
+.describe("Tree booster, gradient boosted trees.")
+.set_body([]() {
+    return new GBTree();
+  });
+}  // namespace gbm
+}  // namespace xgboost
diff --git a/src/io/dmlc_simple.cpp b/src/io/dmlc_simple.cpp
deleted file mode 100644
index 0448bd578..000000000
--- a/src/io/dmlc_simple.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <string>
-#include "../utils/io.h"
-
-// implements a single no split version of DMLC
-// in case we want to avoid dependency on dmlc-core
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief line split implementation from single FILE
- * simply returns lines of files, used for stdin
- */
-class SingleFileSplit : public dmlc::InputSplit {
- public:
-  explicit SingleFileSplit(const char *fname)
-      : use_stdin_(false),
-        chunk_begin_(NULL), chunk_end_(NULL) {
-    if (!std::strcmp(fname, "stdin")) {
-#ifndef XGBOOST_STRICT_CXX98_
-      use_stdin_ = true; fp_ = stdin;
-#endif
-    }
-    if (!use_stdin_) {
-      fp_ = utils::FopenCheck(fname, "rb");
-    }
-    buffer_.resize(kBufferSize);
-  }
-  virtual ~SingleFileSplit(void) {
-    if (!use_stdin_) std::fclose(fp_);
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, 1, size, fp_);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    utils::Error("cannot do write in inputsplit");
-  }
-  virtual void BeforeFirst(void) {
-    std::fseek(fp_, 0, SEEK_SET);
-  }
-  virtual bool NextRecord(Blob *out_rec) {
-    if (chunk_begin_ == chunk_end_) {
-      if (!LoadChunk()) return false;
-    }
-    char *next = FindNextRecord(chunk_begin_,
-                                chunk_end_);
-    out_rec->dptr = chunk_begin_;
-    out_rec->size = next - chunk_begin_;
-    chunk_begin_ = next;
-    return true;
-  }
-  virtual bool NextChunk(Blob *out_chunk) {
-    if (chunk_begin_ == chunk_end_) {
-      if (!LoadChunk()) return false;
-    }
-    out_chunk->dptr = chunk_begin_;
-    out_chunk->size = chunk_end_ - chunk_begin_;
-    chunk_begin_ = chunk_end_;
-    return true;
-  }
-  inline bool ReadChunk(void *buf, size_t *size) {
-    size_t max_size = *size;
-    if (max_size <= overflow_.length()) {
-      *size = 0; return true;
-    }
-    if (overflow_.length() != 0) {
-      std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
-    }
-    size_t olen = overflow_.length();
-    overflow_.resize(0);
-    size_t nread = this->Read(reinterpret_cast<char*>(buf) + olen,
-                              max_size - olen);
-    nread += olen;
-    if (nread == 0) return false;
-    if (nread != max_size) {
-      *size = nread;
-      return true;
-    } else {
-      const char *bptr = reinterpret_cast<const char*>(buf);
-      // return the last position where a record starts
-      const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size);
-      *size = bend - bptr;
-      overflow_.resize(max_size - *size);
-      if (overflow_.length() != 0) {
-        std::memcpy(BeginPtr(overflow_), bend, overflow_.length());
-      }
-      return true;
-    }
-  }
-
- protected:
-  inline const char* FindLastRecordBegin(const char *begin,
-                                         const char *end) {
-    if (begin == end) return begin;
-    for (const char *p = end - 1; p != begin; --p) {
-      if (*p == '\n' || *p == '\r') return p + 1;
-    }
-    return begin;
-  }
-  inline char* FindNextRecord(char *begin, char *end) {
-    char *p;
-    for (p = begin; p != end; ++p) {
-      if (*p == '\n' || *p == '\r') break;
-    }
-    for (; p != end; ++p) {
-      if (*p != '\n' && *p != '\r') return p;
-    }
-    return end;
-  }
-  inline bool LoadChunk(void) {
-    while (true) {
-      size_t size = buffer_.length();
-      if (!ReadChunk(BeginPtr(buffer_), &size)) return false;
-      if (size == 0) {
-        buffer_.resize(buffer_.length() * 2);
-      } else {
-        chunk_begin_ = reinterpret_cast<char *>(BeginPtr(buffer_));
-        chunk_end_ = chunk_begin_ + size;
-        break;
-      }
-    }
-    return true;
-  }
-
- private:
-  // buffer size
-  static const size_t kBufferSize = 1 << 18UL;
-  // file
-  std::FILE *fp_;
-  bool use_stdin_;
-  // internal overflow
-  std::string overflow_;
-  // internal buffer
-  std::string buffer_;
-  // beginning of chunk
-  char *chunk_begin_;
-  // end of chunk
-  char *chunk_end_;
-};
-
-class StdFile : public dmlc::Stream {
- public:
-  explicit StdFile(std::FILE *fp, bool use_stdio)
-      : fp(fp), use_stdio(use_stdio) {
-  }
-  virtual ~StdFile(void) {
-    this->Close();
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, 1, size, fp);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    Check(std::fwrite(ptr, size, 1, fp) == 1, "StdFile::Write: fwrite error!");
-  }
-  virtual void Seek(size_t pos) {
-    std::fseek(fp, static_cast<long>(pos), SEEK_SET);  // NOLINT(*)
-  }
-  virtual size_t Tell(void) {
-    return std::ftell(fp);
-  }
-  virtual bool AtEnd(void) const {
-    return std::feof(fp) != 0;
-  }
-  inline void Close(void) {
-    if (fp != NULL && !use_stdio) {
-      std::fclose(fp); fp = NULL;
-    }
-  }
-
- private:
-  std::FILE *fp;
-  bool use_stdio;
-};
-}  // namespace utils
-}  // namespace xgboost
-
-namespace dmlc {
-InputSplit* InputSplit::Create(const char *uri,
-                               unsigned part,
-                               unsigned nsplit,
-                               const char *type) {
-  using namespace std;
-  using namespace xgboost;
-  const char *msg = "xgboost is compiled in local mode\n"\
-      "to use hdfs, s3 or distributed version, compile with make dmlc=1";
-  utils::Check(strncmp(uri, "s3://", 5) != 0, msg);
-  utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg);
-  utils::Check(nsplit == 1, msg);
-  return new utils::SingleFileSplit(uri);
-}
-
-Stream *Stream::Create(const char *fname, const char * const mode, bool allow_null) {
-  using namespace std;
-  using namespace xgboost;
-  const char *msg = "xgboost is compiled in local mode\n"\
-      "to use hdfs, s3 or distributed version, compile with make dmlc=1";
-  utils::Check(strncmp(fname, "s3://", 5) != 0, msg);
-  utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg);
-
-  std::FILE *fp = NULL;
-  bool use_stdio = false;
-  using namespace std;
-#ifndef XGBOOST_STRICT_CXX98_
-  if (!strcmp(fname, "stdin")) {
-    use_stdio = true; fp = stdin;
-  }
-  if (!strcmp(fname, "stdout")) {
-    use_stdio = true; fp = stdout;
-  }
-#endif
-  if (!strncmp(fname, "file://", 7)) fname += 7;
-  if (!use_stdio) {
-    std::string flag = mode;
-    if (flag == "w") flag = "wb";
-    if (flag == "r") flag = "rb";
-    fp = fopen64(fname, flag.c_str());
-  }
-  if (fp != NULL) {
-    return new utils::StdFile(fp, use_stdio);
-  } else {
-    utils::Check(allow_null, "fail to open file %s", fname);
-    return NULL;
-  }
-}
-}  // namespace dmlc
-
diff --git a/src/io/io.cpp b/src/io/io.cpp
deleted file mode 100644
index b3713f0c5..000000000
--- a/src/io/io.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2014 by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <string>
-#include "./io.h"
-#include "../utils/io.h"
-#include "../utils/utils.h"
-#include "simple_dmatrix-inl.hpp"
-#include "page_dmatrix-inl.hpp"
-
-namespace xgboost {
-namespace io {
-DataMatrix* LoadDataMatrix(const char *fname,
-                           bool silent,
-                           bool savebuffer,
-                           bool loadsplit,
-                           const char *cache_file) {
-  using namespace std;
-  std::string fname_ = fname;
-
-  const char *dlm = strchr(fname, '#');
-  if (dlm != NULL) {
-    utils::Check(strchr(dlm + 1, '#') == NULL,
-                 "only one `#` is allowed in file path for cachefile specification");
-    utils::Check(cache_file == NULL,
-                 "can only specify the cachefile with `#` or argument, not both");
-    fname_ = std::string(fname, dlm - fname);
-    fname = fname_.c_str();
-    cache_file = dlm +1;
-  }
-
-  if (cache_file == NULL) {
-    if (!std::strcmp(fname, "stdin") ||
-        !std::strncmp(fname, "s3://", 5) ||
-        !std::strncmp(fname, "hdfs://", 7) ||
-        loadsplit) {
-      DMatrixSimple *dmat = new DMatrixSimple();
-      dmat->LoadText(fname, silent, loadsplit);
-      return dmat;
-    }
-    int magic;
-    utils::FileStream fs(utils::FopenCheck(fname, "rb"));
-    utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
-    fs.Seek(0);
-    if (magic == DMatrixSimple::kMagic) {
-      DMatrixSimple *dmat = new DMatrixSimple();
-      dmat->LoadBinary(fs, silent, fname);
-      fs.Close();
-      return dmat;
-    }
-    fs.Close();
-    DMatrixSimple *dmat = new DMatrixSimple();
-    dmat->CacheLoad(fname, silent, savebuffer);
-    return dmat;
-  } else {
-    std::string cache_fname = cache_file;
-    if (loadsplit) {
-      std::ostringstream os;
-      os << cache_file << ".r" << rabit::GetRank();
-      cache_fname = os.str();
-      cache_file = cache_fname.c_str();
-    }
-    FILE *fi = fopen64(cache_file, "rb");
-    if (fi != NULL) {
-      DMatrixPage *dmat = new DMatrixPage();
-      utils::FileStream fs(fi);
-      dmat->LoadBinary(fs, silent, cache_file);
-      fs.Close();
-      return dmat;
-    } else {
-      if (fname[0] == '!') {
-        DMatrixHalfRAM *dmat = new DMatrixHalfRAM();
-        dmat->LoadText(fname + 1, cache_file, false, loadsplit);
-        return dmat;
-      } else {
-        DMatrixPage *dmat = new DMatrixPage();
-        dmat->LoadText(fname, cache_file, false, loadsplit);
-        return dmat;
-      }
-    }
-  }
-}
-
-void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
-  if (dmat.magic == DMatrixSimple::kMagic) {
-    const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
-    p_dmat->SaveBinary(fname, silent);
-  } else {
-    DMatrixSimple smat;
-    smat.CopyFrom(dmat);
-    smat.SaveBinary(fname, silent);
-  }
-}
-
-}  // namespace io
-}  // namespace xgboost
diff --git a/src/io/io.h b/src/io/io.h
deleted file mode 100644
index 6ceff2698..000000000
--- a/src/io/io.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file io.h
- * \brief handles input data format of xgboost
- *    I/O module handles a specific DMatrix format
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_IO_H_
-#define XGBOOST_IO_IO_H_
-
-#include "../data.h"
-#include "../learner/dmatrix.h"
-
-namespace xgboost {
-/*! \brief namespace related to data format */
-namespace io {
-/*! \brief DMatrix object that I/O module support save/load */
-typedef learner::DMatrix DataMatrix;
-/*!
- * \brief load DataMatrix from stream
- * \param fname file name to be loaded
- * \param silent whether print message during loading
- * \param savebuffer whether temporal buffer the file if the file is in text format
- * \param loadsplit whether we only load a split of input files
- *        such that each worker node get a split of the data
- * \param cache_file name of cache_file, used by external memory version
- *        can be NULL, if cache_file is specified, this will be the temporal
- *        space that can be re-used to store intermediate data
- * \return a loaded DMatrix
- */
-DataMatrix* LoadDataMatrix(const char *fname,
-                           bool silent,
-                           bool savebuffer,
-                           bool loadsplit,
-                           const char *cache_file = NULL);
-/*!
- * \brief save DataMatrix into stream,
- *  note: the saved dmatrix format may not be in exactly same as input
- *  SaveDMatrix will choose the best way to materialize the dmatrix.
- * \param dmat the dmatrix to be saved
- * \param fname file name to be saved
- * \param silent whether print message during saving
- */
-void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_IO_H_
diff --git a/src/io/libsvm_parser.h b/src/io/libsvm_parser.h
deleted file mode 100644
index 43b8d6b90..000000000
--- a/src/io/libsvm_parser.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file libsvm_parser.h
- * \brief iterator parser to parse libsvm format
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_LIBSVM_PARSER_H_
-#define XGBOOST_IO_LIBSVM_PARSER_H_
-#define NOMINMAX
-#include <vector>
-#include <cstring>
-#include <cctype>
-#include <algorithm>
-#include "../utils/omp.h"
-#include "../utils/utils.h"
-#include "../sync/sync.h"
-#include "../utils/thread_buffer.h"
-#include "./sparse_batch_page.h"
-
-namespace xgboost {
-namespace io {
-/*! \brief page returned by libsvm parser */
-struct LibSVMPage : public SparsePage {
-  std::vector<float> label;
-  // overload clear
-  inline void Clear() {
-    SparsePage::Clear();
-    label.clear();
-  }
-};
-/*!
- * \brief libsvm parser that parses the input lines
- * and returns rows in input data
- * factory that was used by threadbuffer template
- */
-class LibSVMPageFactory  {
- public:
-  LibSVMPageFactory()
-      : bytes_read_(0), at_head_(true) {
-  }
-  inline bool Init(void) {
-    return true;
-  }
-  inline void Setup(dmlc::InputSplit *source,
-                    int nthread) {
-    source_ = source;
-    int maxthread;
-    #pragma omp parallel
-    {
-      maxthread = omp_get_num_procs();
-    }
-    maxthread = std::max(maxthread / 2, 1);
-    nthread_ = std::min(maxthread, nthread);
-  }
-  inline void SetParam(const char *name, const char *val) {}
-  inline bool LoadNext(std::vector<LibSVMPage> *data) {
-    return FillData(data);
-  }
-  inline void FreeSpace(std::vector<LibSVMPage> *a) {
-    delete a;
-  }
-  inline std::vector<LibSVMPage> *Create(void) {
-    return new std::vector<LibSVMPage>();
-  }
-  inline void BeforeFirst(void) {
-    utils::Assert(at_head_, "cannot call beforefirst");
-  }
-  inline void Destroy(void) {
-    delete source_;
-  }
-  inline size_t bytes_read(void) const {
-    return bytes_read_;
-  }
-
- protected:
-  inline bool FillData(std::vector<LibSVMPage> *data) {
-    dmlc::InputSplit::Blob chunk;
-    if (!source_->NextChunk(&chunk)) return false;
-    int nthread;
-    #pragma omp parallel num_threads(nthread_)
-    {
-      nthread = omp_get_num_threads();
-    }
-    // reserve space for data
-    data->resize(nthread);
-    bytes_read_ += chunk.size;
-    utils::Assert(chunk.size != 0, "LibSVMParser.FileData");
-    char *head = reinterpret_cast<char*>(chunk.dptr);
-    #pragma omp parallel num_threads(nthread_)
-    {
-      // threadid
-      int tid = omp_get_thread_num();
-      size_t nstep = (chunk.size + nthread - 1) / nthread;
-      size_t sbegin = std::min(tid * nstep, chunk.size);
-      size_t send = std::min((tid + 1) * nstep, chunk.size);
-      char *pbegin = BackFindEndLine(head + sbegin, head);
-      char *pend;
-      if (tid + 1 == nthread) {
-        pend = head + send;
-      } else {
-        pend = BackFindEndLine(head + send, head);
-      }
-      ParseBlock(pbegin, pend, &(*data)[tid]);
-    }
-    return true;
-  }
-  /*!
-   * \brief parse data into out
-   * \param begin beginning of buffer
-   * \param end end of buffer
-   */
-  inline void ParseBlock(char *begin,
-                         char *end,
-                         LibSVMPage *out) {
-    using namespace std;
-    out->Clear();
-    char *p = begin;
-    while (p != end) {
-      while (isspace(*p) && p != end) ++p;
-      if (p == end) break;
-      char *head = p;
-      while (isdigit(*p) && p != end) ++p;
-      if (*p == ':') {
-        out->data.push_back(SparseBatch::Entry(atol(head),
-                                               static_cast<bst_float>(atof(p + 1))));
-      } else {
-        if (out->label.size() != 0) {
-          out->offset.push_back(out->data.size());
-        }
-        out->label.push_back(static_cast<float>(atof(head)));
-      }
-      while (!isspace(*p) && p != end) ++p;
-    }
-    if (out->label.size() != 0) {
-      out->offset.push_back(out->data.size());
-    }
-    utils::Check(out->label.size() + 1 == out->offset.size(),
-                 "LibSVMParser inconsistent");
-  }
-  /*!
-   * \brief start from bptr, go backward and find first endof line
-   * \param bptr end position to go backward
-   * \param begin the beginning position of buffer
-   * \return position of first endof line going backward
-   */
-  inline char* BackFindEndLine(char *bptr,
-                               char *begin) {
-    for (; bptr != begin; --bptr) {
-      if (*bptr == '\n' || *bptr == '\r') return bptr;
-    }
-    return begin;
-  }
-
- private:
-  // nthread
-  int nthread_;
-  // number of bytes readed
-  size_t bytes_read_;
-  // at beginning, at end of stream
-  bool at_head_;
-  // source split that provides the data
-  dmlc::InputSplit *source_;
-};
-
-class LibSVMParser : public utils::IIterator<LibSVMPage> {
- public:
-  explicit LibSVMParser(dmlc::InputSplit *source,
-                        int nthread)
-      : at_end_(false), data_ptr_(0), data_(NULL) {
-    itr.SetParam("buffer_size", "2");
-    itr.get_factory().Setup(source, nthread);
-    itr.Init();
-  }
-  virtual void BeforeFirst(void) {
-    itr.BeforeFirst();
-  }
-  virtual bool Next(void) {
-    if (at_end_) return false;
-    while (true) {
-      if (data_ == NULL || data_ptr_ >= data_->size()) {
-        if (!itr.Next(data_)) {
-          at_end_ = true; return false;
-        } else {
-          data_ptr_ = 0;
-        }
-      }
-      while (data_ptr_ < data_->size()) {
-        data_ptr_ += 1;
-        if ((*data_)[data_ptr_ - 1].Size() != 0) {
-          return true;
-        }
-      }
-    }
-    return true;
-  }
-  virtual const LibSVMPage &Value(void) const {
-    return (*data_)[data_ptr_ - 1];
-  }
-  inline size_t bytes_read(void) const {
-    return itr.get_factory().bytes_read();
-  }
-
- private:
-  bool at_end_;
-  size_t data_ptr_;
-  std::vector<LibSVMPage> *data_;
-  utils::ThreadBuffer<std::vector<LibSVMPage>*, LibSVMPageFactory> itr;
-};
-
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_LIBSVM_PARSER_H_
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
deleted file mode 100644
index 3012af564..000000000
--- a/src/io/page_dmatrix-inl.hpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file page_dmatrix-inl.hpp
- *   row iterator based on sparse page
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
-#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <algorithm>
-#include "../data.h"
-#include "../utils/iterator.h"
-#include "../utils/thread_buffer.h"
-#include "./simple_fmatrix-inl.hpp"
-#include "./sparse_batch_page.h"
-#include "./page_fmatrix-inl.hpp"
-#include "./libsvm_parser.h"
-
-namespace xgboost {
-namespace io {
-/*! \brief thread buffer iterator */
-class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
- public:
-  ThreadRowPageIterator(void) {
-    itr.SetParam("buffer_size", "4");
-    page_ = NULL;
-    base_rowid_ = 0;
-  }
-  virtual ~ThreadRowPageIterator(void) {}
-  virtual void Init(void) {
-  }
-  virtual void BeforeFirst(void) {
-    itr.BeforeFirst();
-    base_rowid_ = 0;
-  }
-  virtual bool Next(void) {
-    if (!itr.Next(page_)) return false;
-    out_ = page_->GetRowBatch(base_rowid_);
-    base_rowid_ += out_.size;
-    return true;
-  }
-  virtual const RowBatch &Value(void) const {
-    return out_;
-  }
-  /*! \brief load and initialize the iterator with fi */
-  inline void Load(const utils::FileStream &fi) {
-    itr.get_factory().SetFile(fi, 0);
-    itr.Init();
-    this->BeforeFirst();
-  }
-
- private:
-  // base row id
-  size_t base_rowid_;
-  // output data
-  RowBatch out_;
-  SparsePage *page_;
-  utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
-};
-
-/*! \brief data matrix using page */
-template<int TKMagic>
-class DMatrixPageBase : public DataMatrix {
- public:
-  DMatrixPageBase(void) : DataMatrix(kMagic) {
-    iter_ = new ThreadRowPageIterator();
-  }
-  // virtual destructor
-  virtual ~DMatrixPageBase(void) {
-    // do not delete row iterator, since it is owned by fmat
-    // to be cleaned up in a more clear way
-  }
-  /*! \brief save a DataMatrix as DMatrixPage */
-  inline static void Save(const char *fname_, const DataMatrix &mat, bool silent) {
-    std::string fname = fname_;
-    utils::FileStream fs(utils::FopenCheck(fname.c_str(), "wb"));
-    int magic = kMagic;
-    fs.Write(&magic, sizeof(magic));
-    mat.info.SaveBinary(fs);
-    fs.Close();
-    fname += ".row.blob";
-    utils::IIterator<RowBatch> *iter = mat.fmat()->RowIterator();
-    utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb"));
-    SparsePage page;
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        page.Push(batch[i]);
-        if (page.MemCostBytes() >= kPageSize) {
-          page.Save(&fbin); page.Clear();
-        }
-      }
-    }
-    if (page.data.size() != 0) page.Save(&fbin);
-    fbin.Close();
-    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
-                    static_cast<unsigned long>(mat.info.num_row()), // NOLINT(*)
-                    static_cast<unsigned long>(mat.info.num_col()), fname_); // NOLINT(*)
-    }
-  }
-  /*! \brief load and initialize the iterator with fi */
-  inline void LoadBinary(utils::FileStream &fi,  // NOLINT(*)
-                         bool silent,
-                         const char *fname_) {
-    this->set_cache_file(fname_);
-    std::string fname = fname_;
-    int tmagic;
-    utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    this->CheckMagic(tmagic);
-    this->info.LoadBinary(fi);
-    // load in the row data file
-    fname += ".row.blob";
-    utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb"));
-    iter_->Load(fs);
-    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()));  // NOLINT(*)
-      if (fname_ != NULL) {
-        utils::Printf(" from %s\n", fname_);
-      } else {
-        utils::Printf("\n");
-      }
-      if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
-      }
-    }
-  }
-  /*! \brief save a LibSVM format file as DMatrixPage */
-  inline void LoadText(const char *uri,
-                       const char* cache_file,
-                       bool silent,
-                       bool loadsplit) {
-    if (!silent) {
-      utils::Printf("start generate text file from %s\n", uri);
-    }
-    int rank = 0, npart = 1;
-    if (loadsplit) {
-      rank = rabit::GetRank();
-      npart = rabit::GetWorldSize();
-    }
-    this->set_cache_file(cache_file);
-    std::string fname_row = std::string(cache_file) + ".row.blob";
-    utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));
-    SparsePage page;
-    size_t bytes_write = 0;
-    double tstart = rabit::utils::GetTime();
-    LibSVMParser parser(
-        dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
-    info.Clear();
-    while (parser.Next()) {
-      const LibSVMPage &batch = parser.Value();
-      size_t nlabel = info.labels.size();
-      info.labels.resize(nlabel + batch.label.size());
-      if (batch.label.size() != 0) {
-        std::memcpy(BeginPtr(info.labels) + nlabel,
-                    BeginPtr(batch.label),
-                    batch.label.size() * sizeof(float));
-      }
-      page.Push(batch);
-      for (size_t i = 0; i < batch.data.size(); ++i) {
-        info.info.num_col = std::max(info.info.num_col,
-                                     static_cast<size_t>(batch.data[i].index+1));
-      }
-      if (page.MemCostBytes() >= kPageSize) {
-        bytes_write += page.MemCostBytes();
-        page.Save(&fo);
-        page.Clear();
-        double tdiff = rabit::utils::GetTime() - tstart;
-        if (!silent) {
-          utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
-                        cache_file, (bytes_write >> 20UL) / tdiff,
-                        (bytes_write >> 20UL));
-        }
-      }
-      info.info.num_row += batch.label.size();
-    }
-    if (page.data.size() != 0) {
-      page.Save(&fo);
-    }
-    fo.Close();
-    iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb")));
-    // save data matrix
-    utils::FileStream fs(utils::FopenCheck(cache_file, "wb"));
-    int tmagic = kMagic;
-    fs.Write(&tmagic, sizeof(tmagic));
-    this->info.SaveBinary(fs);
-    fs.Close();
-    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    uri);
-    }
-  }
-  /*! \brief magic number used to identify DMatrix */
-  static const int kMagic = TKMagic;
-  /*! \brief page size 32 MB */
-  static const size_t kPageSize = 32UL << 20UL;
-
- protected:
-  virtual void set_cache_file(const std::string &cache_file)  = 0;
-  virtual void CheckMagic(int tmagic)  = 0;
-  /*! \brief row iterator */
-  ThreadRowPageIterator *iter_;
-};
-
-class DMatrixPage : public DMatrixPageBase<0xffffab02> {
- public:
-  DMatrixPage(void) {
-    fmat_ = new FMatrixPage(iter_, this->info);
-  }
-  virtual ~DMatrixPage(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  virtual void set_cache_file(const std::string &cache_file) {
-    fmat_->set_cache_file(cache_file);
-  }
-  virtual void CheckMagic(int tmagic) {
-    utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
-                 tmagic == DMatrixPageBase<0xffffab03>::kMagic,
-                 "invalid format,magic number mismatch");
-  }
-  /*! \brief the real fmatrix */
-  FMatrixPage *fmat_;
-};
-
-// mix of FMatrix S and DMatrix
-// cost half of ram usually as DMatrixSimple
-class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
- public:
-  DMatrixHalfRAM(void) {
-    fmat_ = new FMatrixS(iter_, this->info);
-  }
-  virtual ~DMatrixHalfRAM(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  virtual void set_cache_file(const std::string &cache_file) {
-  }
-  virtual void CheckMagic(int tmagic) {
-    utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
-                 tmagic == DMatrixPageBase<0xffffab03>::kMagic,
-                 "invalid format,magic number mismatch");
-  }
-  /*! \brief the real fmatrix */
-  IFMatrix *fmat_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
deleted file mode 100644
index d2b71e50f..000000000
--- a/src/io/page_fmatrix-inl.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/*!
- * Copyright (c) 2014 by Contributors
- * \file page_fmatrix-inl.hpp
- *   col iterator based on sparse page
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
-#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <algorithm>
-
-namespace xgboost {
-namespace io {
-/*! \brief thread buffer iterator */
-class ThreadColPageIterator: public utils::IIterator<ColBatch> {
- public:
-  ThreadColPageIterator(void) {
-    itr.SetParam("buffer_size", "2");
-    page_ = NULL;
-  }
-  virtual ~ThreadColPageIterator(void) {}
-  virtual void Init(void) {}
-  virtual void BeforeFirst(void) {
-    itr.BeforeFirst();
-  }
-  virtual bool Next(void) {
-    if (!itr.Next(page_)) return false;
-    out_.col_index = BeginPtr(itr.get_factory().index_set());
-    col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(NULL, 0));
-    for (size_t i = 0; i < col_data_.size(); ++i) {
-      col_data_[i] = SparseBatch::Inst
-          (BeginPtr(page_->data) + page_->offset[i],
-           static_cast<bst_uint>(page_->offset[i + 1] - page_->offset[i]));
-    }
-    out_.col_data = BeginPtr(col_data_);
-    out_.size = col_data_.size();
-    return true;
-  }
-  virtual const ColBatch &Value(void) const {
-    return out_;
-  }
-  /*! \brief load and initialize the iterator with fi */
-  inline void SetFile(const utils::FileStream &fi) {
-    itr.get_factory().SetFile(fi);
-    itr.Init();
-  }
-  // set index set
-  inline void SetIndexSet(const std::vector<bst_uint> &fset, bool load_all) {
-    itr.get_factory().SetIndexSet(fset, load_all);
-  }
-
- private:
-  // output data
-  ColBatch out_;
-  SparsePage *page_;
-  std::vector<SparseBatch::Inst> col_data_;
-  utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
-};
-
-struct ColConvertFactory {
-  inline bool Init(void) {
-    return true;
-  }
-  inline void Setup(float pkeep,
-                    size_t max_row_perbatch,
-                    size_t num_col,
-                    utils::IIterator<RowBatch> *iter,
-                    std::vector<bst_uint> *buffered_rowset,
-                    const std::vector<bool> *enabled) {
-    pkeep_ = pkeep;
-    max_row_perbatch_ = max_row_perbatch;
-    num_col_ = num_col;
-    iter_ = iter;
-    buffered_rowset_ = buffered_rowset;
-    enabled_ = enabled;
-  }
-  inline SparsePage *Create(void) {
-    return new SparsePage();
-  }
-  inline void FreeSpace(SparsePage *a) {
-    delete a;
-  }
-  inline void SetParam(const char *name, const char *val) {}
-  inline bool LoadNext(SparsePage *val) {
-    tmp_.Clear();
-    size_t btop = buffered_rowset_->size();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (pkeep_ == 1.0f || random::SampleBinary(pkeep_)) {
-          buffered_rowset_->push_back(ridx);
-          tmp_.Push(batch[i]);
-        }
-      }
-      if (tmp_.MemCostBytes() >= kPageSize ||
-          tmp_.Size() >= max_row_perbatch_) {
-        this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
-                          *enabled_, val);
-        return true;
-      }
-    }
-    if (tmp_.Size() != 0) {
-        this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
-                          *enabled_, val);
-        return true;
-    } else {
-      return false;
-    }
-  }
-  inline void Destroy(void) {}
-  inline void BeforeFirst(void) {}
-  inline void MakeColPage(const SparsePage &prow,
-                          const bst_uint *ridx,
-                          const std::vector<bool> &enabled,
-                          SparsePage *pcol) {
-    pcol->Clear();
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-      int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1);
-      if (nthread > max_nthread) {
-        nthread = max_nthread;
-      }
-    }
-    pcol->Clear();
-    utils::ParallelGroupBuilder<SparseBatch::Entry>
-        builder(&pcol->offset, &pcol->data);
-    builder.InitBudget(num_col_, nthread);
-    bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
-        const SparseBatch::Entry &e = prow.data[j];
-        if (enabled[e.index]) {
-          builder.AddBudget(e.index, tid);
-        }
-      }
-    }
-    builder.InitStorage();
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
-        const SparseBatch::Entry &e = prow.data[j];
-        builder.Push(e.index,
-                     SparseBatch::Entry(ridx[i], e.fvalue),
-                     tid);
-      }
-    }
-    utils::Assert(pcol->Size() == num_col_, "inconsistent col data");
-    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(pcol->data) + pcol->offset[i + 1],
-                  SparseBatch::Entry::CmpValue);
-      }
-    }
-  }
-  // probability of keep
-  float pkeep_;
-  // maximum number of rows per batch
-  size_t max_row_perbatch_;
-  // number of columns
-  size_t num_col_;
-  // row batch iterator
-  utils::IIterator<RowBatch> *iter_;
-  // buffered rowset
-  std::vector<bst_uint> *buffered_rowset_;
-  // enabled marks
-  const std::vector<bool> *enabled_;
-  // internal temp cache
-  SparsePage tmp_;
-  /*! \brief page size 256 M */
-  static const size_t kPageSize = 256 << 20UL;
-};
-/*!
- * \brief sparse matrix that support column access, CSC
- */
-class FMatrixPage : public IFMatrix {
- public:
-  typedef SparseBatch::Entry Entry;
-  /*! \brief constructor */
-  FMatrixPage(utils::IIterator<RowBatch> *iter,
-              const learner::MetaInfo &info) : info(info) {
-    this->iter_ = iter;
-  }
-  // destructor
-  virtual ~FMatrixPage(void) {
-    if (iter_ != NULL) delete iter_;
-  }
-  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const {
-    return col_size_.size() != 0;
-  }
-  /*! \brief get number of columns */
-  virtual size_t NumCol(void) const {
-    utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_size_.size();
-  }
-  /*! \brief get number of buffered rows */
-  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
-    return buffered_rowset_;
-  }
-  /*! \brief get column size */
-  virtual size_t GetColSize(size_t cidx) const {
-    return col_size_[cidx];
-  }
-  /*! \brief get column density */
-  virtual float GetColDensity(size_t cidx) const {
-    size_t nmiss = num_buffered_row_ - (col_size_[cidx]);
-    return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
-  }
-  virtual void InitColAccess(const std::vector<bool> &enabled,
-                             float pkeep, size_t max_row_perbatch) {
-    if (this->HaveColAccess()) return;
-    if (TryLoadColData()) return;
-    this->InitColData(enabled, pkeep, max_row_perbatch);
-    utils::Check(TryLoadColData(), "failed on creating col.blob");
-  }
-  /*!
-   * \brief get the row iterator associated with FMatrix
-   */
-  virtual utils::IIterator<RowBatch>* RowIterator(void) {
-    iter_->BeforeFirst();
-    return iter_;
-  }
-  /*!
-   * \brief get the column based  iterator
-   */
-  virtual utils::IIterator<ColBatch>* ColIterator(void) {
-    size_t ncol = this->NumCol();
-    col_index_.resize(0);
-    for (size_t i = 0; i < ncol; ++i) {
-      col_index_.push_back(static_cast<bst_uint>(i));
-    }
-    col_iter_.SetIndexSet(col_index_, false);
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  /*!
-   * \brief column based iterator
-   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    size_t ncol = this->NumCol();
-    col_index_.resize(0);
-    for (size_t i = 0; i < fset.size(); ++i) {
-      if (fset[i] < ncol) col_index_.push_back(fset[i]);
-    }
-    col_iter_.SetIndexSet(col_index_, false);
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  // set the cache file name
-  inline void set_cache_file(const std::string &cache_file) {
-    col_data_name_ = std::string(cache_file) + ".col.blob";
-    col_meta_name_ = std::string(cache_file) + ".col.meta";
-  }
-
- protected:
-  inline bool TryLoadColData(void) {
-    std::FILE *fi = fopen64(col_meta_name_.c_str(), "rb");
-    if (fi == NULL) return false;
-    utils::FileStream fs(fi);
-    LoadMeta(&fs);
-    fs.Close();
-    fi = utils::FopenCheck(col_data_name_.c_str(), "rb");
-    if (fi == NULL) return false;
-    col_iter_.SetFile(utils::FileStream(fi));
-    return true;
-  }
-  inline void LoadMeta(utils::IStream *fi) {
-    utils::Check(fi->Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0,
-                 "invalid col.blob file");
-    utils::Check(fi->Read(&buffered_rowset_),
-                 "invalid col.blob file");
-    utils::Check(fi->Read(&col_size_),
-                 "invalid col.blob file");
-  }
-  inline void SaveMeta(utils::IStream *fo) {
-    fo->Write(&num_buffered_row_, sizeof(num_buffered_row_));
-    fo->Write(buffered_rowset_);
-    fo->Write(col_size_);
-  }
-  /*!
-   * \brief initialize column data
-   * \param enabled the list of enabled columns
-   * \param pkeep probability to keep a row
-   * \param max_row_perbatch maximum row per batch
-   */
-  inline void InitColData(const std::vector<bool> &enabled,
-                          float pkeep, size_t max_row_perbatch) {
-    // clear rowset
-    buffered_rowset_.clear();
-    col_size_.resize(info.num_col());
-    std::fill(col_size_.begin(), col_size_.end(), 0);
-    utils::FileStream fo;
-    fo = utils::FileStream(utils::FopenCheck(col_data_name_.c_str(), "wb"));
-    iter_->BeforeFirst();
-    double tstart = rabit::utils::GetTime();
-    size_t bytes_write = 0;
-    utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer;
-    citer.SetParam("buffer_size", "2");
-    citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(),
-                              iter_, &buffered_rowset_, &enabled);
-    citer.Init();
-    SparsePage *pcol;
-    while (citer.Next(pcol)) {
-      for (size_t i = 0; i < pcol->Size(); ++i) {
-        col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
-      }
-      pcol->Save(&fo);
-      size_t spage = pcol->MemCostBytes();
-      bytes_write += spage;
-      double tnow = rabit::utils::GetTime();
-      double tdiff = tnow - tstart;
-      utils::Printf("Writing to %s in %g MB/s, %lu MB written\n",
-                    col_data_name_.c_str(),
-                    (bytes_write >> 20UL) / tdiff,
-                    (bytes_write >> 20UL));
-    }
-    fo.Close();
-    num_buffered_row_ = buffered_rowset_.size();
-    fo = utils::FileStream(utils::FopenCheck(col_meta_name_.c_str(), "wb"));
-    this->SaveMeta(&fo);
-    fo.Close();
-  }
-
- private:
-  /*! \brief page size 256 M */
-  static const size_t kPageSize = 256 << 20UL;
-  // shared meta info with DMatrix
-  const learner::MetaInfo &info;
-  // row iterator
-  utils::IIterator<RowBatch> *iter_;
-  /*! \brief column based data file name */
-  std::string col_data_name_;
-  /*! \brief column based data file name */
-  std::string col_meta_name_;
-  /*! \brief list of row index that are buffered */
-  std::vector<bst_uint> buffered_rowset_;
-  // number of buffered rows
-  size_t num_buffered_row_;
-  // count for column data
-  std::vector<size_t> col_size_;
-  // internal column index for output
-  std::vector<bst_uint> col_index_;
-  // internal thread backed col iterator
-  ThreadColPageIterator col_iter_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
deleted file mode 100644
index 063b01665..000000000
--- a/src/io/simple_dmatrix-inl.hpp
+++ /dev/null
@@ -1,324 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file simple_dmatrix-inl.hpp
- * \brief simple implementation of DMatrixS that can be used
- *  the data format of xgboost is templatized, which means it can accept
- *  any data structure that implements the function defined by FMatrix
- *  this file is a specific implementation of input data structure that can be used by BoostLearner
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
-#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
-
-#include <string>
-#include <cstring>
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include "../data.h"
-#include "../utils/utils.h"
-#include "../learner/dmatrix.h"
-#include "./io.h"
-#include "./simple_fmatrix-inl.hpp"
-#include "../sync/sync.h"
-#include "./libsvm_parser.h"
-
-namespace xgboost {
-namespace io {
-/*! \brief implementation of DataMatrix, in CSR format */
-class DMatrixSimple : public DataMatrix {
- public:
-  // constructor
-  DMatrixSimple(void) : DataMatrix(kMagic) {
-    fmat_ = new FMatrixS(new OneBatchIter(this), this->info);
-    this->Clear();
-  }
-  // virtual destructor
-  virtual ~DMatrixSimple(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  /*! \brief clear the storage */
-  inline void Clear(void) {
-    row_ptr_.clear();
-    row_ptr_.push_back(0);
-    row_data_.clear();
-    info.Clear();
-  }
-  /*! \brief copy content data from source matrix */
-  inline void CopyFrom(const DataMatrix &src) {
-    this->Clear();
-    this->info = src.info;
-    // clone data contents from src matrix
-    utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        RowBatch::Inst inst = batch[i];
-        row_data_.resize(row_data_.size() + inst.length);
-        if (inst.length != 0) {
-          std::memcpy(&row_data_[row_ptr_.back()], inst.data,
-                      sizeof(RowBatch::Entry) * inst.length);
-        }
-        row_ptr_.push_back(row_ptr_.back() + inst.length);
-      }
-    }
-  }
-  /*!
-   * \brief add a row to the matrix
-   * \param feats features
-   * \return the index of added row
-   */
-  inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
-    for (size_t i = 0; i < feats.size(); ++i) {
-      row_data_.push_back(feats[i]);
-      info.info.num_col = std::max(info.info.num_col,
-                                   static_cast<size_t>(feats[i].index+1));
-    }
-    row_ptr_.push_back(row_ptr_.back() + feats.size());
-    info.info.num_row += 1;
-    return row_ptr_.size() - 2;
-  }
-  /*!
-   * \brief load split of input, used in distributed mode
-   * \param uri the uri of input
-   * \param loadsplit whether loadsplit of data or all the data
-   * \param silent whether print information or not
-   */
-  inline void LoadText(const char *uri, bool silent = false, bool loadsplit = false) {
-    int rank = 0, npart = 1;
-    if (loadsplit) {
-      rank = rabit::GetRank();
-      npart = rabit::GetWorldSize();
-    }
-    LibSVMParser parser(
-        dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
-    this->Clear();
-    while (parser.Next()) {
-      const LibSVMPage &batch = parser.Value();
-      size_t nlabel = info.labels.size();
-      info.labels.resize(nlabel + batch.label.size());
-      if (batch.label.size() != 0) {
-        std::memcpy(BeginPtr(info.labels) + nlabel,
-                    BeginPtr(batch.label),
-                    batch.label.size() * sizeof(float));
-      }
-      size_t ndata = row_data_.size();
-      row_data_.resize(ndata + batch.data.size());
-      if (batch.data.size() != 0) {
-        std::memcpy(BeginPtr(row_data_) + ndata,
-                    BeginPtr(batch.data),
-                    batch.data.size() * sizeof(RowBatch::Entry));
-      }
-      row_ptr_.resize(row_ptr_.size() + batch.label.size());
-      for (size_t i = 0; i < batch.label.size(); ++i) {
-        row_ptr_[nlabel + i + 1] = row_ptr_[nlabel] + batch.offset[i + 1];
-      }
-      info.info.num_row += batch.Size();
-      for (size_t i = 0; i < batch.data.size(); ++i) {
-        info.info.num_col = std::max(info.info.num_col,
-                                     static_cast<size_t>(batch.data[i].index+1));
-      }
-    }
-    if (!silent) {
-      utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    static_cast<unsigned long>(row_data_.size()), uri);  // NOLINT(*)
-    }
-    // try to load in additional file
-    if (!loadsplit) {
-      std::string name = uri;
-      std::string gname = name + ".group";
-      if (info.TryLoadGroup(gname.c_str(), silent)) {
-        utils::Check(info.group_ptr.back() == info.num_row(),
-                     "DMatrix: group data does not match the number of rows in features");
-      }
-      std::string wname = name + ".weight";
-      if (info.TryLoadFloatInfo("weight", wname.c_str(), silent)) {
-        utils::Check(info.weights.size() == info.num_row(),
-                     "DMatrix: weight data does not match the number of rows in features");
-      }
-      std::string mname = name + ".base_margin";
-      if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) {
-      }
-    }
-  }
-  /*!
-   * \brief load from binary file
-   * \param fname name of binary data
-   * \param silent whether print information or not
-   * \return whether loading is success
-   */
-  inline bool LoadBinary(const char* fname, bool silent = false) {
-    std::FILE *fp = fopen64(fname, "rb");
-    if (fp == NULL) return false;
-    utils::FileStream fs(fp);
-    this->LoadBinary(fs, silent, fname);
-    fs.Close();
-    return true;
-  }
-  /*!
-   * \brief load from binary stream
-   * \param fs input file stream
-   * \param silent whether print information during loading
-   * \param fname file name, used to print message
-   */
-  inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {  // NOLINT(*)
-    int tmagic;
-    utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch",
-                 fname == NULL ? "" : fname);
-
-    info.LoadBinary(fs);
-    LoadBinary(fs, &row_ptr_, &row_data_);
-    fmat_->LoadColAccess(fs);
-
-    if (!silent) {
-      utils::Printf("%lux%lu matrix with %lu entries is loaded",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    static_cast<unsigned long>(row_data_.size()));  // NOLINT(*)
-      if (fname != NULL) {
-        utils::Printf(" from %s\n", fname);
-      } else {
-        utils::Printf("\n");
-      }
-      if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
-      }
-    }
-  }
-  /*!
-   * \brief save to binary file
-   * \param fname name of binary data
-   * \param silent whether print information or not
-   */
-  inline void SaveBinary(const char* fname, bool silent = false) const {
-    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
-    int tmagic = kMagic;
-    fs.Write(&tmagic, sizeof(tmagic));
-    info.SaveBinary(fs);
-    SaveBinary(fs, row_ptr_, row_data_);
-    fmat_->SaveColAccess(fs);
-    fs.Close();
-
-    if (!silent) {
-      utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    static_cast<unsigned long>(row_data_.size()), fname);  // NOLINT(*)
-      if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n",
-                      static_cast<unsigned>(info.group_ptr.size()-1));
-      }
-    }
-  }
-  /*!
-   * \brief cache load data given a file name, if filename ends with .buffer, direct load binary
-   *        otherwise the function will first check if fname + '.buffer' exists,
-   *        if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
-   *        and try to create a buffer file
-   * \param fname name of binary data
-   * \param silent whether print information or not
-   * \param savebuffer whether do save binary buffer if it is text
-   */
-  inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
-    using namespace std;
-    size_t len = strlen(fname);
-    if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
-      if (!this->LoadBinary(fname, silent)) {
-        utils::Error("can not open file \"%s\"", fname);
-      }
-      return;
-    }
-    char bname[1024];
-    utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname);
-    if (!this->LoadBinary(bname, silent)) {
-      this->LoadText(fname, silent);
-      if (savebuffer) this->SaveBinary(bname, silent);
-    }
-  }
-  // data fields
-  /*! \brief row pointer of CSR sparse storage */
-  std::vector<size_t> row_ptr_;
-  /*! \brief data in the row */
-  std::vector<RowBatch::Entry> row_data_;
-  /*! \brief the real fmatrix */
-  FMatrixS *fmat_;
-  /*! \brief magic number used to identify DMatrix */
-  static const int kMagic = 0xffffab01;
-
- protected:
-  /*!
-   * \brief save data to binary stream
-   * \param fo output stream
-   * \param ptr pointer data
-   * \param data data content
-   */
-  inline static void SaveBinary(utils::IStream &fo,  // NOLINT(*)
-                                const std::vector<size_t> &ptr,
-                                const std::vector<RowBatch::Entry> &data) {
-    size_t nrow = ptr.size() - 1;
-    fo.Write(&nrow, sizeof(size_t));
-    fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
-    if (data.size() != 0) {
-      fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
-    }
-  }
-  /*!
-   * \brief load data from binary stream
-   * \param fi input stream
-   * \param out_ptr pointer data
-   * \param out_data data content
-   */
-  inline static void LoadBinary(utils::IStream &fi,  // NOLINT(*)
-                                std::vector<size_t> *out_ptr,
-                                std::vector<RowBatch::Entry> *out_data) {
-    size_t nrow;
-    utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
-    out_ptr->resize(nrow + 1);
-    utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
-                  "invalid input file format");
-    out_data->resize(out_ptr->back());
-    if (out_data->size() != 0) {
-      utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
-                    "invalid input file format");
-    }
-  }
-  // one batch iterator that return content in the matrix
-  struct OneBatchIter: utils::IIterator<RowBatch> {
-    explicit OneBatchIter(DMatrixSimple *parent)
-        : at_first_(true), parent_(parent) {}
-    virtual ~OneBatchIter(void) {}
-    virtual void BeforeFirst(void) {
-      at_first_ = true;
-    }
-    virtual bool Next(void) {
-      if (!at_first_) return false;
-      at_first_ = false;
-      batch_.size = parent_->row_ptr_.size() - 1;
-      batch_.base_rowid = 0;
-      batch_.ind_ptr = BeginPtr(parent_->row_ptr_);
-      batch_.data_ptr = BeginPtr(parent_->row_data_);
-      return true;
-    }
-    virtual const RowBatch &Value(void) const {
-      return batch_;
-    }
-
-   private:
-    // whether is at first
-    bool at_first_;
-    // pointer to parent
-    DMatrixSimple *parent_;
-    // temporal space for batch
-    RowBatch batch_;
-  };
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // namespace XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
deleted file mode 100644
index e467263fa..000000000
--- a/src/io/simple_fmatrix-inl.hpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file simple_fmatrix-inl.hpp
- * \brief the input data structure for gradient boosting
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
-#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
-
-#include <limits>
-#include <algorithm>
-#include <vector>
-#include "../data.h"
-#include "../utils/utils.h"
-#include "../utils/random.h"
-#include "../utils/omp.h"
-#include "../learner/dmatrix.h"
-#include "../utils/group_data.h"
-#include "./sparse_batch_page.h"
-
-namespace xgboost {
-namespace io {
-/*!
- * \brief sparse matrix that support column access, CSC
- */
-class FMatrixS : public IFMatrix {
- public:
-  typedef SparseBatch::Entry Entry;
-  /*! \brief constructor */
-  FMatrixS(utils::IIterator<RowBatch> *iter,
-               const learner::MetaInfo &info)
-      : info_(info) {
-    this->iter_ = iter;
-  }
-  // destructor
-  virtual ~FMatrixS(void) {
-    if (iter_ != NULL) delete iter_;
-  }
-  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const {
-    return col_size_.size() != 0;
-  }
-  /*! \brief get number of columns */
-  virtual size_t NumCol(void) const {
-    utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_size_.size();
-  }
-  /*! \brief get number of buffered rows */
-  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
-    return buffered_rowset_;
-  }
-  /*! \brief get column size */
-  virtual size_t GetColSize(size_t cidx) const {
-    return col_size_[cidx];
-  }
-  /*! \brief get column density */
-  virtual float GetColDensity(size_t cidx) const {
-    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
-    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
-  }
-  virtual void InitColAccess(const std::vector<bool> &enabled,
-                             float pkeep, size_t max_row_perbatch) {
-    if (this->HaveColAccess()) return;
-    this->InitColData(enabled, pkeep, max_row_perbatch);
-  }
-  /*!
-   * \brief get the row iterator associated with FMatrix
-   */
-  virtual utils::IIterator<RowBatch>* RowIterator(void) {
-    iter_->BeforeFirst();
-    return iter_;
-  }
-  /*!
-   * \brief get the column based  iterator
-   */
-  virtual utils::IIterator<ColBatch>* ColIterator(void) {
-    size_t ncol = this->NumCol();
-    col_iter_.col_index_.resize(ncol);
-    for (size_t i = 0; i < ncol; ++i) {
-      col_iter_.col_index_[i] = static_cast<bst_uint>(i);
-    }
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  /*!
-   * \brief column based iterator
-   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    size_t ncol = this->NumCol();
-    col_iter_.col_index_.resize(0);
-    for (size_t i = 0; i < fset.size(); ++i) {
-      if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
-    }
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  /*!
-   * \brief save column access data into stream
-   * \param fo output stream to save to
-   */
-  inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*)
-    size_t n = 0;
-    fo.Write(&n, sizeof(n));
-  }
-  /*!
-   * \brief load column access data from stream
-   * \param fo output stream to load from
-   */
-  inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*)
-    // do nothing in load col access
-  }
-
- protected:
-  /*!
-   * \brief initialize column data
-   * \param enabled the list of enabled columns
-   * \param pkeep probability to keep a row
-   * \param max_row_perbatch maximum row per batch
-   */
-  inline void InitColData(const std::vector<bool> &enabled,
-                          float pkeep, size_t max_row_perbatch) {
-    col_iter_.Clear();
-    if (info_.num_row() < max_row_perbatch) {
-      SparsePage *page = new SparsePage();
-      this->MakeOneBatch(enabled, pkeep, page);
-      col_iter_.cpages_.push_back(page);
-    } else {
-      this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
-    }
-    // setup col-size
-    col_size_.resize(info_.num_col());
-    std::fill(col_size_.begin(), col_size_.end(), 0);
-    for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
-      SparsePage *pcol = col_iter_.cpages_[i];
-      for (size_t j = 0; j < pcol->Size(); ++j) {
-        col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
-      }
-    }
-  }
-  /*!
-   * \brief make column page from iterator
-   * \param pkeep probability to keep a row
-   * \param pcol the target column
-   */
-  inline void MakeOneBatch(const std::vector<bool> &enabled,
-                           float pkeep,
-                           SparsePage *pcol) {
-    // clear rowset
-    buffered_rowset_.clear();
-    // bit map
-    int nthread;
-    std::vector<bool> bmap;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    pcol->Clear();
-    utils::ParallelGroupBuilder<SparseBatch::Entry>
-        builder(&pcol->offset, &pcol->data);
-    builder.InitBudget(info_.num_col(), nthread);
-    // start working
-    iter_->BeforeFirst();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      bmap.resize(bmap.size() + batch.size, true);
-      long batch_size = static_cast<long>(batch.size); // NOLINT(*)
-      for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
-          buffered_rowset_.push_back(ridx);
-        } else {
-          bmap[i] = false;
-        }
-      }
-      #pragma omp parallel for schedule(static)
-      for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
-        int tid = omp_get_thread_num();
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (bmap[ridx]) {
-          RowBatch::Inst inst = batch[i];
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            if (enabled[inst[j].index]) {
-              builder.AddBudget(inst[j].index, tid);
-            }
-          }
-        }
-      }
-    }
-    builder.InitStorage();
-
-    iter_->BeforeFirst();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      #pragma omp parallel for schedule(static)
-      for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
-        int tid = omp_get_thread_num();
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (bmap[ridx]) {
-          RowBatch::Inst inst = batch[i];
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            if (enabled[inst[j].index]) {
-              builder.Push(inst[j].index,
-                           Entry((bst_uint)(batch.base_rowid+i),
-                                 inst[j].fvalue), tid);
-            }
-          }
-        }
-      }
-    }
-
-    utils::Assert(pcol->Size() == info_.num_col(),
-                  "inconsistent col data");
-    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(pcol->data) + pcol->offset[i + 1],
-                  SparseBatch::Entry::CmpValue);
-      }
-    }
-  }
-
-  inline void MakeManyBatch(const std::vector<bool> &enabled,
-                            float pkeep, size_t max_row_perbatch) {
-    size_t btop = 0;
-    buffered_rowset_.clear();
-    // internal temp cache
-    SparsePage tmp; tmp.Clear();
-    iter_->BeforeFirst();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
-          buffered_rowset_.push_back(ridx);
-          tmp.Push(batch[i]);
-        }
-        if (tmp.Size() >= max_row_perbatch) {
-          SparsePage *page = new SparsePage();
-          this->MakeColPage(tmp.GetRowBatch(0),
-                            BeginPtr(buffered_rowset_) + btop,
-                            enabled, page);
-          col_iter_.cpages_.push_back(page);
-          btop = buffered_rowset_.size();
-          tmp.Clear();
-        }
-      }
-    }
-    if (tmp.Size() != 0) {
-      SparsePage *page = new SparsePage();
-      this->MakeColPage(tmp.GetRowBatch(0),
-                        BeginPtr(buffered_rowset_) + btop,
-                        enabled, page);
-      col_iter_.cpages_.push_back(page);
-    }
-  }
-  // make column page from subset of rowbatchs
-  inline void MakeColPage(const RowBatch &batch,
-                          const bst_uint *ridx,
-                          const std::vector<bool> &enabled,
-                          SparsePage *pcol) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-      int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
-      if (nthread > max_nthread) {
-        nthread = max_nthread;
-      }
-    }
-    pcol->Clear();
-    utils::ParallelGroupBuilder<SparseBatch::Entry>
-        builder(&pcol->offset, &pcol->data);
-    builder.InitBudget(info_.num_col(), nthread);
-    bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      RowBatch::Inst inst = batch[i];
-      for (bst_uint j = 0; j < inst.length; ++j) {
-        const SparseBatch::Entry &e = inst[j];
-        if (enabled[e.index]) {
-          builder.AddBudget(e.index, tid);
-        }
-      }
-    }
-    builder.InitStorage();
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      RowBatch::Inst inst = batch[i];
-      for (bst_uint j = 0; j < inst.length; ++j) {
-        const SparseBatch::Entry &e = inst[j];
-        builder.Push(e.index,
-                     SparseBatch::Entry(ridx[i], e.fvalue),
-                     tid);
-      }
-    }
-    utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
-    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(pcol->data) + pcol->offset[i + 1],
-                  SparseBatch::Entry::CmpValue);
-      }
-    }
-  }
-
- private:
-  // one batch iterator that return content in the matrix
-  struct ColBatchIter: utils::IIterator<ColBatch> {
-    ColBatchIter(void) : data_ptr_(0) {}
-    virtual ~ColBatchIter(void) {
-      this->Clear();
-    }
-    virtual void BeforeFirst(void) {
-      data_ptr_ = 0;
-    }
-    virtual bool Next(void) {
-      if (data_ptr_ >= cpages_.size()) return false;
-      data_ptr_ += 1;
-      SparsePage *pcol = cpages_[data_ptr_ - 1];
-      batch_.size = col_index_.size();
-      col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
-      for (size_t i = 0; i < col_data_.size(); ++i) {
-        const bst_uint ridx = col_index_[i];
-        col_data_[i] = SparseBatch::Inst
-            (BeginPtr(pcol->data) + pcol->offset[ridx],
-             static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
-      }
-      batch_.col_index = BeginPtr(col_index_);
-      batch_.col_data = BeginPtr(col_data_);
-      return true;
-    }
-    virtual const ColBatch &Value(void) const {
-      return batch_;
-    }
-    inline void Clear(void) {
-      for (size_t i = 0; i < cpages_.size(); ++i) {
-        delete cpages_[i];
-      }
-      cpages_.clear();
-    }
-    // data content
-    std::vector<bst_uint> col_index_;
-    // column content
-    std::vector<ColBatch::Inst> col_data_;
-    // column sparse pages
-    std::vector<SparsePage*> cpages_;
-    // data pointer
-    size_t data_ptr_;
-    // temporal space for batch
-    ColBatch batch_;
-  };
-  // --- data structure used to support InitColAccess --
-  // column iterator
-  ColBatchIter col_iter_;
-  // shared meta info with DMatrix
-  const learner::MetaInfo &info_;
-  // row iterator
-  utils::IIterator<RowBatch> *iter_;
-  /*! \brief list of row index that are buffered */
-  std::vector<bst_uint> buffered_rowset_;
-  // count for column data
-  std::vector<size_t> col_size_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_
diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h
deleted file mode 100644
index 96810c0fb..000000000
--- a/src/io/sparse_batch_page.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/*!
- * Copyright (c) 2014 by Contributors
- * \file sparse_batch_page.h
- *   content holder of sparse batch that can be saved to disk
- *   the representation can be effectively
- *   use in external memory computation
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
-#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
-
-#include <vector>
-#include <algorithm>
-#include "../data.h"
-
-namespace xgboost {
-namespace io {
-/*!
- * \brief storage unit of sparse batch
- */
-class SparsePage {
- public:
-  /*! \brief offset of the segments */
-  std::vector<size_t> offset;
-  /*! \brief the data of the segments */
-  std::vector<SparseBatch::Entry> data;
-  /*! \brief constructor */
-  SparsePage() {
-    this->Clear();
-  }
-  /*! \return number of instance in the page */
-  inline size_t Size() const {
-    return offset.size() - 1;
-  }
-  /*!
-   * \brief load only the segments we are interested in
-   * \param fi the input stream of the file
-   * \param sorted_index_set sorted index of segments we are interested in
-   * \return true of the loading as successful, false if end of file was reached
-   */
-  inline bool Load(utils::ISeekStream *fi,
-                   const std::vector<bst_uint> &sorted_index_set) {
-    if (!fi->Read(&disk_offset_)) return false;
-    // setup the offset
-    offset.clear(); offset.push_back(0);
-    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
-      bst_uint fid = sorted_index_set[i];
-      utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format");
-      size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
-      offset.push_back(offset.back() + size);
-    }
-    data.resize(offset.back());
-    // read in the data
-    size_t begin = fi->Tell();
-    size_t curr_offset = 0;
-    for (size_t i = 0; i < sorted_index_set.size();) {
-      bst_uint fid = sorted_index_set[i];
-      if (disk_offset_[fid] != curr_offset) {
-        utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted");
-        fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
-        curr_offset = disk_offset_[fid];
-      }
-      size_t j, size_to_read = 0;
-      for (j = i; j < sorted_index_set.size(); ++j) {
-        if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
-          size_to_read += offset[j + 1] - offset[j];
-        } else {
-          break;
-        }
-      }
-      if (size_to_read != 0) {
-        utils::Check(fi->Read(BeginPtr(data) + offset[i],
-                              size_to_read * sizeof(SparseBatch::Entry)) != 0,
-                     "Invalid SparsePage file");
-        curr_offset += size_to_read;
-      }
-      i = j;
-    }
-    // seek to end of record
-    if (curr_offset != disk_offset_.back()) {
-      fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
-    }
-    return true;
-  }
-  /*!
-   * \brief load all the segments
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
-   */
-  inline bool Load(utils::IStream *fi) {
-    if (!fi->Read(&offset)) return false;
-    utils::Check(offset.size() != 0, "Invalid SparsePage file");
-    data.resize(offset.back());
-    if (data.size() != 0) {
-      utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0,
-                   "Invalid SparsePage file");
-    }
-    return true;
-  }
-  /*!
-   * \brief save the data to fo, when a page was written
-   *    to disk it must contain all the elements in the
-   * \param fo output stream
-   */
-  inline void Save(utils::IStream *fo) const {
-    utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset");
-    utils::Assert(offset.back() == data.size(), "in consistent SparsePage");
-    fo->Write(offset);
-    if (data.size() != 0) {
-      fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
-    }
-  }
-  /*! \return estimation of memory cost of this page */
-  inline size_t MemCostBytes(void) const {
-    return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
-  }
-  /*! \brief clear the page */
-  inline void Clear(void) {
-    offset.clear();
-    offset.push_back(0);
-    data.clear();
-  }
-  /*!
-   * \brief load all the segments and add it to existing batch
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
-   */
-  inline bool PushLoad(utils::IStream *fi) {
-    if (!fi->Read(&disk_offset_)) return false;
-    data.resize(offset.back() + disk_offset_.back());
-    if (disk_offset_.back() != 0) {
-      utils::Check(fi->Read(BeginPtr(data) + offset.back(),
-                            disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
-                   "Invalid SparsePage file");
-    }
-    size_t top = offset.back();
-    size_t begin = offset.size();
-    offset.resize(offset.size() + disk_offset_.size());
-    for (size_t i = 0; i < disk_offset_.size(); ++i) {
-      offset[i + begin] = top + disk_offset_[i];
-    }
-    return true;
-  }
-  /*!
-   * \brief Push row batch into the page
-   * \param batch the row batch
-   */
-  inline void Push(const RowBatch &batch) {
-    data.resize(offset.back() + batch.ind_ptr[batch.size]);
-    std::memcpy(BeginPtr(data) + offset.back(),
-                batch.data_ptr + batch.ind_ptr[0],
-                sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
-    size_t top = offset.back();
-    size_t begin = offset.size();
-    offset.resize(offset.size() + batch.size);
-    for (size_t i = 0; i < batch.size; ++i) {
-      offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
-    }
-  }
-  /*!
-   * \brief Push a sparse page
-   * \param batch the row page
-   */
-  inline void Push(const SparsePage &batch) {
-    size_t top = offset.back();
-    data.resize(top + batch.data.size());
-    std::memcpy(BeginPtr(data) + top,
-                BeginPtr(batch.data),
-                sizeof(SparseBatch::Entry) * batch.data.size());
-    size_t begin = offset.size();
-    offset.resize(begin + batch.Size());
-    for (size_t i = 0; i < batch.Size(); ++i) {
-      offset[i + begin] = top + batch.offset[i + 1];
-    }
-  }
-  /*!
-   * \brief Push one instance into page
-   *  \param row an instance row
-   */
-  inline void Push(const SparseBatch::Inst &inst) {
-    offset.push_back(offset.back() + inst.length);
-    size_t begin = data.size();
-    data.resize(begin + inst.length);
-    if (inst.length != 0) {
-      std::memcpy(BeginPtr(data) + begin, inst.data,
-                  sizeof(SparseBatch::Entry) * inst.length);
-    }
-  }
-  /*!
-   * \param base_rowid base_rowid of the data
-   * \return row batch representation of the page
-   */
-  inline RowBatch GetRowBatch(size_t base_rowid) const {
-    RowBatch out;
-    out.base_rowid  = base_rowid;
-    out.ind_ptr = BeginPtr(offset);
-    out.data_ptr = BeginPtr(data);
-    out.size = offset.size() - 1;
-    return out;
-  }
-
- private:
-  /*! \brief external memory column offset */
-  std::vector<size_t> disk_offset_;
-};
-/*!
- * \brief factory class for SparsePage,
- *        used in threadbuffer template
- */
-class SparsePageFactory {
- public:
-  SparsePageFactory(void)
-      : action_load_all_(true), set_load_all_(true) {}
-  inline void SetFile(const utils::FileStream &fi,
-                      size_t file_begin = 0) {
-    fi_ = fi;
-    file_begin_ = file_begin;
-  }
-  inline const std::vector<bst_uint> &index_set(void) const {
-    return action_index_set_;
-  }
-  // set index set, will be used after next before first
-  inline void SetIndexSet(const std::vector<bst_uint> &index_set,
-                          bool load_all) {
-    set_load_all_ = load_all;
-    if (!set_load_all_) {
-      set_index_set_ = index_set;
-      std::sort(set_index_set_.begin(), set_index_set_.end());
-    }
-  }
-  inline bool Init(void) {
-    return true;
-  }
-  inline void SetParam(const char *name, const char *val) {}
-  inline bool LoadNext(SparsePage *val) {
-    if (!action_load_all_) {
-      if (action_index_set_.size() == 0) {
-        return false;
-      } else {
-        return val->Load(&fi_, action_index_set_);
-      }
-    } else {
-      return val->Load(&fi_);
-    }
-  }
-  inline SparsePage *Create(void) {
-    return new SparsePage();
-  }
-  inline void FreeSpace(SparsePage *a) {
-    delete a;
-  }
-  inline void Destroy(void) {
-    fi_.Close();
-  }
-  inline void BeforeFirst(void) {
-    fi_.Seek(file_begin_);
-    action_load_all_ = set_load_all_;
-    if (!set_load_all_) {
-      action_index_set_ = set_index_set_;
-    }
-  }
-
- private:
-  bool action_load_all_, set_load_all_;
-  size_t file_begin_;
-  utils::FileStream fi_;
-  std::vector<bst_uint> action_index_set_;
-  std::vector<bst_uint> set_index_set_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_SPARSE_BATCH_PAGE_H_
diff --git a/src/learner.cc b/src/learner.cc
new file mode 100644
index 000000000..f787be9fc
--- /dev/null
+++ b/src/learner.cc
@@ -0,0 +1,469 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file learner.cc
+ * \brief Implementation of learning algorithm.
+ * \author Tianqi Chen
+ */
+#include <xgboost/learner.h>
+#include <algorithm>
+#include <vector>
+#include <utility>
+#include <string>
+#include <sstream>
+#include <limits>
+#include <iomanip>
+#include "./common/io.h"
+#include "./common/random.h"
+
+namespace xgboost {
+// implementation of base learner.
+bool Learner::AllowLazyCheckPoint() const {
+  return gbm_->AllowLazyCheckPoint();
+}
+
+std::vector<std::string>
+Learner::Dump2Text(const FeatureMap& fmap, int option) const {
+  return gbm_->Dump2Text(fmap, option);
+}
+
+// simple routine to convert any data to string
+template<typename T>
+inline std::string ToString(const T& data) {
+  std::ostringstream os;
+  os << data;
+  return os.str();
+}
+
+/*! \brief training parameter for regression */
+struct LearnerModelParam
+    : public dmlc::Parameter<LearnerModelParam> {
+  /* \brief global bias */
+  float base_score;
+  /* \brief number of features  */
+  unsigned num_feature;
+  /* \brief number of classes, if it is multi-class classification  */
+  int num_class;
+  /*! \brief reserved field */
+  int reserved[31];
+  /*! \brief constructor */
+  LearnerModelParam() {
+    std::memset(this, 0, sizeof(LearnerModelParam));
+    base_score = 0.5f;
+  }
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LearnerModelParam) {
+    DMLC_DECLARE_FIELD(base_score).set_default(0.5f)
+        .describe("Global bias of the model.");
+    DMLC_DECLARE_FIELD(num_feature).set_default(0)
+        .describe("Number of features in training data,"\
+                  " this parameter will be automatically detected by learner.");
+    DMLC_DECLARE_FIELD(num_class).set_default(0).set_lower_bound(0)
+        .describe("Number of class option for multi-class classifier. "\
+                  " By default equals 0 and corresponds to binary classifier.");
+  }
+};
+
+
+struct LearnerTrainParam
+    : public dmlc::Parameter<LearnerTrainParam> {
+  // stored random seed
+  int seed;
+  // whether seed the PRNG each iteration
+  bool seed_per_iteration;
+  // data split mode, can be row, col, or none.
+  int dsplit;
+  // internal test flag
+  std::string test_flag;
+  // maximum buffered row value
+  float prob_buffer_row;
+  // maximum row per batch.
+  size_t max_row_perbatch;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Random number seed during training.");
+    DMLC_DECLARE_FIELD(seed_per_iteration).set_default(false)
+        .describe("Seed PRNG determnisticly via iterator number, "\
+                  "this option will be switched on automatically on distributed mode.");
+    DMLC_DECLARE_FIELD(dsplit).set_default(0)
+        .add_enum("auto", 0)
+        .add_enum("col", 1)
+        .add_enum("row", 2)
+        .describe("Data split mode for distributed trainig. ");
+    DMLC_DECLARE_FIELD(test_flag).set_default("")
+        .describe("Internal test flag");
+    DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
+        .describe("Maximum buffered row portion");
+    DMLC_DECLARE_FIELD(max_row_perbatch).set_default(std::numeric_limits<size_t>::max())
+        .describe("maximum row per batch.");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(LearnerModelParam);
+DMLC_REGISTER_PARAMETER(LearnerTrainParam);
+
+/*!
+ * \brief learner that performs gradient boosting for a specific objective function.
+ *  It does training and prediction.
+ */
+class LearnerImpl : public Learner {
+ public:
+  explicit LearnerImpl(const std::vector<DMatrix*>& cache_mats)
+      noexcept(false) {
+    // setup the cache setting in constructor.
+    CHECK_EQ(cache_.size(), 0);
+    size_t buffer_size = 0;
+    for (auto it  = cache_mats.begin(); it != cache_mats.end(); ++it) {
+      // avoid duplication.
+      if (std::find(cache_mats.begin(), it, *it) != it) continue;
+      DMatrix* pmat = *it;
+      pmat->cache_learner_ptr_ = this;
+      cache_.push_back(CacheEntry(pmat, buffer_size, pmat->info().num_row));
+      buffer_size += pmat->info().num_row;
+    }
+    pred_buffer_size_ = buffer_size;
+    // boosted tree
+    name_obj_ = "reg:linear";
+    name_gbm_ = "gbtree";
+  }
+
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    tparam.InitAllowUnknown(args);
+    // add to configurations
+    cfg_.clear();
+    for (const auto& kv : args) {
+      if (kv.first == "eval_metric") {
+        // check duplication
+        auto dup_check = [&kv](const std::unique_ptr<Metric>&m) {
+          return m->Name() != kv.second;
+        };
+        if (std::all_of(metrics_.begin(), metrics_.end(), dup_check)) {
+          metrics_.emplace_back(Metric::Create(kv.second));
+        }
+      } else {
+        cfg_[kv.first] = kv.second;
+      }
+    }
+    // add additional parameter
+    // These are cosntraints that need to be satisfied.
+    if (tparam.dsplit == 0 && rabit::IsDistributed()) {
+      tparam.dsplit = 2;
+    }
+
+    if (cfg_.count("num_class") != 0) {
+      cfg_["num_output_group"] = cfg_["num_class"];
+      if (atoi(cfg_["num_class"].c_str()) > 1 && cfg_.count("objective") == 0) {
+        cfg_["objective"] = "multi:softmax";
+      }
+    }
+
+    if (cfg_.count("max_delta_step") == 0 &&
+        cfg_.count("objective") != 0 &&
+        cfg_["objective"] == "count:poisson") {
+      cfg_["max_delta_step"] = "0.7";
+    }
+
+    if (cfg_.count("updater") == 0) {
+      if (tparam.dsplit == 1) {
+        cfg_["updater"] = "distcol";
+      } else if (tparam.dsplit == 2) {
+        cfg_["updater"] = "grow_histmaker,prune";
+      }
+      if (tparam.prob_buffer_row != 1.0f) {
+        cfg_["updater"] = "grow_histmaker,refresh,prune";
+      }
+    }
+    if (cfg_.count("objective") == 0) {
+      cfg_["objective"] = "reg:linear";
+    }
+    if (cfg_.count("booster") == 0) {
+      cfg_["booster"] = "gbtree";
+    }
+
+    if (!this->ModelInitialized()) {
+      mparam.InitAllowUnknown(args);
+      name_obj_ = cfg_["objective"];
+      name_gbm_ = cfg_["booster"];
+    }
+
+    common::GlobalRandom().seed(tparam.seed);
+
+    // set number of features correctly.
+    cfg_["num_feature"] = ToString(mparam.num_feature);
+    if (gbm_.get() != nullptr) {
+      gbm_->Configure(cfg_.begin(), cfg_.end());
+    }
+    if (obj_.get() != nullptr) {
+      obj_->Configure(cfg_.begin(), cfg_.end());
+    }
+  }
+
+  void InitModel() override {
+    this->LazyInitModel();
+  }
+
+  void Load(dmlc::Stream* fi) override {
+    // TODO(tqchen) mark deprecation of old format.
+    common::PeekableInStream fp(fi);
+    // backward compatible header check.
+    std::string header;
+    header.resize(4);
+    if (fp.PeekRead(&header[0], 4) == 4) {
+      CHECK_NE(header, "bs64")
+          << "Base64 format is no longer supported in brick.";
+      if (header == "binf") {
+        CHECK_EQ(fp.Read(&header[0], 4), 4);
+      }
+    }
+    // use the peekable reader.
+    fi = &fp;
+    // read parameter
+    CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam))
+        << "BoostLearner: wrong model format";
+    {
+      // backward compatibility code for compatible with old model type
+      // for new model, Read(&name_obj_) is suffice
+      uint64_t len;
+      CHECK_EQ(fi->Read(&len, sizeof(len)), sizeof(len));
+      if (len >= std::numeric_limits<unsigned>::max()) {
+        int gap;
+        CHECK_EQ(fi->Read(&gap, sizeof(gap)), sizeof(gap))
+            << "BoostLearner: wrong model format";
+        len = len >> static_cast<uint64_t>(32UL);
+      }
+      if (len != 0) {
+        name_obj_.resize(len);
+        CHECK_EQ(fi->Read(&name_obj_[0], len), len)
+            <<"BoostLearner: wrong model format";
+      }
+    }
+    CHECK(fi->Read(&name_gbm_))
+        << "BoostLearner: wrong model format";
+    // duplicated code with LazyInitModel
+    obj_.reset(ObjFunction::Create(name_obj_));
+    gbm_.reset(GradientBooster::Create(name_gbm_));
+    gbm_->Load(fi);
+
+    if (metrics_.size() == 0) {
+      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
+    }
+    this->base_score_ = mparam.base_score;
+    gbm_->ResetPredBuffer(pred_buffer_size_);
+    cfg_["num_class"] = ToString(mparam.num_class);
+    obj_->Configure(cfg_.begin(), cfg_.end());
+  }
+
+  // rabit save model to rabit checkpoint
+  void Save(dmlc::Stream *fo) const override {
+    fo->Write(&mparam, sizeof(LearnerModelParam));
+    fo->Write(name_obj_);
+    fo->Write(name_gbm_);
+    gbm_->Save(fo);
+  }
+
+  void UpdateOneIter(int iter, DMatrix* train) override {
+    CHECK(ModelInitialized())
+        << "Always call InitModel or LoadModel before update";
+    if (tparam.seed_per_iteration || rabit::IsDistributed()) {
+      common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
+    }
+    this->LazyInitDMatrix(train);
+    this->PredictRaw(train, &preds_);
+    obj_->GetGradient(preds_, train->info(), iter, &gpair_);
+    gbm_->DoBoost(train, this->FindBufferOffset(train), &gpair_);
+  }
+
+  void BoostOneIter(int iter,
+                    DMatrix* train,
+                    std::vector<bst_gpair>* in_gpair) override {
+    if (tparam.seed_per_iteration || rabit::IsDistributed()) {
+      common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
+    }
+    this->LazyInitDMatrix(train);
+    gbm_->DoBoost(train, this->FindBufferOffset(train), in_gpair);
+  }
+
+  std::string EvalOneIter(int iter,
+                          const std::vector<DMatrix*>& data_sets,
+                          const std::vector<std::string>& data_names) override {
+    std::ostringstream os;
+    os << '[' << iter << ']'
+       << std::setiosflags(std::ios::fixed);
+    for (size_t i = 0; i < data_sets.size(); ++i) {
+      this->PredictRaw(data_sets[i], &preds_);
+      obj_->EvalTransform(&preds_);
+      for (auto& ev : metrics_) {
+        os << '\t' << data_names[i] << '-' << ev->Name() << ':'
+           << ev->Eval(preds_, data_sets[i]->info(), tparam.dsplit == 2);
+      }
+    }
+    return os.str();
+  }
+
+  std::pair<std::string, float> Evaluate(DMatrix* data, std::string metric) {
+    if (metric == "auto") metric = obj_->DefaultEvalMetric();
+    std::unique_ptr<Metric> ev(Metric::Create(metric.c_str()));
+    this->PredictRaw(data, &preds_);
+    obj_->EvalTransform(&preds_);
+    return std::make_pair(metric, ev->Eval(preds_, data->info(), tparam.dsplit == 2));
+  }
+
+  void Predict(DMatrix* data,
+               bool output_margin,
+               std::vector<float> *out_preds,
+               unsigned ntree_limit,
+               bool pred_leaf) const override {
+    if (pred_leaf) {
+      gbm_->PredictLeaf(data, out_preds, ntree_limit);
+    } else {
+      this->PredictRaw(data, out_preds, ntree_limit);
+      if (!output_margin) {
+        obj_->PredTransform(out_preds);
+      }
+    }
+  }
+
+ protected:
+  // check if p_train is ready to used by training.
+  // if not, initialize the column access.
+  inline void LazyInitDMatrix(DMatrix *p_train) {
+    if (p_train->HaveColAccess()) return;
+    int ncol = static_cast<int>(p_train->info().num_col);
+    std::vector<bool> enabled(ncol, true);
+    // set max row per batch to limited value
+    // in distributed mode, use safe choice otherwise
+    size_t max_row_perbatch = tparam.max_row_perbatch;
+    if (tparam.test_flag == "block" || tparam.dsplit == 2) {
+      max_row_perbatch = std::min(
+        static_cast<size_t>(32UL << 10UL), max_row_perbatch);
+    }
+    // initialize column access
+    p_train->InitColAccess(enabled,
+                           tparam.prob_buffer_row,
+                           max_row_perbatch);
+    if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
+      cfg_["updater"] = "grow_histmaker,prune";
+      if (gbm_.get() != nullptr) {
+        gbm_->Configure(cfg_.begin(), cfg_.end());
+      }
+    }
+  }
+
+  // return whether model is already initialized.
+  inline bool ModelInitialized() const {
+    return gbm_.get() != nullptr;
+  }
+  // lazily initialize the model if it haven't yet been initialized.
+  inline void LazyInitModel() {
+    if (this->ModelInitialized()) return;
+    // estimate feature bound
+    unsigned num_feature = 0;
+    for (size_t i = 0; i < cache_.size(); ++i) {
+      num_feature = std::max(num_feature,
+                             static_cast<unsigned>(cache_[i].mat_->info().num_col));
+    }
+    // run allreduce on num_feature to find the maximum value
+    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+    if (num_feature > mparam.num_feature) {
+      mparam.num_feature = num_feature;
+    }
+
+    // setup
+    cfg_["num_feature"] = ToString(mparam.num_feature);
+    CHECK(obj_.get() == nullptr && gbm_.get() == nullptr);
+    obj_.reset(ObjFunction::Create(name_obj_));
+    gbm_.reset(GradientBooster::Create(name_gbm_));
+    gbm_->Configure(cfg_.begin(), cfg_.end());
+    obj_->Configure(cfg_.begin(), cfg_.end());
+
+    // reset the base score
+    mparam.base_score = obj_->ProbToMargin(mparam.base_score);
+    if (metrics_.size() == 0) {
+      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
+    }
+
+    this->base_score_ = mparam.base_score;
+    gbm_->ResetPredBuffer(pred_buffer_size_);
+  }
+  /*!
+   * \brief get un-transformed prediction
+   * \param data training data matrix
+   * \param out_preds output vector that stores the prediction
+   * \param ntree_limit limit number of trees used for boosted tree
+   *   predictor, when it equals 0, this means we are using all the trees
+   */
+  inline void PredictRaw(DMatrix* data,
+                         std::vector<float>* out_preds,
+                         unsigned ntree_limit = 0) const {
+    CHECK(gbm_.get() != nullptr)
+        << "Predict must happen after Load or InitModel";
+    gbm_->Predict(data,
+                  this->FindBufferOffset(data),
+                  out_preds,
+                  ntree_limit);
+    // add base margin
+    std::vector<float>& preds = *out_preds;
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
+    const std::vector<bst_float>& base_margin = data->info().base_margin;
+    if (base_margin.size() != 0) {
+      CHECK_EQ(preds.size(), base_margin.size())
+          << "base_margin.size does not match with prediction size";
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint j = 0; j < ndata; ++j) {
+        preds[j] += base_margin[j];
+      }
+    } else {
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint j = 0; j < ndata; ++j) {
+        preds[j] += this->base_score_;
+      }
+    }
+  }
+  // cached size of predict buffer
+  size_t pred_buffer_size_;
+  // model parameter
+  LearnerModelParam mparam;
+  // training parameter
+  LearnerTrainParam tparam;
+  // configurations
+  std::map<std::string, std::string> cfg_;
+  // name of gbm
+  std::string name_gbm_;
+  // name of objective functon
+  std::string name_obj_;
+  // temporal storages for prediction
+  std::vector<float> preds_;
+  // gradient pairs
+  std::vector<bst_gpair> gpair_;
+
+ private:
+  /*! \brief random number transformation seed. */
+  static const int kRandSeedMagic = 127;
+  // cache entry object that helps handle feature caching
+  struct CacheEntry {
+    const DMatrix* mat_;
+    size_t buffer_offset_;
+    size_t num_row_;
+    CacheEntry(const DMatrix* mat, size_t buffer_offset, size_t num_row)
+        :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
+  };
+
+  // find internal buffer offset for certain matrix, if not exist, return -1
+  inline int64_t FindBufferOffset(const DMatrix* mat) const {
+    for (size_t i = 0; i < cache_.size(); ++i) {
+      if (cache_[i].mat_ == mat && mat->cache_learner_ptr_ == this) {
+        if (cache_[i].num_row_ == mat->info().num_row) {
+          return static_cast<int64_t>(cache_[i].buffer_offset_);
+        }
+      }
+    }
+    return -1;
+  }
+  /*! \brief the entries indicates that we have internal prediction cache */
+  std::vector<CacheEntry> cache_;
+};
+
+Learner* Learner::Create(const std::vector<DMatrix*>& cache_data) {
+  return new LearnerImpl(cache_data);
+}
+}  // namespace xgboost
diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h
deleted file mode 100644
index 52828c3be..000000000
--- a/src/learner/dmatrix.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file dmatrix.h
- * \brief meta data and template data structure
- *        used for regression/classification/ranking
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_LEARNER_DMATRIX_H_
-#define XGBOOST_LEARNER_DMATRIX_H_
-
-#include <vector>
-#include <cstring>
-#include "../data.h"
-#include "../utils/io.h"
-namespace xgboost {
-namespace learner {
-/*!
- * \brief meta information needed in training, including label, weight
- */
-struct MetaInfo {
-  /*!
-   * \brief information needed by booster
-   * BoosterInfo does not implement save and load,
-   * all serialization is done in MetaInfo
-   */
-  BoosterInfo info;
-  /*! \brief label of each instance */
-  std::vector<float> labels;
-  /*!
-   * \brief the index of begin and end of a group
-   * needed when the learning task is ranking
-   */
-  std::vector<bst_uint> group_ptr;
-  /*! \brief weights of each instance, optional */
-  std::vector<float> weights;
-  /*!
-   * \brief initialized margins,
-   * if specified, xgboost will start from this initial margin
-   * can be used to specify initial prediction to boost from
-   */
-  std::vector<float> base_margin;
-  /*! \brief version flag, used to check version of this info */
-  static const int kVersion = 0;
-  // constructor
-  MetaInfo(void) {}
-  /*! \return number of rows in dataset */
-  inline size_t num_row(void) const {
-    return info.num_row;
-  }
-  /*! \return number of columns in dataset */
-  inline size_t num_col(void) const {
-    return info.num_col;
-  }
-  /*! \brief clear all the information */
-  inline void Clear(void) {
-    labels.clear();
-    group_ptr.clear();
-    weights.clear();
-    info.root_index.clear();
-    base_margin.clear();
-    info.num_row = info.num_col = 0;
-  }
-  /*! \brief get weight of each instances */
-  inline float GetWeight(size_t i) const {
-    if (weights.size() != 0) {
-      return weights[i];
-    } else {
-      return 1.0f;
-    }
-  }
-  inline void SaveBinary(utils::IStream &fo) const { // NOLINT(*)
-    int version = kVersion;
-    fo.Write(&version, sizeof(version));
-    fo.Write(&info.num_row, sizeof(info.num_row));
-    fo.Write(&info.num_col, sizeof(info.num_col));
-    fo.Write(labels);
-    fo.Write(group_ptr);
-    fo.Write(weights);
-    fo.Write(info.root_index);
-    fo.Write(base_margin);
-  }
-  inline void LoadBinary(utils::IStream &fi) { // NOLINT(*)
-    int version;
-    utils::Check(fi.Read(&version, sizeof(version)) != 0, "MetaInfo: invalid format");
-    utils::Check(fi.Read(&info.num_row, sizeof(info.num_row)) != 0, "MetaInfo: invalid format");
-    utils::Check(fi.Read(&info.num_col, sizeof(info.num_col)) != 0, "MetaInfo: invalid format");
-    utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&info.root_index), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&base_margin), "MetaInfo: invalid format");
-  }
-  // try to load group information from file, if exists
-  inline bool TryLoadGroup(const char* fname, bool silent = false) {
-    using namespace std;
-    FILE *fi = fopen64(fname, "r");
-    if (fi == NULL) return false;
-    group_ptr.push_back(0);
-    unsigned nline;
-    while (fscanf(fi, "%u", &nline) == 1) {
-      group_ptr.push_back(group_ptr.back()+nline);
-    }
-    if (!silent) {
-      utils::Printf("%u groups are loaded from %s\n",
-                    static_cast<unsigned>(group_ptr.size()-1), fname);
-    }
-    fclose(fi);
-    return true;
-  }
-  inline std::vector<float>& GetFloatInfo(const char *field) {
-    using namespace std;
-    if (!strcmp(field, "label")) return labels;
-    if (!strcmp(field, "weight")) return weights;
-    if (!strcmp(field, "base_margin")) return base_margin;
-    utils::Error("unknown field %s", field);
-    return labels;
-  }
-  inline const std::vector<float>& GetFloatInfo(const char *field) const {
-    return ((MetaInfo*)this)->GetFloatInfo(field); // NOLINT(*)
-  }
-  inline std::vector<unsigned> &GetUIntInfo(const char *field) {
-    using namespace std;
-    if (!strcmp(field, "root_index")) return info.root_index;
-    if (!strcmp(field, "fold_index")) return info.fold_index;
-    utils::Error("unknown field %s", field);
-    return info.root_index;
-  }
-  inline const std::vector<unsigned> &GetUIntInfo(const char *field) const {
-    return ((MetaInfo*)this)->GetUIntInfo(field);  // NOLINT(*)
-  }
-  // try to load weight information from file, if exists
-  inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
-    using namespace std;
-    std::vector<float> &data = this->GetFloatInfo(field);
-    FILE *fi = fopen64(fname, "r");
-    if (fi == NULL) return false;
-    float wt;
-    while (fscanf(fi, "%f", &wt) == 1) {
-      data.push_back(wt);
-    }
-    if (!silent) {
-      utils::Printf("loading %s from %s\n", field, fname);
-    }
-    fclose(fi);
-    return true;
-  }
-};
-
-/*!
- * \brief data object used for learning,
- * \tparam FMatrix type of feature data source
- */
-struct DMatrix {
-  /*!
-   * \brief magic number associated with this object
-   *    used to check if it is specific instance
-   */
-  const int magic;
-  /*! \brief meta information about the dataset */
-  MetaInfo info;
-  /*!
-   * \brief cache pointer to verify if the data structure is cached in some learner
-   *  used to verify if DMatrix is cached
-   */
-  void *cache_learner_ptr_;
-  /*! \brief default constructor */
-  explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
-  /*! \brief get feature matrix about data content */
-  virtual IFMatrix *fmat(void) const = 0;
-  // virtual destructor
-  virtual ~DMatrix(void){}
-};
-
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_DMATRIX_H_
diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp
deleted file mode 100644
index d28702728..000000000
--- a/src/learner/evaluation-inl.hpp
+++ /dev/null
@@ -1,589 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file xgboost_evaluation-inl.hpp
- * \brief evaluation metrics for regression and classification and rank
- * \author Kailong Chen, Tianqi Chen
- */
-#ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_
-#define XGBOOST_LEARNER_EVALUATION_INL_HPP_
-
-#include <vector>
-#include <utility>
-#include <string>
-#include <cmath>
-#include <climits>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/math.h"
-#include "./evaluation.h"
-#include "./helper_utils.h"
-
-namespace xgboost {
-namespace learner {
-/*!
- * \brief base class of element-wise evaluation
- * \tparam Derived the name of subclass
- */
-template<typename Derived>
-struct EvalEWiseBase : public IEvaluator {
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() == info.labels.size(),
-                 "label and prediction size not match"\
-                 "hint: use merror or mlogloss for multi-class classification");
-
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
-
-    float sum = 0.0, wsum = 0.0;
-    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const float wt = info.GetWeight(i);
-      sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
-      wsum += wt;
-    }
-    float dat[2]; dat[0] = sum, dat[1] = wsum;
-    if (distributed) {
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-    }
-    return Derived::GetFinal(dat[0], dat[1]);
-  }
-  /*!
-   * \brief to be implemented by subclass,
-   *   get evaluation result from one row
-   * \param label label of current instance
-   * \param pred prediction value of current instance
-   */
-  inline static float EvalRow(float label, float pred);
-  /*!
-   * \brief to be overridden by subclass, final transformation
-   * \param esum the sum statistics returned by EvalRow
-   * \param wsum sum of weight
-   */
-  inline static float GetFinal(float esum, float wsum) {
-    return esum / wsum;
-  }
-};
-
-/*! \brief RMSE */
-struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
-  virtual const char *Name(void) const {
-    return "rmse";
-  }
-  inline static float EvalRow(float label, float pred) {
-    float diff = label - pred;
-    return diff * diff;
-  }
-  inline static float GetFinal(float esum, float wsum) {
-    return std::sqrt(esum / wsum);
-  }
-};
-
-/*! \brief logloss */
-struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
-  virtual const char *Name(void) const {
-    return "logloss";
-  }
-  inline static float EvalRow(float y, float py) {
-    const float eps = 1e-16f;
-    const float pneg = 1.0f - py;
-    if (py < eps) {
-      return -y * std::log(eps) - (1.0f - y)  * std::log(1.0f - eps);
-    } else if (pneg < eps) {
-      return -y * std::log(1.0f - eps) - (1.0f - y)  * std::log(eps);
-    } else {
-      return -y * std::log(py) - (1.0f - y) * std::log(pneg);
-    }
-  }
-};
-
-/*! \brief error */
-struct EvalError : public EvalEWiseBase<EvalError> {
-  virtual const char *Name(void) const {
-    return "error";
-  }
-  inline static float EvalRow(float label, float pred) {
-    // assume label is in [0,1]
-    return pred > 0.5f ? 1.0f - label : label;
-  }
-};
-
-/*! \brief log-likelihood of Poission distribution */
-struct EvalPoissionNegLogLik : public EvalEWiseBase<EvalPoissionNegLogLik> {
-  virtual const char *Name(void) const {
-    return "poisson-nloglik";
-  }
-  inline static float EvalRow(float y, float py) {
-    const float eps = 1e-16f;
-    if (py < eps) py = eps;
-    return utils::LogGamma(y + 1.0f) + py - std::log(py) * y;
-  }
-};
-
-/*!
- * \brief base class of multi-class evaluation
- * \tparam Derived the name of subclass
- */
-template<typename Derived>
-struct EvalMClassBase : public IEvaluator {
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "label and prediction size not match");
-    const size_t nclass = preds.size() / info.labels.size();
-    utils::Check(nclass > 1,
-                 "mlogloss and merror are only used for multi-class classification,"\
-                 " use logloss for binary classification");
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
-    float sum = 0.0, wsum = 0.0;
-    int label_error = 0;
-    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const float wt = info.GetWeight(i);
-      int label =  static_cast<int>(info.labels[i]);
-      if (label >= 0 && label < static_cast<int>(nclass)) {
-        sum += Derived::EvalRow(label,
-                                BeginPtr(preds) + i * nclass,
-                                nclass) * wt;
-        wsum += wt;
-      } else {
-        label_error = label;
-      }
-    }
-    utils::Check(label_error >= 0 && label_error < static_cast<int>(nclass),
-                 "MultiClassEvaluation: label must be in [0, num_class)," \
-                 " num_class=%d but found %d in label",
-                 static_cast<int>(nclass), label_error);
-    float dat[2]; dat[0] = sum, dat[1] = wsum;
-    if (distributed) {
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-    }
-    return Derived::GetFinal(dat[0], dat[1]);
-  }
-  /*!
-   * \brief to be implemented by subclass,
-   *   get evaluation result from one row
-   * \param label label of current instance
-   * \param pred prediction value of current instance
-   * \param nclass number of class in the prediction
-   */
-  inline static float EvalRow(int label,
-                              const float *pred,
-                              size_t nclass);
-  /*!
-   * \brief to be overridden by subclass, final transformation
-   * \param esum the sum statistics returned by EvalRow
-   * \param wsum sum of weight
-   */
-  inline static float GetFinal(float esum, float wsum) {
-    return esum / wsum;
-  }
-  // used to store error message
-  const char *error_msg_;
-};
-/*! \brief match error */
-struct EvalMatchError : public EvalMClassBase<EvalMatchError> {
-  virtual const char *Name(void) const {
-    return "merror";
-  }
-  inline static float EvalRow(int label,
-                              const float *pred,
-                              size_t nclass) {
-    return FindMaxIndex(pred, nclass) != static_cast<int>(label);
-  }
-};
-/*! \brief match error */
-struct EvalMultiLogLoss : public EvalMClassBase<EvalMultiLogLoss> {
-  virtual const char *Name(void) const {
-    return "mlogloss";
-  }
-  inline static float EvalRow(int label,
-                              const float *pred,
-                              size_t nclass) {
-    const float eps = 1e-16f;
-    size_t k = static_cast<size_t>(label);
-    if (pred[k] > eps) {
-      return -std::log(pred[k]);
-    } else {
-      return -std::log(eps);
-    }
-  }
-};
-
-/*! \brief ctest */
-struct EvalCTest: public IEvaluator {
-  EvalCTest(IEvaluator *base, const char *name)
-      : base_(base), name_(name) {}
-  virtual ~EvalCTest(void) {
-    delete base_;
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str());
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "label and prediction size not match");
-    size_t ngroup = preds.size() / info.labels.size() - 1;
-    const unsigned ndata = static_cast<unsigned>(info.labels.size());
-    utils::Check(ngroup > 1, "pred size does not meet requirement");
-    utils::Check(ndata == info.info.fold_index.size(), "need fold index");
-    double wsum = 0.0;
-    for (size_t k = 0; k < ngroup; ++k) {
-      std::vector<float> tpred;
-      MetaInfo tinfo;
-      for (unsigned i = 0; i < ndata; ++i) {
-        if (info.info.fold_index[i] == k) {
-          tpred.push_back(preds[i + (k + 1) * ndata]);
-          tinfo.labels.push_back(info.labels[i]);
-          tinfo.weights.push_back(info.GetWeight(i));
-        }
-      }
-      wsum += base_->Eval(tpred, tinfo);
-    }
-    return static_cast<float>(wsum / ngroup);
-  }
-
- private:
-  IEvaluator *base_;
-  std::string name_;
-};
-
-/*! \brief AMS: also records best threshold */
-struct EvalAMS : public IEvaluator {
- public:
-  explicit EvalAMS(const char *name) {
-    name_ = name;
-    // note: ams@0 will automatically select which ratio to go
-    utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
-  }
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(!distributed, "metric AMS do not support distributed evaluation");
-    using namespace std;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
-
-    utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
-    std::vector< std::pair<float, unsigned> > rec(ndata);
-
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      rec[i] = std::make_pair(preds[i], i);
-    }
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
-    if (ntop == 0) ntop = ndata;
-    const double br = 10.0;
-    unsigned thresindex = 0;
-    double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
-    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
-      const unsigned ridx = rec[i].second;
-      const float wt = info.weights[ridx];
-      if (info.labels[ridx] > 0.5f) {
-        s_tp += wt;
-      } else {
-        b_fp += wt;
-      }
-      if (rec[i].first != rec[i+1].first) {
-        double ams = sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
-        if (tams < ams) {
-          thresindex = i;
-          tams = ams;
-        }
-      }
-    }
-    if (ntop == ndata) {
-      utils::Printf("\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
-      return static_cast<float>(tams);
-    } else {
-      return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
-    }
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-
- private:
-  std::string name_;
-  float ratio_;
-};
-
-/*! \brief precision with cut off at top percentile */
-struct EvalPrecisionRatio : public IEvaluator{
- public:
-  explicit EvalPrecisionRatio(const char *name) : name_(name) {
-    using namespace std;
-    if (sscanf(name, "apratio@%f", &ratio_) == 1) {
-      use_ap = 1;
-    } else {
-      utils::Assert(sscanf(name, "pratio@%f", &ratio_) == 1, "BUG");
-      use_ap = 0;
-    }
-  }
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(!distributed, "metric %s do not support distributed evaluation", Name());
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Assert(preds.size() % info.labels.size() == 0,
-                  "label size predict size not match");
-    std::vector< std::pair<float, unsigned> > rec;
-    for (size_t j = 0; j < info.labels.size(); ++j) {
-      rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
-    }
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    double pratio = CalcPRatio(rec, info);
-    return static_cast<float>(pratio);
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-
- protected:
-  inline double CalcPRatio(const std::vector< std::pair<float, unsigned> >& rec,
-                           const MetaInfo &info) const {
-    size_t cutoff = static_cast<size_t>(ratio_ * rec.size());
-    double wt_hit = 0.0, wsum = 0.0, wt_sum = 0.0;
-    for (size_t j = 0; j < cutoff; ++j) {
-      const float wt = info.GetWeight(j);
-      wt_hit += info.labels[rec[j].second] * wt;
-      wt_sum += wt;
-      wsum += wt_hit / wt_sum;
-    }
-    if (use_ap != 0) {
-      return wsum / cutoff;
-    } else {
-      return wt_hit / wt_sum;
-    }
-  }
-  int use_ap;
-  float ratio_;
-  std::string name_;
-};
-
-/*! \brief Area Under Curve, for both classification and rank */
-struct EvalAuc : public IEvaluator {
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "label size predict size not match");
-    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(info.labels.size());
-
-    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
-    utils::Check(gptr.back() == info.labels.size(),
-                 "EvalAuc: group structure must match number of prediction");
-    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    // sum statistics
-    double sum_auc = 0.0f;
-    #pragma omp parallel reduction(+:sum_auc)
-    {
-      // each thread takes a local rec
-      std::vector< std::pair<float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.push_back(std::make_pair(preds[j], j));
-        }
-        std::sort(rec.begin(), rec.end(), CmpFirst);
-        // calculate AUC
-        double sum_pospair = 0.0;
-        double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
-        for (size_t j = 0; j < rec.size(); ++j) {
-          const float wt = info.GetWeight(rec[j].second);
-          const float ctr = info.labels[rec[j].second];
-          // keep bucketing predictions in same bucket
-          if (j != 0 && rec[j].first != rec[j - 1].first) {
-            sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-            sum_npos += buf_pos;
-            sum_nneg += buf_neg;
-            buf_neg = buf_pos = 0.0f;
-          }
-          buf_pos += ctr * wt;
-          buf_neg += (1.0f - ctr) * wt;
-        }
-        sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-        sum_npos += buf_pos;
-        sum_nneg += buf_neg;
-        // check weird conditions
-        utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
-                     "AUC: the dataset only contains pos or neg samples");
-        // this is the AUC
-        sum_auc += sum_pospair / (sum_npos*sum_nneg);
-      }
-    }
-    if (distributed) {
-      float dat[2];
-      dat[0] = static_cast<float>(sum_auc);
-      dat[1] = static_cast<float>(ngroup);
-      // approximately estimate auc using mean
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return static_cast<float>(sum_auc) / ngroup;
-    }
-  }
-  virtual const char *Name(void) const {
-    return "auc";
-  }
-};
-
-/*! \brief Evaluate rank list */
-struct EvalRankList : public IEvaluator {
- public:
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(preds.size() == info.labels.size(),
-                  "label size predict size not match");
-    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(preds.size());
-    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
-    utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
-    utils::Assert(gptr.back() == preds.size(),
-                   "EvalRanklist: group structure must match number of prediction");
-    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    // sum statistics
-    double sum_metric = 0.0f;
-    #pragma omp parallel reduction(+:sum_metric)
-    {
-      // each thread takes a local rec
-      std::vector< std::pair<float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
-        }
-        sum_metric += this->EvalMetric(rec);
-      }
-    }
-    if (distributed) {
-      float dat[2];
-      dat[0] = static_cast<float>(sum_metric);
-      dat[1] = static_cast<float>(ngroup);
-      // approximately estimate the metric using mean
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return static_cast<float>(sum_metric) / ngroup;
-    }
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-
- protected:
-  explicit EvalRankList(const char *name) {
-    using namespace std;
-    name_ = name;
-    minus_ = false;
-    if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
-      topn_ = UINT_MAX;
-    }
-    if (name[strlen(name) - 1] == '-') {
-      minus_ = true;
-    }
-  }
-  /*! \return evaluation metric, given the pair_sort record, (pred,label) */
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &pair_sort) const = 0; // NOLINT(*)
-
- protected:
-  unsigned topn_;
-  std::string name_;
-  bool minus_;
-};
-
-/*! \brief Precision at N, for both classification and rank */
-struct EvalPrecision : public EvalRankList{
- public:
-  explicit EvalPrecision(const char *name) : EvalRankList(name) {}
-
- protected:
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
-    // calculate Precision
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    unsigned nhit = 0;
-    for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
-      nhit += (rec[j].second != 0);
-    }
-    return static_cast<float>(nhit) / topn_;
-  }
-};
-
-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCG : public EvalRankList{
- public:
-  explicit EvalNDCG(const char *name) : EvalRankList(name) {}
-
- protected:
-  inline float CalcDCG(const std::vector< std::pair<float, unsigned> > &rec) const {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
-      const unsigned rel = rec[i].second;
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log(i + 2.0);
-      }
-    }
-    return static_cast<float>(sumdcg);
-  }
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const { // NOLINT(*)
-    std::stable_sort(rec.begin(), rec.end(), CmpFirst);
-    float dcg = this->CalcDCG(rec);
-    std::stable_sort(rec.begin(), rec.end(), CmpSecond);
-    float idcg = this->CalcDCG(rec);
-    if (idcg == 0.0f) {
-      if (minus_) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-    return dcg/idcg;
-  }
-};
-
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAP : public EvalRankList {
- public:
-  explicit EvalMAP(const char *name) : EvalRankList(name) {}
-
- protected:
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    unsigned nhits = 0;
-    double sumap = 0.0;
-    for (size_t i = 0; i < rec.size(); ++i) {
-      if (rec[i].second != 0) {
-        nhits += 1;
-        if (i < this->topn_) {
-          sumap += static_cast<float>(nhits) / (i+1);
-        }
-      }
-    }
-    if (nhits != 0) {
-      sumap /= nhits;
-      return static_cast<float>(sumap);
-    } else {
-      if (minus_) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-  }
-};
-
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_EVALUATION_INL_HPP_
diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h
deleted file mode 100644
index a98c47495..000000000
--- a/src/learner/evaluation.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file evaluation.h
- * \brief interface of evaluation function supported in xgboost
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_EVALUATION_H_
-#define XGBOOST_LEARNER_EVALUATION_H_
-
-#include <string>
-#include <vector>
-#include <cstdio>
-#include "../utils/utils.h"
-#include "./dmatrix.h"
-
-namespace xgboost {
-namespace learner {
-/*! \brief evaluator that evaluates the loss metrics */
-struct IEvaluator{
-  /*!
-   * \brief evaluate a specific metric
-   * \param preds prediction
-   * \param info information, including label etc.
-   * \param distributed whether a call to Allreduce is needed to gather
-   *        the average statistics across all the node,
-   *        this is only supported by some metrics
-   */
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed = false) const = 0;
-  /*! \return name of metric */
-  virtual const char *Name(void) const = 0;
-  /*! \brief virtual destructor */
-  virtual ~IEvaluator(void) {}
-};
-}  // namespace learner
-}  // namespace xgboost
-
-// include implementations of evaluation functions
-#include "evaluation-inl.hpp"
-// factory function
-namespace xgboost {
-namespace learner {
-inline IEvaluator* CreateEvaluator(const char *name) {
-  using namespace std;
-  if (!strcmp(name, "rmse")) return new EvalRMSE();
-  if (!strcmp(name, "error")) return new EvalError();
-  if (!strcmp(name, "merror")) return new EvalMatchError();
-  if (!strcmp(name, "logloss")) return new EvalLogLoss();
-  if (!strcmp(name, "mlogloss")) return new EvalMultiLogLoss();
-  if (!strcmp(name, "poisson-nloglik")) return new EvalPoissionNegLogLik();
-  if (!strcmp(name, "auc")) return new EvalAuc();
-  if (!strncmp(name, "ams@", 4)) return new EvalAMS(name);
-  if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
-  if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
-  if (!strncmp(name, "map", 3)) return new EvalMAP(name);
-  if (!strncmp(name, "ndcg", 4)) return new EvalNDCG(name);
-  if (!strncmp(name, "ct-", 3)) return new EvalCTest(CreateEvaluator(name+3), name);
-
-  utils::Error("unknown evaluation metric type: %s", name);
-  return NULL;
-}
-
-/*! \brief a set of evaluators */
-class EvalSet{
- public:
-  inline void AddEval(const char *name) {
-    using namespace std;
-    for (size_t i = 0; i < evals_.size(); ++i) {
-      if (!strcmp(name, evals_[i]->Name())) return;
-    }
-    evals_.push_back(CreateEvaluator(name));
-  }
-  ~EvalSet(void) {
-    for (size_t i = 0; i < evals_.size(); ++i) {
-      delete evals_[i];
-    }
-  }
-  inline std::string Eval(const char *evname,
-                          const std::vector<float> &preds,
-                          const MetaInfo &info,
-                          bool distributed = false) {
-    std::string result = "";
-    for (size_t i = 0; i < evals_.size(); ++i) {
-      float res = evals_[i]->Eval(preds, info, distributed);
-      char tmp[1024];
-      utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
-      result += tmp;
-    }
-    return result;
-  }
-  inline size_t Size(void) const {
-    return evals_.size();
-  }
-
- private:
-  std::vector<const IEvaluator*> evals_;
-};
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_EVALUATION_H_
diff --git a/src/learner/helper_utils.h b/src/learner/helper_utils.h
deleted file mode 100644
index 0db1b46f3..000000000
--- a/src/learner/helper_utils.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file helper_utils.h
- * \brief useful helper functions
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_
-#define XGBOOST_LEARNER_HELPER_UTILS_H_
-
-#include <utility>
-#include <vector>
-#include <cmath>
-#include <algorithm>
-namespace xgboost {
-namespace learner {
-// simple helper function to do softmax
-inline static void Softmax(std::vector<float>* p_rec) {
-  std::vector<float> &rec = *p_rec;
-  float wmax = rec[0];
-  for (size_t i = 1; i < rec.size(); ++i) {
-    wmax = std::max(rec[i], wmax);
-  }
-  double wsum = 0.0f;
-  for (size_t i = 0; i < rec.size(); ++i) {
-    rec[i] = std::exp(rec[i]-wmax);
-    wsum += rec[i];
-  }
-  for (size_t i = 0; i < rec.size(); ++i) {
-    rec[i] /= static_cast<float>(wsum);
-  }
-}
-
-inline static int FindMaxIndex(const float  *rec, size_t size) {
-  size_t mxid = 0;
-  for (size_t i = 1; i < size; ++i) {
-    if (rec[i] > rec[mxid]) {
-      mxid = i;
-    }
-  }
-  return static_cast<int>(mxid);
-}
-
-// simple helper function to do softmax
-inline static int FindMaxIndex(const std::vector<float>& rec) {
-  return FindMaxIndex(BeginPtr(rec), rec.size());
-}
-
-// perform numerically safe logsum
-inline float LogSum(float x, float y) {
-  if (x < y) {
-    return y + std::log(std::exp(x - y) + 1.0f);
-  } else {
-    return x + std::log(std::exp(y - x) + 1.0f);
-  }
-}
-// numerically safe logsum
-inline float LogSum(const float *rec, size_t size) {
-  float mx = rec[0];
-  for (size_t i = 1; i < size; ++i) {
-    mx = std::max(mx, rec[i]);
-  }
-  float sum = 0.0f;
-  for (size_t i = 0; i < size; ++i) {
-    sum += std::exp(rec[i] - mx);
-  }
-  return mx + std::log(sum);
-}
-
-// comparator functions for sorting pairs in descending order
-inline static bool CmpFirst(const std::pair<float, unsigned> &a,
-                            const std::pair<float, unsigned> &b) {
-  return a.first > b.first;
-}
-inline static bool CmpSecond(const std::pair<float, unsigned> &a,
-                             const std::pair<float, unsigned> &b) {
-  return a.second > b.second;
-}
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_HELPER_UTILS_H_
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
deleted file mode 100644
index 0e8480663..000000000
--- a/src/learner/learner-inl.hpp
+++ /dev/null
@@ -1,547 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file learner-inl.hpp
- * \brief learning algorithm
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
-#define XGBOOST_LEARNER_LEARNER_INL_HPP_
-
-#include <algorithm>
-#include <vector>
-#include <utility>
-#include <string>
-#include <limits>
-#include "../sync/sync.h"
-#include "../utils/io.h"
-#include "./objective.h"
-#include "./evaluation.h"
-#include "../gbm/gbm.h"
-
-namespace xgboost {
-/*! \brief namespace for learning algorithm */
-namespace learner {
-/*!
- * \brief learner that performs gradient boosting for a specific objective function.
- *  It does training and prediction.
- */
-class BoostLearner : public rabit::Serializable {
- public:
-  BoostLearner(void) {
-    obj_ = NULL;
-    gbm_ = NULL;
-    name_obj_ = "reg:linear";
-    name_gbm_ = "gbtree";
-    silent = 0;
-    prob_buffer_row = 1.0f;
-    distributed_mode = 0;
-    updater_mode = 0;
-    pred_buffer_size = 0;
-    seed_per_iteration = 0;
-    seed = 0;
-    save_base64 = 0;
-  }
-  virtual ~BoostLearner(void) {
-    if (obj_ != NULL) delete obj_;
-    if (gbm_ != NULL) delete gbm_;
-  }
-  /*!
-   * \brief add internal cache space for mat, this can speedup prediction for matrix,
-   *        please cache prediction for training and eval data
-   *    warning: if the model is loaded from file from some previous training history
-   *             set cache data must be called with exactly SAME
-   *             data matrices to continue training otherwise it will cause error
-   * \param mats array of pointers to matrix whose prediction result need to be cached
-   */
-  inline void SetCacheData(const std::vector<DMatrix*>& mats) {
-    utils::Assert(cache_.size() == 0, "can only call cache data once");
-    // assign buffer index
-    size_t buffer_size = 0;
-    for (size_t i = 0; i < mats.size(); ++i) {
-      bool dupilicate = false;
-      for (size_t j = 0; j < i; ++j) {
-        if (mats[i] == mats[j]) dupilicate = true;
-      }
-      if (dupilicate) continue;
-      // set mats[i]'s cache learner pointer to this
-      mats[i]->cache_learner_ptr_ = this;
-      cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
-      buffer_size += mats[i]->info.num_row();
-    }
-    char str_temp[25];
-    utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
-                   static_cast<unsigned long>(buffer_size)); // NOLINT(*)
-    this->SetParam("num_pbuffer", str_temp);
-    this->pred_buffer_size = buffer_size;
-  }
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    // in this version, bst: prefix is no longer required
-    if (strncmp(name, "bst:", 4) != 0) {
-      std::string n = "bst:"; n += name;
-      this->SetParam(n.c_str(), val);
-    }
-    if (!strcmp(name, "silent")) silent = atoi(val);
-    if (!strcmp(name, "dsplit")) {
-      if (!strcmp(val, "col")) {
-        this->SetParam("updater", "distcol");
-        distributed_mode = 1;
-      } else if (!strcmp(val, "row")) {
-        this->SetParam("updater", "grow_histmaker,prune");
-        distributed_mode = 2;
-      } else {
-        utils::Error("%s is invalid value for dsplit, should be row or col", val);
-      }
-    }
-    if (!strcmp(name, "updater_mode")) updater_mode = atoi(val);
-    if (!strcmp(name, "prob_buffer_row")) {
-      prob_buffer_row = static_cast<float>(atof(val));
-      utils::Check(distributed_mode == 0,
-                   "prob_buffer_row can only be used in single node mode so far");
-      this->SetParam("updater", "grow_colmaker,refresh,prune");
-    }
-    if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
-    if (!strcmp("seed", name)) {
-      seed = atoi(val); random::Seed(seed);
-    }
-    if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
-    if (!strcmp("save_base64", name)) save_base64 = atoi(val);
-    if (!strcmp(name, "num_class")) {
-      this->SetParam("num_output_group", val);
-    }
-    if (!strcmp(name, "nthread")) {
-      omp_set_num_threads(atoi(val));
-    }
-    if (gbm_ == NULL) {
-      if (!strcmp(name, "objective")) name_obj_ = val;
-      if (!strcmp(name, "booster")) name_gbm_ = val;
-      mparam.SetParam(name, val);
-    }
-    if (gbm_ != NULL) gbm_->SetParam(name, val);
-    if (obj_ != NULL) obj_->SetParam(name, val);
-    if (gbm_ == NULL || obj_ == NULL) {
-      cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
-    }
-  }
-  // this is an internal function
-  // initialize the trainer, called at InitModel and LoadModel
-  inline void InitTrainer(bool calc_num_feature = true) {
-    if (calc_num_feature) {
-      // estimate feature bound
-      unsigned num_feature = 0;
-      for (size_t i = 0; i < cache_.size(); ++i) {
-        num_feature = std::max(num_feature,
-                               static_cast<unsigned>(cache_[i].mat_->info.num_col()));
-      }
-      // run allreduce on num_feature to find the maximum value
-      rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
-      if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
-    }
-    char str_temp[25];
-    utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
-    this->SetParam("bst:num_feature", str_temp);
-  }
-  /*!
-   * \brief initialize the model
-   */
-  inline void InitModel(void) {
-    this->InitTrainer();
-    // initialize model
-    this->InitObjGBM();
-    // reset the base score
-    mparam.base_score = obj_->ProbToMargin(mparam.base_score);
-    // initialize GBM model
-    gbm_->InitModel();
-  }
-  /*!
-   * \brief load model from stream
-   * \param fi input stream
-   * \param calc_num_feature whether call InitTrainer with calc_num_feature
-   */
-  inline void LoadModel(utils::IStream &fi,  // NOLINT(*)
-                        bool calc_num_feature = true) {
-    utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
-                 "BoostLearner: wrong model format");
-    {
-      // backward compatibility code for compatible with old model type
-      // for new model, Read(&name_obj_) is suffice
-      uint64_t len;
-      utils::Check(fi.Read(&len, sizeof(len)) != 0, "BoostLearner: wrong model format");
-      if (len >= std::numeric_limits<unsigned>::max()) {
-        int gap;
-        utils::Check(fi.Read(&gap, sizeof(gap)) != 0, "BoostLearner: wrong model format");
-        len = len >> static_cast<uint64_t>(32UL);
-      }
-      if (len != 0) {
-        name_obj_.resize(len);
-        utils::Check(fi.Read(&name_obj_[0], len) != 0, "BoostLearner: wrong model format");
-      }
-    }
-    utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
-    // delete existing gbm if any
-    if (obj_ != NULL) delete obj_;
-    if (gbm_ != NULL) delete gbm_;
-    this->InitTrainer(calc_num_feature);
-    this->InitObjGBM();
-    char tmp[32];
-    utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class);
-    obj_->SetParam("num_class", tmp);
-    gbm_->LoadModel(fi, mparam.saved_with_pbuffer != 0);
-    if (mparam.saved_with_pbuffer == 0) {
-      gbm_->ResetPredBuffer(pred_buffer_size);
-    }
-  }
-  // rabit load model from rabit checkpoint
-  virtual void Load(rabit::Stream *fi) {
-    // for row split, we should not keep pbuffer
-    this->LoadModel(*fi, false);
-  }
-  // rabit save model to rabit checkpoint
-  virtual void Save(rabit::Stream *fo) const {
-    // for row split, we should not keep pbuffer
-    this->SaveModel(*fo, distributed_mode != 2);
-  }
-  /*!
-   * \brief load model from file
-   * \param fname file name
-   */
-  inline void LoadModel(const char *fname) {
-    utils::IStream *fi = utils::IStream::Create(fname, "r");
-    std::string header; header.resize(4);
-    // check header for different binary encode
-    // can be base64 or binary
-    utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
-    // base64 format
-    if (header == "bs64") {
-      utils::Base64InStream bsin(fi);
-      bsin.InitPosition();
-      this->LoadModel(bsin, true);
-    } else if (header == "binf") {
-      this->LoadModel(*fi, true);
-    } else {
-      delete fi;
-      fi = utils::IStream::Create(fname, "r");
-      this->LoadModel(*fi, true);
-    }
-    delete fi;
-  }
-  inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*)
-    ModelParam p = mparam;
-    p.saved_with_pbuffer = static_cast<int>(with_pbuffer);
-    fo.Write(&p, sizeof(ModelParam));
-    fo.Write(name_obj_);
-    fo.Write(name_gbm_);
-    gbm_->SaveModel(fo, with_pbuffer);
-  }
-  /*!
-   * \brief save model into file
-   * \param fname file name
-   * \param with_pbuffer whether save pbuffer together
-   */
-  inline void SaveModel(const char *fname, bool with_pbuffer) const {
-    utils::IStream *fo = utils::IStream::Create(fname, "w");
-    if (save_base64 != 0 || !strcmp(fname, "stdout")) {
-      fo->Write("bs64\t", 5);
-      utils::Base64OutStream bout(fo);
-      this->SaveModel(bout, with_pbuffer);
-      bout.Finish('\n');
-    } else {
-      fo->Write("binf", 4);
-      this->SaveModel(*fo, with_pbuffer);
-    }
-    delete fo;
-  }
-  /*!
-   * \brief check if data matrix is ready to be used by training,
-   *  if not initialize it
-   * \param p_train pointer to the matrix used by training
-   */
-  inline void CheckInit(DMatrix *p_train) {
-    int ncol = static_cast<int>(p_train->info.info.num_col);
-    std::vector<bool> enabled(ncol, true);
-    // set max row per batch to limited value
-    // in distributed mode, use safe choice otherwise
-    size_t max_row_perbatch = std::numeric_limits<size_t>::max();
-    if (updater_mode != 0 || distributed_mode == 2) {
-      max_row_perbatch = 32UL << 10UL;
-    }
-    // initialize column access
-    p_train->fmat()->InitColAccess(enabled,
-                                   prob_buffer_row,
-                                   max_row_perbatch);
-    const int kMagicPage = 0xffffab02;
-    // check, if it is DMatrixPage, then use hist maker
-    if (p_train->magic == kMagicPage) {
-      this->SetParam("updater", "grow_histmaker,prune");
-    }
-  }
-  /*!
-   * \brief update the model for one iteration
-   * \param iter current iteration number
-   * \param train reference to the data matrix
-   */
-  inline void UpdateOneIter(int iter, const DMatrix &train) {
-    if (seed_per_iteration != 0 || rabit::IsDistributed()) {
-      random::Seed(this->seed * kRandSeedMagic + iter);
-    }
-    this->PredictRaw(train, &preds_);
-    obj_->GetGradient(preds_, train.info, iter, &gpair_);
-    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
-  }
-  /*!
-   * \brief whether model allow lazy checkpoint
-   */
-  inline bool AllowLazyCheckPoint(void) const {
-    return gbm_->AllowLazyCheckPoint();
-  }
-  /*!
-   * \brief evaluate the model for specific iteration
-   * \param iter iteration number
-   * \param evals datas i want to evaluate
-   * \param evname name of each dataset
-   * \return a string corresponding to the evaluation result
-   */
-  inline std::string EvalOneIter(int iter,
-                                 const std::vector<const DMatrix*> &evals,
-                                 const std::vector<std::string> &evname) {
-    std::string res;
-    char tmp[256];
-    utils::SPrintf(tmp, sizeof(tmp), "[%d]", iter);
-    res = tmp;
-    for (size_t i = 0; i < evals.size(); ++i) {
-      this->PredictRaw(*evals[i], &preds_);
-      obj_->EvalTransform(&preds_);
-      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2);
-    }
-    return res;
-  }
-  /*!
-   * \brief simple evaluation function, with a specified metric
-   * \param data input data
-   * \param metric name of metric
-   * \return a pair of <evaluation name, result>
-   */
-  std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
-    if (metric == "auto") metric = obj_->DefaultEvalMetric();
-    IEvaluator *ev = CreateEvaluator(metric.c_str());
-    this->PredictRaw(data, &preds_);
-    obj_->EvalTransform(&preds_);
-    float res = ev->Eval(preds_, data.info);
-    delete ev;
-    return std::make_pair(metric, res);
-  }
-  /*!
-   * \brief get prediction
-   * \param data input data
-   * \param output_margin whether to only predict margin value instead of transformed prediction
-   * \param out_preds output vector that stores the prediction
-   * \param ntree_limit limit number of trees used for boosted tree
-   *   predictor, when it equals 0, this means we are using all the trees
-   * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
-   */
-  inline void Predict(const DMatrix &data,
-                      bool output_margin,
-                      std::vector<float> *out_preds,
-                      unsigned ntree_limit = 0,
-                      bool pred_leaf = false) const {
-    if (pred_leaf) {
-      gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit);
-    } else {
-      this->PredictRaw(data, out_preds, ntree_limit);
-      if (!output_margin) {
-        obj_->PredTransform(out_preds);
-      }
-    }
-  }
-  /*!
-   * \brief online prediction function, predict score for one instance at a time
-   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
-   *        more efficient than online prediction
-   *        This function is NOT threadsafe, make sure you only call from one thread
-   *
-   * \param inst the instance you want to predict
-   * \param output_margin whether to only predict margin value instead of transformed prediction
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction
-   * \sa Predict
-   */
-  inline void Predict(const SparseBatch::Inst &inst,
-                      bool output_margin,
-                      std::vector<float> *out_preds,
-                      unsigned ntree_limit = 0) const {
-    gbm_->Predict(inst, out_preds, ntree_limit);
-    if (out_preds->size() == 1) {
-      (*out_preds)[0] += mparam.base_score;
-    }
-    if (!output_margin) {
-      obj_->PredTransform(out_preds);
-    }
-  }
-  /*! \brief dump model out */
-  inline std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    return gbm_->DumpModel(fmap, option);
-  }
-
- protected:
-  /*!
-   * \brief initialize the objective function and GBM,
-   * if not yet done
-   */
-  inline void InitObjGBM(void) {
-    if (obj_ != NULL) return;
-    utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
-    obj_ = CreateObjFunction(name_obj_.c_str());
-    gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
-    this->InitAdditionDefaultParam();
-    // set parameters
-    for (size_t i = 0; i < cfg_.size(); ++i) {
-      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
-      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
-    }
-    if (evaluator_.Size() == 0) {
-      evaluator_.AddEval(obj_->DefaultEvalMetric());
-    }
-  }
-  /*!
-   * \brief additional default value for specific objs
-   */
-  inline void InitAdditionDefaultParam(void) {
-    if (name_obj_ == "count:poisson") {
-      obj_->SetParam("max_delta_step", "0.7");
-      gbm_->SetParam("max_delta_step", "0.7");
-    }
-  }
-  /*!
-   * \brief get un-transformed prediction
-   * \param data training data matrix
-   * \param out_preds output vector that stores the prediction
-   * \param ntree_limit limit number of trees used for boosted tree
-   *   predictor, when it equals 0, this means we are using all the trees
-   */
-  inline void PredictRaw(const DMatrix &data,
-                         std::vector<float> *out_preds,
-                         unsigned ntree_limit = 0) const {
-    gbm_->Predict(data.fmat(), this->FindBufferOffset(data),
-                  data.info.info, out_preds, ntree_limit);
-    // add base margin
-    std::vector<float> &preds = *out_preds;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
-    if (data.info.base_margin.size() != 0) {
-      utils::Check(preds.size() == data.info.base_margin.size(),
-                   "base_margin.size does not match with prediction size");
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint j = 0; j < ndata; ++j) {
-        preds[j] += data.info.base_margin[j];
-      }
-    } else {
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint j = 0; j < ndata; ++j) {
-        preds[j] += mparam.base_score;
-      }
-    }
-  }
-
-  /*! \brief training parameter for regression */
-  struct ModelParam{
-    /* \brief global bias */
-    float base_score;
-    /* \brief number of features  */
-    unsigned num_feature;
-    /* \brief number of classes, if it is multi-class classification  */
-    int num_class;
-    /*! \brief whether the model itself is saved with pbuffer */
-    int saved_with_pbuffer;
-    /*! \brief reserved field */
-    int reserved[30];
-    /*! \brief constructor */
-    ModelParam(void) {
-      std::memset(this, 0, sizeof(ModelParam));
-      base_score = 0.5f;
-      num_feature = 0;
-      num_class = 0;
-      saved_with_pbuffer = 0;
-    }
-    /*!
-     * \brief set parameters from outside
-     * \param name name of the parameter
-     * \param val value of the parameter
-     */
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
-      if (!strcmp("num_class", name)) num_class = atoi(val);
-      if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
-    }
-  };
-  // data fields
-  // stored random seed
-  int seed;
-  // whether seed the PRNG each iteration
-  // this is important for restart from existing iterations
-  // default set to no, but will auto switch on in distributed mode
-  int seed_per_iteration;
-  // save model in base64 encoding
-  int save_base64;
-  // silent during training
-  int silent;
-  // distributed learning mode, if any, 0:none, 1:col, 2:row
-  int distributed_mode;
-  // updater mode, 0:normal, reserved for internal test
-  int updater_mode;
-  // cached size of predict buffer
-  size_t pred_buffer_size;
-  // maximum buffered row value
-  float prob_buffer_row;
-  // evaluation set
-  EvalSet evaluator_;
-  // model parameter
-  ModelParam  mparam;
-  // gbm model that back everything
-  gbm::IGradBooster *gbm_;
-  // name of gbm model used for training
-  std::string name_gbm_;
-  // objective function
-  IObjFunction *obj_;
-  // name of objective function
-  std::string name_obj_;
-  // configurations
-  std::vector< std::pair<std::string, std::string> > cfg_;
-  // temporal storages for prediction
-  std::vector<float> preds_;
-  // gradient pairs
-  std::vector<bst_gpair> gpair_;
-
- protected:
-  // magic number to transform random seed
-  static const int kRandSeedMagic = 127;
-  // cache entry object that helps handle feature caching
-  struct CacheEntry {
-    const DMatrix *mat_;
-    size_t buffer_offset_;
-    size_t num_row_;
-    CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
-        :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
-  };
-  // find internal buffer offset for certain matrix, if not exist, return -1
-  inline int64_t FindBufferOffset(const DMatrix &mat) const {
-    for (size_t i = 0; i < cache_.size(); ++i) {
-      if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
-        if (cache_[i].num_row_ == mat.info.num_row()) {
-          return static_cast<int64_t>(cache_[i].buffer_offset_);
-        }
-      }
-    }
-    return -1;
-  }
-  // data structure field
-  /*! \brief the entries indicates that we have internal prediction cache */
-  std::vector<CacheEntry> cache_;
-};
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_LEARNER_INL_HPP_
diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp
deleted file mode 100644
index ce23b02fb..000000000
--- a/src/learner/objective-inl.hpp
+++ /dev/null
@@ -1,642 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file objective-inl.hpp
- * \brief objective function implementations
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
-#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include <utility>
-#include <cmath>
-#include <functional>
-#include "../data.h"
-#include "./objective.h"
-#include "./helper_utils.h"
-#include "../utils/random.h"
-#include "../utils/omp.h"
-
-namespace xgboost {
-namespace learner {
-/*! \brief defines functions to calculate some commonly used functions */
-struct LossType {
-  /*! \brief indicate which type we are using */
-  int loss_type;
-  // list of constants
-  static const int kLinearSquare = 0;
-  static const int kLogisticNeglik = 1;
-  static const int kLogisticClassify = 2;
-  static const int kLogisticRaw = 3;
-  /*!
-   * \brief transform the linear sum to prediction
-   * \param x linear sum of boosting ensemble
-   * \return transformed prediction
-   */
-  inline float PredTransform(float x) const {
-    switch (loss_type) {
-      case kLogisticRaw:
-      case kLinearSquare: return x;
-      case kLogisticClassify:
-      case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x));
-      default: utils::Error("unknown loss_type"); return 0.0f;
-    }
-  }
-  /*!
-   * \brief check if label range is valid
-   */
-  inline bool CheckLabel(float x) const {
-    if (loss_type != kLinearSquare) {
-      return x >= 0.0f && x <= 1.0f;
-    }
-    return true;
-  }
-  /*!
-   * \brief error message displayed when check label fail
-   */
-  inline const char * CheckLabelErrorMsg(void) const {
-    if (loss_type != kLinearSquare) {
-      return "label must be in [0,1] for logistic regression";
-    } else {
-      return "";
-    }
-  }
-  /*!
-   * \brief calculate first order gradient of loss, given transformed prediction
-   * \param predt transformed prediction
-   * \param label true label
-   * \return first order gradient
-   */
-  inline float FirstOrderGradient(float predt, float label) const {
-    switch (loss_type) {
-      case kLinearSquare: return predt - label;
-      case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
-      case kLogisticClassify:
-      case kLogisticNeglik: return predt - label;
-      default: utils::Error("unknown loss_type"); return 0.0f;
-    }
-  }
-  /*!
-   * \brief calculate second order gradient of loss, given transformed prediction
-   * \param predt transformed prediction
-   * \param label true label
-   * \return second order gradient
-   */
-  inline float SecondOrderGradient(float predt, float label) const {
-    // cap second order gradient to positive value
-    const float eps = 1e-16f;
-    switch (loss_type) {
-      case kLinearSquare: return 1.0f;
-      case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
-      case kLogisticClassify:
-      case kLogisticNeglik: return std::max(predt * (1.0f - predt), eps);
-      default: utils::Error("unknown loss_type"); return 0.0f;
-    }
-  }
-  /*!
-   * \brief transform probability value back to margin
-   */
-  inline float ProbToMargin(float base_score) const {
-    if (loss_type == kLogisticRaw ||
-        loss_type == kLogisticClassify ||
-        loss_type == kLogisticNeglik ) {
-      utils::Check(base_score > 0.0f && base_score < 1.0f,
-                   "base_score must be in (0,1) for logistic loss");
-      base_score = -std::log(1.0f / base_score - 1.0f);
-    }
-    return base_score;
-  }
-  /*! \brief get default evaluation metric for the objective */
-  inline const char *DefaultEvalMetric(void) const {
-    if (loss_type == kLogisticClassify) return "error";
-    if (loss_type == kLogisticRaw) return "auc";
-    return "rmse";
-  }
-};
-
-/*! \brief objective function that only need to */
-class RegLossObj : public IObjFunction {
- public:
-  explicit RegLossObj(int loss_type) {
-    loss.loss_type = loss_type;
-    scale_pos_weight = 1.0f;
-  }
-  virtual ~RegLossObj(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp("scale_pos_weight", name)) {
-      scale_pos_weight = static_cast<float>(atof(val));
-    }
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "labels are not correctly provided");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    // check if label in range
-    bool label_correct = true;
-    // start calculating gradient
-    const unsigned nstep = static_cast<unsigned>(info.labels.size());
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const unsigned j = i % nstep;
-      float p = loss.PredTransform(preds[i]);
-      float w = info.GetWeight(j);
-      if (info.labels[j] == 1.0f) w *= scale_pos_weight;
-      if (!loss.CheckLabel(info.labels[j])) label_correct = false;
-      gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
-                           loss.SecondOrderGradient(p, info.labels[j]) * w);
-    }
-    utils::Check(label_correct, loss.CheckLabelErrorMsg());
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return loss.DefaultEvalMetric();
-  }
-  virtual void PredTransform(std::vector<float> *io_preds) {
-    std::vector<float> &preds = *io_preds;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint j = 0; j < ndata; ++j) {
-      preds[j] = loss.PredTransform(preds[j]);
-    }
-  }
-  virtual float ProbToMargin(float base_score) const {
-    return loss.ProbToMargin(base_score);
-  }
-
- protected:
-  float scale_pos_weight;
-  LossType loss;
-};
-
-// poisson regression for count
-class PoissonRegression : public IObjFunction {
- public:
-  PoissonRegression(void) {
-    max_delta_step = 0.0f;
-  }
-  virtual ~PoissonRegression(void) {}
-
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp("max_delta_step", name)) {
-      max_delta_step = static_cast<float>(atof(val));
-    }
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(max_delta_step != 0.0f,
-                 "PoissonRegression: need to set max_delta_step");
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() == info.labels.size(),
-                 "labels are not correctly provided");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    // check if label in range
-    bool label_correct = true;
-    // start calculating gradient
-    const long ndata = static_cast<bst_omp_uint>(preds.size()); // NOLINT(*)
-    #pragma omp parallel for schedule(static)
-    for (long i = 0; i < ndata; ++i) { // NOLINT(*)
-      float p = preds[i];
-      float w = info.GetWeight(i);
-      float y = info.labels[i];
-      if (y >= 0.0f) {
-        gpair[i] = bst_gpair((std::exp(p) - y) * w,
-                             std::exp(p + max_delta_step) * w);
-      } else {
-        label_correct = false;
-      }
-    }
-    utils::Check(label_correct,
-                 "PoissonRegression: label must be nonnegative");
-  }
-  virtual void PredTransform(std::vector<float> *io_preds) {
-    std::vector<float> &preds = *io_preds;
-    const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
-    #pragma omp parallel for schedule(static)
-    for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
-      preds[j] = std::exp(preds[j]);
-    }
-  }
-  virtual void EvalTransform(std::vector<float> *io_preds) {
-    PredTransform(io_preds);
-  }
-  virtual float ProbToMargin(float base_score) const {
-    return std::log(base_score);
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return "poisson-nloglik";
-  }
-
- private:
-  float max_delta_step;
-};
-
-// softmax multi-class classification
-class SoftmaxMultiClassObj : public IObjFunction {
- public:
-  explicit SoftmaxMultiClassObj(int output_prob)
-      : output_prob(output_prob) {
-    nclass = 0;
-  }
-  virtual ~SoftmaxMultiClassObj(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp( "num_class", name )) nclass = atoi(val);
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(nclass != 0, "must set num_class to use softmax");
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % (static_cast<size_t>(nclass) * info.labels.size()) == 0,
-                 "SoftmaxMultiClassObj: label size and pred size does not match");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    const unsigned nstep = static_cast<unsigned>(info.labels.size() * nclass);
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size() / nclass);
-    int label_error = 0;
-    #pragma omp parallel
-    {
-      std::vector<float> rec(nclass);
-      #pragma omp for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        for (int k = 0; k < nclass; ++k) {
-          rec[k] = preds[i * nclass + k];
-        }
-        Softmax(&rec);
-        const unsigned j = i % nstep;
-        int label = static_cast<int>(info.labels[j]);
-        if (label < 0 || label >= nclass)  {
-          label_error = label; label = 0;
-        }
-        const float wt = info.GetWeight(j);
-        for (int k = 0; k < nclass; ++k) {
-          float p = rec[k];
-          const float h = 2.0f * p * (1.0f - p) * wt;
-          if (label == k) {
-            gpair[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h);
-          } else {
-            gpair[i * nclass + k] = bst_gpair(p* wt, h);
-          }
-        }
-      }
-    }
-    utils::Check(label_error >= 0 && label_error < nclass,
-                 "SoftmaxMultiClassObj: label must be in [0, num_class),"\
-                 " num_class=%d but found %d in label", nclass, label_error);
-  }
-  virtual void PredTransform(std::vector<float> *io_preds) {
-    this->Transform(io_preds, output_prob);
-  }
-  virtual void EvalTransform(std::vector<float> *io_preds) {
-    this->Transform(io_preds, 1);
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return "merror";
-  }
-
- private:
-  inline void Transform(std::vector<float> *io_preds, int prob) {
-    utils::Check(nclass != 0, "must set num_class to use softmax");
-    std::vector<float> &preds = *io_preds;
-    std::vector<float> tmp;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()/nclass);
-    if (prob == 0) tmp.resize(ndata);
-    #pragma omp parallel
-    {
-      std::vector<float> rec(nclass);
-      #pragma omp for schedule(static)
-      for (bst_omp_uint j = 0; j < ndata; ++j) {
-        for (int k = 0; k < nclass; ++k) {
-          rec[k] = preds[j * nclass + k];
-        }
-        if (prob == 0) {
-          tmp[j] = static_cast<float>(FindMaxIndex(rec));
-        } else {
-          Softmax(&rec);
-          for (int k = 0; k < nclass; ++k) {
-            preds[j * nclass + k] = rec[k];
-          }
-        }
-      }
-    }
-    if (prob == 0) preds = tmp;
-  }
-  // data field
-  int nclass;
-  int output_prob;
-};
-
-/*! \brief objective for lambda rank */
-class LambdaRankObj : public IObjFunction {
- public:
-  LambdaRankObj(void) {
-    loss.loss_type = LossType::kLogisticRaw;
-    fix_list_weight = 0.0f;
-    num_pairsample = 1;
-  }
-  virtual ~LambdaRankObj(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
-    if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
-    if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels.size());
-    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
-    utils::Check(gptr.size() != 0 && gptr.back() == info.labels.size(),
-                 "group structure not consistent with #rows");
-    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    #pragma omp parallel
-    {
-      // parall construct, declare random number generator here, so that each
-      // thread use its own random number generator, seed by thread id and current iteration
-      random::Random rnd; rnd.Seed(iter* 1111 + omp_get_thread_num());
-      std::vector<LambdaPair> pairs;
-      std::vector<ListEntry>  lst;
-      std::vector< std::pair<float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        lst.clear(); pairs.clear();
-        for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
-          lst.push_back(ListEntry(preds[j], info.labels[j], j));
-          gpair[j] = bst_gpair(0.0f, 0.0f);
-        }
-        std::sort(lst.begin(), lst.end(), ListEntry::CmpPred);
-        rec.resize(lst.size());
-        for (unsigned i = 0; i < lst.size(); ++i) {
-          rec[i] = std::make_pair(lst[i].label, i);
-        }
-        std::sort(rec.begin(), rec.end(), CmpFirst);
-        // enumerate buckets with same label, for each item in the lst, grab another sample randomly
-        for (unsigned i = 0; i < rec.size(); ) {
-          unsigned j = i + 1;
-          while (j < rec.size() && rec[j].first == rec[i].first) ++j;
-          // bucket in [i,j), get a sample outside bucket
-          unsigned nleft = i, nright = static_cast<unsigned>(rec.size() - j);
-          if (nleft + nright != 0) {
-            int nsample = num_pairsample;
-            while (nsample --) {
-              for (unsigned pid = i; pid < j; ++pid) {
-                unsigned ridx = static_cast<unsigned>(rnd.RandDouble() * (nleft+nright));
-                if (ridx < nleft) {
-                  pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second));
-                } else {
-                  pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second));
-                }
-              }
-            }
-          }
-          i = j;
-        }
-        // get lambda weight for the pairs
-        this->GetLambdaWeight(lst, &pairs);
-        // rescale each gradient and hessian so that the lst have constant weighted
-        float scale = 1.0f / num_pairsample;
-        if (fix_list_weight != 0.0f) {
-          scale *= fix_list_weight / (gptr[k+1] - gptr[k]);
-        }
-        for (size_t i = 0; i < pairs.size(); ++i) {
-          const ListEntry &pos = lst[pairs[i].pos_index];
-          const ListEntry &neg = lst[pairs[i].neg_index];
-          const float w = pairs[i].weight * scale;
-          float p = loss.PredTransform(pos.pred - neg.pred);
-          float g = loss.FirstOrderGradient(p, 1.0f);
-          float h = loss.SecondOrderGradient(p, 1.0f);
-          // accumulate gradient and hessian in both pid, and nid
-          gpair[pos.rindex].grad += g * w;
-          gpair[pos.rindex].hess += 2.0f * w * h;
-          gpair[neg.rindex].grad -= g * w;
-          gpair[neg.rindex].hess += 2.0f * w * h;
-        }
-      }
-    }
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return "map";
-  }
-
- protected:
-  /*! \brief helper information in a list */
-  struct ListEntry {
-    /*! \brief the predict score we in the data */
-    float pred;
-    /*! \brief the actual label of the entry */
-    float label;
-    /*! \brief row index in the data matrix */
-    unsigned rindex;
-    // constructor
-    ListEntry(float pred, float label, unsigned rindex)
-        : pred(pred), label(label), rindex(rindex) {}
-    // comparator by prediction
-    inline static bool CmpPred(const ListEntry &a, const ListEntry &b) {
-      return a.pred > b.pred;
-    }
-    // comparator by label
-    inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) {
-      return a.label > b.label;
-    }
-  };
-  /*! \brief a pair in the lambda rank */
-  struct LambdaPair {
-    /*! \brief positive index: this is a position in the list */
-    unsigned pos_index;
-    /*! \brief negative index: this is a position in the list */
-    unsigned neg_index;
-    /*! \brief weight to be filled in */
-    float weight;
-    // constructor
-    LambdaPair(unsigned pos_index, unsigned neg_index)
-        : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {}
-  };
-  /*!
-   * \brief get lambda weight for existing pairs
-   * \param list a list that is sorted by pred score
-   * \param io_pairs record of pairs, containing the pairs to fill in weights
-   */
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) = 0;
-
- private:
-  // loss function
-  LossType loss;
-  // number of samples peformed for each instance
-  int num_pairsample;
-  // fix weight of each elements in list
-  float fix_list_weight;
-};
-
-class PairwiseRankObj: public LambdaRankObj{
- public:
-  virtual ~PairwiseRankObj(void) {}
-
- protected:
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) {}
-};
-
-// beta version: NDCG lambda rank
-class LambdaRankObjNDCG : public LambdaRankObj {
- public:
-  virtual ~LambdaRankObjNDCG(void) {}
-
- protected:
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) {
-    std::vector<LambdaPair> &pairs = *io_pairs;
-    float IDCG;
-    {
-      std::vector<float> labels(sorted_list.size());
-      for (size_t i = 0; i < sorted_list.size(); ++i) {
-        labels[i] = sorted_list[i].label;
-      }
-      std::sort(labels.begin(), labels.end(), std::greater<float>());
-      IDCG = CalcDCG(labels);
-    }
-    if (IDCG == 0.0) {
-      for (size_t i = 0; i < pairs.size(); ++i) {
-        pairs[i].weight = 0.0f;
-      }
-    } else {
-      IDCG = 1.0f / IDCG;
-      for (size_t i = 0; i < pairs.size(); ++i) {
-        unsigned pos_idx = pairs[i].pos_index;
-        unsigned neg_idx = pairs[i].neg_index;
-        float pos_loginv = 1.0f / std::log(pos_idx + 2.0f);
-        float neg_loginv = 1.0f / std::log(neg_idx + 2.0f);
-        int pos_label = static_cast<int>(sorted_list[pos_idx].label);
-        int neg_label = static_cast<int>(sorted_list[neg_idx].label);
-        float original =
-            ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv;
-        float changed  =
-            ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv;
-        float delta = (original - changed) * IDCG;
-        if (delta < 0.0f) delta = - delta;
-        pairs[i].weight = delta;
-      }
-    }
-  }
-  inline static float CalcDCG(const std::vector<float> &labels) {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < labels.size(); ++i) {
-      const unsigned rel = static_cast<unsigned>(labels[i]);
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log(static_cast<float>(i + 2));
-      }
-    }
-    return static_cast<float>(sumdcg);
-  }
-};
-
-class LambdaRankObjMAP : public LambdaRankObj {
- public:
-  virtual ~LambdaRankObjMAP(void) {}
-
- protected:
-  struct MAPStats {
-    /*! \brief the accumulated precision */
-    float ap_acc;
-    /*!
-     * \brief the accumulated precision,
-     *   assuming a positive instance is missing
-     */
-    float ap_acc_miss;
-    /*!
-     * \brief the accumulated precision,
-     * assuming that one more positive instance is inserted ahead
-     */
-    float ap_acc_add;
-    /* \brief the accumulated positive instance count */
-    float hits;
-    MAPStats(void) {}
-    MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits)
-        : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {}
-  };
-  /*!
-   * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2
-   *        in sorted triples
-   * \param sorted_list the list containing entry information
-   * \param index1,index2 the instances switched
-   * \param map_stats a vector containing the accumulated precisions for each position in a list
-   */
-  inline float GetLambdaMAP(const std::vector<ListEntry> &sorted_list,
-                            int index1, int index2,
-                            std::vector<MAPStats> *p_map_stats) {
-    std::vector<MAPStats> &map_stats = *p_map_stats;
-    if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) {
-      return 0.0f;
-    }
-    if (index1 > index2) std::swap(index1, index2);
-    float original = map_stats[index2].ap_acc;
-    if (index1 != 0) original -= map_stats[index1 - 1].ap_acc;
-    float changed = 0;
-    float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f;
-    float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f;
-    if (label1 == label2) {
-      return 0.0;
-    } else if (label1 < label2) {
-      changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add;
-      changed += (map_stats[index1].hits + 1.0f) / (index1 + 1);
-    } else {
-      changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss;
-      changed += map_stats[index2].hits / (index2 + 1);
-    }
-    float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits);
-    if (ans < 0) ans = -ans;
-    return ans;
-  }
-  /*
-   * \brief obtain preprocessing results for calculating delta MAP
-   * \param sorted_list the list containing entry information
-   * \param map_stats a vector containing the accumulated precisions for each position in a list
-   */
-  inline void GetMAPStats(const std::vector<ListEntry> &sorted_list,
-                          std::vector<MAPStats> *p_map_acc) {
-    std::vector<MAPStats> &map_acc = *p_map_acc;
-    map_acc.resize(sorted_list.size());
-    float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0;
-    for (size_t i = 1; i <= sorted_list.size(); ++i) {
-      if (sorted_list[i - 1].label > 0.0f) {
-        hit++;
-        acc1 += hit / i;
-        acc2 += (hit - 1) / i;
-        acc3 += (hit + 1) / i;
-      }
-      map_acc[i - 1] = MAPStats(acc1, acc2, acc3, hit);
-    }
-  }
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) {
-    std::vector<LambdaPair> &pairs = *io_pairs;
-    std::vector<MAPStats> map_stats;
-    GetMAPStats(sorted_list, &map_stats);
-    for (size_t i = 0; i < pairs.size(); ++i) {
-      pairs[i].weight =
-          GetLambdaMAP(sorted_list, pairs[i].pos_index,
-                       pairs[i].neg_index, &map_stats);
-    }
-  }
-};
-
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
diff --git a/src/learner/objective.h b/src/learner/objective.h
deleted file mode 100644
index 774286854..000000000
--- a/src/learner/objective.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file objective.h
- * \brief interface of objective function used for gradient boosting
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_OBJECTIVE_H_
-#define XGBOOST_LEARNER_OBJECTIVE_H_
-
-#include <vector>
-#include "./dmatrix.h"
-
-namespace xgboost {
-namespace learner {
-/*! \brief interface of objective function */
-class IObjFunction{
- public:
-  /*! \brief virtual destructor */
-  virtual ~IObjFunction(void) {}
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val value of the parameter
-   */
-  virtual void SetParam(const char *name, const char *val) = 0;
-  /*!
-   * \brief get gradient over each of predictions, given existing information
-   * \param preds prediction of current round
-   * \param info information about labels, weights, groups in rank
-   * \param iter current iteration number
-   * \param out_gpair output of get gradient, saves gradient and second order gradient in
-   */
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) = 0;
-  /*! \return the default evaluation metric for the objective */
-  virtual const char* DefaultEvalMetric(void) const = 0;
-  // the following functions are optional, most of time default implementation is good enough
-  /*!
-   * \brief transform prediction values, this is only called when Prediction is called
-   * \param io_preds prediction values, saves to this vector as well
-   */
-  virtual void PredTransform(std::vector<float> *io_preds) {}
-  /*!
-   * \brief transform prediction values, this is only called when Eval is called,
-   *  usually it redirect to PredTransform
-   * \param io_preds prediction values, saves to this vector as well
-   */
-  virtual void EvalTransform(std::vector<float> *io_preds) {
-    this->PredTransform(io_preds);
-  }
-  /*!
-   * \brief transform probability value back to margin
-   * this is used to transform user-set base_score back to margin
-   * used by gradient boosting
-   * \return transformed value
-   */
-  virtual float ProbToMargin(float base_score) const {
-    return base_score;
-  }
-};
-}  // namespace learner
-}  // namespace xgboost
-
-// this are implementations of objective functions
-#include "objective-inl.hpp"
-// factory function
-namespace xgboost {
-namespace learner {
-/*! \brief factory function to create objective function by name */
-inline IObjFunction* CreateObjFunction(const char *name) {
-  using namespace std;
-  if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
-  if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
-  if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
-  if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw);
-  if (!strcmp("count:poisson", name)) return new PoissonRegression();
-  if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0);
-  if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1);
-  if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj();
-  if (!strcmp("rank:ndcg", name)) return new LambdaRankObjNDCG();
-  if (!strcmp("rank:map", name)) return new LambdaRankObjMAP();
-  utils::Error("unknown objective function type: %s", name);
-  return NULL;
-}
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_OBJECTIVE_H_
diff --git a/src/logging.cc b/src/logging.cc
new file mode 100644
index 000000000..f579f9a97
--- /dev/null
+++ b/src/logging.cc
@@ -0,0 +1,23 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file logging.cc
+ * \brief Implementation of loggers.
+ * \author Tianqi Chen
+ */
+#include <xgboost/logging.h>
+#include <iostream>
+#include "./common/sync.h"
+
+namespace xgboost {
+
+#if XGBOOST_CUSTOMIZE_LOGGER == 0
+ConsoleLogger::~ConsoleLogger() {
+  std::cout << log_stream_.str() << std::endl;
+}
+
+TrackerLogger::~TrackerLogger() {
+  log_stream_ << '\n';
+  rabit::TrackerPrint(log_stream_.str());
+}
+#endif
+}  // namespace xgboost
diff --git a/src/metric/elementwise_metric.cc b/src/metric/elementwise_metric.cc
new file mode 100644
index 000000000..e0086c9c9
--- /dev/null
+++ b/src/metric/elementwise_metric.cc
@@ -0,0 +1,130 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file elementwise_metric.cc
+ * \brief evaluation metrics for elementwise binary or regression.
+ * \author Kailong Chen, Tianqi Chen
+ */
+#include <xgboost/metric.h>
+#include <dmlc/registry.h>
+#include <cmath>
+#include "../common/math.h"
+#include "../common/sync.h"
+
+namespace xgboost {
+namespace metric {
+// tag the this file, used by force static link later.
+DMLC_REGISTRY_FILE_TAG(elementwise_metric);
+
+/*!
+ * \brief base class of element-wise evaluation
+ * \tparam Derived the name of subclass
+ */
+template<typename Derived>
+struct EvalEWiseBase : public Metric {
+  float Eval(const std::vector<float>& preds,
+             const MetaInfo& info,
+             bool distributed) const override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels.size())
+        << "label and prediction size not match, "
+        << "hint: use merror or mlogloss for multi-class classification";
+    const omp_ulong ndata = static_cast<omp_ulong>(info.labels.size());
+    double sum = 0.0, wsum = 0.0;
+    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
+    for (omp_ulong i = 0; i < ndata; ++i) {
+      const float wt = info.GetWeight(i);
+      sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
+      wsum += wt;
+    }
+    double dat[2]; dat[0] = sum, dat[1] = wsum;
+    if (distributed) {
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+    }
+    return Derived::GetFinal(dat[0], dat[1]);
+  }
+  /*!
+   * \brief to be implemented by subclass,
+   *   get evaluation result from one row
+   * \param label label of current instance
+   * \param pred prediction value of current instance
+   */
+  inline static float EvalRow(float label, float pred);
+  /*!
+   * \brief to be overridden by subclass, final transformation
+   * \param esum the sum statistics returned by EvalRow
+   * \param wsum sum of weight
+   */
+  inline static float GetFinal(float esum, float wsum) {
+    return esum / wsum;
+  }
+};
+
+struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
+  const char *Name() const override {
+    return "rmse";
+  }
+  inline static float EvalRow(float label, float pred) {
+    float diff = label - pred;
+    return diff * diff;
+  }
+  inline static float GetFinal(float esum, float wsum) {
+    return std::sqrt(esum / wsum);
+  }
+};
+
+struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
+  const char *Name() const override {
+    return "logloss";
+  }
+  inline static float EvalRow(float y, float py) {
+    const float eps = 1e-16f;
+    const float pneg = 1.0f - py;
+    if (py < eps) {
+      return -y * std::log(eps) - (1.0f - y)  * std::log(1.0f - eps);
+    } else if (pneg < eps) {
+      return -y * std::log(1.0f - eps) - (1.0f - y)  * std::log(eps);
+    } else {
+      return -y * std::log(py) - (1.0f - y) * std::log(pneg);
+    }
+  }
+};
+
+struct EvalError : public EvalEWiseBase<EvalError> {
+  const char *Name() const override {
+    return "error";
+  }
+  inline static float EvalRow(float label, float pred) {
+    // assume label is in [0,1]
+    return pred > 0.5f ? 1.0f - label : label;
+  }
+};
+
+struct EvalPoissionNegLogLik : public EvalEWiseBase<EvalPoissionNegLogLik> {
+  const char *Name() const override {
+    return "poisson-nloglik";
+  }
+  inline static float EvalRow(float y, float py) {
+    const float eps = 1e-16f;
+    if (py < eps) py = eps;
+    return common::LogGamma(y + 1.0f) + py - std::log(py) * y;
+  }
+};
+
+XGBOOST_REGISTER_METRIC(RMSE, "rmse")
+.describe("Rooted mean square error.")
+.set_body([](const char* param) { return new EvalRMSE(); });
+
+XGBOOST_REGISTER_METRIC(LogLoss, "logloss")
+.describe("Negative loglikelihood for logistic regression.")
+.set_body([](const char* param) { return new EvalLogLoss(); });
+
+XGBOOST_REGISTER_METRIC(Error, "error")
+.describe("Binary classification error.")
+.set_body([](const char* param) { return new EvalError(); });
+
+XGBOOST_REGISTER_METRIC(PossionNegLoglik, "poisson-nloglik")
+.describe("Negative loglikelihood for poisson regression.")
+.set_body([](const char* param) { return new EvalPoissionNegLogLik(); });
+
+}  // namespace metric
+}  // namespace xgboost
diff --git a/src/metric/metric.cc b/src/metric/metric.cc
new file mode 100644
index 000000000..7986dec6b
--- /dev/null
+++ b/src/metric/metric.cc
@@ -0,0 +1,42 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file metric_registry.cc
+ * \brief Registry of objective functions.
+ */
+#include <xgboost/metric.h>
+#include <dmlc/registry.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
+}
+
+namespace xgboost {
+Metric* Metric::Create(const std::string& name) {
+  std::string buf = name;
+  std::string prefix = name;
+  auto pos = buf.find('@');
+  if (pos == std::string::npos) {
+    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(name);
+    if (e == nullptr) {
+      LOG(FATAL) << "Unknown metric function " << name;
+    }
+    return (e->body)(nullptr);
+  } else {
+    std::string prefix = buf.substr(0, pos);
+    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
+    if (e == nullptr) {
+      LOG(FATAL) << "Unknown metric function " << name;
+    }
+    return (e->body)(buf.substr(pos + 1, buf.length()).c_str());
+  }
+}
+}  // namespace xgboost
+
+namespace xgboost {
+namespace metric {
+// List of files that will be force linked in static links.
+DMLC_REGISTRY_LINK_TAG(elementwise_metric);
+DMLC_REGISTRY_LINK_TAG(multiclass_metric);
+DMLC_REGISTRY_LINK_TAG(rank_metric);
+}  // namespace metric
+}  // namespace xgboost
diff --git a/src/metric/multiclass_metric.cc b/src/metric/multiclass_metric.cc
new file mode 100644
index 000000000..d51379c64
--- /dev/null
+++ b/src/metric/multiclass_metric.cc
@@ -0,0 +1,119 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file multiclass_metric.cc
+ * \brief evaluation metrics for multiclass classification.
+ * \author Kailong Chen, Tianqi Chen
+ */
+#include <xgboost/metric.h>
+#include <cmath>
+#include "../common/sync.h"
+#include "../common/math.h"
+
+namespace xgboost {
+namespace metric {
+// tag the this file, used by force static link later.
+DMLC_REGISTRY_FILE_TAG(multiclass_metric);
+
+/*!
+ * \brief base class of multi-class evaluation
+ * \tparam Derived the name of subclass
+ */
+template<typename Derived>
+struct EvalMClassBase : public Metric {
+  float Eval(const std::vector<float> &preds,
+             const MetaInfo &info,
+             bool distributed) const override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK(preds.size() % info.labels.size() == 0)
+        << "label and prediction size not match";
+    const size_t nclass = preds.size() / info.labels.size();
+    CHECK_GE(nclass, 1)
+        << "mlogloss and merror are only used for multi-class classification,"
+        << " use logloss for binary classification";
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
+    float sum = 0.0, wsum = 0.0;
+    int label_error = 0;
+    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const float wt = info.GetWeight(i);
+      int label =  static_cast<int>(info.labels[i]);
+      if (label >= 0 && label < static_cast<int>(nclass)) {
+        sum += Derived::EvalRow(label,
+                                dmlc::BeginPtr(preds) + i * nclass,
+                                nclass) * wt;
+        wsum += wt;
+      } else {
+        label_error = label;
+      }
+    }
+    CHECK(label_error >= 0 && label_error < static_cast<int>(nclass))
+        << "MultiClassEvaluation: label must be in [0, num_class),"
+        << " num_class=" << nclass << " but found " << label_error << " in label";
+
+    float dat[2]; dat[0] = sum, dat[1] = wsum;
+    if (distributed) {
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+    }
+    return Derived::GetFinal(dat[0], dat[1]);
+  }
+  /*!
+   * \brief to be implemented by subclass,
+   *   get evaluation result from one row
+   * \param label label of current instance
+   * \param pred prediction value of current instance
+   * \param nclass number of class in the prediction
+   */
+  inline static float EvalRow(int label,
+                              const float *pred,
+                              size_t nclass);
+  /*!
+   * \brief to be overridden by subclass, final transformation
+   * \param esum the sum statistics returned by EvalRow
+   * \param wsum sum of weight
+   */
+  inline static float GetFinal(float esum, float wsum) {
+    return esum / wsum;
+  }
+  // used to store error message
+  const char *error_msg_;
+};
+
+/*! \brief match error */
+struct EvalMatchError : public EvalMClassBase<EvalMatchError> {
+  const char* Name() const override {
+    return "merror";
+  }
+  inline static float EvalRow(int label,
+                              const float *pred,
+                              size_t nclass) {
+    return common::FindMaxIndex(pred, pred + nclass) != pred + static_cast<int>(label);
+  }
+};
+
+/*! \brief match error */
+struct EvalMultiLogLoss : public EvalMClassBase<EvalMultiLogLoss> {
+  const char* Name() const override {
+    return "mlogloss";
+  }
+  inline static float EvalRow(int label,
+                              const float *pred,
+                              size_t nclass) {
+    const float eps = 1e-16f;
+    size_t k = static_cast<size_t>(label);
+    if (pred[k] > eps) {
+      return -std::log(pred[k]);
+    } else {
+      return -std::log(eps);
+    }
+  }
+};
+
+XGBOOST_REGISTER_METRIC(MatchError, "merror")
+.describe("Multiclass classification error.")
+.set_body([](const char* param) { return new EvalMatchError(); });
+
+XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss")
+.describe("Multiclass negative loglikelihood.")
+.set_body([](const char* param) { return new EvalMultiLogLoss(); });
+}  // namespace metric
+}  // namespace xgboost
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
new file mode 100644
index 000000000..feb0f37ff
--- /dev/null
+++ b/src/metric/rank_metric.cc
@@ -0,0 +1,326 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file rank_metric.cc
+ * \brief prediction rank based metrics.
+ * \author Kailong Chen, Tianqi Chen
+ */
+#include <xgboost/metric.h>
+#include <dmlc/registry.h>
+#include <cmath>
+#include "../common/sync.h"
+#include "../common/math.h"
+
+namespace xgboost {
+namespace metric {
+// tag the this file, used by force static link later.
+DMLC_REGISTRY_FILE_TAG(rank_metric);
+
+/*! \brief AMS: also records best threshold */
+struct EvalAMS : public Metric {
+ public:
+  explicit EvalAMS(const char* param) {
+    CHECK(param != nullptr)
+        << "AMS must be in format ams@k";
+    ratio_ = atof(param);
+    std::ostringstream os;
+    os << "ams@" << ratio_;
+    name_ = os.str();
+  }
+  float Eval(const std::vector<float> &preds,
+             const MetaInfo &info,
+             bool distributed) const override {
+    CHECK(!distributed) << "metric AMS do not support distributed evaluation";
+    using namespace std;  // NOLINT(*)
+
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
+    CHECK_EQ(info.weights.size(), ndata) << "we need weight to evaluate ams";
+    std::vector<std::pair<float, unsigned> > rec(ndata);
+
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      rec[i] = std::make_pair(preds[i], i);
+    }
+    std::sort(rec.begin(), rec.end(), common::CmpFirst);
+    unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
+    if (ntop == 0) ntop = ndata;
+    const double br = 10.0;
+    unsigned thresindex = 0;
+    double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
+    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
+      const unsigned ridx = rec[i].second;
+      const float wt = info.weights[ridx];
+      if (info.labels[ridx] > 0.5f) {
+        s_tp += wt;
+      } else {
+        b_fp += wt;
+      }
+      if (rec[i].first != rec[i + 1].first) {
+        double ams = sqrt(2 * ((s_tp + b_fp + br) * log(1.0 + s_tp / (b_fp + br)) - s_tp));
+        if (tams < ams) {
+          thresindex = i;
+          tams = ams;
+        }
+      }
+    }
+    if (ntop == ndata) {
+      LOG(INFO) << "best-ams-ratio=" << static_cast<float>(thresindex) / ndata;
+      return static_cast<float>(tams);
+    } else {
+      return static_cast<float>(
+          sqrt(2 * ((s_tp + b_fp + br) * log(1.0 + s_tp/(b_fp + br)) - s_tp)));
+    }
+  }
+
+  const char* Name() const override {
+    return name_.c_str();
+  }
+
+ private:
+  std::string name_;
+  float ratio_;
+};
+
+/*! \brief Area Under Curve, for both classification and rank */
+struct EvalAuc : public Metric {
+  float Eval(const std::vector<float> &preds,
+             const MetaInfo &info,
+             bool distributed) const override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels.size())
+        << "label size predict size not match";
+    std::vector<unsigned> tgptr(2, 0);
+    tgptr[1] = static_cast<unsigned>(info.labels.size());
+
+    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
+    CHECK_EQ(gptr.back(), info.labels.size())
+        << "EvalAuc: group structure must match number of prediction";
+    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
+    // sum statistics
+    double sum_auc = 0.0f;
+    #pragma omp parallel reduction(+:sum_auc)
+    {
+      // each thread takes a local rec
+      std::vector< std::pair<float, unsigned> > rec;
+      #pragma omp for schedule(static)
+      for (bst_omp_uint k = 0; k < ngroup; ++k) {
+        rec.clear();
+        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
+          rec.push_back(std::make_pair(preds[j], j));
+        }
+        std::sort(rec.begin(), rec.end(), common::CmpFirst);
+        // calculate AUC
+        double sum_pospair = 0.0;
+        double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
+        for (size_t j = 0; j < rec.size(); ++j) {
+          const float wt = info.GetWeight(rec[j].second);
+          const float ctr = info.labels[rec[j].second];
+          // keep bucketing predictions in same bucket
+          if (j != 0 && rec[j].first != rec[j - 1].first) {
+            sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
+            sum_npos += buf_pos;
+            sum_nneg += buf_neg;
+            buf_neg = buf_pos = 0.0f;
+          }
+          buf_pos += ctr * wt;
+          buf_neg += (1.0f - ctr) * wt;
+        }
+        sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
+        sum_npos += buf_pos;
+        sum_nneg += buf_neg;
+        // check weird conditions
+        CHECK(sum_npos > 0.0 && sum_nneg > 0.0)
+            << "AUC: the dataset only contains pos or neg samples";
+        // this is the AUC
+        sum_auc += sum_pospair / (sum_npos*sum_nneg);
+      }
+    }
+    if (distributed) {
+      float dat[2];
+      dat[0] = static_cast<float>(sum_auc);
+      dat[1] = static_cast<float>(ngroup);
+      // approximately estimate auc using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<float>(sum_auc) / ngroup;
+    }
+  }
+  const char* Name() const override {
+    return "auc";
+  }
+};
+
+/*! \brief Evaluate rank list */
+struct EvalRankList : public Metric {
+ public:
+  float Eval(const std::vector<float> &preds,
+             const MetaInfo &info,
+             bool distributed) const override {
+    CHECK_EQ(preds.size(), info.labels.size())
+        << "label size predict size not match";
+    // quick consistency when group is not available
+    std::vector<unsigned> tgptr(2, 0);
+    tgptr[1] = static_cast<unsigned>(preds.size());
+    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
+    CHECK_NE(gptr.size(), 0) << "must specify group when constructing rank file";
+    CHECK_EQ(gptr.back(), preds.size())
+        << "EvalRanklist: group structure must match number of prediction";
+    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
+    // sum statistics
+    double sum_metric = 0.0f;
+    #pragma omp parallel reduction(+:sum_metric)
+    {
+      // each thread takes a local rec
+      std::vector< std::pair<float, unsigned> > rec;
+      #pragma omp for schedule(static)
+      for (bst_omp_uint k = 0; k < ngroup; ++k) {
+        rec.clear();
+        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
+          rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
+        }
+        sum_metric += this->EvalMetric(rec);
+      }
+    }
+    if (distributed) {
+      float dat[2];
+      dat[0] = static_cast<float>(sum_metric);
+      dat[1] = static_cast<float>(ngroup);
+      // approximately estimate the metric using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<float>(sum_metric) / ngroup;
+    }
+  }
+  const char* Name() const override {
+    return name_.c_str();
+  }
+
+ protected:
+  explicit EvalRankList(const char* name, const char* param) {
+    using namespace std;  // NOLINT(*)
+    minus_ = false;
+    if (param != nullptr) {
+      std::ostringstream os;
+      os << name << '@' << param;
+      name_ = os.str();
+      if (sscanf(param, "%u[-]?", &topn_) != 1) {
+        topn_ = std::numeric_limits<unsigned>::max();
+      }
+      if (param[strlen(param) - 1] == '-') {
+        minus_ = true;
+      }
+    } else {
+      topn_ = std::numeric_limits<unsigned>::max();
+    }
+  }
+  /*! \return evaluation metric, given the pair_sort record, (pred,label) */
+  virtual float EvalMetric(std::vector<std::pair<float, unsigned> > &pair_sort) const = 0; // NOLINT(*)
+
+ protected:
+  unsigned topn_;
+  std::string name_;
+  bool minus_;
+};
+
+/*! \brief Precision at N, for both classification and rank */
+struct EvalPrecision : public EvalRankList{
+ public:
+  explicit EvalPrecision(const char *name) : EvalRankList("pre", name) {}
+
+ protected:
+  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
+    // calculate Precision
+    std::sort(rec.begin(), rec.end(), common::CmpFirst);
+    unsigned nhit = 0;
+    for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
+      nhit += (rec[j].second != 0);
+    }
+    return static_cast<float>(nhit) / topn_;
+  }
+};
+
+/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
+struct EvalNDCG : public EvalRankList{
+ public:
+  explicit EvalNDCG(const char *name) : EvalRankList("ndcg", name) {}
+
+ protected:
+  inline float CalcDCG(const std::vector<std::pair<float, unsigned> > &rec) const {
+    double sumdcg = 0.0;
+    for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
+      const unsigned rel = rec[i].second;
+      if (rel != 0) {
+        sumdcg += ((1 << rel) - 1) / std::log(i + 2.0);
+      }
+    }
+    return static_cast<float>(sumdcg);
+  }
+  virtual float EvalMetric(std::vector<std::pair<float, unsigned> > &rec) const { // NOLINT(*)
+    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
+    float dcg = this->CalcDCG(rec);
+    std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
+    float idcg = this->CalcDCG(rec);
+    if (idcg == 0.0f) {
+      if (minus_) {
+        return 0.0f;
+      } else {
+        return 1.0f;
+      }
+    }
+    return dcg/idcg;
+  }
+};
+
+/*! \brief Mean Average Precision at N, for both classification and rank */
+struct EvalMAP : public EvalRankList {
+ public:
+  explicit EvalMAP(const char *name) : EvalRankList("map", name) {}
+
+ protected:
+  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
+    std::sort(rec.begin(), rec.end(), common::CmpFirst);
+    unsigned nhits = 0;
+    double sumap = 0.0;
+    for (size_t i = 0; i < rec.size(); ++i) {
+      if (rec[i].second != 0) {
+        nhits += 1;
+        if (i < this->topn_) {
+          sumap += static_cast<float>(nhits) / (i + 1);
+        }
+      }
+    }
+    if (nhits != 0) {
+      sumap /= nhits;
+      return static_cast<float>(sumap);
+    } else {
+      if (minus_) {
+        return 0.0f;
+      } else {
+        return 1.0f;
+      }
+    }
+  }
+};
+
+XGBOOST_REGISTER_METRIC(AMS, "ams")
+.describe("AMS metric for higgs.")
+.set_body([](const char* param) { return new EvalAMS(param); });
+
+XGBOOST_REGISTER_METRIC(Auc, "auc")
+.describe("Area under curve for both classification and rank.")
+.set_body([](const char* param) { return new EvalAuc(); });
+
+XGBOOST_REGISTER_METRIC(Precision, "pre")
+.describe("precision@k for rank.")
+.set_body([](const char* param) { return new EvalPrecision(param); });
+
+XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
+.describe("ndcg@k for rank.")
+.set_body([](const char* param) { return new EvalNDCG(param); });
+
+XGBOOST_REGISTER_METRIC(MAP, "map")
+.describe("map@k for rank.")
+.set_body([](const char* param) { return new EvalMAP(param); });
+}  // namespace metric
+}  // namespace xgboost
diff --git a/src/objective/multiclass_obj.cc b/src/objective/multiclass_obj.cc
new file mode 100644
index 000000000..42b9fa255
--- /dev/null
+++ b/src/objective/multiclass_obj.cc
@@ -0,0 +1,137 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file multi_class.cc
+ * \brief Definition of multi-class classification objectives.
+ * \author Tianqi Chen
+ */
+#include <dmlc/omp.h>
+#include <dmlc/parameter.h>
+#include <xgboost/logging.h>
+#include <xgboost/objective.h>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include "../common/math.h"
+
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(multiclass_obj);
+
+struct SoftmaxMultiClassParam : public dmlc::Parameter<SoftmaxMultiClassParam> {
+  int num_class;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(SoftmaxMultiClassParam) {
+    DMLC_DECLARE_FIELD(num_class).set_lower_bound(1)
+        .describe("Number of output class in the multi-class classification.");
+  }
+};
+
+class SoftmaxMultiClassObj : public ObjFunction {
+ public:
+  explicit SoftmaxMultiClassObj(bool output_prob)
+      : output_prob_(output_prob) {
+  }
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+  void GetGradient(const std::vector<float>& preds,
+                   const MetaInfo& info,
+                   int iter,
+                   std::vector<bst_gpair>* out_gpair) override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK(preds.size() == (static_cast<size_t>(param_.num_class) * info.labels.size()))
+        << "SoftmaxMultiClassObj: label size and pred size does not match";
+    out_gpair->resize(preds.size());
+    const int nclass = param_.num_class;
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.size() / nclass);
+
+    int label_error = 0;
+    #pragma omp parallel
+    {
+      std::vector<float> rec(nclass);
+      #pragma omp for schedule(static)
+      for (omp_ulong i = 0; i < ndata; ++i) {
+        for (int k = 0; k < nclass; ++k) {
+          rec[k] = preds[i * nclass + k];
+        }
+        common::Softmax(&rec);
+        int label = static_cast<int>(info.labels[i]);
+        if (label < 0 || label >= nclass)  {
+          label_error = label; label = 0;
+        }
+        const float wt = info.GetWeight(i);
+        for (int k = 0; k < nclass; ++k) {
+          float p = rec[k];
+          const float h = 2.0f * p * (1.0f - p) * wt;
+          if (label == k) {
+            out_gpair->at(i * nclass + k) = bst_gpair((p - 1.0f) * wt, h);
+          } else {
+            out_gpair->at(i * nclass + k) = bst_gpair(p* wt, h);
+          }
+        }
+      }
+    }
+    CHECK(label_error >= 0 && label_error < nclass)
+        << "SoftmaxMultiClassObj: label must be in [0, num_class),"
+        << " num_class=" << nclass
+        << " but found " << label_error << " in label.";
+  }
+  void PredTransform(std::vector<float>* io_preds) override {
+    this->Transform(io_preds, output_prob_);
+  }
+  void EvalTransform(std::vector<float>* io_preds) override {
+    this->Transform(io_preds, true);
+  }
+  const char* DefaultEvalMetric() const override {
+    return "merror";
+  }
+
+ private:
+  inline void Transform(std::vector<float> *io_preds, bool prob) {
+    std::vector<float> &preds = *io_preds;
+    std::vector<float> tmp;
+    const int nclass = param_.num_class;
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.size() / nclass);
+    if (!prob) tmp.resize(ndata);
+
+    #pragma omp parallel
+    {
+      std::vector<float> rec(nclass);
+      #pragma omp for schedule(static)
+      for (omp_ulong j = 0; j < ndata; ++j) {
+        for (int k = 0; k < nclass; ++k) {
+          rec[k] = preds[j * nclass + k];
+        }
+        if (!prob) {
+          tmp[j] = static_cast<float>(
+              common::FindMaxIndex(rec.begin(), rec.end()) - rec.begin());
+        } else {
+          common::Softmax(&rec);
+          for (int k = 0; k < nclass; ++k) {
+            preds[j * nclass + k] = rec[k];
+          }
+        }
+      }
+    }
+    if (!prob) preds = tmp;
+  }
+  // output probability
+  bool output_prob_;
+  // parameter
+  SoftmaxMultiClassParam param_;
+};
+
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(SoftmaxMultiClassParam);
+
+XGBOOST_REGISTER_OBJECTIVE(SoftmaxMultiClass, "multi:softmax")
+.describe("Softmax for multi-class classification, output class index.")
+.set_body([]() { return new SoftmaxMultiClassObj(false); });
+
+XGBOOST_REGISTER_OBJECTIVE(SoftprobMultiClass, "multi:softprob")
+.describe("Softmax for multi-class classification, output probability distribution.")
+.set_body([]() { return new SoftmaxMultiClassObj(true); });
+
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
new file mode 100644
index 000000000..413494d3d
--- /dev/null
+++ b/src/objective/objective.cc
@@ -0,0 +1,34 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file objective.cc
+ * \brief Registry of all objective functions.
+ */
+#include <xgboost/objective.h>
+#include <dmlc/registry.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg);
+}  // namespace dmlc
+
+namespace xgboost {
+// implement factory functions
+ObjFunction* ObjFunction::Create(const std::string& name) {
+  auto *e = ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->Find(name);
+  if (e == nullptr) {
+    for (const auto& entry : ::dmlc::Registry< ::xgboost::ObjFunctionReg>::List()) {
+      LOG(INFO) << "Objective candidate: " << entry->name;
+    }
+    LOG(FATAL) << "Unknown objective function " << name;
+  }
+  return (e->body)();
+}
+}  // namespace xgboost
+
+namespace xgboost {
+namespace obj {
+// List of files that will be force linked in static links.
+DMLC_REGISTRY_LINK_TAG(regression_obj);
+DMLC_REGISTRY_LINK_TAG(multiclass_obj);
+DMLC_REGISTRY_LINK_TAG(rank_obj);
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc
new file mode 100644
index 000000000..faa17c322
--- /dev/null
+++ b/src/objective/rank_obj.cc
@@ -0,0 +1,328 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file rank.cc
+ * \brief Definition of rank loss.
+ * \author Tianqi Chen, Kailong Chen
+ */
+#include <dmlc/omp.h>
+#include <xgboost/logging.h>
+#include <xgboost/objective.h>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include "../common/math.h"
+#include "../common/random.h"
+
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(rank_obj);
+
+struct LambdaRankParam : public dmlc::Parameter<LambdaRankParam> {
+  int num_pairsample;
+  float fix_list_weight;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LambdaRankParam) {
+    DMLC_DECLARE_FIELD(num_pairsample).set_lower_bound(1).set_default(1)
+        .describe("Number of pair generated for each instance.");
+    DMLC_DECLARE_FIELD(fix_list_weight).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("Normalize the weight of each list by this value,"
+                  " if equals 0, no effect will happen");
+  }
+};
+
+// objective for lambda rank
+class LambdaRankObj : public ObjFunction {
+ public:
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+  void GetGradient(const std::vector<float>& preds,
+                   const MetaInfo& info,
+                   int iter,
+                   std::vector<bst_gpair>* out_gpair) override {
+    CHECK_EQ(preds.size(), info.labels.size()) << "label size predict size not match";
+    std::vector<bst_gpair>& gpair = *out_gpair;
+    gpair.resize(preds.size());
+    // quick consistency when group is not available
+    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels.size());
+    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
+    CHECK(gptr.size() != 0 && gptr.back() == info.labels.size())
+        << "group structure not consistent with #rows";
+    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
+    #pragma omp parallel
+    {
+      // parall construct, declare random number generator here, so that each
+      // thread use its own random number generator, seed by thread id and current iteration
+      common::RandomEngine rnd(iter * 1111 + omp_get_thread_num());
+
+      std::vector<LambdaPair> pairs;
+      std::vector<ListEntry>  lst;
+      std::vector< std::pair<float, unsigned> > rec;
+      #pragma omp for schedule(static)
+      for (bst_omp_uint k = 0; k < ngroup; ++k) {
+        lst.clear(); pairs.clear();
+        for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
+          lst.push_back(ListEntry(preds[j], info.labels[j], j));
+          gpair[j] = bst_gpair(0.0f, 0.0f);
+        }
+        std::sort(lst.begin(), lst.end(), ListEntry::CmpPred);
+        rec.resize(lst.size());
+        for (unsigned i = 0; i < lst.size(); ++i) {
+          rec[i] = std::make_pair(lst[i].label, i);
+        }
+        std::sort(rec.begin(), rec.end(), common::CmpFirst);
+        // enumerate buckets with same label, for each item in the lst, grab another sample randomly
+        for (unsigned i = 0; i < rec.size(); ) {
+          unsigned j = i + 1;
+          while (j < rec.size() && rec[j].first == rec[i].first) ++j;
+          // bucket in [i,j), get a sample outside bucket
+          unsigned nleft = i, nright = static_cast<unsigned>(rec.size() - j);
+          if (nleft + nright != 0) {
+            int nsample = param_.num_pairsample;
+            while (nsample --) {
+              for (unsigned pid = i; pid < j; ++pid) {
+                unsigned ridx = std::uniform_int_distribution<unsigned>(0, nleft + nright - 1)(rnd);
+                if (ridx < nleft) {
+                  pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second));
+                } else {
+                  pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second));
+                }
+              }
+            }
+          }
+          i = j;
+        }
+        // get lambda weight for the pairs
+        this->GetLambdaWeight(lst, &pairs);
+        // rescale each gradient and hessian so that the lst have constant weighted
+        float scale = 1.0f / param_.num_pairsample;
+        if (param_.fix_list_weight != 0.0f) {
+          scale *= param_.fix_list_weight / (gptr[k + 1] - gptr[k]);
+        }
+        for (size_t i = 0; i < pairs.size(); ++i) {
+          const ListEntry &pos = lst[pairs[i].pos_index];
+          const ListEntry &neg = lst[pairs[i].neg_index];
+          const float w = pairs[i].weight * scale;
+          const float eps = 1e-16f;
+          float p = common::Sigmoid(pos.pred - neg.pred);
+          float g = p - 1.0f;
+          float h = std::max(p * (1.0f - p), eps);
+          // accumulate gradient and hessian in both pid, and nid
+          gpair[pos.rindex].grad += g * w;
+          gpair[pos.rindex].hess += 2.0f * w * h;
+          gpair[neg.rindex].grad -= g * w;
+          gpair[neg.rindex].hess += 2.0f * w * h;
+        }
+      }
+    }
+  }
+  const char* DefaultEvalMetric(void) const override {
+    return "map";
+  }
+
+ protected:
+  /*! \brief helper information in a list */
+  struct ListEntry {
+    /*! \brief the predict score we in the data */
+    float pred;
+    /*! \brief the actual label of the entry */
+    float label;
+    /*! \brief row index in the data matrix */
+    unsigned rindex;
+    // constructor
+    ListEntry(float pred, float label, unsigned rindex)
+        : pred(pred), label(label), rindex(rindex) {}
+    // comparator by prediction
+    inline static bool CmpPred(const ListEntry &a, const ListEntry &b) {
+      return a.pred > b.pred;
+    }
+    // comparator by label
+    inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) {
+      return a.label > b.label;
+    }
+  };
+  /*! \brief a pair in the lambda rank */
+  struct LambdaPair {
+    /*! \brief positive index: this is a position in the list */
+    unsigned pos_index;
+    /*! \brief negative index: this is a position in the list */
+    unsigned neg_index;
+    /*! \brief weight to be filled in */
+    float weight;
+    // constructor
+    LambdaPair(unsigned pos_index, unsigned neg_index)
+        : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {}
+  };
+  /*!
+   * \brief get lambda weight for existing pairs
+   * \param list a list that is sorted by pred score
+   * \param io_pairs record of pairs, containing the pairs to fill in weights
+   */
+  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                               std::vector<LambdaPair> *io_pairs) = 0;
+
+ private:
+  LambdaRankParam param_;
+};
+
+class PairwiseRankObj: public LambdaRankObj{
+ protected:
+  void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                       std::vector<LambdaPair> *io_pairs) override {}
+};
+
+// beta version: NDCG lambda rank
+class LambdaRankObjNDCG : public LambdaRankObj {
+ protected:
+  void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                       std::vector<LambdaPair> *io_pairs) override {
+    std::vector<LambdaPair> &pairs = *io_pairs;
+    float IDCG;
+    {
+      std::vector<float> labels(sorted_list.size());
+      for (size_t i = 0; i < sorted_list.size(); ++i) {
+        labels[i] = sorted_list[i].label;
+      }
+      std::sort(labels.begin(), labels.end(), std::greater<float>());
+      IDCG = CalcDCG(labels);
+    }
+    if (IDCG == 0.0) {
+      for (size_t i = 0; i < pairs.size(); ++i) {
+        pairs[i].weight = 0.0f;
+      }
+    } else {
+      IDCG = 1.0f / IDCG;
+      for (size_t i = 0; i < pairs.size(); ++i) {
+        unsigned pos_idx = pairs[i].pos_index;
+        unsigned neg_idx = pairs[i].neg_index;
+        float pos_loginv = 1.0f / std::log(pos_idx + 2.0f);
+        float neg_loginv = 1.0f / std::log(neg_idx + 2.0f);
+        int pos_label = static_cast<int>(sorted_list[pos_idx].label);
+        int neg_label = static_cast<int>(sorted_list[neg_idx].label);
+        float original =
+            ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv;
+        float changed  =
+            ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv;
+        float delta = (original - changed) * IDCG;
+        if (delta < 0.0f) delta = - delta;
+        pairs[i].weight = delta;
+      }
+    }
+  }
+  inline static float CalcDCG(const std::vector<float> &labels) {
+    double sumdcg = 0.0;
+    for (size_t i = 0; i < labels.size(); ++i) {
+      const unsigned rel = static_cast<unsigned>(labels[i]);
+      if (rel != 0) {
+        sumdcg += ((1 << rel) - 1) / std::log(static_cast<float>(i + 2));
+      }
+    }
+    return static_cast<float>(sumdcg);
+  }
+};
+
+class LambdaRankObjMAP : public LambdaRankObj {
+ protected:
+  struct MAPStats {
+    /*! \brief the accumulated precision */
+    float ap_acc;
+    /*!
+     * \brief the accumulated precision,
+     *   assuming a positive instance is missing
+     */
+    float ap_acc_miss;
+    /*!
+     * \brief the accumulated precision,
+     * assuming that one more positive instance is inserted ahead
+     */
+    float ap_acc_add;
+    /* \brief the accumulated positive instance count */
+    float hits;
+    MAPStats(void) {}
+    MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits)
+        : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {}
+  };
+  /*!
+   * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2
+   *        in sorted triples
+   * \param sorted_list the list containing entry information
+   * \param index1,index2 the instances switched
+   * \param map_stats a vector containing the accumulated precisions for each position in a list
+   */
+  inline float GetLambdaMAP(const std::vector<ListEntry> &sorted_list,
+                            int index1, int index2,
+                            std::vector<MAPStats> *p_map_stats) {
+    std::vector<MAPStats> &map_stats = *p_map_stats;
+    if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) {
+      return 0.0f;
+    }
+    if (index1 > index2) std::swap(index1, index2);
+    float original = map_stats[index2].ap_acc;
+    if (index1 != 0) original -= map_stats[index1 - 1].ap_acc;
+    float changed = 0;
+    float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f;
+    float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f;
+    if (label1 == label2) {
+      return 0.0;
+    } else if (label1 < label2) {
+      changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add;
+      changed += (map_stats[index1].hits + 1.0f) / (index1 + 1);
+    } else {
+      changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss;
+      changed += map_stats[index2].hits / (index2 + 1);
+    }
+    float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits);
+    if (ans < 0) ans = -ans;
+    return ans;
+  }
+  /*
+   * \brief obtain preprocessing results for calculating delta MAP
+   * \param sorted_list the list containing entry information
+   * \param map_stats a vector containing the accumulated precisions for each position in a list
+   */
+  inline void GetMAPStats(const std::vector<ListEntry> &sorted_list,
+                          std::vector<MAPStats> *p_map_acc) {
+    std::vector<MAPStats> &map_acc = *p_map_acc;
+    map_acc.resize(sorted_list.size());
+    float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0;
+    for (size_t i = 1; i <= sorted_list.size(); ++i) {
+      if (sorted_list[i - 1].label > 0.0f) {
+        hit++;
+        acc1 += hit / i;
+        acc2 += (hit - 1) / i;
+        acc3 += (hit + 1) / i;
+      }
+      map_acc[i - 1] = MAPStats(acc1, acc2, acc3, hit);
+    }
+  }
+  void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                       std::vector<LambdaPair> *io_pairs) override {
+    std::vector<LambdaPair> &pairs = *io_pairs;
+    std::vector<MAPStats> map_stats;
+    GetMAPStats(sorted_list, &map_stats);
+    for (size_t i = 0; i < pairs.size(); ++i) {
+      pairs[i].weight =
+          GetLambdaMAP(sorted_list, pairs[i].pos_index,
+                       pairs[i].neg_index, &map_stats);
+    }
+  }
+};
+
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(LambdaRankParam);
+
+XGBOOST_REGISTER_OBJECTIVE(PairwieRankObj, "rank:pairwise")
+.describe("Pairwise rank objective.")
+.set_body([]() { return new PairwiseRankObj(); });
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankNDCG, "rank:ndcg")
+.describe("LambdaRank with NDCG as objective.")
+.set_body([]() { return new LambdaRankObjNDCG(); });
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankObjMAP, "rank:map")
+.describe("LambdaRank with MAP as objective.")
+.set_body([]() { return new LambdaRankObjMAP(); });
+
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc
new file mode 100644
index 000000000..6eb0f0a78
--- /dev/null
+++ b/src/objective/regression_obj.cc
@@ -0,0 +1,221 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file regression.cc
+ * \brief Definition of single-value regression and classification objectives.
+ * \author Tianqi Chen, Kailong Chen
+ */
+#include <dmlc/omp.h>
+#include <xgboost/logging.h>
+#include <xgboost/objective.h>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include "../common/math.h"
+
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(regression_obj);
+
+// common regressions
+// linear regression
+struct LinearSquareLoss {
+  static float PredTransform(float x) { return x; }
+  static bool CheckLabel(float x) { return true; }
+  static float FirstOrderGradient(float predt, float label) { return predt - label; }
+  static float SecondOrderGradient(float predt, float label) { return 1.0f; }
+  static float ProbToMargin(float base_score) { return base_score; }
+  static const char* LabelErrorMsg() { return ""; }
+  static const char* DefaultEvalMetric() { return "rmse"; }
+};
+// logistic loss for probability regression task
+struct LogisticRegression {
+  static float PredTransform(float x) { return common::Sigmoid(x); }
+  static bool CheckLabel(float x) { return x >= 0.0f && x <= 1.0f; }
+  static float FirstOrderGradient(float predt, float label) { return predt - label; }
+  static float SecondOrderGradient(float predt, float label) {
+    const float eps = 1e-16f;
+    return std::max(predt * (1.0f - predt), eps);
+  }
+  static float ProbToMargin(float base_score) {
+    CHECK(base_score > 0.0f && base_score < 1.0f)
+        << "base_score must be in (0,1) for logistic loss";
+    return -std::log(1.0f / base_score - 1.0f);
+  }
+  static const char* LabelErrorMsg() {
+    return "label must be in [0,1] for logistic regression";
+  }
+  static const char* DefaultEvalMetric() { return "rmse"; }
+};
+// logistic loss for binary classification task.
+struct LogisticClassification : public LogisticRegression {
+  static const char* DefaultEvalMetric() { return "error"; }
+};
+// logistic loss, but predict un-transformed margin
+struct LogisticRaw : public LogisticRegression {
+  static float PredTransform(float x) { return x; }
+  static float FirstOrderGradient(float predt, float label) {
+    predt = common::Sigmoid(predt);
+    return predt - label;
+  }
+  static float SecondOrderGradient(float predt, float label) {
+    const float eps = 1e-16f;
+    predt = common::Sigmoid(predt);
+    return std::max(predt * (1.0f - predt), eps);
+  }
+  static const char* DefaultEvalMetric() { return "auc"; }
+};
+
+struct RegLossParam : public dmlc::Parameter<RegLossParam> {
+  float scale_pos_weight;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(RegLossParam) {
+    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
+        .describe("Scale the weight of positive examples by this factor");
+  }
+};
+
+// regression los function
+template<typename Loss>
+class RegLossObj : public ObjFunction {
+ public:
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+  void GetGradient(const std::vector<float> &preds,
+                   const MetaInfo &info,
+                   int iter,
+                   std::vector<bst_gpair> *out_gpair) override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels.size())
+        << "labels are not correctly provided"
+        << "preds.size=" << preds.size() << ", label.size=" << info.labels.size();
+    out_gpair->resize(preds.size());
+    // check if label in range
+    bool label_correct = true;
+    // start calculating gradient
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.size());
+    #pragma omp parallel for schedule(static)
+    for (omp_ulong i = 0; i < ndata; ++i) {
+      float p = Loss::PredTransform(preds[i]);
+      float w = info.GetWeight(i);
+      if (info.labels[i] == 1.0f) w *= param_.scale_pos_weight;
+      if (!Loss::CheckLabel(info.labels[i])) label_correct = false;
+      out_gpair->at(i) = bst_gpair(Loss::FirstOrderGradient(p, info.labels[i]) * w,
+                                   Loss::SecondOrderGradient(p, info.labels[i]) * w);
+    }
+    if (!label_correct) {
+      LOG(FATAL) << Loss::LabelErrorMsg();
+    }
+  }
+  const char* DefaultEvalMetric() const override {
+    return Loss::DefaultEvalMetric();
+  }
+  void PredTransform(std::vector<float> *io_preds) override {
+    std::vector<float> &preds = *io_preds;
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint j = 0; j < ndata; ++j) {
+      preds[j] = Loss::PredTransform(preds[j]);
+    }
+  }
+  float ProbToMargin(float base_score) const override {
+    return Loss::ProbToMargin(base_score);
+  }
+
+ protected:
+  RegLossParam param_;
+};
+
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(RegLossParam);
+
+XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
+.describe("Linear regression.")
+.set_body([]() { return new RegLossObj<LinearSquareLoss>(); });
+
+XGBOOST_REGISTER_OBJECTIVE(LogisticRegression, "reg:logistic")
+.describe("Logistic regression for probability regression task.")
+.set_body([]() { return new RegLossObj<LogisticRegression>(); });
+
+XGBOOST_REGISTER_OBJECTIVE(LogisticClassification, "binary:logistic")
+.describe("Logistic regression for binary classification task.")
+.set_body([]() { return new RegLossObj<LogisticClassification>(); });
+
+XGBOOST_REGISTER_OBJECTIVE(LogisticRaw, "binary:logitraw")
+.describe("Logistic regression for classification, output score before logistic transformation")
+.set_body([]() { return new RegLossObj<LogisticRaw>(); });
+
+// declare parameter
+struct PoissonRegressionParam : public dmlc::Parameter<PoissonRegressionParam> {
+  float max_delta_step;
+  DMLC_DECLARE_PARAMETER(PoissonRegressionParam) {
+    DMLC_DECLARE_FIELD(max_delta_step).set_lower_bound(0.0f)
+        .describe("Maximum delta step we allow each weight estimation to be." \
+                  " This parameter is required for possion regression.");
+  }
+};
+
+// poisson regression for count
+class PoissonRegression : public ObjFunction {
+ public:
+  // declare functions
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+
+  void GetGradient(const std::vector<float> &preds,
+                   const MetaInfo &info,
+                   int iter,
+                   std::vector<bst_gpair> *out_gpair) override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
+    out_gpair->resize(preds.size());
+    // check if label in range
+    bool label_correct = true;
+    // start calculating gradient
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+    #pragma omp parallel for schedule(static)
+    for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
+      float p = preds[i];
+      float w = info.GetWeight(i);
+      float y = info.labels[i];
+      if (y >= 0.0f) {
+        out_gpair->at(i) = bst_gpair((std::exp(p) - y) * w,
+                                     std::exp(p + param_.max_delta_step) * w);
+      } else {
+        label_correct = false;
+      }
+    }
+    CHECK(label_correct) << "PoissonRegression: label must be nonnegative";
+  }
+  void PredTransform(std::vector<float> *io_preds) override {
+    std::vector<float> &preds = *io_preds;
+    const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
+    #pragma omp parallel for schedule(static)
+    for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
+      preds[j] = std::exp(preds[j]);
+    }
+  }
+  void EvalTransform(std::vector<float> *io_preds) override {
+    PredTransform(io_preds);
+  }
+  float ProbToMargin(float base_score) const override {
+    return std::log(base_score);
+  }
+  const char* DefaultEvalMetric(void) const override {
+    return "poisson-nloglik";
+  }
+
+ private:
+  PoissonRegressionParam param_;
+};
+
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(PoissonRegressionParam);
+
+XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
+.describe("Possion regression for count data.")
+.set_body([]() { return new PoissonRegression(); });
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/sync/sync.h b/src/sync/sync.h
deleted file mode 100644
index b9bdf89fe..000000000
--- a/src/sync/sync.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file sync.h
- * \brief the synchronization module of rabit
- *        redirects to subtree rabit header
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_SYNC_SYNC_H_
-#define XGBOOST_SYNC_SYNC_H_
-
-#include "../../subtree/rabit/include/rabit.h"
-#include "../../subtree/rabit/include/rabit/timer.h"
-#endif  // XGBOOST_SYNC_SYNC_H_
diff --git a/src/tree/param.h b/src/tree/param.h
index 364e3572d..b6ac89aef 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -1,7 +1,7 @@
 /*!
  * Copyright 2014 by Contributors
  * \file param.h
- * \brief training parameters, statistics used to support tree construction
+ * \brief training parameters, statistics used to support tree construction.
  * \author Tianqi Chen
  */
 #ifndef XGBOOST_TREE_PARAM_H_
@@ -9,13 +9,12 @@
 
 #include <vector>
 #include <cstring>
-#include "../data.h"
 
 namespace xgboost {
 namespace tree {
 
 /*! \brief training parameters for regression tree */
-struct TrainParam{
+struct TrainParam : public dmlc::Parameter<TrainParam> {
   // learning step size for a time
   float learning_rate;
   // minimum loss change required for a split
@@ -52,67 +51,63 @@ struct TrainParam{
   // option for parallelization
   int parallel_option;
   // option to open cacheline optimization
-  int cache_opt;
+  bool cache_opt;
   // number of threads to be used for tree construction,
   // if OpenMP is enabled, if equals 0, use system default
   int nthread;
-  /*! \brief constructor */
-  TrainParam(void) {
-    learning_rate = 0.3f;
-    min_split_loss = 0.0f;
-    min_child_weight = 1.0f;
-    max_delta_step = 0.0f;
-    max_depth = 6;
-    reg_lambda = 1.0f;
-    reg_alpha = 0.0f;
-    default_direction = 0;
-    subsample = 1.0f;
-    colsample_bytree = 1.0f;
-    colsample_bylevel = 1.0f;
-    opt_dense_col = 1.0f;
-    nthread = 0;
-    size_leaf_vector = 0;
-    // enforce parallel option to 0 for now, investigate the other strategy
-    parallel_option = 0;
-    sketch_eps = 0.1f;
-    sketch_ratio = 2.0f;
-    cache_opt = 1;
-  }
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    // sync-names
-    if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
-    if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
-    if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
-    if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
-    if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
-    if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
-    if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
-    if (!strcmp(name, "max_delta_step")) max_delta_step = static_cast<float>(atof(val));
-    if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
-    if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
-    if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
-    if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
-    if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
-    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
-    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
-    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
-    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
-    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
-    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
-    if (!strcmp(name, "nthread")) nthread = atoi(val);
-    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
-    if (!strcmp(name, "default_direction")) {
-      if (!strcmp(val, "learn")) default_direction = 0;
-      if (!strcmp(val, "left")) default_direction = 1;
-      if (!strcmp(val, "right")) default_direction = 2;
-    }
+  // whether to not print info during training.
+  bool silent;
+  // declare the parameters
+  DMLC_DECLARE_PARAMETER(TrainParam) {
+    DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(0.3f)
+        .describe("Learning rate(step size) of update.");
+    DMLC_DECLARE_FIELD(min_split_loss).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("Minimum loss reduction required to make a further partition.");
+    DMLC_DECLARE_FIELD(max_depth).set_lower_bound(0).set_default(6)
+        .describe("Maximum depth of the tree.");
+    DMLC_DECLARE_FIELD(min_child_weight).set_lower_bound(0.0f).set_default(1.0f)
+        .describe("Minimum sum of instance weight(hessian) needed in a child.");
+    DMLC_DECLARE_FIELD(reg_lambda).set_lower_bound(0.0f).set_default(1.0f)
+        .describe("L2 regularization on leaf weight");
+    DMLC_DECLARE_FIELD(reg_alpha).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("L1 regularization on leaf weight");
+    DMLC_DECLARE_FIELD(default_direction).set_default(0)
+        .add_enum("learn", 0)
+        .add_enum("left", 1)
+        .add_enum("right", 2)
+        .describe("Default direction choice when encountering a missing value");
+    DMLC_DECLARE_FIELD(max_delta_step).set_lower_bound(0.0f).set_default(0.0f)
+        .describe("Maximum delta step we allow each tree's weight estimate to be. "\
+                  "If the value is set to 0, it means there is no constraint");
+    DMLC_DECLARE_FIELD(subsample).set_range(0.0f, 1.0f).set_default(1.0f)
+        .describe("Row subsample ratio of training instance.");
+    DMLC_DECLARE_FIELD(colsample_bylevel).set_range(0.0f, 1.0f).set_default(1.0f)
+        .describe("Subsample ratio of columns, resample on each level.");
+    DMLC_DECLARE_FIELD(colsample_bytree).set_range(0.0f, 1.0f).set_default(1.0f)
+        .describe("Subsample ratio of columns, resample on each tree construction.");
+    DMLC_DECLARE_FIELD(opt_dense_col).set_range(0.0f, 1.0f).set_default(1.0f)
+        .describe("EXP Param: speed optimization for dense column.");
+    DMLC_DECLARE_FIELD(sketch_eps).set_range(0.0f, 1.0f).set_default(0.1f)
+        .describe("EXP Param: Sketch accuracy of approximate algorithm.");
+    DMLC_DECLARE_FIELD(sketch_ratio).set_lower_bound(0.0f).set_default(2.0f)
+        .describe("EXP Param: Sketch accuracy related parameter of approximate algorithm.");
+    DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
+        .describe("Size of leaf vectors, reserved for vector trees");
+    DMLC_DECLARE_FIELD(parallel_option).set_default(0)
+        .describe("Different types of parallelization algorithm.");
+    DMLC_DECLARE_FIELD(cache_opt).set_default(true)
+        .describe("EXP Param: Cache aware optimization.");
+    DMLC_DECLARE_FIELD(nthread).set_default(0)
+        .describe("Number of threads used for training.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Not print information during trainig.");
+    // add alias of parameters
+    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
+    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
+    DMLC_DECLARE_ALIAS(min_split_loss, gamma);
+    DMLC_DECLARE_ALIAS(learning_rate, eta);
   }
+
   // calculate the cost of loss function
   inline double CalcGain(double sum_grad, double sum_hess) const {
     if (sum_hess < min_child_weight) return 0.0;
@@ -176,9 +171,9 @@ struct TrainParam{
     return sum_hess < this->min_child_weight * 2.0;
   }
   /*! \brief maximum sketch size */
-  inline unsigned max_sketch_size(void) const {
+  inline unsigned max_sketch_size() const {
     unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
-    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
+    CHECK_GT(ret, 0);
     return ret;
   }
 
@@ -206,15 +201,15 @@ struct GradStats {
    */
   static const int kSimpleStats = 1;
   /*! \brief constructor, the object must be cleared during construction */
-  explicit GradStats(const TrainParam &param) {
+  explicit GradStats(const TrainParam& param) {
     this->Clear();
   }
   /*! \brief clear the statistics */
-  inline void Clear(void) {
+  inline void Clear() {
     sum_grad = sum_hess = 0.0f;
   }
   /*! \brief check if necessary information is ready */
-  inline static void CheckInfo(const BoosterInfo &info) {
+  inline static void CheckInfo(const MetaInfo& info) {
   }
   /*!
    * \brief accumulate statistics
@@ -229,130 +224,53 @@ struct GradStats {
    * \param info the additional information
    * \param ridx instance index of this instance
    */
-  inline void Add(const std::vector<bst_gpair> &gpair,
-                  const BoosterInfo &info,
+  inline void Add(const std::vector<bst_gpair>& gpair,
+                  const MetaInfo& info,
                   bst_uint ridx) {
-    const bst_gpair &b = gpair[ridx];
+    const bst_gpair& b = gpair[ridx];
     this->Add(b.grad, b.hess);
   }
   /*! \brief calculate leaf weight */
-  inline double CalcWeight(const TrainParam &param) const {
+  inline double CalcWeight(const TrainParam& param) const {
     return param.CalcWeight(sum_grad, sum_hess);
   }
   /*! \brief calculate gain of the solution */
-  inline double CalcGain(const TrainParam &param) const {
+  inline double CalcGain(const TrainParam& param) const {
     return param.CalcGain(sum_grad, sum_hess);
   }
   /*! \brief add statistics to the data */
-  inline void Add(const GradStats &b) {
+  inline void Add(const GradStats& b) {
     this->Add(b.sum_grad, b.sum_hess);
   }
   /*! \brief same as add, reduce is used in All Reduce */
-  inline static void Reduce(GradStats &a, const GradStats &b) { // NOLINT(*)
+  inline static void Reduce(GradStats& a, const GradStats& b) { // NOLINT(*)
     a.Add(b);
   }
   /*! \brief set current value to a - b */
-  inline void SetSubstract(const GradStats &a, const GradStats &b) {
+  inline void SetSubstract(const GradStats& a, const GradStats& b) {
     sum_grad = a.sum_grad - b.sum_grad;
     sum_hess = a.sum_hess - b.sum_hess;
   }
   /*! \return whether the statistics is not used yet */
-  inline bool Empty(void) const {
+  inline bool Empty() const {
     return sum_hess == 0.0;
   }
   /*! \brief set leaf vector value based on statistics */
-  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
+  inline void SetLeafVec(const TrainParam& param, bst_float *vec) const {
   }
   // constructor to allow inheritance
-  GradStats(void) {}
+  GradStats() {}
   /*! \brief add statistics to the data */
   inline void Add(double grad, double hess) {
     sum_grad += grad; sum_hess += hess;
   }
 };
 
-/*! \brief vectorized cv statistics */
-template<unsigned vsize>
-struct CVGradStats : public GradStats {
-  // additional statistics
-  GradStats train[vsize], valid[vsize];
-  // constructor
-  explicit CVGradStats(const TrainParam &param) {
-    utils::Check(param.size_leaf_vector == vsize,
-                 "CVGradStats: vsize must match size_leaf_vector");
-    this->Clear();
-  }
-  /*! \brief check if necessary information is ready */
-  inline static void CheckInfo(const BoosterInfo &info) {
-    utils::Check(info.fold_index.size() != 0,
-                 "CVGradStats: require fold_index");
-  }
-  /*! \brief clear the statistics */
-  inline void Clear(void) {
-    GradStats::Clear();
-    for (unsigned i = 0; i < vsize; ++i) {
-      train[i].Clear(); valid[i].Clear();
-    }
-  }
-  inline void Add(const std::vector<bst_gpair> &gpair,
-                  const BoosterInfo &info,
-                  bst_uint ridx) {
-    GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
-    const size_t step = info.fold_index.size();
-    for (unsigned i = 0; i < vsize; ++i) {
-      const bst_gpair &b = gpair[(i + 1) * step + ridx];
-      if (info.fold_index[ridx] == i) {
-        valid[i].Add(b.grad, b.hess);
-      } else {
-        train[i].Add(b.grad, b.hess);
-      }
-    }
-  }
-  /*! \brief calculate gain of the solution */
-  inline double CalcGain(const TrainParam &param) const {
-    double ret = 0.0;
-    for (unsigned i = 0; i < vsize; ++i) {
-      ret += param.CalcGain(train[i].sum_grad,
-                            train[i].sum_hess,
-                            vsize * valid[i].sum_grad,
-                            vsize * valid[i].sum_hess);
-    }
-    return ret / vsize;
-  }
-  /*! \brief add statistics to the data */
-  inline void Add(const CVGradStats &b) {
-    GradStats::Add(b);
-    for (unsigned i = 0; i < vsize; ++i) {
-      train[i].Add(b.train[i]);
-      valid[i].Add(b.valid[i]);
-    }
-  }
-  /*! \brief same as add, reduce is used in All Reduce */
-  inline static void Reduce(CVGradStats &a, const CVGradStats &b) { // NOLINT(*)
-    a.Add(b);
-  }
-  /*! \brief set current value to a - b */
-  inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
-    GradStats::SetSubstract(a, b);
-    for (int i = 0; i < vsize; ++i) {
-      train[i].SetSubstract(a.train[i], b.train[i]);
-      valid[i].SetSubstract(a.valid[i], b.valid[i]);
-    }
-  }
-  /*! \brief set leaf vector value based on statistics */
-  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
-    for (int i = 0; i < vsize; ++i) {
-      vec[i] = param.learning_rate *
-          param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
-    }
-  }
-};
-
 /*!
  * \brief statistics that is helpful to store
  *   and represent a split solution for the tree
  */
-struct SplitEntry{
+struct SplitEntry {
   /*! \brief loss change after split this node */
   bst_float loss_chg;
   /*! \brief split index */
@@ -360,7 +278,7 @@ struct SplitEntry{
   /*! \brief split value */
   float split_value;
   /*! \brief constructor */
-  SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
+  SplitEntry() : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
   /*!
    * \brief decides whether we can replace current entry with the given statistics
    *   This function gives better priority to lower index when loss_chg == new_loss_chg.
@@ -380,7 +298,7 @@ struct SplitEntry{
    * \param e candidate split solution
    * \return whether the proposed split is better and can replace current split
    */
-  inline bool Update(const SplitEntry &e) {
+  inline bool Update(const SplitEntry& e) {
     if (this->NeedReplace(e.loss_chg, e.split_index())) {
       this->loss_chg = e.loss_chg;
       this->sindex = e.sindex;
@@ -411,15 +329,15 @@ struct SplitEntry{
     }
   }
   /*! \brief same as update, used by AllReduce*/
-  inline static void Reduce(SplitEntry &dst, const SplitEntry &src) { // NOLINT(*)
+  inline static void Reduce(SplitEntry& dst, const SplitEntry& src) { // NOLINT(*)
     dst.Update(src);
   }
   /*!\return feature index to split on */
-  inline unsigned split_index(void) const {
+  inline unsigned split_index() const {
     return sindex & ((1U << 31) - 1U);
   }
   /*!\return whether missing value goes to left branch */
-  inline bool default_left(void) const {
+  inline bool default_left() const {
     return (sindex >> 31) != 0;
   }
 };
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
new file mode 100644
index 000000000..06fb0055b
--- /dev/null
+++ b/src/tree/tree_model.cc
@@ -0,0 +1,84 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file tree_model.cc
+ * \brief model structure for tree
+ */
+#include <xgboost/tree_model.h>
+#include <sstream>
+#include "./param.h"
+
+namespace xgboost {
+// register tree parameter
+DMLC_REGISTER_PARAMETER(TreeParam);
+
+namespace tree {
+DMLC_REGISTER_PARAMETER(TrainParam);
+}
+// internal function to dump regression tree to text
+void DumpRegTree2Text(std::stringstream& fo,  // NOLINT(*)
+                      const RegTree& tree,
+                      const FeatureMap& fmap,
+                      int nid, int depth, bool with_stats) {
+  for (int i = 0;  i < depth; ++i) {
+    fo << '\t';
+  }
+  if (tree[nid].is_leaf()) {
+    fo << nid << ":leaf=" << tree[nid].leaf_value();
+    if (with_stats) {
+      fo << ",cover=" << tree.stat(nid).sum_hess;
+    }
+    fo << '\n';
+  } else {
+    // right then left,
+    bst_float cond = tree[nid].split_cond();
+    const unsigned split_index = tree[nid].split_index();
+    if (split_index < fmap.size()) {
+      switch (fmap.type(split_index)) {
+        case FeatureMap::kIndicator: {
+          int nyes = tree[nid].default_left() ?
+              tree[nid].cright() : tree[nid].cleft();
+          fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
+             << ",no=" << tree[nid].cdefault();
+          break;
+        }
+        case FeatureMap::kInteger: {
+          fo << nid << ":[" << fmap.name(split_index) << "<"
+             << int(float(cond)+1.0f)
+             << "] yes=" << tree[nid].cleft()
+             << ",no=" << tree[nid].cright()
+             << ",missing=" << tree[nid].cdefault();
+          break;
+        }
+        case FeatureMap::kFloat:
+        case FeatureMap::kQuantitive: {
+          fo << nid << ":[" << fmap.name(split_index) << "<"<< float(cond)
+             << "] yes=" << tree[nid].cleft()
+             << ",no=" << tree[nid].cright()
+             << ",missing=" << tree[nid].cdefault();
+            break;
+        }
+        default: LOG(FATAL) << "unknown fmap type";
+        }
+    } else {
+      fo << nid << ":[f" << split_index << "<"<< float(cond)
+         << "] yes=" << tree[nid].cleft()
+         << ",no=" << tree[nid].cright()
+         << ",missing=" << tree[nid].cdefault();
+    }
+    if (with_stats) {
+      fo << ",gain=" << tree.stat(nid).loss_chg << ",cover=" << tree.stat(nid).sum_hess;
+    }
+    fo << '\n';
+    DumpRegTree2Text(fo, tree, fmap, tree[nid].cleft(), depth + 1, with_stats);
+    DumpRegTree2Text(fo, tree, fmap, tree[nid].cright(), depth + 1, with_stats);
+  }
+}
+
+std::string RegTree::Dump2Text(const FeatureMap& fmap, bool with_stats) const {
+  std::stringstream fo("");
+  for (int i = 0; i < param.num_roots; ++i) {
+    DumpRegTree2Text(fo, *this, fmap, i, 0, with_stats);
+  }
+  return fo.str();
+}
+}  // namespace xgboost
diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc
new file mode 100644
index 000000000..ca04a2c84
--- /dev/null
+++ b/src/tree/tree_updater.cc
@@ -0,0 +1,35 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file tree_updater.cc
+ * \brief Registry of tree updaters.
+ */
+#include <xgboost/tree_updater.h>
+#include <dmlc/registry.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);
+}  // namespace dmlc
+
+namespace xgboost {
+
+TreeUpdater* TreeUpdater::Create(const std::string& name) {
+  auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name);
+  if (e == nullptr) {
+    LOG(FATAL) << "Unknown tree updater " << name;
+  }
+  return (e->body)();
+}
+
+}  // namespace xgboost
+
+namespace xgboost {
+namespace tree {
+// List of files that will be force linked in static links.
+DMLC_REGISTRY_LINK_TAG(updater_colmaker);
+DMLC_REGISTRY_LINK_TAG(updater_skmaker);
+DMLC_REGISTRY_LINK_TAG(updater_refresh);
+DMLC_REGISTRY_LINK_TAG(updater_prune);
+DMLC_REGISTRY_LINK_TAG(updater_histmaker);
+DMLC_REGISTRY_LINK_TAG(updater_sync);
+}  // namespace tree
+}  // namespace xgboost
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
deleted file mode 100644
index eb2e06925..000000000
--- a/src/tree/updater.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2014 by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <cstring>
-#include "./updater.h"
-#include "./updater_prune-inl.hpp"
-#include "./updater_refresh-inl.hpp"
-#include "./updater_colmaker-inl.hpp"
-#ifndef XGBOOST_STRICT_CXX98_
-#include "./updater_sync-inl.hpp"
-#include "./updater_distcol-inl.hpp"
-#include "./updater_histmaker-inl.hpp"
-#include "./updater_skmaker-inl.hpp"
-#endif
-
-namespace xgboost {
-namespace tree {
-IUpdater* CreateUpdater(const char *name) {
-  using namespace std;
-  if (!strcmp(name, "prune")) return new TreePruner();
-  if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
-  if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-#ifndef XGBOOST_STRICT_CXX98_
-  if (!strcmp(name, "sync")) return new TreeSyncher();
-  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
-  if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
-  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
-#endif
-  utils::Error("unknown updater:%s", name);
-  return NULL;
-}
-
-}  // namespace tree
-}  // namespace xgboost
diff --git a/src/tree/updater.h b/src/tree/updater.h
deleted file mode 100644
index ff4da5e98..000000000
--- a/src/tree/updater.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater.h
- * \brief interface to update the tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_H_
-#define XGBOOST_TREE_UPDATER_H_
-
-#include <vector>
-
-#include "../data.h"
-#include "./model.h"
-
-namespace xgboost {
-namespace tree {
-/*!
- * \brief interface of tree update module, that performs update of a tree
- */
-class IUpdater {
- public:
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  virtual void SetParam(const char *name, const char *val) = 0;
-  /*!
-   * \brief perform update to the tree models
-   * \param gpair the gradient pair statistics of the data
-   * \param p_fmat feature matrix that provide access to features
-   * \param info extra side information that may be need, such as root index
-   * \param trees references the trees to be updated, updater will change the content of trees
-   *   note: all the trees in the vector are updated, with the same statistics,
-   *         but maybe different random seeds, usually one tree is passed in at a time,
-   *         there can be multiple trees when we train random forest style model
-   */
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) = 0;
-
-  /*!
-   * \brief this is simply a function for optimizing performance
-   * this function asks the updater to return the leaf position of each instance in the p_fmat,
-   * if it is cached in the updater, if it is not available, return NULL
-   * \return array of leaf position of each instance in the last updated tree
-   */
-  virtual const int* GetLeafPosition(void) const {
-    return NULL;
-  }
-  // destructor
-  virtual ~IUpdater(void) {}
-};
-/*!
- * \brief create an updater based on name
- * \param name name of updater
- * \return return the updater instance
- */
-IUpdater* CreateUpdater(const char *name);
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_H_
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.h
similarity index 82%
rename from src/tree/updater_basemaker-inl.hpp
rename to src/tree/updater_basemaker-inl.h
index 6204c47b7..25faaae4e 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.h
@@ -1,18 +1,24 @@
 /*!
  * Copyright 2014 by Contributors
- * \file updater_basemaker-inl.hpp
+ * \file updater_basemaker-inl.h
  * \brief implement a common tree constructor
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
+#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
+
+#include <xgboost/base.h>
+#include <xgboost/tree_updater.h>
 #include <vector>
 #include <algorithm>
 #include <string>
 #include <limits>
-#include "../sync/sync.h"
-#include "../utils/random.h"
-#include "../utils/quantile.h"
+#include <utility>
+#include "./param.h"
+#include "../common/sync.h"
+#include "../common/io.h"
+#include "../common/random.h"
+#include "../common/quantile.h"
 
 namespace xgboost {
 namespace tree {
@@ -20,13 +26,10 @@ namespace tree {
  * \brief base tree maker class that defines common operation
  *  needed in tree making
  */
-class BaseMaker: public IUpdater {
+class BaseMaker: public TreeUpdater {
  public:
-  // destructor
-  virtual ~BaseMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param.InitAllowUnknown(args);
   }
 
  protected:
@@ -34,31 +37,31 @@ class BaseMaker: public IUpdater {
   struct FMetaHelper {
    public:
     /*! \brief find type of each feature, use column format */
-    inline void InitByCol(IFMatrix *p_fmat,
-                          const RegTree &tree) {
+    inline void InitByCol(DMatrix* p_fmat,
+                          const RegTree& tree) {
       fminmax.resize(tree.param.num_feature * 2);
       std::fill(fminmax.begin(), fminmax.end(),
                 -std::numeric_limits<bst_float>::max());
       // start accumulating statistics
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+      dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
+        const ColBatch& batch = iter->Value();
         for (bst_uint i = 0; i < batch.size; ++i) {
           const bst_uint fid = batch.col_index[i];
-          const ColBatch::Inst &c = batch[i];
+          const ColBatch::Inst& c = batch[i];
           if (c.length != 0) {
             fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
             fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
           }
         }
       }
-      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
+      rabit::Allreduce<rabit::op::Max>(dmlc::BeginPtr(fminmax), fminmax.size());
     }
     // get feature type, 0:empty 1:binary 2:real
     inline int Type(bst_uint fid) const {
-      utils::Assert(fid * 2 + 1 < fminmax.size(),
-                    "FeatHelper fid exceed query bound ");
+      CHECK_LT(fid * 2 + 1, fminmax.size())
+          << "FeatHelper fid exceed query bound ";
       bst_float a = fminmax[fid * 2];
       bst_float b = fminmax[fid * 2 + 1];
       if (a == -std::numeric_limits<bst_float>::max()) return 0;
@@ -79,12 +82,12 @@ class BaseMaker: public IUpdater {
         if (this->Type(fid) != 0) findex.push_back(fid);
       }
       unsigned n = static_cast<unsigned>(p * findex.size());
-      random::Shuffle(findex);
+      std::shuffle(findex.begin(), findex.end(), common::GlobalRandom());
       findex.resize(n);
       // sync the findex if it is subsample
       std::string s_cache;
-      utils::MemoryBufferStream fc(&s_cache);
-      utils::IStream &fs = fc;
+      common::MemoryBufferStream fc(&s_cache);
+      dmlc::Stream& fs = fc;
       if (rabit::GetRank() == 0) {
         fs.Write(findex);
       }
@@ -113,7 +116,7 @@ class BaseMaker: public IUpdater {
     return n.cdefault();
   }
   /*! \brief get number of omp thread in current context */
-  inline static int get_nthread(void) {
+  inline static int get_nthread() {
     int nthread;
     #pragma omp parallel
     {
@@ -124,11 +127,11 @@ class BaseMaker: public IUpdater {
   //  ------class member helpers---------
   /*! \brief initialize temp data structure */
   inline void InitData(const std::vector<bst_gpair> &gpair,
-                       const IFMatrix &fmat,
-                       const std::vector<unsigned> &root_index,
+                       const DMatrix &fmat,
                        const RegTree &tree) {
-    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
-                  "TreeMaker: can only grow new tree");
+    CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
+        << "TreeMaker: can only grow new tree";
+    const std::vector<unsigned> &root_index =  fmat.info().root_index;
     {
       // setup position
       position.resize(gpair.size());
@@ -137,8 +140,8 @@ class BaseMaker: public IUpdater {
       } else {
         for (size_t i = 0; i < position.size(); ++i) {
           position[i] = root_index[i];
-          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
-                        "root index exceed setting");
+          CHECK_LT(root_index[i], (unsigned)tree.param.num_roots)
+              << "root index exceed setting";
         }
       }
       // mark delete for the deleted datas
@@ -147,9 +150,11 @@ class BaseMaker: public IUpdater {
       }
       // mark subsample
       if (param.subsample < 1.0f) {
+        std::bernoulli_distribution coin_flip(param.subsample);
+        auto& rnd = common::GlobalRandom();
         for (size_t i = 0; i < position.size(); ++i) {
           if (gpair[i].hess < 0.0f) continue;
-          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+          if (!coin_flip(rnd)) position[i] = ~position[i];
         }
       }
     }
@@ -197,7 +202,8 @@ class BaseMaker: public IUpdater {
    * \param tree the regression tree structure
    */
   inline void ResetPositionCol(const std::vector<int> &nodes,
-                               IFMatrix *p_fmat, const RegTree &tree) {
+                               DMatrix *p_fmat,
+                               const RegTree &tree) {
     // set the positions in the nondefault
     this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
     // set rest of instances to default position
@@ -234,7 +240,8 @@ class BaseMaker: public IUpdater {
    * \param tree the regression tree structure
    */
   virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
-                                        IFMatrix *p_fmat, const RegTree &tree) {
+                                        DMatrix *p_fmat,
+                                        const RegTree &tree) {
     // step 1, classify the non-default data into right places
     std::vector<unsigned> fsplits;
     for (size_t i = 0; i < nodes.size(); ++i) {
@@ -246,7 +253,7 @@ class BaseMaker: public IUpdater {
     std::sort(fsplits.begin(), fsplits.end());
     fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
 
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fsplits);
     while (iter->Next()) {
       const ColBatch &batch = iter->Value();
       for (size_t i = 0; i < batch.size; ++i) {
@@ -273,12 +280,12 @@ class BaseMaker: public IUpdater {
   /*! \brief helper function to get statistics from a tree */
   template<typename TStats>
   inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
-                           const IFMatrix &fmat,
+                           const DMatrix &fmat,
                            const RegTree &tree,
-                           const BoosterInfo &info,
                            std::vector< std::vector<TStats> > *p_thread_temp,
                            std::vector<TStats> *p_node_stats) {
     std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
+    const MetaInfo &info = fmat.info();
     thread_temp.resize(this->get_nthread());
     p_node_stats->resize(tree.param.num_nodes);
     #pragma omp parallel
@@ -323,7 +330,7 @@ class BaseMaker: public IUpdater {
     /*! \brief current size of sketch */
     double next_goal;
     // pointer to the sketch to put things in
-    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    common::WXQuantileSketch<bst_float, bst_float> *sketch;
     // initialize the space
     inline void Init(unsigned max_size) {
       next_goal = -1.0f;
@@ -351,13 +358,13 @@ class BaseMaker: public IUpdater {
               last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
             // push to sketch
             sketch->temp.data[sketch->temp.size] =
-                utils::WXQuantileSketch<bst_float, bst_float>::
+                common::WXQuantileSketch<bst_float, bst_float>::
                 Entry(static_cast<bst_float>(rmin),
                       static_cast<bst_float>(rmax),
                       static_cast<bst_float>(wmin), last_fvalue);
-            utils::Assert(sketch->temp.size < max_size,
-                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
-                          max_size, sketch->temp.size);
+            CHECK_LT(sketch->temp.size, max_size)
+                << "invalid maximum size max_size=" << max_size
+                << ", stemp.size" << sketch->temp.size;
             ++sketch->temp.size;
           }
           if (sketch->temp.size == max_size) {
@@ -367,8 +374,10 @@ class BaseMaker: public IUpdater {
           }
         } else {
           if (rmax >= next_goal) {
-            rabit::TrackerPrintf("INFO: rmax=%g, sum_total=%g, next_goal=%g, size=%lu\n",
-                                 rmax, sum_total, next_goal, sketch->temp.size);
+            LOG(TRACKER) << "INFO: rmax=" << rmax
+                         << ", sum_total=" << sum_total
+                         << ", naxt_goal=" << next_goal
+                         << ", size=" << sketch->temp.size;
           }
         }
         rmin = rmax;
@@ -382,12 +391,12 @@ class BaseMaker: public IUpdater {
     inline void Finalize(unsigned max_size) {
       double rmax = rmin + wmin;
       if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-        utils::Assert(sketch->temp.size <= max_size,
-                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
-                      sketch->temp.size, max_size);
+        CHECK_LE(sketch->temp.size, max_size)
+            << "Finalize: invalid maximum size, max_size=" << max_size
+            << ", stemp.size=" << sketch->temp.size;
         // push to sketch
         sketch->temp.data[sketch->temp.size] =
-            utils::WXQuantileSketch<bst_float, bst_float>::
+            common::WXQuantileSketch<bst_float, bst_float>::
             Entry(static_cast<bst_float>(rmin),
                   static_cast<bst_float>(rmax),
                   static_cast<bst_float>(wmin), last_fvalue);
@@ -424,4 +433,4 @@ class BaseMaker: public IUpdater {
 };
 }  // namespace tree
 }  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#endif  // XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker.cc
similarity index 73%
rename from src/tree/updater_colmaker-inl.hpp
rename to src/tree/updater_colmaker.cc
index 1f89f7ed4..26efb33bc 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker.cc
@@ -1,45 +1,43 @@
 /*!
  * Copyright 2014 by Contributors
- * \file updater_colmaker-inl.hpp
+ * \file updater_colmaker.cc
  * \brief use columnwise update to construct a tree
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
-
+#include <xgboost/tree_updater.h>
 #include <vector>
 #include <cmath>
 #include <algorithm>
 #include "./param.h"
-#include "./updater.h"
-#include "../utils/omp.h"
-#include "../utils/random.h"
+#include "../common/random.h"
+#include "../common/bitmap.h"
+#include "../common/sync.h"
 
 namespace xgboost {
 namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_colmaker);
+
 /*! \brief column-wise update to construct a tree */
 template<typename TStats>
-class ColMaker: public IUpdater {
+class ColMaker: public TreeUpdater {
  public:
-  virtual ~ColMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param.InitAllowUnknown(args);
   }
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
+
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix* dmat,
+              const std::vector<RegTree*> &trees) override {
+    TStats::CheckInfo(dmat->info());
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
       Builder builder(param);
-      builder.Update(gpair, p_fmat, info, trees[i]);
+      builder.Update(gpair, dmat, trees[i]);
     }
-
     param.learning_rate = lr;
   }
 
@@ -74,27 +72,26 @@ class ColMaker: public IUpdater {
     /*! \brief current best solution */
     SplitEntry best;
     // constructor
-    explicit NodeEntry(const TrainParam &param)
+    explicit NodeEntry(const TrainParam& param)
         : stats(param), root_gain(0.0f), weight(0.0f){
     }
   };
   // actual builder that runs the algorithm
-  struct Builder{
+  struct Builder {
    public:
     // constructor
-    explicit Builder(const TrainParam &param) : param(param) {}
+    explicit Builder(const TrainParam& param) : param(param) {}
     // update one tree, growing
-    virtual void Update(const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
-                        RegTree *p_tree) {
-      this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-      this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+    virtual void Update(const std::vector<bst_gpair>& gpair,
+                        DMatrix* p_fmat,
+                        RegTree* p_tree) {
+      this->InitData(gpair, *p_fmat, *p_tree);
+      this->InitNewNode(qexpand_, gpair, *p_fmat, *p_tree);
       for (int depth = 0; depth < param.max_depth; ++depth) {
-        this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
+        this->FindSplit(depth, qexpand_, gpair, p_fmat, p_tree);
         this->ResetPosition(qexpand_, p_fmat, *p_tree);
         this->UpdateQueueExpand(*p_tree, &qexpand_);
-        this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+        this->InitNewNode(qexpand_, gpair, *p_fmat, *p_tree);
         // if nothing left to be expand, break
         if (qexpand_.size() == 0) break;
       }
@@ -114,13 +111,13 @@ class ColMaker: public IUpdater {
 
    protected:
     // initialize temp data structure
-    inline void InitData(const std::vector<bst_gpair> &gpair,
-                         const IFMatrix &fmat,
-                         const std::vector<unsigned> &root_index,
-                         const RegTree &tree) {
-      utils::Assert(tree.param.num_nodes == tree.param.num_roots,
-                    "ColMaker: can only grow new tree");
-      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    inline void InitData(const std::vector<bst_gpair>& gpair,
+                         const DMatrix& fmat,
+                         const RegTree& tree) {
+      CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
+          << "ColMaker: can only grow new tree";
+      const std::vector<unsigned>& root_index = fmat.info().root_index;
+      const std::vector<bst_uint>& rowset = fmat.buffered_rowset();
       {
         // setup position
         position.resize(gpair.size());
@@ -132,8 +129,7 @@ class ColMaker: public IUpdater {
           for (size_t i = 0; i < rowset.size(); ++i) {
             const bst_uint ridx = rowset[i];
             position[ridx] = root_index[ridx];
-            utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots,
-                          "root index exceed setting");
+            CHECK_LT(root_index[ridx], (unsigned)tree.param.num_roots);
           }
         }
         // mark delete for the deleted datas
@@ -143,25 +139,28 @@ class ColMaker: public IUpdater {
         }
         // mark subsample
         if (param.subsample < 1.0f) {
+          std::bernoulli_distribution coin_flip(param.subsample);
+          auto& rnd = common::GlobalRandom();
           for (size_t i = 0; i < rowset.size(); ++i) {
             const bst_uint ridx = rowset[i];
             if (gpair[ridx].hess < 0.0f) continue;
-            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
+            if (!coin_flip(rnd)) position[ridx] = ~position[ridx];
           }
         }
       }
       {
         // initialize feature index
-        unsigned ncol = static_cast<unsigned>(fmat.NumCol());
+        unsigned ncol = static_cast<unsigned>(fmat.info().num_col);
         for (unsigned i = 0; i < ncol; ++i) {
           if (fmat.GetColSize(i) != 0) {
             feat_index.push_back(i);
           }
         }
         unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
-        random::Shuffle(feat_index);
-        utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included",
-                     param.colsample_bytree);
+        std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
+        CHECK_GT(n, 0)
+            << "colsample_bytree=" << param.colsample_bytree
+            << " is too small that no feature can be included";
         feat_index.resize(n);
       }
       {
@@ -190,11 +189,10 @@ class ColMaker: public IUpdater {
      * \brief initialize the base_weight, root_gain,
      *  and NodeEntry for all the new nodes in qexpand
      */
-    inline void InitNewNode(const std::vector<int> &qexpand,
-                            const std::vector<bst_gpair> &gpair,
-                            const IFMatrix &fmat,
-                            const BoosterInfo &info,
-                            const RegTree &tree) {
+    inline void InitNewNode(const std::vector<int>& qexpand,
+                            const std::vector<bst_gpair>& gpair,
+                            const DMatrix& fmat,
+                            const RegTree& tree) {
       {
         // setup statistics space for each tree node
         for (size_t i = 0; i < stemp.size(); ++i) {
@@ -203,6 +201,7 @@ class ColMaker: public IUpdater {
         snode.resize(tree.param.num_nodes, NodeEntry(param));
       }
       const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+      const MetaInfo& info = fmat.info();
       // setup position
       const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
       #pragma omp parallel for schedule(static)
@@ -226,7 +225,7 @@ class ColMaker: public IUpdater {
       }
     }
     /*! \brief update queue expand add in new leaves */
-    inline void UpdateQueueExpand(const RegTree &tree, std::vector<int> *p_qexpand) {
+    inline void UpdateQueueExpand(const RegTree& tree, std::vector<int>* p_qexpand) {
       std::vector<int> &qexpand = *p_qexpand;
       std::vector<int> newnodes;
       for (size_t i = 0; i < qexpand.size(); ++i) {
@@ -243,9 +242,9 @@ class ColMaker: public IUpdater {
     // this function does not support nested functions
     inline void ParallelFindSplit(const ColBatch::Inst &col,
                                   bst_uint fid,
-                                  const IFMatrix &fmat,
-                                  const std::vector<bst_gpair> &gpair,
-                                  const BoosterInfo &info) {
+                                  const DMatrix &fmat,
+                                  const std::vector<bst_gpair> &gpair) {
+      const MetaInfo& info = fmat.info();
       const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
       bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
       bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
@@ -484,7 +483,7 @@ class ColMaker: public IUpdater {
                                int d_step,
                                bst_uint fid,
                                const std::vector<bst_gpair> &gpair,
-                               const BoosterInfo &info,
+                               const MetaInfo &info,
                                std::vector<ThreadEntry> &temp) { // NOLINT(*)
       // use cacheline aware optimization
       if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
@@ -542,10 +541,10 @@ class ColMaker: public IUpdater {
     }
 
     // update the solution candidate
-    virtual void UpdateSolution(const ColBatch &batch,
-                                const std::vector<bst_gpair> &gpair,
-                                const IFMatrix &fmat,
-                                const BoosterInfo &info) {
+    virtual void UpdateSolution(const ColBatch& batch,
+                                const std::vector<bst_gpair>& gpair,
+                                const DMatrix& fmat) {
+      const MetaInfo& info = fmat.info();
       // start enumeration
       const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #if defined(_OPENMP)
@@ -574,7 +573,7 @@ class ColMaker: public IUpdater {
       } else {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           this->ParallelFindSplit(batch[i], batch.col_index[i],
-                                  fmat, gpair, info);
+                                  fmat, gpair);
         }
       }
     }
@@ -582,19 +581,19 @@ class ColMaker: public IUpdater {
     inline void FindSplit(int depth,
                           const std::vector<int> &qexpand,
                           const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
+                          DMatrix *p_fmat,
                           RegTree *p_tree) {
       std::vector<bst_uint> feat_set = feat_index;
       if (param.colsample_bylevel != 1.0f) {
-        random::Shuffle(feat_set);
+        std::shuffle(feat_set.begin(), feat_set.end(), common::GlobalRandom());
         unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
-        utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
+        CHECK_GT(n, 0)
+            << "colsample_bylevel is too small that no feature can be included";
         feat_set.resize(n);
       }
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
+      dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator(feat_set);
       while (iter->Next()) {
-        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
+        this->UpdateSolution(iter->Value(), gpair, *p_fmat);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
       this->SyncBestSolution(qexpand);
@@ -616,7 +615,8 @@ class ColMaker: public IUpdater {
     }
     // reset position of each data points after split is created in the tree
     inline void ResetPosition(const std::vector<int> &qexpand,
-                              IFMatrix *p_fmat, const RegTree &tree) {
+                              DMatrix* p_fmat,
+                              const RegTree& tree) {
       // set the positions in the nondefault
       this->SetNonDefaultPosition(qexpand, p_fmat, tree);
       // set rest of instances to default position
@@ -630,7 +630,7 @@ class ColMaker: public IUpdater {
       for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
         if (ridx >= position.size()) {
-          utils::Printf("ridx exceed bound\n");
+          LOG(INFO) << "ridx exceed bound\n";
         }
         const int nid = this->DecodePosition(ridx);
         if (tree[nid].is_leaf()) {
@@ -660,7 +660,8 @@ class ColMaker: public IUpdater {
       }
     }
     virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       IFMatrix *p_fmat, const RegTree &tree) {
+                                       DMatrix *p_fmat,
+                                       const RegTree &tree) {
       // step 1, classify the non-default data into right places
       std::vector<unsigned> fsplits;
       for (size_t i = 0; i < qexpand.size(); ++i) {
@@ -671,8 +672,7 @@ class ColMaker: public IUpdater {
       }
       std::sort(fsplits.begin(), fsplits.end());
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fsplits);
       while (iter->Next()) {
         const ColBatch &batch = iter->Value();
         for (size_t i = 0; i < batch.size; ++i) {
@@ -711,7 +711,7 @@ class ColMaker: public IUpdater {
       }
     }
     //  --data fields--
-    const TrainParam &param;
+    const TrainParam& param;
     // number of omp thread used during training
     int nthread;
     // Per feature: shuffle index of each feature index
@@ -727,6 +727,170 @@ class ColMaker: public IUpdater {
   };
 };
 
+// distributed column maker
+template<typename TStats>
+class DistColMaker : public ColMaker<TStats> {
+ public:
+  DistColMaker() : builder(param) {
+    pruner.reset(TreeUpdater::Create("prune"));
+  }
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param.InitAllowUnknown(args);
+    pruner->Init(args);
+  }
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix* dmat,
+              const std::vector<RegTree*> &trees) override {
+    TStats::CheckInfo(dmat->info());
+    CHECK_EQ(trees.size(), 1) << "DistColMaker: only support one tree at a time";
+    // build the tree
+    builder.Update(gpair, dmat, trees[0]);
+    //// prune the tree, note that pruner will sync the tree
+    pruner->Update(gpair, dmat, trees);
+    // update position after the tree is pruned
+    builder.UpdatePosition(dmat, *trees[0]);
+  }
+  const int* GetLeafPosition() const override {
+    return builder.GetLeafPosition();
+  }
+
+ private:
+  struct Builder : public ColMaker<TStats>::Builder {
+   public:
+    explicit Builder(const TrainParam &param)
+        : ColMaker<TStats>::Builder(param) {
+    }
+    inline void UpdatePosition(DMatrix* p_fmat, const RegTree &tree) {
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        int nid = this->DecodePosition(ridx);
+        while (tree[nid].is_deleted()) {
+          nid = tree[nid].parent();
+          CHECK_GE(nid, 0);
+        }
+        this->position[ridx] = nid;
+      }
+    }
+    inline const int* GetLeafPosition() const {
+      return dmlc::BeginPtr(this->position);
+    }
+
+   protected:
+    void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                               DMatrix *p_fmat,
+                               const RegTree &tree) override {
+     // step 2, classify the non-default data into right places
+      std::vector<unsigned> fsplits;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
+      }
+      // get the candidate split index
+      std::sort(fsplits.begin(), fsplits.end());
+      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->info().num_col) {
+        fsplits.pop_back();
+      }
+      // bitmap is only word concurrent, set to bool first
+      {
+        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
+        boolmap.resize(ndata);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+            boolmap[j] = 0;
+        }
+      }
+      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (size_t i = 0; i < batch.size; ++i) {
+          ColBatch::Inst col = batch[i];
+          const bst_uint fid = batch.col_index[i];
+          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+          #pragma omp parallel for schedule(static)
+          for (bst_omp_uint j = 0; j < ndata; ++j) {
+            const bst_uint ridx = col[j].index;
+            const float fvalue = col[j].fvalue;
+            const int nid = this->DecodePosition(ridx);
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if (fvalue < tree[nid].split_cond()) {
+                if (!tree[nid].default_left()) boolmap[ridx] = 1;
+              } else {
+                if (tree[nid].default_left()) boolmap[ridx] = 1;
+              }
+            }
+          }
+        }
+      }
+
+      bitmap.InitFromBool(boolmap);
+      // communicate bitmap
+      rabit::Allreduce<rabit::op::BitOR>(dmlc::BeginPtr(bitmap.data), bitmap.data.size());
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // get the new position
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int nid = this->DecodePosition(ridx);
+        if (bitmap.Get(ridx)) {
+          CHECK(!tree[nid].is_leaf()) << "inconsistent reduce information";
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cright());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          }
+        }
+      }
+    }
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      std::vector<SplitEntry> vec;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          this->snode[nid].best.Update(this->stemp[tid][nid].best);
+        }
+        vec.push_back(this->snode[nid].best);
+      }
+      // TODO(tqchen) lazy version
+      // communicate best solution
+      reducer.Allreduce(dmlc::BeginPtr(vec), vec.size());
+      // assign solution back
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        this->snode[nid].best = vec[i];
+      }
+    }
+
+   private:
+    common::BitMap bitmap;
+    std::vector<int> boolmap;
+    rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
+  };
+  // we directly introduce pruner here
+  std::unique_ptr<TreeUpdater> pruner;
+  // training parameter
+  TrainParam param;
+  // pointer to the builder
+  Builder builder;
+};
+
+XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
+.describe("Grow tree with parallelization over columns.")
+.set_body([]() {
+    return new ColMaker<GradStats>();
+  });
+
+XGBOOST_REGISTER_TREE_UPDATER(DistColMaker, "distcol")
+.describe("Distributed column split version of tree maker.")
+.set_body([]() {
+    return new DistColMaker<GradStats>();
+  });
 }  // namespace tree
 }  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
deleted file mode 100644
index e3d3f8b59..000000000
--- a/src/tree/updater_distcol-inl.hpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_distcol-inl.hpp
- * \brief beta distributed version that takes a sub-column
- *        and construct a tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
-#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/bitmap.h"
-#include "../utils/io.h"
-#include "./updater_colmaker-inl.hpp"
-#include "./updater_prune-inl.hpp"
-
-namespace xgboost {
-namespace tree {
-template<typename TStats>
-class DistColMaker : public ColMaker<TStats> {
- public:
-  DistColMaker(void) : builder(param) {}
-  virtual ~DistColMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-    pruner.SetParam(name, val);
-  }
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
-    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
-    // build the tree
-    builder.Update(gpair, p_fmat, info, trees[0]);
-    //// prune the tree, note that pruner will sync the tree
-    pruner.Update(gpair, p_fmat, info, trees);
-    // update position after the tree is pruned
-    builder.UpdatePosition(p_fmat, *trees[0]);
-  }
-  virtual const int* GetLeafPosition(void) const {
-    return builder.GetLeafPosition();
-  }
-
- private:
-  struct Builder : public ColMaker<TStats>::Builder {
-   public:
-    explicit Builder(const TrainParam &param)
-        : ColMaker<TStats>::Builder(param) {
-    }
-    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        int nid = this->DecodePosition(ridx);
-        while (tree[nid].is_deleted()) {
-          nid = tree[nid].parent();
-          utils::Assert(nid >=0, "distributed learning error");
-        }
-        this->position[ridx] = nid;
-      }
-    }
-    virtual const int* GetLeafPosition(void) const {
-      return BeginPtr(this->position);
-    }
-
-   protected:
-    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       IFMatrix *p_fmat, const RegTree &tree) {
-      // step 2, classify the non-default data into right places
-      std::vector<unsigned> fsplits;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) {
-          fsplits.push_back(tree[nid].split_index());
-        }
-      }
-      // get the candidate split index
-      std::sort(fsplits.begin(), fsplits.end());
-      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
-        fsplits.pop_back();
-      }
-      // bitmap is only word concurrent, set to bool first
-      {
-        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
-        boolmap.resize(ndata);
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = 0; j < ndata; ++j) {
-            boolmap[j] = 0;
-        }
-      }
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
-          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
-          #pragma omp parallel for schedule(static)
-          for (bst_omp_uint j = 0; j < ndata; ++j) {
-            const bst_uint ridx = col[j].index;
-            const float fvalue = col[j].fvalue;
-            const int nid = this->DecodePosition(ridx);
-            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                if (!tree[nid].default_left()) boolmap[ridx] = 1;
-              } else {
-                if (tree[nid].default_left()) boolmap[ridx] = 1;
-              }
-            }
-          }
-        }
-      }
-
-      bitmap.InitFromBool(boolmap);
-      // communicate bitmap
-      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // get the new position
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int nid = this->DecodePosition(ridx);
-        if (bitmap.Get(ridx)) {
-          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
-          if (tree[nid].default_left()) {
-            this->SetEncodePosition(ridx, tree[nid].cright());
-          } else {
-            this->SetEncodePosition(ridx, tree[nid].cleft());
-          }
-        }
-      }
-    }
-    // synchronize the best solution of each node
-    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
-      std::vector<SplitEntry> vec;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        for (int tid = 0; tid < this->nthread; ++tid) {
-          this->snode[nid].best.Update(this->stemp[tid][nid].best);
-        }
-        vec.push_back(this->snode[nid].best);
-      }
-      // TODO(tqchen) lazy version
-      // communicate best solution
-      reducer.Allreduce(BeginPtr(vec), vec.size());
-      // assign solution back
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        this->snode[nid].best = vec[i];
-      }
-    }
-
-   private:
-    utils::BitMap bitmap;
-    std::vector<int> boolmap;
-    rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
-  };
-  // we directly introduce pruner here
-  TreePruner pruner;
-  // training parameter
-  TrainParam param;
-  // pointer to the builder
-  Builder builder;
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker.cc
similarity index 83%
rename from src/tree/updater_histmaker-inl.hpp
rename to src/tree/updater_histmaker.cc
index d86204e4b..c6d53b270 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker.cc
@@ -1,36 +1,36 @@
 /*!
  * Copyright 2014 by Contributors
- * \file updater_histmaker-inl.hpp
+ * \file updater_histmaker.cc
  * \brief use histogram counting to construct a tree
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
-
+#include <xgboost/base.h>
+#include <xgboost/tree_updater.h>
 #include <vector>
 #include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/quantile.h"
-#include "../utils/group_data.h"
-#include "./updater_basemaker-inl.hpp"
+#include "../common/sync.h"
+#include "../common/quantile.h"
+#include "../common/group_data.h"
+#include "./updater_basemaker-inl.h"
 
 namespace xgboost {
 namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_histmaker);
+
 template<typename TStats>
 class HistMaker: public BaseMaker {
  public:
-  virtual ~HistMaker(void) {}
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix *p_fmat,
+              const std::vector<RegTree*> &trees) override {
+    TStats::CheckInfo(p_fmat->info());
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
-      this->Update(gpair, p_fmat, info, trees[i]);
+      this->Update(gpair, p_fmat, trees[i]);
     }
     param.learning_rate = lr;
   }
@@ -45,19 +45,18 @@ class HistMaker: public BaseMaker {
     /*! \brief size of histogram */
     unsigned size;
     // default constructor
-    HistUnit(void) {}
+    HistUnit() {}
     // constructor
     HistUnit(const bst_float *cut, TStats *data, unsigned size)
         : cut(cut), data(data), size(size) {}
     /*! \brief add a histogram to data */
     inline void Add(bst_float fv,
                     const std::vector<bst_gpair> &gpair,
-                    const BoosterInfo &info,
+                    const MetaInfo &info,
                     const bst_uint ridx) {
       unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
-      utils::Assert(size != 0, "try insert into size=0");
-      utils::Assert(i < size,
-                    "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
+      CHECK_NE(size, 0) << "try insert into size=0";
+      CHECK_LT(i, size);
       data[i].Add(gpair, info, ridx);
     }
   };
@@ -92,13 +91,13 @@ class HistMaker: public BaseMaker {
         for (size_t i = 0; i < hset[tid].data.size(); ++i) {
           hset[tid].data[i].Clear();
         }
-        hset[tid].rptr = BeginPtr(rptr);
-        hset[tid].cut = BeginPtr(cut);
+        hset[tid].rptr = dmlc::BeginPtr(rptr);
+        hset[tid].cut = dmlc::BeginPtr(cut);
         hset[tid].data.resize(cut.size(), TStats(param));
       }
     }
     // aggregate all statistics to hset[0]
-    inline void Aggregate(void) {
+    inline void Aggregate() {
       bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
@@ -108,11 +107,11 @@ class HistMaker: public BaseMaker {
       }
     }
     /*! \brief clear the workspace */
-    inline void Clear(void) {
+    inline void Clear() {
       cut.clear(); rptr.resize(1); rptr[0] = 0;
     }
     /*! \brief total size */
-    inline size_t Size(void) const {
+    inline size_t Size() const {
       return rptr.size() - 1;
     }
   };
@@ -124,18 +123,17 @@ class HistMaker: public BaseMaker {
   std::vector<bst_uint> fwork_set;
   // update function implementation
   virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
+                      DMatrix *p_fmat,
                       RegTree *p_tree) {
-    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    this->InitData(gpair, *p_fmat, *p_tree);
     this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
     for (int depth = 0; depth < param.max_depth; ++depth) {
       // reset and propose candidate split
-      this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
+      this->ResetPosAndPropose(gpair, p_fmat, fwork_set, *p_tree);
       // create histogram
-      this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
+      this->CreateHist(gpair, p_fmat, fwork_set, *p_tree);
       // find split based on histogram statistics
-      this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
+      this->FindSplit(depth, gpair, p_fmat, fwork_set, p_tree);
       // reset position after split
       this->ResetPositionAfterSplit(p_fmat, *p_tree);
       this->UpdateQueueExpand(*p_tree);
@@ -151,12 +149,11 @@ class HistMaker: public BaseMaker {
   // (1) reset the position in array position, to be the latest leaf id
   // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
+                                  DMatrix *p_fmat,
                                   const std::vector <bst_uint> &fset,
                                   const RegTree &tree) = 0;
   // initialize the current working set of features in this round
-  virtual void InitWorkSet(IFMatrix *p_fmat,
+  virtual void InitWorkSet(DMatrix *p_fmat,
                            const RegTree &tree,
                            std::vector<bst_uint> *p_fset) {
     p_fset->resize(tree.param.num_feature);
@@ -165,12 +162,11 @@ class HistMaker: public BaseMaker {
     }
   }
   // reset position after split, this is not a must, depending on implementation
-  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+  virtual void ResetPositionAfterSplit(DMatrix *p_fmat,
                                        const RegTree &tree) {
   }
   virtual void CreateHist(const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
+                          DMatrix *p_fmat,
                           const std::vector <bst_uint> &fset,
                           const RegTree &tree)  = 0;
 
@@ -212,8 +208,7 @@ class HistMaker: public BaseMaker {
   }
   inline void FindSplit(int depth,
                         const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
+                        DMatrix *p_fmat,
                         const std::vector <bst_uint> &fset,
                         RegTree *p_tree) {
     const size_t num_feature = fset.size();
@@ -224,8 +219,7 @@ class HistMaker: public BaseMaker {
     #pragma omp parallel for schedule(dynamic, 1)
     for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
       const int nid = qexpand[wid];
-      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
-                    "node2workindex inconsistent");
+      CHECK_EQ(node2workindex[nid], static_cast<int>(wid));
       SplitEntry &best = sol[wid];
       TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
       for (size_t i = 0; i < fset.size(); ++i) {
@@ -279,10 +273,10 @@ class CQHistMaker: public HistMaker<TStats> {
      */
     inline void Add(bst_float fv,
                     const std::vector<bst_gpair> &gpair,
-                    const BoosterInfo &info,
+                    const MetaInfo &info,
                     const bst_uint ridx) {
       while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
-      utils::Assert(istart != hist.size, "the bound variable must be max");
+      CHECK_NE(istart, hist.size);
       hist.data[istart].Add(gpair, info, ridx);
     }
     /*!
@@ -292,25 +286,25 @@ class CQHistMaker: public HistMaker<TStats> {
     inline void Add(bst_float fv,
                     bst_gpair gstats) {
       while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
-      utils::Assert(istart != hist.size, "the bound variable must be max");
+      CHECK_NE(istart, hist.size);
       hist.data[istart].Add(gstats);
     }
   };
   // sketch type used for this
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // initialize the work set of tree
-  virtual void InitWorkSet(IFMatrix *p_fmat,
-                           const RegTree &tree,
-                           std::vector<bst_uint> *p_fset) {
+  void InitWorkSet(DMatrix *p_fmat,
+                   const RegTree &tree,
+                   std::vector<bst_uint> *p_fset) override {
     feat_helper.InitByCol(p_fmat, tree);
     feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
   }
   // code to create histogram
-  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
-                          const std::vector<bst_uint> &fset,
-                          const RegTree &tree) {
+  void CreateHist(const std::vector<bst_gpair> &gpair,
+                  DMatrix *p_fmat,
+                  const std::vector<bst_uint> &fset,
+                  const RegTree &tree) override {
+    const MetaInfo &info = p_fmat->info();
     // fill in reverse map
     feat2workindex.resize(tree.param.num_feature);
     std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
@@ -327,7 +321,7 @@ class CQHistMaker: public HistMaker<TStats> {
     {
       thread_hist.resize(this->get_nthread());
       // start accumulating statistics
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
+      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fset);
       iter->BeforeFirst();
       while (iter->Next()) {
         const ColBatch &batch = iter->Value();
@@ -353,21 +347,22 @@ class CQHistMaker: public HistMaker<TStats> {
     // sync the histogram
     // if it is C++11, use lazy evaluation for Allreduce
 #if __cplusplus >= 201103L
-    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data),
+    this->histred.Allreduce(dmlc::BeginPtr(this->wspace.hset[0].data),
                             this->wspace.hset[0].data.size(), lazy_get_hist);
 #else
-    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());
+    this->histred.Allreduce(dmlc::BeginPtr(this->wspace.hset[0].data),
+                            this->wspace.hset[0].data.size());
 #endif
   }
-  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
-                                       const RegTree &tree) {
+  void ResetPositionAfterSplit(DMatrix *p_fmat,
+                                       const RegTree &tree) override {
     this->ResetPositionCol(this->qexpand, p_fmat, tree);
   }
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const std::vector<bst_uint> &fset,
-                                  const RegTree &tree) {
+  void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                          DMatrix *p_fmat,
+                          const std::vector<bst_uint> &fset,
+                          const RegTree &tree) override {
+    const MetaInfo &info = p_fmat->info();
     // fill in reverse map
     feat2workindex.resize(tree.param.num_feature);
     std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
@@ -380,7 +375,7 @@ class CQHistMaker: public HistMaker<TStats> {
         feat2workindex[fset[i]] = -2;
       }
     }
-    this->GetNodeStats(gpair, *p_fmat, tree, info,
+    this->GetNodeStats(gpair, *p_fmat, tree,
                        &thread_stats, &node_stats);
     sketchs.resize(this->qexpand.size() * freal_set.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
@@ -403,7 +398,7 @@ class CQHistMaker: public HistMaker<TStats> {
       // number of rows in
       const size_t nrows = p_fmat->buffered_rowset().size();
       // start accumulating statistics
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
+      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(freal_set);
       iter->BeforeFirst();
       while (iter->Next()) {
         const ColBatch &batch = iter->Value();
@@ -422,18 +417,19 @@ class CQHistMaker: public HistMaker<TStats> {
         }
       }
       for (size_t i = 0; i < sketchs.size(); ++i) {
-        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+        common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
         sketchs[i].GetSummary(&out);
         summary_array[i].SetPrune(out, max_size);
       }
-      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
+      CHECK_EQ(summary_array.size(), sketchs.size());
     };
     if (summary_array.size() != 0) {
       size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
 #if __cplusplus >= 201103L
-      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
+      sreducer.Allreduce(dmlc::BeginPtr(summary_array),
+                         nbytes, summary_array.size(), lazy_get_summary);
 #else
-      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+      sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
 #endif
     }
     // now we get the final result of sketch, setup the cut
@@ -460,7 +456,7 @@ class CQHistMaker: public HistMaker<TStats> {
           }
           this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
         } else {
-          utils::Assert(offset == -2, "BUG in mark");
+          CHECK_EQ(offset, -2);
           bst_float cpt = feat_helper.MaxValue(fset[i]);
           this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
           this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
@@ -470,15 +466,14 @@ class CQHistMaker: public HistMaker<TStats> {
       this->wspace.cut.push_back(0.0f);
       this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
     }
-    utils::Assert(this->wspace.rptr.size() ==
-                  (fset.size() + 1) * this->qexpand.size() + 1,
-                  "cut space inconsistent");
+    CHECK_EQ(this->wspace.rptr.size(),
+             (fset.size() + 1) * this->qexpand.size() + 1);
   }
 
  private:
   inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
                             const ColBatch::Inst &c,
-                            const BoosterInfo &info,
+                            const MetaInfo &info,
                             const RegTree &tree,
                             const std::vector<bst_uint> &fset,
                             bst_uint fid_offset,
@@ -623,11 +618,11 @@ class CQHistMaker: public HistMaker<TStats> {
   // set of index from fset that are real
   std::vector<bst_uint> freal_set;
   // thread temp data
-  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
+  std::vector<std::vector<BaseMaker::SketchEntry> > thread_sketch;
   // used to hold statistics
-  std::vector< std::vector<TStats> > thread_stats;
+  std::vector<std::vector<TStats> > thread_stats;
   // used to hold start pointer
-  std::vector< std::vector<HistEntry> > thread_hist;
+  std::vector<std::vector<HistEntry> > thread_hist;
   // node statistics
   std::vector<TStats> node_stats;
   // summary array
@@ -635,18 +630,18 @@ class CQHistMaker: public HistMaker<TStats> {
   // reducer for summary
   rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // per node, per feature sketch
-  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+  std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs;
 };
 
 template<typename TStats>
 class QuantileHistMaker: public HistMaker<TStats> {
  protected:
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const std::vector <bst_uint> &fset,
-                                  const RegTree &tree) {
+  typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                          DMatrix *p_fmat,
+                          const std::vector <bst_uint> &fset,
+                          const RegTree &tree) override {
+    const MetaInfo &info = p_fmat->info();
     // initialize the data structure
     int nthread = BaseMaker::get_nthread();
     sketchs.resize(this->qexpand.size() * tree.param.num_feature);
@@ -654,12 +649,13 @@ class QuantileHistMaker: public HistMaker<TStats> {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
     // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
       const RowBatch &batch = iter->Value();
       // parallel convert to column major format
-      utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
+      common::ParallelGroupBuilder<SparseBatch::Entry>
+          builder(&col_ptr, &col_data, &thread_col_ptr);
       builder.InitBudget(tree.param.num_feature, nthread);
 
       const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
@@ -711,14 +707,14 @@ class QuantileHistMaker: public HistMaker<TStats> {
     // synchronize sketch
     summary_array.resize(sketchs.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      common::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
       sketchs[i].GetSummary(&out);
       summary_array[i].Reserve(max_size);
       summary_array[i].SetPrune(out, max_size);
     }
 
     size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
-    sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+    sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
     this->wspace.rptr.clear();
@@ -745,9 +741,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
       this->wspace.cut.push_back(0.0f);
       this->wspace.rptr.push_back(this->wspace.cut.size());
     }
-    utils::Assert(this->wspace.rptr.size() ==
-                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
-                  "cut space inconsistent");
+    CHECK_EQ(this->wspace.rptr.size(),
+             (tree.param.num_feature + 1) * this->qexpand.size() + 1);
   }
 
  private:
@@ -759,11 +754,15 @@ class QuantileHistMaker: public HistMaker<TStats> {
   std::vector<size_t> col_ptr;
   // local storage of column data
   std::vector<SparseBatch::Entry> col_data;
-  std::vector< std::vector<size_t> > thread_col_ptr;
+  std::vector<std::vector<size_t> > thread_col_ptr;
   // per node, per feature sketch
-  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
+  std::vector<common::WQuantileSketch<bst_float, bst_float> > sketchs;
 };
 
+XGBOOST_REGISTER_TREE_UPDATER(HistMaker, "grow_histmaker")
+.describe("Tree constructor that uses approximate histogram construction.")
+.set_body([]() {
+    return new CQHistMaker<GradStats>();
+  });
 }  // namespace tree
 }  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune.cc
similarity index 58%
rename from src/tree/updater_prune-inl.hpp
rename to src/tree/updater_prune.cc
index 6f964d6c3..af52f73f4 100644
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune.cc
@@ -1,35 +1,37 @@
 /*!
  * Copyright 2014 by Contributors
- * \file updater_prune-inl.hpp
+ * \file updater_prune.cc
  * \brief prune a tree given the statistics
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
-#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
 
-#include <vector>
+#include <xgboost/tree_updater.h>
+#include <string>
+#include <memory>
 #include "./param.h"
-#include "./updater.h"
-#include "./updater_sync-inl.hpp"
+#include "../common/sync.h"
+#include "../common/io.h"
 
 namespace xgboost {
 namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_prune);
+
 /*! \brief pruner that prunes a tree after growing finishes */
-class TreePruner: public IUpdater {
+class TreePruner: public TreeUpdater {
  public:
-  virtual ~TreePruner(void) {}
+  TreePruner() {
+    syncher.reset(TreeUpdater::Create("sync"));
+  }
   // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    param.SetParam(name, val);
-    syncher.SetParam(name, val);
-    if (!strcmp(name, "silent")) silent = atoi(val);
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param.InitAllowUnknown(args);
+    syncher->Init(args);
   }
   // update the tree, do pruning
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix *p_fmat,
+              const std::vector<RegTree*> &trees) override {
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
@@ -37,7 +39,7 @@ class TreePruner: public IUpdater {
       this->DoPrune(*trees[i]);
     }
     param.learning_rate = lr;
-    syncher.Update(gpair, p_fmat, info, trees);
+    syncher->Update(gpair, p_fmat, trees);
   }
 
  private:
@@ -51,7 +53,7 @@ class TreePruner: public IUpdater {
       // need to be pruned
       tree.ChangeToLeaf(pid, param.learning_rate * s.base_weight);
       // tail recursion
-      return this->TryPruneLeaf(tree, pid, depth - 1, npruned+2);
+      return this->TryPruneLeaf(tree, pid, depth - 1, npruned + 2);
     } else {
       return npruned;
     }
@@ -68,20 +70,24 @@ class TreePruner: public IUpdater {
         npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
       }
     }
-    if (silent == 0) {
-      utils::Printf("tree pruning end, %d roots, %d extra nodes, %d pruned nodes, max_depth=%d\n",
-                    tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
+    if (!param.silent) {
+      LOG(INFO) << "tree pruning end, " << tree.param.num_roots << " roots, "
+                << tree.num_extra_nodes() << " extra nodes, " << npruned
+                << " pruned nodes, max_depth=" << tree.MaxDepth();
     }
   }
 
  private:
   // synchronizer
-  TreeSyncher syncher;
-  // shutup
-  int silent;
+  std::unique_ptr<TreeUpdater> syncher;
   // training parameter
   TrainParam param;
 };
+
+XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune")
+.describe("Pruner that prune the tree according to statistics.")
+.set_body([]() {
+    return new TreePruner();
+  });
 }  // namespace tree
 }  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh.cc
similarity index 78%
rename from src/tree/updater_refresh-inl.hpp
rename to src/tree/updater_refresh.cc
index b6c5ee89e..3fef13ef6 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh.cc
@@ -1,39 +1,37 @@
 /*!
  * Copyright 2014 by Contributors
- * \file updater_refresh-inl.hpp
+ * \file updater_refresh.cc
  * \brief refresh the statistics and leaf value on the tree on the dataset
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
-#define XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
 
+#include <xgboost/tree_updater.h>
 #include <vector>
 #include <limits>
-#include "../sync/sync.h"
 #include "./param.h"
-#include "./updater.h"
-#include "../utils/omp.h"
+#include "../common/sync.h"
+#include "../common/io.h"
 
 namespace xgboost {
 namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_refresh);
+
 /*! \brief pruner that prunes a tree after growing finishs */
 template<typename TStats>
-class TreeRefresher: public IUpdater {
+class TreeRefresher: public TreeUpdater {
  public:
-  virtual ~TreeRefresher(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param.InitAllowUnknown(args);
   }
   // update the tree, do pruning
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix *p_fmat,
+              const std::vector<RegTree*> &trees) {
     if (trees.size() == 0) return;
     // number of threads
     // thread temporal space
-    std::vector< std::vector<TStats> > stemp;
+    std::vector<std::vector<TStats> > stemp;
     std::vector<RegTree::FVec> fvec_temp;
     // setup temp space for each thread
     int nthread;
@@ -60,13 +58,13 @@ class TreeRefresher: public IUpdater {
     auto lazy_get_stats = [&]()
 #endif
     {
+      const MetaInfo &info = p_fmat->info();
       // start accumulating statistics
-      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+      dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
         const RowBatch &batch = iter->Value();
-        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                     "too large batch size ");
+        CHECK_LT(batch.size, std::numeric_limits<unsigned>::max());
         const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
         #pragma omp parallel for schedule(static)
         for (bst_omp_uint i = 0; i < nbatch; ++i) {
@@ -78,7 +76,7 @@ class TreeRefresher: public IUpdater {
           int offset = 0;
           for (size_t j = 0; j < trees.size(); ++j) {
             AddStats(*trees[j], feats, gpair, info, ridx,
-                     BeginPtr(stemp[tid]) + offset);
+                     dmlc::BeginPtr(stemp[tid]) + offset);
             offset += trees[j]->param.num_nodes;
           }
           feats.Drop(inst);
@@ -94,9 +92,9 @@ class TreeRefresher: public IUpdater {
       }
     };
 #if __cplusplus >= 201103L
-    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+    reducer.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
 #else
-    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
+    reducer.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size());
 #endif
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
@@ -104,7 +102,7 @@ class TreeRefresher: public IUpdater {
     int offset = 0;
     for (size_t i = 0; i < trees.size(); ++i) {
       for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
-        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
+        this->Refresh(dmlc::BeginPtr(stemp[0]) + offset, rid, trees[i]);
       }
       offset += trees[i]->param.num_nodes;
     }
@@ -116,7 +114,7 @@ class TreeRefresher: public IUpdater {
   inline static void AddStats(const RegTree &tree,
                               const RegTree::FVec &feat,
                               const std::vector<bst_gpair> &gpair,
-                              const BoosterInfo &info,
+                              const MetaInfo &info,
                               const bst_uint ridx,
                               TStats *gstats) {
     // start from groups that belongs to current data
@@ -152,6 +150,10 @@ class TreeRefresher: public IUpdater {
   rabit::Reducer<TStats, TStats::Reduce> reducer;
 };
 
+XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")
+.describe("Refresher that refreshes the weight and statistics according to data.")
+.set_body([]() {
+    return new TreeRefresher<GradStats>();
+  });
 }  // namespace tree
 }  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker.cc
similarity index 87%
rename from src/tree/updater_skmaker-inl.hpp
rename to src/tree/updater_skmaker.cc
index ade22011b..c0d62ce5e 100644
--- a/src/tree/updater_skmaker-inl.hpp
+++ b/src/tree/updater_skmaker.cc
@@ -1,57 +1,58 @@
 /*!
  * Copyright 2014 by Contributors
- * \file updater_skmaker-inl.hpp
+ * \file updater_skmaker.cc
  * \brief use approximation sketch to construct a tree,
           a refresh is needed to make the statistics exactly correct
  * \author Tianqi Chen
  */
-#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
 
+#include <xgboost/base.h>
+#include <xgboost/tree_updater.h>
 #include <vector>
 #include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/quantile.h"
-#include "./updater_basemaker-inl.hpp"
+#include "../common/sync.h"
+#include "../common/quantile.h"
+#include "../common/group_data.h"
+#include "./updater_basemaker-inl.h"
 
 namespace xgboost {
 namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_skmaker);
+
 class SketchMaker: public BaseMaker {
  public:
-  virtual ~SketchMaker(void) {}
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix *p_fmat,
+              const std::vector<RegTree*> &trees) override {
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
-      this->Update(gpair, p_fmat, info, trees[i]);
+      this->Update(gpair, p_fmat, trees[i]);
     }
     param.learning_rate = lr;
   }
 
  protected:
   inline void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      RegTree *p_tree) {
-    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+                     DMatrix *p_fmat,
+                     RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, *p_tree);
     for (int depth = 0; depth < param.max_depth; ++depth) {
-      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+      this->GetNodeStats(gpair, *p_fmat, *p_tree,
                          &thread_stats, &node_stats);
-      this->BuildSketch(gpair, p_fmat, info, *p_tree);
+      this->BuildSketch(gpair, p_fmat, *p_tree);
       this->SyncNodeStats();
-      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->FindSplit(depth, gpair, p_fmat, p_tree);
       this->ResetPositionCol(qexpand, p_fmat, *p_tree);
       this->UpdateQueueExpand(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
     }
     if (qexpand.size() != 0) {
-      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+      this->GetNodeStats(gpair, *p_fmat, *p_tree,
                          &thread_stats, &node_stats);
       this->SyncNodeStats();
     }
@@ -72,7 +73,7 @@ class SketchMaker: public BaseMaker {
     }
   }
   // define the sketch we want to use
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
 
  private:
   // statistics needed in the gradient calculation
@@ -94,7 +95,7 @@ class SketchMaker: public BaseMaker {
     }
     // accumulate statistics
     inline void Add(const std::vector<bst_gpair> &gpair,
-                    const BoosterInfo &info,
+                    const MetaInfo &info,
                     bst_uint ridx) {
       const bst_gpair &b = gpair[ridx];
       if (b.grad >= 0.0f) {
@@ -133,9 +134,9 @@ class SketchMaker: public BaseMaker {
     }
   };
   inline void BuildSketch(const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
+                          DMatrix *p_fmat,
                           const RegTree &tree) {
+    const MetaInfo& info = p_fmat->info();
     sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
@@ -144,7 +145,7 @@ class SketchMaker: public BaseMaker {
     // number of rows in
     const size_t nrows = p_fmat->buffered_rowset().size();
     // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
       const ColBatch &batch = iter->Value();
@@ -164,13 +165,13 @@ class SketchMaker: public BaseMaker {
     // synchronize sketch
     summary_array.resize(sketchs.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
       sketchs[i].GetSummary(&out);
       summary_array[i].Reserve(max_size);
       summary_array[i].SetPrune(out, max_size);
     }
     size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
-    sketch_reducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+    sketch_reducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
   }
   // update sketch information in column fid
   inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
@@ -256,20 +257,19 @@ class SketchMaker: public BaseMaker {
     }
   }
   inline void SyncNodeStats(void) {
-    utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
+    CHECK_NE(qexpand.size(), 0);
     std::vector<SKStats> tmp(qexpand.size());
     for (size_t i = 0; i < qexpand.size(); ++i) {
       tmp[i] = node_stats[qexpand[i]];
     }
-    stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
+    stats_reducer.Allreduce(dmlc::BeginPtr(tmp), tmp.size());
     for (size_t i = 0; i < qexpand.size(); ++i) {
       node_stats[qexpand[i]] = tmp[i];
     }
   }
   inline void FindSplit(int depth,
                         const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
+                        DMatrix *p_fmat,
                         RegTree *p_tree) {
     const bst_uint num_feature = p_tree->param.num_feature;
     // get the best split condition for each node
@@ -278,8 +278,7 @@ class SketchMaker: public BaseMaker {
     #pragma omp parallel for schedule(dynamic, 1)
     for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
       const int nid = qexpand[wid];
-      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
-                    "node2workindex inconsistent");
+      CHECK_EQ(node2workindex[nid], static_cast<int>(wid));
       SplitEntry &best = sol[wid];
       for (bst_uint fid = 0; fid < num_feature; ++fid) {
         unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
@@ -380,9 +379,9 @@ class SketchMaker: public BaseMaker {
 
   // thread temp data
   // used to hold temporal sketch
-  std::vector< std::vector<SketchEntry> > thread_sketch;
+  std::vector<std::vector<SketchEntry> > thread_sketch;
   // used to hold statistics
-  std::vector< std::vector<SKStats> > thread_stats;
+  std::vector<std::vector<SKStats> > thread_stats;
   // node statistics
   std::vector<SKStats> node_stats;
   // summary array
@@ -392,8 +391,13 @@ class SketchMaker: public BaseMaker {
   // reducer for summary
   rabit::SerializeReducer<WXQSketch::SummaryContainer> sketch_reducer;
   // per node, per feature sketch
-  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+  std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs;
 };
+
+XGBOOST_REGISTER_TREE_UPDATER(SketchMaker, "grow_skmaker")
+.describe("Approximate sketching maker.")
+.set_body([]() {
+    return new SketchMaker();
+  });
 }  // namespace tree
 }  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
diff --git a/src/tree/updater_sync-inl.hpp b/src/tree/updater_sync-inl.hpp
deleted file mode 100644
index e76d1f76d..000000000
--- a/src/tree/updater_sync-inl.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_sync-inl.hpp
- * \brief synchronize the tree in all distributed nodes
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
-#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <limits>
-#include "../sync/sync.h"
-#include "./updater.h"
-
-namespace xgboost {
-namespace tree {
-/*!
- * \brief syncher that synchronize the tree in all distributed nodes
- * can implement various strategies, so far it is always set to node 0's tree
- */
-class TreeSyncher: public IUpdater {
- public:
-  virtual ~TreeSyncher(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-  }
-  // update the tree, do pruning
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    this->SyncTrees(trees);
-  }
-
- private:
-  // synchronize the trees in different nodes, take tree from rank 0
-  inline void SyncTrees(const std::vector<RegTree *> &trees) {
-    if (rabit::GetWorldSize() == 1) return;
-    std::string s_model;
-    utils::MemoryBufferStream fs(&s_model);
-    int rank = rabit::GetRank();
-    if (rank == 0) {
-      for (size_t i = 0; i < trees.size(); ++i) {
-        trees[i]->SaveModel(fs);
-      }
-    }
-    fs.Seek(0);
-    rabit::Broadcast(&s_model, 0);
-    for (size_t i = 0; i < trees.size(); ++i) {
-      trees[i]->LoadModel(fs);
-    }
-  }
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
new file mode 100644
index 000000000..bd17968cd
--- /dev/null
+++ b/src/tree/updater_sync.cc
@@ -0,0 +1,52 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_sync.cc
+ * \brief synchronize the tree in all distributed nodes
+ */
+#include <xgboost/tree_updater.h>
+#include <vector>
+#include <string>
+#include <limits>
+#include "../common/sync.h"
+#include "../common/io.h"
+
+namespace xgboost {
+namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_sync);
+
+/*!
+ * \brief syncher that synchronize the tree in all distributed nodes
+ * can implement various strategies, so far it is always set to node 0's tree
+ */
+class TreeSyncher: public TreeUpdater {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& args) override {}
+
+  void Update(const std::vector<bst_gpair> &gpair,
+              DMatrix* dmat,
+              const std::vector<RegTree*> &trees) override {
+    if (rabit::GetWorldSize() == 1) return;
+    std::string s_model;
+    common::MemoryBufferStream fs(&s_model);
+    int rank = rabit::GetRank();
+    if (rank == 0) {
+      for (size_t i = 0; i < trees.size(); ++i) {
+        trees[i]->Save(&fs);
+      }
+    }
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    for (size_t i = 0; i < trees.size(); ++i) {
+      trees[i]->Load(&fs);
+    }
+  }
+};
+
+XGBOOST_REGISTER_TREE_UPDATER(TreeSyncher, "sync")
+.describe("Syncher that synchronize the tree in all distributed nodes.")
+.set_body([]() {
+    return new TreeSyncher();
+  });
+}  // namespace tree
+}  // namespace xgboost
diff --git a/src/utils/fmap.h b/src/utils/fmap.h
deleted file mode 100644
index cc06b7021..000000000
--- a/src/utils/fmap.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file fmap.h
- * \brief helper class that holds the feature names and interpretations
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_FMAP_H_
-#define XGBOOST_UTILS_FMAP_H_
-
-#include <vector>
-#include <string>
-#include <cstring>
-#include "./utils.h"
-
-namespace xgboost {
-namespace utils {
-/*! \brief helper class that holds the feature names and interpretations */
-class FeatMap {
- public:
-  enum Type {
-    kIndicator = 0,
-    kQuantitive = 1,
-    kInteger = 2,
-    kFloat = 3
-  };
-  // function definitions
-  /*! \brief load feature map from text format */
-  inline void LoadText(const char *fname) {
-    std::FILE *fi = utils::FopenCheck(fname, "r");
-    this->LoadText(fi);
-    std::fclose(fi);
-  }
-  /*! \brief load feature map from text format */
-  inline void LoadText(std::FILE *fi) {
-    int fid;
-    char fname[1256], ftype[1256];
-    while (std::fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
-      this->PushBack(fid, fname, ftype);
-    }
-  }
-  /*!\brief push back feature map */
-  inline void PushBack(int fid, const char *fname, const char *ftype) {
-    utils::Check(fid == static_cast<int>(names_.size()), "invalid fmap format");
-    names_.push_back(std::string(fname));
-    types_.push_back(GetType(ftype));
-  }
-  inline void Clear(void) {
-    names_.clear(); types_.clear();
-  }
-  /*! \brief number of known features */
-  size_t size(void) const {
-    return names_.size();
-  }
-  /*! \brief return name of specific feature */
-  const char* name(size_t idx) const {
-    utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
-    return names_[idx].c_str();
-  }
-  /*! \brief return type of specific feature */
-  const Type& type(size_t idx) const {
-    utils::Assert(idx < names_.size(), "utils::FMap::type feature index exceed bound");
-    return types_[idx];
-  }
-
- private:
-  inline static Type GetType(const char *tname) {
-    using namespace std;
-    if (!strcmp("i", tname)) return kIndicator;
-    if (!strcmp("q", tname)) return kQuantitive;
-    if (!strcmp("int", tname)) return kInteger;
-    if (!strcmp("float", tname)) return kFloat;
-    utils::Error("unknown feature type, use i for indicator and q for quantity");
-    return kIndicator;
-  }
-  /*! \brief name of the feature */
-  std::vector<std::string> names_;
-  /*! \brief type of the feature */
-  std::vector<Type> types_;
-};
-
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_FMAP_H_
diff --git a/src/utils/io.h b/src/utils/io.h
deleted file mode 100644
index 1fd09310e..000000000
--- a/src/utils/io.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file io.h
- * \brief general stream interface for serialization, I/O
- * \author Tianqi Chen
- */
-
-#ifndef XGBOOST_UTILS_IO_H_
-#define XGBOOST_UTILS_IO_H_
-#include <cstdio>
-#include <vector>
-#include <string>
-#include <cstring>
-#include "./utils.h"
-#include "../sync/sync.h"
-
-namespace xgboost {
-namespace utils {
-// reuse the definitions of streams
-typedef rabit::Stream IStream;
-typedef rabit::utils::SeekStream ISeekStream;
-typedef rabit::utils::MemoryFixSizeBuffer MemoryFixSizeBuffer;
-typedef rabit::utils::MemoryBufferStream MemoryBufferStream;
-
-/*! \brief implementation of file i/o stream */
-class FileStream : public ISeekStream {
- public:
-  explicit FileStream(std::FILE *fp) : fp(fp) {}
-  FileStream(void) {
-    this->fp = NULL;
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, size, 1, fp);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    Check(std::fwrite(ptr, size, 1, fp) == 1, "FileStream::Write: fwrite error!");
-  }
-  virtual void Seek(size_t pos) {
-    std::fseek(fp, static_cast<long>(pos), SEEK_SET); // NOLINT(*)
-  }
-  virtual size_t Tell(void) {
-    return std::ftell(fp);
-  }
-  virtual bool AtEnd(void) const {
-    return std::feof(fp) != 0;
-  }
-  inline void Close(void) {
-    if (fp != NULL) {
-      std::fclose(fp); fp = NULL;
-    }
-  }
-
- private:
-  std::FILE *fp;
-};
-}  // namespace utils
-}  // namespace xgboost
-#include "./base64-inl.h"
-#endif  // XGBOOST_UTILS_IO_H_
diff --git a/src/utils/iterator.h b/src/utils/iterator.h
deleted file mode 100644
index 73068dbbf..000000000
--- a/src/utils/iterator.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file iterator.h
- * \brief itertator interface
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_ITERATOR_H_
-#define XGBOOST_UTILS_ITERATOR_H_
-#include <cstdio>
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief iterator interface
- * \tparam DType data type
- */
-template<typename DType>
-class IIterator {
- public:
-  /*!
-   * \brief set the parameter
-   * \param name name of parameter
-   * \param val value of parameter
-   */
-  virtual void SetParam(const char *name, const char *val) {}
-  /*! \brief initialize the iterator so that we can use the iterator */
-  virtual void Init(void) {}
-  /*! \brief set before first of the item */
-  virtual void BeforeFirst(void) = 0;
-  /*! \brief move to next item */
-  virtual bool Next(void) = 0;
-  /*! \brief get current data */
-  virtual const DType &Value(void) const = 0;
- public:
-  /*! \brief constructor */
-  virtual ~IIterator(void) {}
-};
-
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_ITERATOR_H_
-
diff --git a/src/utils/math.h b/src/utils/math.h
deleted file mode 100644
index 7609df076..000000000
--- a/src/utils/math.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file math.h
- * \brief support additional math
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_MATH_H_
-#define XGBOOST_UTILS_MATH_H_
-
-#include <cmath>
-
-namespace xgboost {
-namespace utils {
-#ifdef XGBOOST_STRICT_CXX98_
-// check nan
-bool CheckNAN(double v);
-double LogGamma(double v);
-#else
-template<typename T>
-inline bool CheckNAN(T v) {
-#ifdef _MSC_VER
-  return (_isnan(v) != 0);
-#else
-  return isnan(v);
-#endif
-}
-template<typename T>
-inline T LogGamma(T v) {
-#ifdef _MSC_VER
-#if _MSC_VER >= 1800
-  return lgamma(v);
-#else
-#pragma message("Warning: lgamma function was not available until VS2013"\
-                ", poisson regression will be disabled")
-  utils::Error("lgamma function was not available until VS2013");
-  return static_cast<T>(1.0);
-#endif
-#else
-  return lgamma(v);
-#endif
-}
-#endif
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_MATH_H_
diff --git a/src/utils/omp.h b/src/utils/omp.h
deleted file mode 100644
index c7a04dc32..000000000
--- a/src/utils/omp.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file omp.h
- * \brief header to handle OpenMP compatibility issues
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_OMP_H_
-#define XGBOOST_UTILS_OMP_H_
-
-#if defined(_OPENMP) && !defined(DISABLE_OPENMP)
-#include <omp.h>
-#else
-#if !defined(DISABLE_OPENMP) && !defined(_MSC_VER)
-// use pragma message instead of warning
-#pragma message("Warning: OpenMP is not available,"\
-                "xgboost will be compiled into single-thread code."\
-                "Use OpenMP-enabled compiler to get benefit of multi-threading")
-#endif
-inline int omp_get_thread_num() { return 0; }
-inline int omp_get_num_threads() { return 1; }
-inline void omp_set_num_threads(int nthread) {}
-inline int omp_get_num_procs() { return 1; }
-#endif
-
-// loop variable used in openmp
-namespace xgboost {
-#ifdef _MSC_VER
-typedef int bst_omp_uint;
-#else
-typedef unsigned bst_omp_uint;
-#endif
-}  // namespace xgboost
-
-#endif  // XGBOOST_UTILS_OMP_H_
diff --git a/src/utils/random.h b/src/utils/random.h
deleted file mode 100644
index 8e3255cf3..000000000
--- a/src/utils/random.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file xgboost_random.h
- * \brief PRNG to support random number generation
- * \author Tianqi Chen: tianqi.tchen@gmail.com
- *
- * Use standard PRNG from stdlib
- */
-#ifndef XGBOOST_UTILS_RANDOM_H_
-#define XGBOOST_UTILS_RANDOM_H_
-
-#include <cmath>
-#include <cstdlib>
-#include <vector>
-#include <algorithm>
-#include "./utils.h"
-
-/*! namespace of PRNG */
-namespace xgboost {
-namespace random {
-#ifndef XGBOOST_CUSTOMIZE_PRNG_
-/*! \brief seed the PRNG */
-inline void Seed(unsigned seed) {
-  srand(seed);
-}
-/*! \brief basic function, uniform */
-inline double Uniform(void) {
-  return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0); // NOLINT(*)
-}
-/*! \brief return a real number uniform in (0,1) */
-inline double NextDouble2(void) {
-  return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0); // NOLINT(*)
-}
-/*! \brief return  x~N(0,1) */
-inline double Normal(void) {
-  double x, y, s;
-  do {
-    x = 2 * NextDouble2() - 1.0;
-    y = 2 * NextDouble2() - 1.0;
-    s = x*x + y*y;
-  } while (s >= 1.0 || s == 0.0);
-
-  return x * sqrt(-2.0 * log(s) / s);
-}
-#else
-// include declarations, to be implemented
-void Seed(unsigned seed);
-double Uniform(void);
-double Normal(void);
-#endif
-
-/*! \brief return a real number uniform in [0,1) */
-inline double NextDouble(void) {
-  return Uniform();
-}
-/*! \brief return a random number in n */
-inline uint32_t NextUInt32(uint32_t n) {
-  return (uint32_t)std::floor(NextDouble() * n);
-}
-/*! \brief return  x~N(mu,sigma^2) */
-inline double SampleNormal(double mu, double sigma) {
-  return Normal() * sigma + mu;
-}
-/*! \brief  return 1 with probability p, coin flip */
-inline int SampleBinary(double p) {
-  return NextDouble() < p;
-}
-
-template<typename T>
-inline void Shuffle(T *data, size_t sz) {
-  if (sz == 0) return;
-  for (uint32_t i = (uint32_t)sz - 1; i > 0; i--) {
-    std::swap(data[i], data[NextUInt32(i + 1)]);
-  }
-}
-// random shuffle the data inside, require PRNG
-template<typename T>
-inline void Shuffle(std::vector<T> &data) { // NOLINT(*)
-  Shuffle(&data[0], data.size());
-}
-
-/*! \brief random number generator with independent random number seed*/
-struct Random{
-  /*! \brief set random number seed */
-  inline void Seed(unsigned sd) {
-    this->rseed = sd;
-#if defined(_MSC_VER) || defined(_WIN32)
-    ::xgboost::random::Seed(sd);
-#endif
-  }
-  /*! \brief return a real number uniform in [0,1) */
-  inline double RandDouble(void) {
-    // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
-    // For cygwin and mingw, this can slows down parallelism,
-    // but rand_r is only used in objective-inl.hpp, won't affect speed in general
-    // todo, replace with another PRNG
-#if defined(_MSC_VER) || defined(_WIN32) || defined(XGBOOST_STRICT_CXX98_)
-    return Uniform();
-#else
-    return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
-#endif
-  }
-  // random number seed
-  unsigned rseed;
-};
-}  // namespace random
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_RANDOM_H_
diff --git a/src/utils/thread.h b/src/utils/thread.h
deleted file mode 100644
index a6e8e7fdc..000000000
--- a/src/utils/thread.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file thread.h
- * \brief this header include the minimum necessary resource
- * for multi-threading that can be compiled in windows, linux, mac
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_THREAD_H_ // NOLINT(*)
-#define XGBOOST_UTILS_THREAD_H_ // NOLINT(*)
-
-#ifdef _MSC_VER
-#include <windows.h>
-#include <process.h>
-#include "./utils.h"
-namespace xgboost {
-namespace utils {
-/*! \brief simple semaphore used for synchronization */
-class Semaphore {
- public :
-  inline void Init(int init_val) {
-    sem = CreateSemaphore(NULL, init_val, 10, NULL);
-    utils::Check(sem != NULL, "create Semaphore error");
-  }
-  inline void Destroy(void) {
-    CloseHandle(sem);
-  }
-  inline void Wait(void) {
-    utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
-  }
-  inline void Post(void) {
-    utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error");
-  }
-
- private:
-  HANDLE sem;
-};
-
-/*! \brief mutex under windows */
-class Mutex {
- public:
-  inline void Init(void) {
-    utils::Check(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400) != 0,
-                   "Mutex::Init fail");
-  }
-  inline void Lock(void) {
-    EnterCriticalSection(&mutex);
-  }
-  inline void Unlock(void) {
-    LeaveCriticalSection(&mutex);
-  }
-  inline void Destroy(void) {
-    DeleteCriticalSection(&mutex);
-  }
-
- private:
-  friend class ConditionVariable;
-  CRITICAL_SECTION mutex;
-};
-
-// conditional variable that uses pthread
-class ConditionVariable {
- public:
-  // initialize conditional variable
-  inline void Init(void) {
-    InitializeConditionVariable(&cond);
-  }
-  // destroy the thread
-  inline void Destroy(void) {
-    // DeleteConditionVariable(&cond);
-  }
-  // wait on the conditional variable
-  inline void Wait(Mutex *mutex) {
-    utils::Check(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE) != 0,
-                 "ConditionVariable:Wait fail");
-  }
-  inline void Broadcast(void) {
-    WakeAllConditionVariable(&cond);
-  }
-  inline void Signal(void) {
-    WakeConditionVariable(&cond);
-  }
-
- private:
-  CONDITION_VARIABLE cond;
-};
-
-/*! \brief simple thread that wraps windows thread */
-class Thread {
- private:
-  HANDLE    thread_handle;
-  unsigned  thread_id;
- public:
-  inline void Start(unsigned int __stdcall entry(void*p), void *param) {
-    thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
-  }
-  inline int Join(void) {
-    WaitForSingleObject(thread_handle, INFINITE);
-    return 0;
-  }
-};
-/*! \brief exit function called from thread */
-inline void ThreadExit(void *status) {
-  _endthreadex(0);
-}
-#define XGBOOST_THREAD_PREFIX unsigned int __stdcall
-}  // namespace utils
-}  // namespace xgboost
-#else
-// thread interface using g++
-#include <semaphore.h>
-#include <pthread.h>
-#include <errno.h>
-namespace xgboost {
-namespace utils {
-/*!\brief semaphore class */
-class Semaphore {
-  #ifdef __APPLE__
-
- private:
-  sem_t* semPtr;
-  char sema_name[20];
-
- private:
-  inline void GenRandomString(char *s, const int len) {
-    static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-    for (int i = 0; i < len; ++i) {
-      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
-    }
-    s[len] = 0;
-  }
-
- public:
-  inline void Init(int init_val) {
-    sema_name[0] = '/';
-    sema_name[1] = 's';
-    sema_name[2] = 'e';
-    sema_name[3] = '/';
-    GenRandomString(&sema_name[4], 16);
-    if ((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
-      perror("sem_open");
-      exit(1);
-    }
-    utils::Check(semPtr != NULL, "create Semaphore error");
-  }
-  inline void Destroy(void) {
-    if (sem_close(semPtr) == -1) {
-      perror("sem_close");
-      exit(EXIT_FAILURE);
-    }
-    if (sem_unlink(sema_name) == -1) {
-      perror("sem_unlink");
-      exit(EXIT_FAILURE);
-    }
-  }
-  inline void Wait(void) {
-    sem_wait(semPtr);
-  }
-  inline void Post(void) {
-    sem_post(semPtr);
-  }
-  #else
-
- private:
-  sem_t sem;
-
- public:
-  inline void Init(int init_val) {
-    if (sem_init(&sem, 0, init_val) != 0) {
-      utils::Error("Semaphore.Init:%s", strerror(errno));
-    }
-  }
-  inline void Destroy(void) {
-    if (sem_destroy(&sem) != 0) {
-      utils::Error("Semaphore.Destroy:%s", strerror(errno));
-    }
-  }
-  inline void Wait(void) {
-    if (sem_wait(&sem) != 0) {
-      utils::Error("Semaphore.Wait:%s", strerror(errno));
-    }
-  }
-  inline void Post(void) {
-    if (sem_post(&sem) != 0) {
-      utils::Error("Semaphore.Post:%s", strerror(errno));
-    }
-  }
-  #endif
-};
-
-// mutex that works with pthread
-class Mutex {
- public:
-  inline void Init(void) {
-    pthread_mutex_init(&mutex, NULL);
-  }
-  inline void Lock(void) {
-    pthread_mutex_lock(&mutex);
-  }
-  inline void Unlock(void) {
-    pthread_mutex_unlock(&mutex);
-  }
-  inline void Destroy(void) {
-    pthread_mutex_destroy(&mutex);
-  }
-
- private:
-  friend class ConditionVariable;
-  pthread_mutex_t mutex;
-};
-
-// conditional variable that uses pthread
-class ConditionVariable {
- public:
-  // initialize conditional variable
-  inline void Init(void) {
-    pthread_cond_init(&cond, NULL);
-  }
-  // destroy the thread
-  inline void Destroy(void) {
-    pthread_cond_destroy(&cond);
-  }
-  // wait on the conditional variable
-  inline void Wait(Mutex *mutex) {
-    pthread_cond_wait(&cond, &(mutex->mutex));
-  }
-  inline void Broadcast(void) {
-    pthread_cond_broadcast(&cond);
-  }
-  inline void Signal(void) {
-    pthread_cond_signal(&cond);
-  }
-
- private:
-  pthread_cond_t cond;
-};
-
-/*!\brief simple thread class */
-class Thread {
- private:
-  pthread_t thread;
- public :
-  inline void Start(void * entry(void*), void *param) { // NOLINT(*)
-    pthread_attr_t attr;
-    pthread_attr_init(&attr);
-    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-    pthread_create(&thread, &attr, entry, param);
-  }
-  inline int Join(void) {
-    void *status;
-    return pthread_join(thread, &status);
-  }
-};
-inline void ThreadExit(void *status) {
-  pthread_exit(status);
-}
-}  // namespace utils
-}  // namespace xgboost
-#define XGBOOST_THREAD_PREFIX void *
-#endif  // Linux
-#endif  // XGBOOST_UTILS_THREAD_H_  NOLINT(*)
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
deleted file mode 100644
index 8acb8ffd0..000000000
--- a/src/utils/thread_buffer.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file thread_buffer.h
- * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_
-#define XGBOOST_UTILS_THREAD_BUFFER_H_
-
-#include <vector>
-#include <cstring>
-#include <cstdlib>
-#include "./utils.h"
-// threading util could not run on solaris
-#ifndef XGBOOST_STRICT_CXX98_
-#include "./thread.h"
-#endif
-
-namespace xgboost {
-namespace utils {
-#if !defined(XGBOOST_STRICT_CXX98_)
-/*!
- * \brief buffered loading iterator that uses multithread
- * this template method will assume the following parameters
- * \tparam Elem element type to be buffered
- * \tparam ElemFactory factory type to implement in order to use thread buffer
- */
-template<typename Elem, typename ElemFactory>
-class ThreadBuffer {
- public:
-  /*!\brief constructor */
-  ThreadBuffer(void) {
-    this->init_end = false;
-    this->buf_size = 30;
-  }
-  ~ThreadBuffer(void) {
-    if (init_end) this->Destroy();
-  }
-  /*!\brief set parameter, will also pass the parameter to factory */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
-    factory.SetParam(name, val);
-  }
-  /*!
-   * \brief initalize the buffered iterator
-   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
-   * \return false if the initialization can't be done, e.g. buffer file hasn't been created
-   */
-  inline bool Init(void) {
-    if (!factory.Init()) return false;
-    for (int i = 0; i < buf_size; ++i) {
-      bufA.push_back(factory.Create());
-      bufB.push_back(factory.Create());
-    }
-    this->init_end = true;
-    this->StartLoader();
-    return true;
-  }
-  /*!\brief place the iterator before first value */
-  inline void BeforeFirst(void) {
-    // wait till last loader end
-    loading_end.Wait();
-    // critical zone
-    current_buf = 1;
-    factory.BeforeFirst();
-    // reset terminate limit
-    endA = endB = buf_size;
-    // wake up loader for first part
-    loading_need.Post();
-    // wait til first part is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();
-    // set buffer value
-    buf_index = 0;
-  }
-  /*! \brief destroy the buffer iterator, will deallocate the buffer */
-  inline void Destroy(void) {
-    // wait until the signal is consumed
-    this->destroy_signal = true;
-    loading_need.Post();
-    loader_thread.Join();
-    loading_need.Destroy();
-    loading_end.Destroy();
-    for (size_t i = 0; i < bufA.size(); ++i) {
-      factory.FreeSpace(bufA[i]);
-    }
-    for (size_t i = 0; i < bufB.size(); ++i) {
-      factory.FreeSpace(bufB[i]);
-    }
-    bufA.clear(); bufB.clear();
-    factory.Destroy();
-    this->init_end = false;
-  }
-  /*!
-   * \brief get the next element needed in buffer
-   * \param elem element to store into
-   * \return whether reaches end of data
-   */
-  inline bool Next(Elem &elem) { // NOLINT(*)
-    // end of buffer try to switch
-    if (buf_index == buf_size) {
-      this->SwitchBuffer();
-      buf_index = 0;
-    }
-    if (buf_index >= (current_buf ? endA : endB)) {
-      return false;
-    }
-    std::vector<Elem> &buf = current_buf ? bufA : bufB;
-    elem = buf[buf_index];
-    ++buf_index;
-    return true;
-  }
-  /*!
-   * \brief get the factory object
-   */
-  inline ElemFactory &get_factory(void) {
-    return factory;
-  }
-  inline const ElemFactory &get_factory(void) const {
-    return factory;
-  }
-  // size of buffer
-  int  buf_size;
-
- private:
-  // factory object used to load configures
-  ElemFactory factory;
-  // index in current buffer
-  int buf_index;
-  // indicate which one is current buffer
-  int current_buf;
-  // max limit of visit, also marks termination
-  int endA, endB;
-  // double buffer, one is accessed by loader
-  // the other is accessed by consumer
-  // buffer of the data
-  std::vector<Elem> bufA, bufB;
-  // initialization end
-  bool init_end;
-  // singal whether the data is loaded
-  bool data_loaded;
-  // signal to kill the thread
-  bool destroy_signal;
-  // thread object
-  Thread loader_thread;
-  // signal of the buffer
-  Semaphore loading_end, loading_need;
-  /*!
-   * \brief slave thread
-   * this implementation is like producer-consumer style
-   */
-  inline void RunLoader(void) {
-    while (!destroy_signal) {
-      // sleep until loading is needed
-      loading_need.Wait();
-      std::vector<Elem> &buf = current_buf ? bufB : bufA;
-      int i;
-      for (i = 0; i < buf_size ; ++i) {
-        if (!factory.LoadNext(buf[i])) {
-          int &end = current_buf ? endB : endA;
-          end = i;  // marks the termination
-          break;
-        }
-      }
-      // signal that loading is done
-      data_loaded = true;
-      loading_end.Post();
-    }
-  }
-  /*!\brief entry point of loader thread */
-  inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) {
-    static_cast< ThreadBuffer<Elem, ElemFactory>* >(pthread)->RunLoader();
-    return NULL;
-  }
-  /*!\brief start loader thread */
-  inline void StartLoader(void) {
-    destroy_signal = false;
-    // set param
-    current_buf = 1;
-    loading_need.Init(1);
-    loading_end .Init(0);
-    // reset terminate limit
-    endA = endB = buf_size;
-    loader_thread.Start(LoaderEntry, this);
-    // wait until first part of data is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();
-    buf_index = 0;
-  }
-  /*!\brief switch double buffer */
-  inline void SwitchBuffer(void) {
-    loading_end.Wait();
-    // loader shall be sleep now, critcal zone!
-    current_buf = !current_buf;
-    // wake up loader
-    data_loaded = false;
-    loading_need.Post();
-  }
-};
-#else
-// a dummy single threaded ThreadBuffer
-// use this to resolve R's solaris compatibility for now
-template<typename Elem, typename ElemFactory>
-class ThreadBuffer {
- public:
-  ThreadBuffer() : init_end_(false) {}
-  ~ThreadBuffer() {
-    if (init_end_) {
-      factory_.FreeSpace(data_);
-      factory_.Destroy();
-    }
-  }
-  inline void SetParam(const char *name, const char *val) {
-  }
-  inline bool Init(void) {
-    if (!factory_.Init()) return false;
-    data_ = factory_.Create();
-    return (init_end_ = true);
-  }
-  inline void BeforeFirst(void) {
-    factory_.BeforeFirst();
-  }
-  inline bool Next(Elem &elem) { // NOLINT(*)
-    if (factory_.LoadNext(data_)) {
-      elem = data_; return true;
-    } else {
-      return false;
-    }
-  }
-  inline ElemFactory &get_factory() {
-    return factory_;
-  }
-  inline const ElemFactory &get_factory() const {
-    return factory_;
-  }
-
- private:
-  // initialized
-  bool init_end_;
-  // current data
-  Elem data_;
-  // factory object used to load configures
-  ElemFactory factory_;
-};
-#endif  // !defined(XGBOOST_STRICT_CXX98_)
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_THREAD_BUFFER_H_
diff --git a/src/utils/utils.h b/src/utils/utils.h
deleted file mode 100644
index 4d06d3c61..000000000
--- a/src/utils/utils.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file utils.h
- * \brief simple utils to support the code
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_UTILS_H_
-#define XGBOOST_UTILS_UTILS_H_
-
-#define _CRT_SECURE_NO_WARNINGS
-#include <cstdio>
-#include <string>
-#include <cstdlib>
-#include <vector>
-#include <stdexcept>
-
-#ifndef XGBOOST_STRICT_CXX98_
-#include <cstdarg>
-#endif
-
-#if !defined(__GNUC__)
-#define fopen64 std::fopen
-#endif
-#ifdef _MSC_VER
-// NOTE: sprintf_s is not equivalent to snprintf,
-// they are equivalent when success, which is sufficient for our case
-#define snprintf sprintf_s
-#define vsnprintf vsprintf_s
-#else
-#ifdef _FILE_OFFSET_BITS
-#if _FILE_OFFSET_BITS == 32
-#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
-#endif
-#endif
-
-#ifdef __APPLE__
-#define off64_t off_t
-#define fopen64 std::fopen
-#endif
-
-extern "C" {
-#include <sys/types.h>
-}
-#endif
-
-#ifdef _MSC_VER
-typedef unsigned char uint8_t;
-typedef unsigned __int16 uint16_t;
-typedef unsigned __int32 uint32_t;
-typedef unsigned __int64 uint64_t;
-typedef __int64 int64_t;
-#else
-#include <inttypes.h>
-#endif
-
-namespace xgboost {
-/*! \brief namespace for helper utils of the project */
-namespace utils {
-
-/*! \brief error message buffer length */
-const int kPrintBuffer = 1 << 12;
-
-#ifndef XGBOOST_CUSTOMIZE_MSG_
-/*!
- * \brief handling of Assert error, caused by inappropriate input
- * \param msg error message
- */
-inline void HandleAssertError(const char *msg) {
-  fprintf(stderr, "AssertError:%s\n", msg);
-  exit(-1);
-}
-/*!
- * \brief handling of Check error, caused by inappropriate input
- * \param msg error message
- */
-inline void HandleCheckError(const char *msg) {
-  throw std::runtime_error(msg);
-}
-inline void HandlePrint(const char *msg) {
-  printf("%s", msg);
-}
-#else
-#ifndef XGBOOST_STRICT_CXX98_
-// include declarations, some one must implement this
-void HandleAssertError(const char *msg);
-void HandleCheckError(const char *msg);
-void HandlePrint(const char *msg);
-#endif
-#endif
-#ifdef XGBOOST_STRICT_CXX98_
-// these function pointers are to be assigned
-extern "C" void (*Printf)(const char *fmt, ...);
-extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
-extern "C" void (*Assert)(int exp, const char *fmt, ...);
-extern "C" void (*Check)(int exp, const char *fmt, ...);
-extern "C" void (*Error)(const char *fmt, ...);
-#else
-/*! \brief printf, print message to the console */
-inline void Printf(const char *fmt, ...) {
-  std::string msg(kPrintBuffer, '\0');
-  va_list args;
-  va_start(args, fmt);
-  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-  va_end(args);
-  HandlePrint(msg.c_str());
-}
-/*! \brief portable version of snprintf */
-inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  int ret = vsnprintf(buf, size, fmt, args);
-  va_end(args);
-  return ret;
-}
-
-/*! \brief assert an condition is true, use this to handle debug information */
-inline void Assert(bool exp, const char *fmt, ...) {
-  if (!exp) {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleAssertError(msg.c_str());
-  }
-}
-
-/*!\brief same as assert, but this is intended to be used as message for user*/
-inline void Check(bool exp, const char *fmt, ...) {
-  if (!exp) {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleCheckError(msg.c_str());
-  }
-}
-
-/*! \brief report error message, same as check */
-inline void Error(const char *fmt, ...) {
-  {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleCheckError(msg.c_str());
-  }
-}
-#endif
-
-/*! \brief replace fopen, report error when the file open fails */
-inline std::FILE *FopenCheck(const char *fname, const char *flag) {
-  std::FILE *fp = fopen64(fname, flag);
-  Check(fp != NULL, "can not open file \"%s\"\n", fname);
-  return fp;
-}
-}  // namespace utils
-// easy utils that can be directly accessed in xgboost
-/*! \brief get the beginning address of a vector */
-template<typename T>
-inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
-  if (vec.size() == 0) {
-    return NULL;
-  } else {
-    return &vec[0];
-  }
-}
-/*! \brief get the beginning address of a vector */
-template<typename T>
-inline const T *BeginPtr(const std::vector<T> &vec) {
-  if (vec.size() == 0) {
-    return NULL;
-  } else {
-    return &vec[0];
-  }
-}
-inline char* BeginPtr(std::string &str) { // NOLINT(*)
-  if (str.length() == 0) return NULL;
-  return &str[0];
-}
-inline const char* BeginPtr(const std::string &str) {
-  if (str.length() == 0) return NULL;
-  return &str[0];
-}
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_UTILS_H_
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
deleted file mode 100644
index 773001503..000000000
--- a/src/xgboost_main.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-// Copyright 2014 by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <ctime>
-#include <string>
-#include <cstring>
-#include <vector>
-#include "./sync/sync.h"
-#include "./io/io.h"
-#include "./utils/utils.h"
-#include "./utils/config.h"
-#include "./learner/learner-inl.hpp"
-
-namespace xgboost {
-/*!
- * \brief wrapping the training process
- */
-class BoostLearnTask {
- public:
-  inline int Run(int argc, char *argv[]) {
-    if (argc < 2) {
-      printf("Usage: <config>\n");
-      return 0;
-    }
-    utils::ConfigIterator itr(argv[1]);
-    while (itr.Next()) {
-      this->SetParam(itr.name(), itr.val());
-    }
-    for (int i = 2; i < argc; ++i) {
-      char name[256], val[256];
-      if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
-        this->SetParam(name, val);
-      }
-    }
-    // do not save anything when save to stdout
-    if (model_out == "stdout" || name_pred == "stdout") {
-      this->SetParam("silent", "1");
-      save_period = 0;
-    }
-    // initialized the result
-    rabit::Init(argc, argv);
-    if (rabit::IsDistributed()) {
-      std::string pname = rabit::GetProcessorName();
-      fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
-    }
-    if (rabit::IsDistributed() && data_split == "NONE") {
-      this->SetParam("dsplit", "row");
-    }
-    if (rabit::GetRank() != 0) {
-      this->SetParam("silent", "2");
-    }
-    this->InitData();
-
-    if (task == "train") {
-      // if task is training, will try recover from checkpoint
-      this->TaskTrain();
-      return 0;
-    } else {
-      this->InitLearner();
-    }
-    if (task == "dump") {
-      this->TaskDump(); return 0;
-    }
-    if (task == "eval") {
-      this->TaskEval(); return 0;
-    }
-    if (task == "pred") {
-      this->TaskPred();
-    }
-    return 0;
-  }
-  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp("silent", name)) silent = atoi(val);
-    if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
-    if (!strcmp("num_round", name)) num_round = atoi(val);
-    if (!strcmp("pred_margin", name)) pred_margin = atoi(val);
-    if (!strcmp("ntree_limit", name)) ntree_limit = atoi(val);
-    if (!strcmp("save_period", name)) save_period = atoi(val);
-    if (!strcmp("eval_train", name)) eval_train = atoi(val);
-    if (!strcmp("task", name)) task = val;
-    if (!strcmp("data", name)) train_path = val;
-    if (!strcmp("test:data", name)) test_path = val;
-    if (!strcmp("model_in", name)) model_in = val;
-    if (!strcmp("model_out", name)) model_out = val;
-    if (!strcmp("model_dir", name)) model_dir_path = val;
-    if (!strcmp("fmap", name)) name_fmap = val;
-    if (!strcmp("name_dump", name)) name_dump = val;
-    if (!strcmp("name_pred", name)) name_pred = val;
-    if (!strcmp("dsplit", name)) data_split = val;
-    if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
-    if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val);
-    if (!strncmp("eval[", name, 5)) {
-      char evname[256];
-      utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1,
-                    "must specify evaluation name for display");
-      eval_data_names.push_back(std::string(evname));
-      eval_data_paths.push_back(std::string(val));
-    }
-    learner.SetParam(name, val);
-  }
-
- public:
-  BoostLearnTask(void) {
-    // default parameters
-    silent = 0;
-    use_buffer = 1;
-    num_round = 10;
-    save_period = 0;
-    eval_train = 0;
-    pred_margin = 0;
-    ntree_limit = 0;
-    dump_model_stats = 0;
-    task = "train";
-    model_in = "NULL";
-    model_out = "NULL";
-    name_fmap = "NULL";
-    name_pred = "pred.txt";
-    name_dump = "dump.txt";
-    model_dir_path = "./";
-    data_split = "NONE";
-    load_part = 0;
-    save_with_pbuffer = 0;
-    data = NULL;
-  }
-  ~BoostLearnTask(void) {
-    for (size_t i = 0; i < deval.size(); i++) {
-      delete deval[i];
-    }
-    if (data != NULL) delete data;
-  }
-
- private:
-  inline void InitData(void) {
-    if (strchr(train_path.c_str(), '%') != NULL) {
-      char s_tmp[256];
-      utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), rabit::GetRank());
-      train_path = s_tmp;
-      load_part = 1;
-    }
-    bool loadsplit = data_split == "row";
-    if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
-    if (task == "dump") return;
-    if (task == "pred") {
-      data = io::LoadDataMatrix(test_path.c_str(), silent != 0, use_buffer != 0, loadsplit);
-    } else {
-      // training
-      data = io::LoadDataMatrix(train_path.c_str(),
-                                silent != 0 && load_part == 0,
-                                use_buffer != 0, loadsplit);
-      utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
-      for (size_t i = 0; i < eval_data_names.size(); ++i) {
-        deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(),
-                                           silent != 0,
-                                           use_buffer != 0,
-                                           loadsplit));
-        devalall.push_back(deval.back());
-      }
-
-      std::vector<io::DataMatrix *> dcache(1, data);
-      for (size_t i = 0; i < deval.size(); ++i) {
-        dcache.push_back(deval[i]);
-      }
-      // set cache data to be all training and evaluation data
-      learner.SetCacheData(dcache);
-
-      // add training set to evaluation set if needed
-      if (eval_train != 0) {
-        devalall.push_back(data);
-        eval_data_names.push_back(std::string("train"));
-      }
-    }
-  }
-  inline void InitLearner(void) {
-    if (model_in != "NULL") {
-      learner.LoadModel(model_in.c_str());
-    } else {
-      utils::Assert(task == "train", "model_in not specified");
-      learner.InitModel();
-    }
-  }
-  inline void TaskTrain(void) {
-    int version = rabit::LoadCheckPoint(&learner);
-    if (version == 0) this->InitLearner();
-    const time_t start = time(NULL);
-    unsigned long elapsed = 0;  // NOLINT(*)
-    learner.CheckInit(data);
-
-    bool allow_lazy = learner.AllowLazyCheckPoint();
-    for (int i = version / 2; i < num_round; ++i) {
-      elapsed = (unsigned long)(time(NULL) - start);  // NOLINT(*)
-      if (version % 2 == 0) {
-        if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
-        learner.UpdateOneIter(i, *data);
-        if (allow_lazy) {
-          rabit::LazyCheckPoint(&learner);
-        } else {
-          rabit::CheckPoint(&learner);
-        }
-        version += 1;
-      }
-      utils::Assert(version == rabit::VersionNumber(), "consistent check");
-      std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
-      if (rabit::IsDistributed()) {
-        if (rabit::GetRank() == 0) {
-          rabit::TrackerPrintf("%s\n", res.c_str());
-        }
-      } else {
-        if (silent < 2) {
-          fprintf(stderr, "%s\n", res.c_str());
-        }
-      }
-      if (save_period != 0 && (i + 1) % save_period == 0) {
-        this->SaveModel(i);
-      }
-      if (allow_lazy) {
-        rabit::LazyCheckPoint(&learner);
-      } else {
-        rabit::CheckPoint(&learner);
-      }
-      version += 1;
-      utils::Assert(version == rabit::VersionNumber(), "consistent check");
-      elapsed = (unsigned long)(time(NULL) - start);  // NOLINT(*)
-    }
-    // always save final round
-    if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE") {
-      if (model_out == "NULL") {
-        this->SaveModel(num_round - 1);
-      } else {
-        this->SaveModel(model_out.c_str());
-      }
-    }
-    if (!silent) {
-      printf("\nupdating end, %lu sec in all\n", elapsed);
-    }
-  }
-  inline void TaskEval(void) {
-    learner.EvalOneIter(0, devalall, eval_data_names);
-  }
-  inline void TaskDump(void) {
-    FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
-    std::vector<std::string> dump = learner.DumpModel(fmap, dump_model_stats != 0);
-    for (size_t i = 0; i < dump.size(); ++i) {
-      fprintf(fo, "booster[%lu]:\n", i);
-      fprintf(fo, "%s", dump[i].c_str());
-    }
-    fclose(fo);
-  }
-  inline void SaveModel(const char *fname) const {
-    if (rabit::GetRank() != 0) return;
-    learner.SaveModel(fname, save_with_pbuffer != 0);
-  }
-  inline void SaveModel(int i) const {
-    char fname[256];
-    utils::SPrintf(fname, sizeof(fname),
-                   "%s/%04d.model", model_dir_path.c_str(), i + 1);
-    this->SaveModel(fname);
-  }
-  inline void TaskPred(void) {
-    std::vector<float> preds;
-    if (!silent) printf("start prediction...\n");
-    learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
-    if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
-    FILE *fo;
-    if (name_pred != "stdout") {
-      fo = utils::FopenCheck(name_pred.c_str(), "w");
-    } else {
-      fo = stdout;
-    }
-    for (size_t i = 0; i < preds.size(); ++i) {
-      fprintf(fo, "%g\n", preds[i]);
-    }
-    if (fo != stdout) fclose(fo);
-  }
-
- private:
-  /*! \brief whether silent */
-  int silent;
-  /*! \brief special load */
-  int load_part;
-  /*! \brief whether use auto binary buffer */
-  int use_buffer;
-  /*! \brief whether evaluate training statistics */
-  int eval_train;
-  /*! \brief number of boosting iterations */
-  int num_round;
-  /*! \brief the period to save the model, 0 means only save the final round model */
-  int save_period;
-  /*! \brief the path of training/test data set */
-  std::string train_path, test_path;
-  /*! \brief the path of test model file, or file to restart training */
-  std::string model_in;
-  /*! \brief the path of final model file, to be saved */
-  std::string model_out;
-  /*! \brief the path of directory containing the saved models */
-  std::string model_dir_path;
-  /*! \brief task to perform */
-  std::string task;
-  /*! \brief name of predict file */
-  std::string name_pred;
-  /*! \brief data split mode */
-  std::string data_split;
-  /*!\brief limit number of trees in prediction */
-  int ntree_limit;
-  /*!\brief whether to directly output margin value */
-  int pred_margin;
-  /*! \brief whether dump statistics along with model */
-  int dump_model_stats;
-  /*! \brief whether save prediction buffer */
-  int save_with_pbuffer;
-  /*! \brief name of feature map */
-  std::string name_fmap;
-  /*! \brief name of dump file */
-  std::string name_dump;
-  /*! \brief the paths of validation data sets */
-  std::vector<std::string> eval_data_paths;
-  /*! \brief the names of the evaluation data used in output log */
-  std::vector<std::string> eval_data_names;
-
- private:
-  io::DataMatrix* data;
-  std::vector<io::DataMatrix*> deval;
-  std::vector<const io::DataMatrix*> devalall;
-  utils::FeatMap fmap;
-  learner::BoostLearner learner;
-};
-}  // namespace xgboost
-
-int main(int argc, char *argv[]) {
-  xgboost::BoostLearnTask tsk;
-  tsk.SetParam("seed", "0");
-  int ret = tsk.Run(argc, argv);
-  rabit::Finalize();
-  return ret;
-}
diff --git a/subtree/README.md b/subtree/README.md
deleted file mode 100644
index 9c3df6609..000000000
--- a/subtree/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-This folder contains git subtree projects of xgboost.
-Do not make changes to the subtree projects in xgboost,
-push changes to the original project instead and changes will be pulled back to this folder
-
-* rabit: https://github.com/tqchen/rabit
diff --git a/subtree/rabit/.gitignore b/subtree/rabit/.gitignore
deleted file mode 100644
index 121caaafe..000000000
--- a/subtree/rabit/.gitignore
+++ /dev/null
@@ -1,39 +0,0 @@
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-*.lnk
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-*~
-*.pyc
-*.mpi
-*.exe
-*.txt
-*tmp*
-*.rabit
-*.mock
-dmlc-core
-recommonmark
-recom
diff --git a/subtree/rabit/.travis.yml b/subtree/rabit/.travis.yml
deleted file mode 100644
index 339f5c692..000000000
--- a/subtree/rabit/.travis.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-# disable sudo to use container based build
-sudo: false
-
-# Use Build Matrix to do lint and build seperately
-env:
-  matrix:
-    - TASK=lint LINT_LANG=cpp
-    - TASK=lint LINT_LANG=python
-    - TASK=doc
-    - TASK=build CXX=g++
-    - TASK=test CXX=g++
-
-# dependent apt packages
-addons:
-  apt:
-    packages:
-      - doxygen
-      - libopenmpi-dev
-      - wget
-      - git
-      - libcurl4-openssl-dev
-      - unzip
-      - python-numpy
-      
-before_install:
-  - git clone https://github.com/dmlc/dmlc-core
-  - export TRAVIS=dmlc-core/scripts/travis/
-  - source ${TRAVIS}/travis_setup_env.sh
-
-install:
-  - pip install cpplint pylint --user `whoami`
-
-script: scripts/travis_script.sh
-
-
-before_cache:
-  - ${TRAVIS}/travis_before_cache.sh
-
-
-cache:
-  directories:
-    - ${HOME}/.cache/usr
-
-
-notifications:
-# Emails are sent to the committer's git-configured email address by default,
-  email:
-    on_success: change
-    on_failure: always
-
-
diff --git a/subtree/rabit/LICENSE b/subtree/rabit/LICENSE
deleted file mode 100644
index 2485f4eaa..000000000
--- a/subtree/rabit/LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2014 by Contributors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of rabit nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
diff --git a/subtree/rabit/Makefile b/subtree/rabit/Makefile
deleted file mode 100644
index 8c9d9f403..000000000
--- a/subtree/rabit/Makefile
+++ /dev/null
@@ -1,76 +0,0 @@
-ifndef CXX
-export CXX = g++
-endif
-export MPICXX = mpicxx
-export LDFLAGS= -Llib -lrt
-export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x
-export CFLAGS = -O3 -msse2 $(WARNFLAGS)
-
-ifndef WITH_FPIC
-	WITH_FPIC = 1
-endif
-ifeq ($(WITH_FPIC), 1)
-	CFLAGS += -fPIC
-endif
-
-ifndef LINT_LANG
-	LINT_LANG="all"
-endif
-
-# build path
-BPATH=.
-# objectives that makes up rabit library
-MPIOBJ= $(BPATH)/engine_mpi.o
-OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
-	$(BPATH)/rabit_wrapper.o $(BPATH)/engine_base.o
-SLIB= wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so wrapper/librabit_wrapper_mpi.so
-ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
-HEADERS=src/*.h include/*.h include/rabit/*.h
-DMLC=dmlc-core
-
-.PHONY: clean all install mpi python lint doc doxygen
-
-all: lib/librabit.a lib/librabit_mock.a  wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so lib/librabit_base.a
-mpi: lib/librabit_mpi.a wrapper/librabit_wrapper_mpi.so
-python: wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so
-
-$(BPATH)/allreduce_base.o: src/allreduce_base.cc $(HEADERS)
-$(BPATH)/engine.o: src/engine.cc $(HEADERS)
-$(BPATH)/allreduce_robust.o: src/allreduce_robust.cc $(HEADERS)
-$(BPATH)/engine_mpi.o: src/engine_mpi.cc $(HEADERS)
-$(BPATH)/engine_empty.o: src/engine_empty.cc $(HEADERS)
-$(BPATH)/engine_mock.o: src/engine_mock.cc $(HEADERS)
-$(BPATH)/engine_base.o: src/engine_base.cc $(HEADERS)
-
-lib/librabit.a: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o
-lib/librabit_base.a: $(BPATH)/allreduce_base.o $(BPATH)/engine_base.o
-lib/librabit_mock.a: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine_mock.o
-lib/librabit_empty.a: $(BPATH)/engine_empty.o
-lib/librabit_mpi.a: $(MPIOBJ)
-# wrapper code
-$(BPATH)/rabit_wrapper.o: wrapper/rabit_wrapper.cc
-wrapper/librabit_wrapper.so: $(BPATH)/rabit_wrapper.o lib/librabit.a
-wrapper/librabit_wrapper_mock.so: $(BPATH)/rabit_wrapper.o lib/librabit_mock.a
-wrapper/librabit_wrapper_mpi.so: $(BPATH)/rabit_wrapper.o lib/librabit_mpi.a
-
-$(OBJ) :
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
-
-$(MPIOBJ) :
-	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
-
-$(ALIB):
-	ar cr $@ $+
-
-$(SLIB) :
-	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
-
-lint:
-	$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include wrapper
-
-doc doxygen:
-	cd include; doxygen ../doc/Doxyfile; cd -
-
-clean:
-	$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
-
diff --git a/subtree/rabit/README.md b/subtree/rabit/README.md
deleted file mode 100644
index 9302a2199..000000000
--- a/subtree/rabit/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-## rabit: Reliable Allreduce and Broadcast Interface
-[![Build Status](https://travis-ci.org/dmlc/rabit.svg?branch=master)](https://travis-ci.org/dmlc/rabit)
-[![Documentation Status](https://readthedocs.org/projects/rabit/badge/?version=latest)](http://rabit.readthedocs.org/)
-
-rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs.
-
-* [Tutorial](guide)
-* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
-* You can also directly read the [interface header](include/rabit.h)
-* [Distributed Machine Learning Tools](https://github.com/dmlc/wormhole)
-  - Rabit is one of the backbone library to support wormhole machine learning tools
-
-Features
-====
-All these features comes from the facts about small rabbit:)
-* Portable: rabit is light weight and runs everywhere
-  - Rabit is a library instead of a framework, a program only needs to link the library to run
-  - Rabit only replies on a mechanism to start program, which was provided by most framework
-  - You can run rabit programs on many platforms, including Yarn(Hadoop), MPI using the same code
-* Scalable and Flexible: rabit runs fast
-  * Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
-  - Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
-  - Programs persist over all the iterations, unless they fail and recover.
-* Reliable: rabit dig burrows to avoid disasters
-  - Rabit programs can recover the model and results using synchronous function calls.
-
-Use Rabit
-====
-* Type make in the root folder will compile the rabit library in lib folder
-* Add lib to the library path and include to the include path of compiler
-* Languages: You can use rabit in C++ and python
-  - It is also possible to port the library to other languages
-
-Contributing
-====
-Rabit is an open-source library, contributions are welcomed, including:
-* The rabit core library.
-* Customized tracker script for new platforms and interface of new languages.
-* Tutorial and examples about the library.
diff --git a/subtree/rabit/doc/.gitignore b/subtree/rabit/doc/.gitignore
deleted file mode 100644
index 95f88be43..000000000
--- a/subtree/rabit/doc/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-html
-latex
-*.sh
-_*
-doxygen
diff --git a/subtree/rabit/doc/Doxyfile b/subtree/rabit/doc/Doxyfile
deleted file mode 100644
index 2c9c64ea7..000000000
--- a/subtree/rabit/doc/Doxyfile
+++ /dev/null
@@ -1,287 +0,0 @@
-# Doxyfile 1.7.6.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "rabit"
-PROJECT_NUMBER         =
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = ../doc/doxygen
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = YES
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = YES
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = NO
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = NO
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = YES
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_DIRECTORIES       = NO
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = NO
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = YES
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = . dmlc
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          =
-RECURSIVE              = NO
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       = *-inl.hpp
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = NO
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = YES
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = html
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            =
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_ALIGN_MEMBERS     = YES
-HTML_DYNAMIC_SECTIONS  = NO
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-USE_INLINE_TREES       = NO
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = YES
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = YES
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = NO
-PAPER_TYPE             = a4
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = YES
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = NO
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               =
-GENERATE_TAGFILE       =
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = Helvetica
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-TEMPLATE_RELATIONS     = NO
-INCLUDE_GRAPH          = YES
-INCLUDED_BY_GRAPH      = YES
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = YES
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = YES
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
diff --git a/subtree/rabit/doc/Makefile b/subtree/rabit/doc/Makefile
deleted file mode 100644
index 40bba2a28..000000000
--- a/subtree/rabit/doc/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = _build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  applehelp  to make an Apple Help Book"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-	@echo "  coverage   to run coverage check of the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
-
-applehelp:
-	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
-	@echo
-	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
-	@echo "N.B. You won't be able to view it unless you put it in" \
-	      "~/Library/Documentation/Help or install it in your application" \
-	      "bundle."
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
-
-coverage:
-	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
-	@echo "Testing of coverage in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/coverage/python.txt."
-
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/subtree/rabit/doc/conf.py b/subtree/rabit/doc/conf.py
deleted file mode 100644
index ef89de489..000000000
--- a/subtree/rabit/doc/conf.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# documentation build configuration file, created by
-# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-import sys
-import os, subprocess
-import shlex
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-libpath = os.path.join(curr_path, '../wrapper/')
-sys.path.insert(0, os.path.join(curr_path, '../wrapper/'))
-sys.path.insert(0, curr_path)
-from sphinx_util import MarkdownParser, AutoStructify
-
-# -- General configuration ------------------------------------------------
-
-# General information about the project.
-project = u'rabit'
-copyright = u'2015, rabit developers'
-author = u'rabit developers'
-github_doc_root = 'https://github.com/dmlc/rabit/tree/master/doc/'
-
-# add markdown parser
-MarkdownParser.github_doc_root = github_doc_root
-source_parsers = {
-    '.md': MarkdownParser,
-}
-# Version information.
-import rabit
-
-version = rabit.__version__
-release = rabit.__version__
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.mathjax',
-    'breathe',
-]
-
-# Use breathe to include doxygen documents
-breathe_projects = {'rabit' : 'doxygen/xml/'}
-breathe_default_project = 'rabit'
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-# source_suffix = ['.rst', '.md']
-source_suffix = ['.rst', '.md']
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all
-# documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-# html_theme = 'alabaster'
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = project + 'doc'
-
-# -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-  (master_doc, 'rabit.tex', project,
-   author, 'manual'),
-]
-
-# hook for doxygen
-def run_doxygen(folder):
-    """Run the doxygen make command in the designated folder."""
-    try:
-        retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
-        if retcode < 0:
-            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
-    except OSError as e:
-        sys.stderr.write("doxygen execution failed: %s" % e)
-
-
-def run_build_lib(folder):
-    """Run the doxygen make command in the designated folder."""
-    try:
-        retcode = subprocess.call("cd %s; make" % folder, shell=True)
-        retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
-        retcode = subprocess.call("mkdir _build", shell=True)
-        retcode = subprocess.call("mkdir _build/html", shell=True)
-        retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
-        if retcode < 0:
-            sys.stderr.write("build terminated by signal %s" % (-retcode))
-    except OSError as e:
-        sys.stderr.write("build execution failed: %s" % e)
-
-
-def generate_doxygen_xml(app):
-    """Run the doxygen make commands if we're on the ReadTheDocs server"""
-    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-    if read_the_docs_build:
-        run_doxygen('..')
-        sys.stderr.write('Check if shared lib exists\n')
-        run_build_lib('..')
-    sys.stderr.write('The wrapper path: %s\n' % str(os.listdir('../wrapper')))
-    rabit._loadlib()
-
-
-def setup(app):
-    # Add hook for building doxygen xml when needed
-    app.connect("builder-inited", generate_doxygen_xml)
-    app.add_config_value('recommonmark_config', {
-            'url_resolver': lambda url: github_doc_root + url,
-            }, True)
-    app.add_transform(AutoStructify)
diff --git a/subtree/rabit/doc/cpp_api.md b/subtree/rabit/doc/cpp_api.md
deleted file mode 100644
index c6184aa08..000000000
--- a/subtree/rabit/doc/cpp_api.md
+++ /dev/null
@@ -1,9 +0,0 @@
-C++ Library API of Rabit
-========================
-This page contains document of Library API of rabit.
-
-```eval_rst
-.. toctree::
-
-.. doxygennamespace:: rabit
-```
diff --git a/subtree/rabit/doc/guide.md b/subtree/rabit/doc/guide.md
deleted file mode 100644
index e2bfa5ce8..000000000
--- a/subtree/rabit/doc/guide.md
+++ /dev/null
@@ -1,413 +0,0 @@
-Tutorial
-========
-This is rabit's tutorial, a ***Reliable Allreduce and Broadcast Interface***.
-All the example codes are in the [guide](https://github.com/dmlc/rabit/blob/master/guide/) folder of the project.
-To run the examples locally, you will need to build them with ```make```.
-
-**List of Topics**
-* [What is Allreduce](#what-is-allreduce)
-* [Common Use Case](#common-use-case)
-* [Use Rabit API](#use-rabit-api)
-  - [Structure of a Rabit Program](#structure-of-a-rabit-program)
-  - [Allreduce and Lazy Preparation](#allreduce-and-lazy-preparation)
-  - [Checkpoint and LazyCheckpoint](#checkpoint-and-lazycheckpoint)
-* [Compile Programs with Rabit](#compile-programs-with-rabit)
-* [Running Rabit Jobs](#running-rabit-jobs)
-  - [Running Rabit on Hadoop](#running-rabit-on-hadoop)
-  - [Running Rabit using MPI](#running-rabit-using-mpi)
-  - [Customize Tracker Script](#customize-tracker-script)
-* [Fault Tolerance](#fault-tolerance)
-
-What is Allreduce
------------------
-The main methods provided by rabit are Allreduce and Broadcast. Allreduce performs reduction across different computation nodes,
-and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](../guide/basic.cc) (there is a python example right after this if you are more familiar with python).
-```c++
-#include <rabit.h>
-using namespace rabit;
-const int N = 3;
-int main(int argc, char *argv[]) {
-  int a[N];
-  rabit::Init(argc, argv);
-  for (int i = 0; i < N; ++i) {
-    a[i] = rabit::GetRank() + i;
-  }
-  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // allreduce take max of each elements in all processes
-  Allreduce<op::Max>(&a[0], N);
-  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // second allreduce that sums everything up
-  Allreduce<op::Sum>(&a[0], N);
-  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  rabit::Finalize();
-  return 0;
-}
-```
-You can run the example using the rabit_demo.py script. The following command
-starts the rabit program with two worker processes.
-```bash
-../tracker/rabit_demo.py -n 2 basic.rabit
-```
-This will start two processes, one process with rank 0 and the other with rank 1, both processes run the same code.
-The ```rabit::GetRank()``` function returns the rank of current process.
-
-Before the call to Allreduce, process 0 contains the array ```a = {0, 1, 2}```, while process 1 has the array
-```a = {1, 2, 3}```. After the call to Allreduce, the array contents in all processes are replaced by the
-reduction result (in this case, the maximum value in each position across all the processes). So, after the
-Allreduce call, the result will become ```a = {1, 2, 3}```.
-Rabit provides different reduction operators, for example,  if you change ```op::Max``` to ```op::Sum```,
-the reduction operation will be a summation, and the result will become ```a = {1, 3, 5}```.
-You can also run the example with different processes by setting -n to different values.
-
-If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](../guide/basic.py):
-
-```python
-import numpy as np
-import rabit
-
-rabit.init()
-n = 3
-rank = rabit.get_rank()
-a = np.zeros(n)
-for i in xrange(n):
-    a[i] = rank + i
-
-print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
-a = rabit.allreduce(a, rabit.MAX)
-print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
-a = rabit.allreduce(a, rabit.SUM)
-print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
-rabit.finalize()
-```
-You can run the program using the following command
-```bash
-../tracker/rabit_demo.py -n 2 basic.py
-```
-
-Broadcast is another method provided by rabit besides Allreduce. This function allows one node to broadcast its
-local data to all other nodes. The following code in [broadcast.cc](../guide/broadcast.cc) broadcasts a string from
-node 0 to all other nodes.
-```c++
-#include <rabit.h>
-using namespace rabit;
-const int N = 3;
-int main(int argc, char *argv[]) {
-  rabit::Init(argc, argv);
-  std::string s;
-  if (rabit::GetRank() == 0) s = "hello world";
-  printf("@node[%d] before-broadcast: s=\"%s\"\n",
-         rabit::GetRank(), s.c_str());
-  // broadcast s from node 0 to all other nodes
-  rabit::Broadcast(&s, 0);
-  printf("@node[%d] after-broadcast: s=\"%s\"\n",
-         rabit::GetRank(), s.c_str());
-  rabit::Finalize();
-  return 0;
-}
-```
-The following command starts the program with three worker processes.
-```bash
-../tracker/rabit_demo.py -n 3 broadcast.rabit
-```
-Besides strings, rabit also allows to broadcast constant size array and vectors.
-
-The counterpart in python can be found in [broadcast.py](../guide/broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
-
-```python
-import rabit
-rabit.init()
-n = 3
-rank = rabit.get_rank()
-s = None
-if rank == 0:
-    s = {'hello world':100, 2:3}
-print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
-s = rabit.broadcast(s, 0)
-print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
-rabit.finalize()
-```
-
-Common Use Case
----------------
-Many distributed machine learning algorithms involve splitting the data into different nodes,
-computing statistics locally, and finally aggregating them. Such workflow is usually done repetitively through many iterations before the algorithm converges. Allreduce naturally meets the structure of such programs,
-common use cases include:
-
-* Aggregation of gradient values, which can be used in optimization methods such as L-BFGS.
-* Aggregation of other statistics, which can be used in KMeans and Gaussian Mixture Models.
-* Find the best split candidate and aggregation of split statistics, used for tree based models.
-
-Rabit is a reliable and portable library for distributed machine learning programs, that allow programs to run reliably on different platforms.
-
-Use Rabit API
--------------
-This section introduces topics about how to use rabit API.
-You can always refer to [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for definition of each functions.
-This section trys to gives examples of different aspectes of rabit API.
-
-#### Structure of a Rabit Program
-The following code illustrates the common structure of a rabit program. This is an abstract example,
-you can also refer to [wormhole](https://github.com/dmlc/wormhole/blob/master/learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
-
-```c++
-#include <rabit.h>
-int main(int argc, char *argv[]) {
-  ...
-  rabit::Init(argc, argv);
-  // load the latest checked model
-  int version = rabit::LoadCheckPoint(&model);
-  // initialize the model if it is the first version
-  if (version == 0) model.InitModel();
-  // the version number marks the iteration to resume
-  for (int iter = version; iter < max_iter; ++iter) {
-    // at this point, the model object should allow us to recover the program state
-    ...
-    // each iteration can contain multiple calls of allreduce/broadcast
-    rabit::Allreduce<rabit::op::Max>(&data[0], n);
-    ...
-    // checkpoint model after one iteration finishes
-    rabit::CheckPoint(&model);
-  }
-  rabit::Finalize();
-  return 0;
-}
-```
-
-Besides the common Allreduce and Broadcast functions, there are two additional functions: ```LoadCheckPoint```
-and ```CheckPoint```. These two functions are used for fault-tolerance purposes.
-As mentioned before, traditional machine learning programs involve several iterations. In each iteration, we start with a model, make some calls
-to Allreduce or Broadcast and update the model. The calling sequence in each iteration does not need to be the same.
-
-* When the nodes start from the beginning (i.e. iteration 0), ```LoadCheckPoint``` returns 0, so we can initialize the model.
-* ```CheckPoint``` saves the model after each iteration.
-  - Efficiency Note: the model is only kept in local memory and no save to disk is performed when calling Checkpoint
-* When a node goes down and restarts, ```LoadCheckPoint``` will recover the latest saved model, and
-* When a node goes down, the rest of the nodes will block in the call of Allreduce/Broadcast and wait for
-  the recovery of the failed node until it catches up.
-
-Please see the [Fault Tolerance](#fault-tolerance) section to understand the recovery procedure executed by rabit.
-
-#### Allreduce and Lazy Preparation
-Allreduce is one of the most important function provided by rabit. You can call allreduce by specifying the
-reduction operator, pointer to the data and size of the buffer, as follows
-```c++
-Allreduce<operator>(pointer_of_data, size_of_data);
-```
-This is the basic use case of Allreduce function. It is common that user writes the code to prepare the data needed
-into the data buffer, pass the data to Allreduce function, and get the reduced result. However, when a node restarts
-from failure, we can directly recover the result from other nodes(see also [Fault Tolerance](#fault-tolerance)) and
-the data preparation procedure no longer necessary. Rabit Allreduce add an optional parameter preparation function
-to support such scenario. User can pass in a function that corresponds to the data preparation procedure to Allreduce
-calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](../guide/lazy_allreduce.cc)
-as an example to demonstrate this feature. It is modified from [basic.cc](../guide/basic.cc), and you can compare the two codes.
-```c++
-#include <rabit.h>
-using namespace rabit;
-const int N = 3;
-int main(int argc, char *argv[]) {
-  int a[N] = {0};
-  rabit::Init(argc, argv);
-  // lazy preparation function
-  auto prepare = [&]() {
-    printf("@node[%d] run prepare function\n", rabit::GetRank());
-    for (int i = 0; i < N; ++i) {
-      a[i] = rabit::GetRank() + i;
-    }
-  };
-  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // allreduce take max of each elements in all processes
-  Allreduce<op::Max>(&a[0], N, prepare);
-  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // rum second allreduce
-  Allreduce<op::Sum>(&a[0], N);
-  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  rabit::Finalize();
-  return 0;
-}
-```
-Here we use features of C++11 because the lambda function makes things much shorter.
-There is also C++ compatible callback interface provided in the [API](http://homes.cs.washington.edu/~tqchen/rabit/doc).
-You can compile the program by typing ```make lazy_allreduce.mock```. We link against the mock library so that we can see
-the effect when a process goes down. You can run the program using the following command
-```bash
-../tracker/rabit_demo.py -n 2 lazy_allreduce.mock mock=0,0,1,0
-```
-The additional arguments ```mock=0,0,1,0``` will cause node 0 to kill itself before second call of Allreduce (see also [mock test](#link-against-mock-test-rabit-library)).
-You will find that the prepare function's print is only executed once and node 0 will no longer execute the preparation function when it restarts from failure.
-
-You can also find python version of the example in [lazy_allreduce.py](../guide/lazy_allreduce.py), and run it using the followin command
-```bash
-../tracker/rabit_demo.py -n 2 lazy_allreduce.py mock=0,0,1,0
-
-```
-
-Since lazy preparation function may not be called during execution. User should be careful when using this feature. For example, a possible mistake
-could be putting some memory allocation code in the lazy preparation function, and the computing memory was not allocated when lazy preparation function is not called.
-The example in [lazy_allreduce.cc](../guide/lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](../guide/basic.cc)) to lazy version: wrap the preparation
-code with a lambda function, and pass it to allreduce.
-
-#### Checkpoint and LazyCheckpoint
-Common machine learning algorithms usually involves iterative computation. As mentioned in the section ([Structure of a Rabit Program](#structure-of-a-rabit-program)),
-user can and should use Checkpoint to ```save``` the progress so far, so that when a node fails, the latest checkpointed model can be loaded.
-
-There are two model arguments you can pass to Checkpoint and LoadCheckpoint: ```global_model``` and ```local_model```:
-* ```global_model``` refers to the model that is commonly shared across all the nodes
-  - For example, the centriods of clusters in kmeans is shared across all nodes
-* ```local_model``` refers to the model that is specifically tied to the current node
-  - For example, in topic modeling, the topic assignments of subset of documents in current node is local model
-
-Because the different nature of the two types of models, different strategy will be used for them.
-```global_model``` is simply saved in local memory of each node, while ```local_model``` will replicated to some other
-nodes (selected using a ring replication strategy). The checkpoint is only saved in the memory without touching the disk which makes rabit programs more efficient.
-User is encouraged to use ```global_model``` only when is sufficient for better efficiency.
-
-To enable a model class to be checked pointed, user can implement a [serialization interface](../include/rabit_serialization.h). The serialization interface already
-provide serialization functions of STL vector and string. For python API, user can checkpoint any python object that can be pickled.
-
-There is a special Checkpoint function called [LazyCheckpoint](http://homes.cs.washington.edu/~tqchen/rabit/doc/namespacerabit.html#a99f74c357afa5fba2c80cc0363e4e459),
-which can be used for ```global_model``` only cases under certain condition.
-When LazyCheckpoint is called, no action is taken and the rabit engine only remembers the pointer to the model.
-The serialization will only happen when another node fails and the recovery starts. So user basically pays no extra cost calling LazyCheckpoint.
-To use this function, the user need to ensure the model remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
-So that when recovery procedure happens in these function calls, the serialized model will be the same.
-
-For example, consider the following calling sequence
-```
-LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
-```
-The user must only change the model in code3. Such condition can usually be satiesfied in many scenarios, and user can use LazyCheckpoint to further
-improve the efficiency of the program.
-
-
-Compile Programs with Rabit
----------------------------
-Rabit is a portable library, to use it, you only need to include the rabit header file.
-* You will need to add the path to [../include](../include) to the header search path of the compiler
-  - Solution 1: add ```-I/path/to/rabit/include``` to the compiler flag in gcc or clang
-  - Solution 2: add the path to the environment variable CPLUS_INCLUDE_PATH
-* You will need to add the path to [../lib](../lib) to the library search path of the compiler
-  - Solution 1: add ```-L/path/to/rabit/lib``` to the linker flag
-  - Solution 2: add the path to environment variable LIBRARY_PATH AND LD_LIBRARY_PATH
-* Link against lib/rabit.a
-  - Add ```-lrabit``` to the linker flag
-
-The procedure above allows you to compile a program with rabit. The following two sections contain additional
-options you can use to link against different backends other than the normal one.
-
-#### Link against MPI Allreduce
-You can link against ```rabit_mpi.a``` instead of using MPI Allreduce, however, the resulting program is backed by MPI and
-is not fault tolerant anymore.
-* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mpi```
-* The final linking needs to be done by mpi wrapper compiler ```mpicxx```
-
-#### Link against Mock Test Rabit Library
-If you want to use a mock to test the program in order to see the behavior of the code when some nodes go down, you can link against ```rabit_mock.a``` .
-* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mock```
-
-The resulting rabit mock program can take in additional arguments in the following format
-```
-mock=rank,version,seq,ndeath
-```
-
-The four integers specify an event that will cause the program to ```commit suicide```(exit with -2)
-* rank specifies the rank of the node to kill
-* version specifies the version (iteration) of the model where you want the process to die
-* seq specifies the sequence number of the Allreduce/Broadcast call since last checkpoint, where the process will be killed
-* ndeath specifies how many times this node died already
-
-For example, consider the following script in the test case
-```bash
-../tracker/rabit_demo.py -n 10 test_model_recover 10000\
-                         mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1
-```
-* The first mock will cause node 0 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 0
-* The second mock will cause node 1 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 1
-* The third mock will cause node 1 to exit again when calling second Allreduce/Broadcast (seq = 1) in iteration 1
-  - Note that ndeath = 1 means this will happen only if node 1 died once, which is our case
-
-Running Rabit Jobs
-------------------
-Rabit is a portable library that can run on multiple platforms.
-
-#### Running Rabit Locally
-* You can use [../tracker/rabit_demo.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_demo.py) to start n processes locally
-* This script will restart the program when it exits with -2, so it can be used for [mock test](#link-against-mock-test-library)
-
-#### Running Rabit on Hadoop
-* You can use [../tracker/rabit_yarn.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_yarn.py) to run rabit programs as Yarn application
-* This will start rabit programs as yarn applications
-  - This allows multi-threading programs in each node, which can be more efficient
-  - An easy multi-threading solution could be to use OpenMP with rabit code
-* It is also possible to run rabit program via hadoop streaming, however, YARN is highly recommended.
-
-#### Running Rabit using MPI
-* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_mpi.py).
-* If you linked your code against librabit_mpi.a, then you can directly use mpirun to submit the job
-
-#### Customize Tracker Script
-You can also modify the tracker script to allow rabit to run on other platforms. To do so, refer to existing
-tracker scripts, such as [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) and [../tracker/rabit_mpi.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_mpi.py) to get a sense of how it is done.
-
-You will need to implement a platform dependent submission function with the following definition
-```python
-def fun_submit(nworkers, worker_args, worker_envs):
-    """
-      customized submit script, that submits nslave jobs,
-      each must contain args as parameter
-      note this can be a lambda closure
-      Parameters
-         nworkers number of worker processes to start
-         worker_args addtiional arguments that needs to be passed to worker
-         worker_envs enviroment variables that need to be set to the worker
-    """
-```
-The submission function should start nworkers processes in the platform, and append worker_args to the end of the other arguments.
-Then you can simply call ```tracker.submit``` with fun_submit to submit jobs to the target platform
-
-Note that the current rabit tracker does not restart a worker when it dies, the restart of a node is done by the platform, otherwise we should write the fail-restart logic in the custom script.
-* Fail-restart is usually provided by most platforms.
-  - rabit-yarn provides such functionality in YARN
-
-Fault Tolerance
----------------
-This section introduces how fault tolerance works in rabit.
-The following figure shows how rabit deals with failures.
-
-![](http://homes.cs.washington.edu/~tqchen/rabit/fig/fault-tol.png)
-
-The scenario is as follows:
-* Node 1 fails between the first and second call of Allreduce after the second checkpoint
-* The other nodes wait in the call of the second Allreduce in order to help node 1 to recover.
-* When node 1 restarts, it will call ```LoadCheckPoint```, and get the latest checkpoint from one of the existing nodes.
-* Then node 1 can start from the latest checkpoint and continue running.
-* When node 1 calls the first Allreduce again, as the other nodes already know the result, node 1 can get it from one of them.
-* When node 1 reaches the second Allreduce, the other nodes find out that node 1 has catched up and they can continue the program normally.
-
-This fault tolerance model is based on a key property of Allreduce and
-Broadcast: All the nodes get the same result after calling Allreduce/Broadcast.
-Because of this property, any node can record the results of history
-Allreduce/Broadcast calls.  When a node is recovered, it can fetch the lost
-results from some alive nodes and rebuild its model.
-
-The checkpoint is introduced so that we can discard the history results of
-Allreduce/Broadcast calls before the latest checkpoint. This saves memory
-consumption used for backup.  The checkpoint of each node is a model defined by
-users and can be split into 2 parts: a global model and a local model. The
-global model is shared by all nodes and can be backed up by any nodes. The
-local model of a node is replicated to some other nodes (selected using a ring
-replication strategy).  The checkpoint is only saved in the memory without
-touching the disk which makes rabit programs more efficient.  The strategy of
-rabit is different from the fail-restart strategy where all the nodes restart
-from the same checkpoint when any of them fail.  In rabit, all the alive nodes
-will block in the Allreduce call and help the recovery.  To catch up, the
-recovered node fetches its latest checkpoint and the results of
-Allreduce/Broadcast calls after the checkpoint from some alive nodes.
-
-This is just a conceptual introduction to rabit's fault tolerance model. The actual implementation is more sophisticated,
-and can deal with more complicated cases such as multiple nodes failure and node failure during recovery phase.
diff --git a/subtree/rabit/doc/index.md b/subtree/rabit/doc/index.md
deleted file mode 100644
index d209d95ba..000000000
--- a/subtree/rabit/doc/index.md
+++ /dev/null
@@ -1,24 +0,0 @@
-Rabit Documentation
-=====================
-rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support **portable** , **scalable** and **reliable** distributed machine learning programs.
-
-API Documents
--------------
-```eval_rst
-
-.. toctree::
-   :maxdepth: 2
-
-   python_api.md
-   cpp_api.md
-   parameters.md
-   guide.md
-```
-Indices and tables
-------------------
-
-```eval_rst
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-```
\ No newline at end of file
diff --git a/subtree/rabit/doc/parameters.md b/subtree/rabit/doc/parameters.md
deleted file mode 100644
index 37580d5a1..000000000
--- a/subtree/rabit/doc/parameters.md
+++ /dev/null
@@ -1,21 +0,0 @@
-Parameters
-==========
-This section list all the parameters that can be passed to rabit::Init function as argv.
-All the parameters are passed in as string in format of ``parameter-name=parameter-value``.
-In most setting these parameters have default value or will be automatically detected,
-and do not need to be manually configured.
-
-* rabit_tracker_uri [passed in automatically by tracker]
-  - The uri/ip of rabit tracker
-* rabit_tracker_port [passed in automatically by tracker]
-  - The port of rabit tracker
-* rabit_task_id [automatically detected]
-  - The unique identifier of computing process
-  - When running on hadoop, this is automatically extracted from enviroment variable
-* rabit_reduce_buffer [default = 256MB]
-  - The memory buffer used to store intermediate result of reduction
-  - Format "digits + unit", can be 128M, 1G
-* rabit_global_replica [default = 5]
-  - Number of replication copies of result kept for each Allreduce/Broadcast call
-* rabit_local_replica [default = 2]
-  - Number of replication of local model in check point
diff --git a/subtree/rabit/doc/python-requirements.txt b/subtree/rabit/doc/python-requirements.txt
deleted file mode 100644
index 5970c4367..000000000
--- a/subtree/rabit/doc/python-requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-numpy
-breathe
-commonmark
-
diff --git a/subtree/rabit/doc/python_api.md b/subtree/rabit/doc/python_api.md
deleted file mode 100644
index 8a0eda921..000000000
--- a/subtree/rabit/doc/python_api.md
+++ /dev/null
@@ -1,11 +0,0 @@
-Python API of Rabit
-===================
-This page contains document of python API of rabit.
-
-```eval_rst
-.. toctree::
-
-.. automodule:: rabit
-    :members:
-    :show-inheritance:
-```
diff --git a/subtree/rabit/doc/sphinx_util.py b/subtree/rabit/doc/sphinx_util.py
deleted file mode 100644
index f6a33ffa3..000000000
--- a/subtree/rabit/doc/sphinx_util.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Helper utilty function for customization."""
-import sys
-import os
-import docutils
-import subprocess
-
-if os.environ.get('READTHEDOCS', None) == 'True':
-    subprocess.call('cd ..; rm -rf recommonmark;' +
-                    'git clone https://github.com/tqchen/recommonmark', shell=True)
-
-sys.path.insert(0, os.path.abspath('../recommonmark/'))
-from recommonmark import parser, transform
-
-MarkdownParser = parser.CommonMarkParser
-AutoStructify = transform.AutoStructify
diff --git a/subtree/rabit/guide/Makefile b/subtree/rabit/guide/Makefile
deleted file mode 100644
index 7213e1bf7..000000000
--- a/subtree/rabit/guide/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-export CC  = gcc
-export CXX = g++
-export MPICXX = mpicxx
-export LDFLAGS= -pthread -lm -L../lib
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include 
-
-.PHONY: clean all lib libmpi
-BIN = basic.rabit broadcast.rabit
-MOCKBIN= lazy_allreduce.mock
-
-all: $(BIN)
-basic.rabit: basic.cc lib
-broadcast.rabit: broadcast.cc lib
-lazy_allreduce.mock: lazy_allreduce.cc lib
-
-$(BIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit
-
-$(MOCKBIN) : 
-	$(CXX) $(CFLAGS) -std=c++11 -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit_mock
-
-$(OBJ) : 
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
-
-clean:
-	$(RM) $(OBJ) $(BIN) $(MOCKBIN) *~ ../src/*~
\ No newline at end of file
diff --git a/subtree/rabit/guide/README b/subtree/rabit/guide/README
deleted file mode 100644
index 2483d683f..000000000
--- a/subtree/rabit/guide/README
+++ /dev/null
@@ -1 +0,0 @@
-See tutorial at ../doc/guide.md
\ No newline at end of file
diff --git a/subtree/rabit/guide/basic.cc b/subtree/rabit/guide/basic.cc
deleted file mode 100644
index a9a729170..000000000
--- a/subtree/rabit/guide/basic.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file basic.cc
- * \brief This is an example demonstrating what is Allreduce
- *
- * \author Tianqi Chen
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#include <vector>
-#include <rabit.h>
-using namespace rabit;
-int main(int argc, char *argv[]) {
-  int N = 3;
-  if (argc > 1) {
-    N = atoi(argv[1]);
-  }
-  std::vector<int> a(N);
-  rabit::Init(argc, argv);
-  for (int i = 0; i < N; ++i) {
-    a[i] = rabit::GetRank() + i;
-  } 
-  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // allreduce take max of each elements in all processes
-  Allreduce<op::Max>(&a[0], N);
-  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // second allreduce that sums everything up
-  Allreduce<op::Sum>(&a[0], N);
-  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);  
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/guide/basic.py b/subtree/rabit/guide/basic.py
deleted file mode 100755
index becdae07d..000000000
--- a/subtree/rabit/guide/basic.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/python
-"""
-demo python script of rabit
-"""
-import os
-import sys
-import numpy as np
-# import rabit, the tracker script will setup the lib path correctly
-# for normal run without tracker script, add following line
-# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
-import rabit
-
-rabit.init()
-n = 3
-rank = rabit.get_rank()
-a = np.zeros(n)
-for i in xrange(n):
-    a[i] = rank + i
-    
-print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
-a = rabit.allreduce(a, rabit.MAX)
-print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
-a = rabit.allreduce(a, rabit.SUM)
-print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
-rabit.finalize()
diff --git a/subtree/rabit/guide/broadcast.cc b/subtree/rabit/guide/broadcast.cc
deleted file mode 100644
index 83dbe67fe..000000000
--- a/subtree/rabit/guide/broadcast.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <rabit.h>
-using namespace rabit;
-const int N = 3;
-int main(int argc, char *argv[]) {
-  rabit::Init(argc, argv);
-  std::string s;
-  if (rabit::GetRank() == 0) s = "hello world";
-  printf("@node[%d] before-broadcast: s=\"%s\"\n",
-         rabit::GetRank(), s.c_str());
-  // broadcast s from node 0 to all other nodes
-  rabit::Broadcast(&s, 0);
-  printf("@node[%d] after-broadcast: s=\"%s\"\n",
-         rabit::GetRank(), s.c_str());
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/guide/broadcast.py b/subtree/rabit/guide/broadcast.py
deleted file mode 100755
index defe69eaa..000000000
--- a/subtree/rabit/guide/broadcast.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/python
-"""
-demo python script of rabit
-"""
-import os
-import sys
-# add path to wrapper
-# for normal run without tracker script, add following line
-# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
-import rabit
-
-rabit.init()
-n = 3
-rank = rabit.get_rank()
-s = None
-if rank == 0:
-    s = {'hello world':100, 2:3}
-print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
-s = rabit.broadcast(s, 0)
-
-print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
-rabit.finalize()
diff --git a/subtree/rabit/guide/lazy_allreduce.cc b/subtree/rabit/guide/lazy_allreduce.cc
deleted file mode 100644
index b54776ecc..000000000
--- a/subtree/rabit/guide/lazy_allreduce.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file basic.cc
- * \brief This is an example demonstrating what is Allreduce
- *
- * \author Tianqi Chen
- */
-#include <rabit.h>
-using namespace rabit;
-const int N = 3;
-int main(int argc, char *argv[]) {
-  int a[N] = {0};
-  rabit::Init(argc, argv);
-  // lazy preparation function
-  auto prepare = [&]() {
-    printf("@node[%d] run prepare function\n", rabit::GetRank());
-    for (int i = 0; i < N; ++i) {
-      a[i] = rabit::GetRank() + i;
-    } 
-  };
-  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);
-  // allreduce take max of each elements in all processes
-  Allreduce<op::Max>(&a[0], N, prepare);  
-  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);  
-  // rum second allreduce
-  Allreduce<op::Sum>(&a[0], N);
-  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);  
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/guide/lazy_allreduce.py b/subtree/rabit/guide/lazy_allreduce.py
deleted file mode 100755
index a195f58d2..000000000
--- a/subtree/rabit/guide/lazy_allreduce.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/python
-"""
-demo python script of rabit: Lazy preparation function
-"""
-import os
-import sys
-import numpy as np
-# import rabit, the tracker script will setup the lib path correctly
-# for normal run without tracker script, add following line
-# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
-import rabit
-
-
-# use mock library so that we can run failure test
-rabit.init(lib = 'mock')
-n = 3
-rank = rabit.get_rank()
-a = np.zeros(n)
-
-def prepare(a):
-    print '@node[%d] run prepare function' % rank
-    # must take in reference and modify the reference
-    for i in xrange(n):
-        a[i] = rank + i    
-    
-print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
-a = rabit.allreduce(a, rabit.MAX, prepare_fun = prepare)
-print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
-a = rabit.allreduce(a, rabit.SUM)
-print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
-rabit.finalize()
diff --git a/subtree/rabit/include/README.md b/subtree/rabit/include/README.md
deleted file mode 100644
index 2512edc78..000000000
--- a/subtree/rabit/include/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Library Header Files
-====
-* This folder contains all the header needed to use the library
-* To use it, add the "include" folder to the search path of the compiler
-* User only needs to know [rabit.h](rabit.h) and [rabit_serializable.h](rabit_serializable.h) in order to use the library
-* Folder [rabit](rabit) contains headers for internal engine and template's implementation
-* Not all .h files in the project are in the "include" folder, .h files that are internally used by the library remain at [src](../src)
diff --git a/subtree/rabit/include/dmlc/README.md b/subtree/rabit/include/dmlc/README.md
deleted file mode 100644
index 846cec006..000000000
--- a/subtree/rabit/include/dmlc/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-This folder is part of dmlc-core library, this allows rabit to use unified stream interface with other dmlc projects.
-
-- Since it is only interface dependency DMLC core is not required to compile rabit
-- To compile project that uses dmlc-core functions, link to libdmlc.a (provided by dmlc-core) will be required.
diff --git a/subtree/rabit/include/dmlc/io.h b/subtree/rabit/include/dmlc/io.h
deleted file mode 100644
index 66d590b2d..000000000
--- a/subtree/rabit/include/dmlc/io.h
+++ /dev/null
@@ -1,423 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file io.h
- * \brief defines serializable interface of dmlc
- */
-#ifndef DMLC_IO_H_
-#define DMLC_IO_H_
-#include <cstdio>
-#include <string>
-#include <vector>
-#include <istream>
-#include <ostream>
-#include <streambuf>
-
-// include uint64_t only to make io standalone
-#ifdef _MSC_VER
-/*! \brief uint64 */
-typedef unsigned __int64 uint64_t;
-#else
-#include <inttypes.h>
-#endif
-
-/*! \brief namespace for dmlc */
-namespace dmlc {
-/*!
- * \brief interface of stream I/O for serialization
- */
-class Stream {  // NOLINT(*)
- public:
-  /*!
-   * \brief reads data from a stream
-   * \param ptr pointer to a memory buffer
-   * \param size block size
-   * \return the size of data read
-   */
-  virtual size_t Read(void *ptr, size_t size) = 0;
-  /*!
-   * \brief writes data to a stream
-   * \param ptr pointer to a memory buffer
-   * \param size block size
-   */
-  virtual void Write(const void *ptr, size_t size) = 0;
-  /*! \brief virtual destructor */
-  virtual ~Stream(void) {}
-  /*!
-   * \brief generic factory function
-   *  create an stream, the stream will close the underlying files upon deletion
-   *
-   * \param uri the uri of the input currently we support
-   *            hdfs://, s3://, and file:// by default file:// will be used
-   * \param flag can be "w", "r", "a"
-   * \param allow_null whether NULL can be returned, or directly report error
-   * \return the created stream, can be NULL when allow_null == true and file do not exist
-   */
-  static Stream *Create(const char *uri,
-                        const char* const flag,
-                        bool allow_null = false);
-  // helper functions to write/read different data structures
-  /*!
-   * \brief writes a vector
-   * \param vec vector to be written/serialized
-   */
-  template<typename T>
-  inline void Write(const std::vector<T> &vec);
-  /*!
-   * \brief loads a vector
-   * \param out_vec vector to be loaded/deserialized
-   * \return whether the load was successful
-   */
-  template<typename T>
-  inline bool Read(std::vector<T> *out_vec);
-  /*!
-   * \brief writes a string
-   * \param str the string to be written/serialized
-   */
-  inline void Write(const std::string &str);
-  /*!
-   * \brief loads a string
-   * \param out_str string to be loaded/deserialized
-   * \return whether the load/deserialization was successful
-   */
-  inline bool Read(std::string *out_str);
-};
-
-/*! \brief interface of i/o stream that support seek */
-class SeekStream: public Stream {
- public:
-  // virtual destructor
-  virtual ~SeekStream(void) {}
-  /*! \brief seek to certain position of the file */
-  virtual void Seek(size_t pos) = 0;
-  /*! \brief tell the position of the stream */
-  virtual size_t Tell(void) = 0;
-  /*!
-   * \brief generic factory function
-   *  create an SeekStream for read only,
-   *  the stream will close the underlying files upon deletion
-   *  error will be reported and the system will exit when create failed
-   * \param uri the uri of the input currently we support
-   *            hdfs://, s3://, and file:// by default file:// will be used
-   * \param allow_null whether NULL can be returned, or directly report error
-   * \return the created stream, can be NULL when allow_null == true and file do not exist
-   */
-  static SeekStream *CreateForRead(const char *uri,
-                                   bool allow_null = false);
-};
-
-/*! \brief interface for serializable objects */
-class Serializable {
- public:
-  /*!
-  * \brief load the model from a stream
-  * \param fi stream where to load the model from
-  */
-  virtual void Load(Stream *fi) = 0;
-  /*!
-  * \brief saves the model to a stream
-  * \param fo stream where to save the model to
-  */
-  virtual void Save(Stream *fo) const = 0;
-};
-
-/*!
- * \brief input split creates that allows reading
- *  of records from split of data,
- *  independent part that covers all the dataset
- *
- *  see InputSplit::Create for definition of record
- */
-class InputSplit {
- public:
-  /*! \brief a blob of memory region */
-  struct Blob {
-    /*! \brief points to start of the memory region */
-    void *dptr;
-    /*! \brief size of the memory region */
-    size_t size;
-  };
-  /*!
-   * \brief hint the inputsplit how large the chunk size
-   *  it should return when implementing NextChunk
-   *  this is a hint so may not be enforced,
-   *  but InputSplit will try adjust its internal buffer
-   *  size to the hinted value
-   * \param chunk_size the chunk size
-   */
-  virtual void HintChunkSize(size_t chunk_size) {}
-  /*! \brief reset the position of InputSplit to beginning */
-  virtual void BeforeFirst(void) = 0;
-  /*!
-   * \brief get the next record, the returning value
-   *   is valid until next call to NextRecord or NextChunk
-   *   caller can modify the memory content of out_rec
-   *
-   *   For text, out_rec contains a single line
-   *   For recordio, out_rec contains one record content(with header striped)
-   *
-   * \param out_rec used to store the result
-   * \return true if we can successfully get next record
-   *     false if we reached end of split
-   * \sa InputSplit::Create for definition of record
-   */
-  virtual bool NextRecord(Blob *out_rec) = 0;
-  /*!
-   * \brief get a chunk of memory that can contain multiple records,
-   *  the caller needs to parse the content of the resulting chunk,
-   *  for text file, out_chunk can contain data of multiple lines
-   *  for recordio, out_chunk can contain multiple records(including headers)
-   *
-   *  This function ensures there won't be partial record in the chunk
-   *  caller can modify the memory content of out_chunk,
-   *  the memory is valid until next call to NextRecord or NextChunk
-   *
-   *  Usually NextRecord is sufficient, NextChunk can be used by some
-   *  multi-threaded parsers to parse the input content
-   *
-   * \param out_chunk used to store the result
-   * \return true if we can successfully get next record
-   *     false if we reached end of split
-   * \sa InputSplit::Create for definition of record
-   * \sa RecordIOChunkReader to parse recordio content from out_chunk
-   */
-  virtual bool NextChunk(Blob *out_chunk) = 0;
-  /*! \brief destructor*/
-  virtual ~InputSplit(void) {}
-  /*!
-   * \brief factory function:
-   *  create input split given a uri
-   * \param uri the uri of the input, can contain hdfs prefix
-   * \param part_index the part id of current input
-   * \param num_parts total number of splits
-   * \param type type of record
-   *   List of possible types: "text", "recordio"
-   *     - "text":
-   *         text file, each line is treated as a record
-   *         input split will split on '\\n' or '\\r'
-   *     - "recordio":
-   *         binary recordio file, see recordio.h
-   * \return a new input split
-   * \sa InputSplit::Type
-   */
-  static InputSplit* Create(const char *uri,
-                            unsigned part_index,
-                            unsigned num_parts,
-                            const char *type);
-};
-
-/*!
- * \brief a std::ostream class that can can wrap Stream objects,
- *  can use ostream with that output to underlying Stream
- *
- * Usage example:
- * \code
- *
- *   Stream *fs = Stream::Create("hdfs:///test.txt", "w");
- *   dmlc::ostream os(fs);
- *   os << "hello world" << std::endl;
- *   delete fs;
- * \endcode
- */
-class ostream : public std::basic_ostream<char> {
- public:
-  /*!
-   * \brief construct std::ostream type
-   * \param stream the Stream output to be used
-   * \param buffer_size internal streambuf size
-   */
-  explicit ostream(Stream *stream,
-                   size_t buffer_size = (1 << 10))
-      : std::basic_ostream<char>(NULL), buf_(buffer_size) {
-    this->set_stream(stream);
-  }
-  // explictly synchronize the buffer
-  virtual ~ostream() {
-    buf_.pubsync();
-  }
-  /*!
-   * \brief set internal stream to be stream, reset states
-   * \param stream new stream as output
-   */
-  inline void set_stream(Stream *stream) {
-    buf_.set_stream(stream);
-    this->rdbuf(&buf_);
-  }
-
- private:
-  // internal streambuf
-  class OutBuf : public std::streambuf {
-   public:
-    explicit OutBuf(size_t buffer_size)
-        : stream_(NULL), buffer_(buffer_size) {
-      if (buffer_size == 0) buffer_.resize(2);
-    }
-    // set stream to the buffer
-    inline void set_stream(Stream *stream);
-
-   private:
-    /*! \brief internal stream by StreamBuf */
-    Stream *stream_;
-    /*! \brief internal buffer */
-    std::vector<char> buffer_;
-    // override sync
-    inline int_type sync(void);
-    // override overflow
-    inline int_type overflow(int c);
-  };
-  /*! \brief buffer of the stream */
-  OutBuf buf_;
-};
-
-/*!
- * \brief a std::istream class that can can wrap Stream objects,
- *  can use istream with that output to underlying Stream
- *
- * Usage example:
- * \code
- *
- *   Stream *fs = Stream::Create("hdfs:///test.txt", "r");
- *   dmlc::istream is(fs);
- *   is >> mydata;
- *   delete fs;
- * \endcode
- */
-class istream : public std::basic_istream<char> {
- public:
-  /*!
-   * \brief construct std::ostream type
-   * \param stream the Stream output to be used
-   * \param buffer_size internal buffer size
-   */
-  explicit istream(Stream *stream,
-                   size_t buffer_size = (1 << 10))
-      : std::basic_istream<char>(NULL), buf_(buffer_size) {
-    this->set_stream(stream);
-  }
-  virtual ~istream() {}
-  /*!
-   * \brief set internal stream to be stream, reset states
-   * \param stream new stream as output
-   */
-  inline void set_stream(Stream *stream) {
-    buf_.set_stream(stream);
-    this->rdbuf(&buf_);
-  }
-  /*! \return how many bytes we read so far */
-  inline size_t bytes_read(void) const {
-    return buf_.bytes_read();
-  }
-
- private:
-  // internal streambuf
-  class InBuf : public std::streambuf {
-   public:
-    explicit InBuf(size_t buffer_size)
-        : stream_(NULL), bytes_read_(0),
-          buffer_(buffer_size) {
-      if (buffer_size == 0) buffer_.resize(2);
-    }
-    // set stream to the buffer
-    inline void set_stream(Stream *stream);
-    // return how many bytes read so far
-    inline size_t bytes_read(void) const {
-      return bytes_read_;
-    }
-   private:
-    /*! \brief internal stream by StreamBuf */
-    Stream *stream_;
-    /*! \brief how many bytes we read so far */
-    size_t bytes_read_;
-    /*! \brief internal buffer */
-    std::vector<char> buffer_;
-    // override underflow
-    inline int_type underflow();
-  };
-  /*! \brief input buffer */
-  InBuf buf_;
-};
-
-// implementations of inline functions
-template<typename T>
-inline void Stream::Write(const std::vector<T> &vec) {
-  uint64_t sz = static_cast<uint64_t>(vec.size());
-  this->Write(&sz, sizeof(sz));
-  if (sz != 0) {
-    this->Write(&vec[0], sizeof(T) * vec.size());
-  }
-}
-template<typename T>
-inline bool Stream::Read(std::vector<T> *out_vec) {
-  uint64_t sz;
-  if (this->Read(&sz, sizeof(sz)) == 0) return false;
-  size_t size = static_cast<size_t>(sz);
-  out_vec->resize(size);
-  if (sz != 0) {
-    if (this->Read(&(*out_vec)[0], sizeof(T) * size) == 0) return false;
-  }
-  return true;
-}
-inline void Stream::Write(const std::string &str) {
-  uint64_t sz = static_cast<uint64_t>(str.length());
-  this->Write(&sz, sizeof(sz));
-  if (sz != 0) {
-    this->Write(&str[0], sizeof(char) * str.length());
-  }
-}
-inline bool Stream::Read(std::string *out_str) {
-  uint64_t sz;
-  if (this->Read(&sz, sizeof(sz)) == 0) return false;
-  size_t size = static_cast<size_t>(sz);
-  out_str->resize(size);
-  if (sz != 0) {
-    if (this->Read(&(*out_str)[0], sizeof(char) * size) == 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// implementations for ostream
-inline void ostream::OutBuf::set_stream(Stream *stream) {
-  if (stream_ != NULL) this->pubsync();
-  this->stream_ = stream;
-  this->setp(&buffer_[0], &buffer_[0] + buffer_.size() - 1);
-}
-inline int ostream::OutBuf::sync(void) {
-  if (stream_ == NULL) return -1;
-  std::ptrdiff_t n = pptr() - pbase();
-  stream_->Write(pbase(), n);
-  this->pbump(-static_cast<int>(n));
-  return 0;
-}
-inline int ostream::OutBuf::overflow(int c) {
-  *(this->pptr()) = c;
-  std::ptrdiff_t n = pptr() - pbase();
-  this->pbump(-static_cast<int>(n));
-  if (c == EOF) {
-    stream_->Write(pbase(), n);
-  } else {
-    stream_->Write(pbase(), n + 1);
-  }
-  return c;
-}
-
-// implementations for istream
-inline void istream::InBuf::set_stream(Stream *stream) {
-  stream_ = stream;
-  this->setg(&buffer_[0], &buffer_[0], &buffer_[0]);
-}
-inline int istream::InBuf::underflow() {
-  char *bhead = &buffer_[0];
-  if (this->gptr() == this->egptr()) {
-    size_t sz = stream_->Read(bhead, buffer_.size());
-    this->setg(bhead, bhead, bhead + sz);
-    bytes_read_ += sz;
-  }
-  if (this->gptr() == this->egptr()) {
-    return traits_type::eof();
-  } else {
-    return traits_type::to_int_type(*gptr());
-  }
-}
-}  // namespace dmlc
-#endif  // DMLC_IO_H_
diff --git a/subtree/rabit/include/rabit.h b/subtree/rabit/include/rabit.h
deleted file mode 100644
index b0f1df39c..000000000
--- a/subtree/rabit/include/rabit.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file rabit.h
- * \brief This file defines rabit's Allreduce/Broadcast interface
- *   The rabit engine contains the actual implementation
- *   Code that only uses this header can also be compiled with MPI Allreduce (non fault-tolerant),
- *
- *   rabit.h and serializable.h is all what the user needs to use the rabit interface
- * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
- */
-#ifndef RABIT_RABIT_H_  // NOLINT(*)
-#define RABIT_RABIT_H_  // NOLINT(*)
-#include <string>
-#include <vector>
-
-// whether or not use c++11 support
-#ifndef DMLC_USE_CXX11
-#define DMLC_USE_CXX11 (defined(__GXX_EXPERIMENTAL_CXX0X__) ||\
-                        __cplusplus >= 201103L || defined(_MSC_VER))
-#endif
-// optionally support of lambda functions in C++11, if available
-#if DMLC_USE_CXX11
-#include <functional>
-#endif  // C++11
-// contains definition of Serializable
-#include "./rabit_serializable.h"
-// engine definition of rabit, defines internal implementation
-// to use rabit interface, there is no need to read engine.h
-// rabit.h and serializable.h are enough to use the interface
-#include "./rabit/engine.h"
-
-/*! \brief rabit namespace */
-namespace rabit {
-/*!
- * \brief reduction operators namespace
- */
-namespace op {
-/*!
- * \class rabit::op::Max
- * \brief maximum reduction operator
- */
-struct Max;
-/*!
- * \class rabit::op::Min
- * \brief minimum reduction operator
- */
-struct Min;
-/*!
- * \class rabit::op::Sum
- * \brief sum reduction operator
- */
-struct Sum;
-/*!
- * \class rabit::op::BitOR
- * \brief bitwise OR reduction operator
- */
-struct BitOR;
-}  // namespace op
-/*!
- * \brief initializes rabit, call this once at the beginning of your program
- * \param argc number of arguments in argv
- * \param argv the array of input arguments
- */
-inline void Init(int argc, char *argv[]);
-/*!
- * \brief finalizes the rabit engine, call this function after you finished with all the jobs
- */
-inline void Finalize(void);
-/*! \brief gets rank of the current process */
-inline int GetRank(void);
-/*! \brief gets total number of processes */
-inline int GetWorldSize(void);
-/*! \brief whether rabit env is in distributed mode */
-inline bool IsDistributed(void);
-
-/*! \brief gets processor's name */
-inline std::string GetProcessorName(void);
-/*!
- * \brief prints the msg to the tracker,
- *    this function can be used to communicate progress information to
- *    the user who monitors the tracker
- * \param msg the message to be printed
- */
-inline void TrackerPrint(const std::string &msg);
-#ifndef RABIT_STRICT_CXX98_
-/*!
- * \brief prints the msg to the tracker, this function may not be available
- *    in very strict c++98 compilers, though it usually is.
- *    this function can be used to communicate progress information to
- *    the user who monitors the tracker
- * \param fmt the format string
- */
-inline void TrackerPrintf(const char *fmt, ...);
-#endif
-/*!
- * \brief broadcasts a memory region to every node from the root
- *
- *     Example: int a = 1; Broadcast(&a, sizeof(a), root);
- * \param sendrecv_data the pointer to the send/receive buffer,
- * \param size the data size
- * \param root the process root
- */
-inline void Broadcast(void *sendrecv_data, size_t size, int root);
-/*!
- * \brief broadcasts an std::vector<DType> to every node from root
- * \param sendrecv_data the pointer to send/receive vector,
- *        for the receiver, the vector does not need to be pre-allocated
- * \param root the process root
- * \tparam DType the data type stored in the vector, has to be a simple data type
- *               that can be directly transmitted by sending the sizeof(DType)
- */
-template<typename DType>
-inline void Broadcast(std::vector<DType> *sendrecv_data, int root);
-/*!
- * \brief broadcasts a std::string to every node from the root
- * \param sendrecv_data the pointer to the send/receive buffer,
- *        for the receiver, the vector does not need to be pre-allocated
- * \param root the process root
- */
-inline void Broadcast(std::string *sendrecv_data, int root);
-/*!
- * \brief performs in-place Allreduce on sendrecvbuf
- *        this function is NOT thread-safe
- *
- * Example Usage: the following code does an Allreduce and outputs the sum as the result
- * \code{.cpp}
- * vector<int> data(10);
- * ...
- * Allreduce<op::Sum>(&data[0], data.size());
- * ...
- * \endcode
- *
- * \param sendrecvbuf buffer for both sending and receiving data
- * \param count number of elements to be reduced
- * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
- *                    will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
- *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
- * \param prepare_arg argument used to pass into the lazy preprocessing function
- * \tparam OP see namespace op, reduce operator
- * \tparam DType data type
- */
-template<typename OP, typename DType>
-inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      void (*prepare_fun)(void *) = NULL,
-                      void *prepare_arg = NULL);
-// C++11 support for lambda prepare function
-#if DMLC_USE_CXX11
-/*!
- * \brief performs in-place Allreduce, on sendrecvbuf
- *        with a prepare function specified by a lambda function
- *
- * Example Usage:
- * \code{.cpp}
- * // the following code does an Allreduce and outputs the sum as the result
- * vector<int> data(10);
- * ...
- * Allreduce<op::Sum>(&data[0], data.size(), [&]() {
- *                     for (int i = 0; i < 10; ++i) {
- *                       data[i] = i;
- *                     }
- *                    });
- *     ...
- * \endcode
- * \param sendrecvbuf buffer for both sending and receiving data
- * \param count number of elements to be reduced
- * \param prepare_fun  Lazy lambda preprocessing function, prepare_fun() will be invoked
- *                     by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
- *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
- * \tparam OP see namespace op, reduce operator
- * \tparam DType data type
- */
-template<typename OP, typename DType>
-inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      std::function<void()> prepare_fun);
-#endif  // C++11
-/*!
- * \brief loads the latest check point
- * \param global_model pointer to the globally shared model/state
- *   when calling this function, the caller needs to guarantee that the global_model
- *   is the same in every node
- * \param local_model pointer to the local model that is specific to the current node/rank
- *   this can be NULL when no local model is needed
- *
- * \return the version number of the check point loaded
- *     if returned version == 0, this means no model has been CheckPointed
- *     the p_model is not touched, users should do the necessary initialization by themselves
- *
- * \code{.cpp}
- * // Example usage code of LoadCheckPoint
- * int iter = rabit::LoadCheckPoint(&model);
- * if (iter == 0) model.InitParameters();
- * for (i = iter; i < max_iter; ++i) {
- *   // do many things, include allreduce
- *   rabit::CheckPoint(model);
- * }
- * \endcode
- * \sa CheckPoint, VersionNumber
- */
-inline int LoadCheckPoint(Serializable *global_model,
-                          Serializable *local_model = NULL);
-/*!
- * \brief checkpoints the model, meaning a stage of execution has finished.
- *  every time we call check point, a version number will be increased by one
- *
- * \param global_model pointer to the globally shared model/state
- *   when calling this function, the caller needs to guarantee that the global_model
- *   is the same in every node
- * \param local_model pointer to the local model that is specific to the current node/rank
- *   this can be NULL when no local state is needed
-   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
-   *       bring replication cost in the CheckPoint function. global_model does not need explicit replication.
-   *       So, only CheckPoint with the global_model if possible
-   * \sa LoadCheckPoint, VersionNumber
-   */
-inline void CheckPoint(const Serializable *global_model,
-                       const Serializable *local_model = NULL);
-/*!
- * \brief This function can be used to replace CheckPoint for global_model only,
- *   when certain condition is met (see detailed explanation).
- *
- *   This is a "lazy" checkpoint such that only the pointer to the global_model is
- *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
- *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
- *   In other words, the global_model model can be changed only between the last call of
- *   Allreduce/Broadcast and LazyCheckPoint, both in the same version
- *
- *   For example, suppose the calling sequence is:
- *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint/(or can be CheckPoint)
- *
- *   Then the user MUST only change the global_model in code3.
- *
- *   The use of LazyCheckPoint instead of CheckPoint will improve the efficiency of the program.
- * \param global_model pointer to the globally shared model/state
- *   when calling this function, the caller needs to guarantee that the global_model
- *   is the same in every node
- * \sa LoadCheckPoint, CheckPoint, VersionNumber
- */
-inline void LazyCheckPoint(const Serializable *global_model);
-/*!
- * \return version number of the current stored model,
- *         which means how many calls to CheckPoint we made so far
- * \sa LoadCheckPoint, CheckPoint
- */
-inline int VersionNumber(void);
-// ----- extensions that allow customized reducer ------
-// helper class to do customized reduce, user do not need to know the type
-namespace engine {
-class ReduceHandle;
-}  // namespace engine
-/*!
- * \brief template class to make customized reduce and all reduce easy
- *  Do not use reducer directly in the function you call Finalize,
- *   because the destructor can execute after Finalize
- * \tparam DType data type that to be reduced
- * \tparam freduce the customized reduction function
- *  DType must be a struct, with no pointer
- */
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
-class Reducer {
- public:
-  Reducer(void);
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param count number of elements to be reduced
-   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  inline void Allreduce(DType *sendrecvbuf, size_t count,
-                        void (*prepare_fun)(void *) = NULL,
-                        void *prepare_arg = NULL);
-#if DMLC_USE_CXX11
-  /*!
-   * \brief customized in-place all reduce operation, with lambda function as preprocessor
-   * \param sendrecvbuf pointer to the array of objects to be reduced
-   * \param count number of elements to be reduced
-   * \param prepare_fun lambda function executed to prepare the data, if necessary
-   */
-  inline void Allreduce(DType *sendrecvbuf, size_t count,
-                        std::function<void()> prepare_fun);
-#endif
-
- private:
-  /*! \brief function handle to do reduce */
-  engine::ReduceHandle handle_;
-};
-/*!
- * \brief template class to make customized reduce,
- *  this class defines complex reducer handles all the data structure that can be
- *  serialized/deserialized into fixed size buffer
- *  Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
- *
- * \tparam DType data type that to be reduced, DType must contain the following functions:
- * \tparam freduce the customized reduction function
- *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte)
- */
-template<typename DType>
-class SerializeReducer {
- public:
-  SerializeReducer(void);
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvobj pointer to the array of objects to be reduced
-   * \param max_nbyte maximum amount of memory needed to serialize each object
-   *        this includes budget limit for intermediate and final result
-   * \param count number of elements to be reduced
-   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
-   *                     If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  inline void Allreduce(DType *sendrecvobj,
-                        size_t max_nbyte, size_t count,
-                        void (*prepare_fun)(void *) = NULL,
-                        void *prepare_arg = NULL);
-// C++11 support for lambda prepare function
-#if DMLC_USE_CXX11
-  /*!
-   * \brief customized in-place all reduce operation, with lambda function as preprocessor
-   * \param sendrecvobj pointer to the array of objects to be reduced
-   * \param max_nbyte maximum amount of memory needed to serialize each object
-   *        this includes budget limit for intermediate and final result
-   * \param count number of elements to be reduced
-   * \param prepare_fun lambda function executed to prepare the data, if necessary
-   */
-  inline void Allreduce(DType *sendrecvobj,
-                        size_t max_nbyte, size_t count,
-                        std::function<void()> prepare_fun);
-#endif
-
- private:
-  /*! \brief function handle to do reduce */
-  engine::ReduceHandle handle_;
-  /*! \brief temporal buffer used to do reduce*/
-  std::string buffer_;
-};
-}  // namespace rabit
-// implementation of template functions
-#include "./rabit/rabit-inl.h"
-#endif  // RABIT_RABIT_H_ // NOLINT(*)
diff --git a/subtree/rabit/include/rabit/engine.h b/subtree/rabit/include/rabit/engine.h
deleted file mode 100644
index 272bbb8ef..000000000
--- a/subtree/rabit/include/rabit/engine.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine.h
- * \brief This file defines the core interface of rabit library
- * \author Tianqi Chen, Nacho, Tianyi
- */
-#ifndef RABIT_ENGINE_H_
-#define RABIT_ENGINE_H_
-#include <string>
-#include "../rabit_serializable.h"
-
-namespace MPI {
-/*! \brief MPI data type just to be compatible with MPI reduce function*/
-class Datatype;
-}
-
-/*! \brief namespace of rabit */
-namespace rabit {
-/*! \brief core interface of the engine */
-namespace engine {
-/*! \brief interface of core Allreduce engine */
-class IEngine {
- public:
-  /*! 
-   * \brief Preprocessing function, that is called before AllReduce,
-   *        used to prepare the data used by AllReduce
-   * \param arg additional possible argument used to invoke the preprocessor
-   */
-  typedef void (PreprocFunction) (void *arg);
-  /*!
-   * \brief reduce function, the same form of MPI reduce function is used,
-   *        to be compatible with MPI interface
-   *        In all the functions, the memory is ensured to aligned to 64-bit
-   *        which means it is OK to cast src,dst to double* int* etc
-   * \param src pointer to source space
-   * \param dst pointer to destination reduction
-   * \param count total number of elements to be reduced (note this is total number of elements instead of bytes)
-   *              the definition of the reduce function should be type aware
-   * \param dtype the data type object, to be compatible with MPI reduce
-   */
-  typedef void (ReduceFunction) (const void *src,
-                                 void *dst, int count,
-                                 const MPI::Datatype &dtype);
-  /*!
-   * \brief performs in-place Allreduce, on sendrecvbuf
-   *        this function is NOT thread-safe
-   * \param sendrecvbuf_ buffer for both sending and receiving data
-   * \param type_nbytes the number of bytes the type has
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \param prepare_func Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  virtual void Allreduce(void *sendrecvbuf_,
-                         size_t type_nbytes,
-                         size_t count,
-                         ReduceFunction reducer,
-                         PreprocFunction prepare_fun = NULL,
-                         void *prepare_arg = NULL) = 0;
-  /*!
-   * \brief broadcasts data from root to every other node
-   * \param sendrecvbuf_ buffer for both sending and receiving data
-   * \param size the size of the data to be broadcasted
-   * \param root the root worker id to broadcast the data
-   */
-  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) = 0;
-  /*!
-   * \brief explicitly re-initialize everything before calling LoadCheckPoint
-   *    call this function when IEngine throws an exception,
-   *    this function should only be used for test purposes
-   */
-  virtual void InitAfterException(void) = 0;
-  /*!
-   * \brief loads the latest check point
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller needs to guarantee that the global_model
-   *   is the same in all nodes
-   * \param local_model pointer to the local model that is specific to current node/rank
-   *   this can be NULL when no local model is needed
-   *
-   * \return the version number of the model loaded
-   *     if returned version == 0, this means no model has been CheckPointed
-   *     the p_model is not touched, users should do necessary initialization by themselves
-   *   
-   *   Common usage example:
-   *      int iter = rabit::LoadCheckPoint(&model);
-   *      if (iter == 0) model.InitParameters();
-   *      for (i = iter; i < max_iter; ++i) {
-   *        do many things, include allreduce
-   *        rabit::CheckPoint(model);
-   *      } 
-   *
-   * \sa CheckPoint, VersionNumber
-   */
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model = NULL) = 0;
-  /*!
-   * \brief checkpoints the model, meaning a stage of execution was finished
-   *  every time we call check point, a version number increases by ones
-   * 
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller needs to guarantee that the global_model
-   *   is the same in every node
-   * \param local_model pointer to the local model that is specific to current node/rank
-   *   this can be NULL when no local state is needed
-   *
-   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
-   *       bring replication cost in CheckPoint function. global_model does not need explicit replication.
-   *       So, only CheckPoint with global_model if possible
-   *
-   * \sa LoadCheckPoint, VersionNumber
-   */
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model = NULL) = 0;
-  /*!
-   * \brief This function can be used to replace CheckPoint for global_model only,
-   *   when certain condition is met (see detailed explanation).
-   * 
-   *   This is a "lazy" checkpoint such that only the pointer to global_model is
-   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
-   *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
-   *   In other words, global_model can be changed only between the last call of 
-   *   Allreduce/Broadcast and LazyCheckPoint in the current version
-   *   
-   *   For example, suppose the calling sequence is:
-   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
-   *   
-   *   If the user can only change global_model in code3, then LazyCheckPoint can be used to
-   *   improve the efficiency of the program.
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller needs to guarantee that global_model
-   *   is the same in every node
-   * \sa LoadCheckPoint, CheckPoint, VersionNumber
-   */
-  virtual void LazyCheckPoint(const Serializable *global_model) = 0;
-  /*!
-   * \return version number of the current stored model,
-   *         which means how many calls to CheckPoint we made so far
-   * \sa LoadCheckPoint, CheckPoint
-   */
-  virtual int VersionNumber(void) const = 0;
-  /*! \brief gets rank of current node */
-  virtual int GetRank(void) const = 0;
-  /*! \brief gets total number of nodes */
-  virtual int GetWorldSize(void) const = 0;
-  /*! \brief whether we run in distribted mode */
-  virtual bool IsDistributed(void) const = 0;
-  /*! \brief gets the host name of the current node */
-  virtual std::string GetHost(void) const = 0;
-  /*!
-   * \brief prints the msg in the tracker,
-   *    this function can be used to communicate progress information to
-   *    the user who monitors the tracker
-   * \param msg message to be printed in the tracker
-   */
-  virtual void TrackerPrint(const std::string &msg) = 0;
-};
-
-/*! \brief initializes the engine module */
-void Init(int argc, char *argv[]);
-/*! \brief finalizes the engine module */
-void Finalize(void);
-/*! \brief singleton method to get engine */
-IEngine *GetEngine(void);
-
-/*! \brief namespace that contains stubs to be compatible with MPI */
-namespace mpi {
-/*!\brief enum of all operators */
-enum OpType {
-  kMax = 0,
-  kMin = 1,
-  kSum = 2,
-  kBitwiseOR = 3
-};
-/*!\brief enum of supported data types */
-enum DataType {
-  kChar = 0,
-  kUChar = 1,
-  kInt = 2,
-  kUInt = 3,
-  kLong = 4,
-  kULong = 5,
-  kFloat = 6,
-  kDouble = 7,
-  kLongLong = 8,
-  kULongLong = 9
-};
-}  // namespace mpi
-/*!
- * \brief perform in-place Allreduce, on sendrecvbuf 
- *   this is an internal function used by rabit to be able to compile with MPI
- *   do not use this function directly
- * \param sendrecvbuf buffer for both sending and receiving data
- * \param type_nbytes the number of bytes the type has
- * \param count number of elements to be reduced
- * \param reducer reduce function
- * \param dtype the data type 
- * \param op the reduce operator type
- * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
- *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
- *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
- * \param prepare_arg argument used to pass into the lazy preprocessing function.
- */
-void Allreduce_(void *sendrecvbuf,
-                size_t type_nbytes,
-                size_t count,
-                IEngine::ReduceFunction red,
-                mpi::DataType dtype,
-                mpi::OpType op,
-                IEngine::PreprocFunction prepare_fun = NULL,
-                void *prepare_arg = NULL);
-
-/*!
- * \brief handle for customized reducer, used to handle customized reduce
- *  this class is mainly created for compatiblity issues with MPI's customized reduce
- */
-class ReduceHandle {
- public:
-  // constructor
-  ReduceHandle(void);
-  // destructor
-  ~ReduceHandle(void);
-  /*!
-   * \brief initialize the reduce function,
-   *   with the type the reduce function needs to deal with
-   *   the reduce function MUST be communicative
-   */
-  void Init(IEngine::ReduceFunction redfunc, size_t type_nbytes);
-  /*!
-   * \brief customized in-place all reduce operation 
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param type_n4bytes size of the type, in terms of 4bytes
-   * \param count number of elements to send
-   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  void Allreduce(void *sendrecvbuf,
-                 size_t type_nbytes, size_t count,
-                 IEngine::PreprocFunction prepare_fun = NULL,
-                 void *prepare_arg = NULL);
-  /*! \return the number of bytes occupied by the type */
-  static int TypeSize(const MPI::Datatype &dtype);
-
- protected:
-  // handle function field
-  void *handle_;
-  // reduce function of the reducer
-  IEngine::ReduceFunction *redfunc_;
-  // handle to the type field
-  void *htype_;
-  // the created type in 4 bytes
-  size_t created_type_nbytes_;
-};
-}  // namespace engine
-}  // namespace rabit
-#endif  // RABIT_ENGINE_H_
diff --git a/subtree/rabit/include/rabit/io.h b/subtree/rabit/include/rabit/io.h
deleted file mode 100644
index 7ffca38f2..000000000
--- a/subtree/rabit/include/rabit/io.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file io.h
- * \brief utilities with different serializable implementations
- * \author Tianqi Chen
- */
-#ifndef RABIT_IO_H_
-#define RABIT_IO_H_
-#include <cstdio>
-#include <vector>
-#include <cstring>
-#include <string>
-#include <algorithm>
-#include "./utils.h"
-#include "../rabit_serializable.h"
-
-namespace rabit {
-namespace utils {
-/*! \brief re-use definition of dmlc::SeekStream */
-typedef dmlc::SeekStream SeekStream;
-/*! \brief fixed size memory buffer */
-struct MemoryFixSizeBuffer : public SeekStream {
- public:
-  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
-      : p_buffer_(reinterpret_cast<char*>(p_buffer)),
-        buffer_size_(buffer_size) {
-    curr_ptr_ = 0;
-  }
-  virtual ~MemoryFixSizeBuffer(void) {}
-  virtual size_t Read(void *ptr, size_t size) {
-    utils::Assert(curr_ptr_ + size <= buffer_size_,
-                  "read can not have position excceed buffer length");
-    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
-    if (nread != 0) std::memcpy(ptr, p_buffer_ + curr_ptr_, nread);
-    curr_ptr_ += nread;
-    return nread;
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    if (size == 0) return;
-    utils::Assert(curr_ptr_ + size <=  buffer_size_,
-                  "write position exceed fixed buffer size");
-    std::memcpy(p_buffer_ + curr_ptr_, ptr, size);
-    curr_ptr_ += size;
-  }
-  virtual void Seek(size_t pos) {
-    curr_ptr_ = static_cast<size_t>(pos);
-  }
-  virtual size_t Tell(void) {
-    return curr_ptr_;
-  }
-  virtual bool AtEnd(void) const {
-    return curr_ptr_ == buffer_size_;
-  }
-
- private:
-  /*! \brief in memory buffer */
-  char *p_buffer_;
-  /*! \brief current pointer */
-  size_t buffer_size_;
-  /*! \brief current pointer */
-  size_t curr_ptr_;
-};  // class MemoryFixSizeBuffer
-
-/*! \brief a in memory buffer that can be read and write as stream interface */
-struct MemoryBufferStream : public SeekStream {
- public:
-  explicit MemoryBufferStream(std::string *p_buffer)
-      : p_buffer_(p_buffer) {
-    curr_ptr_ = 0;
-  }
-  virtual ~MemoryBufferStream(void) {}
-  virtual size_t Read(void *ptr, size_t size) {
-    utils::Assert(curr_ptr_ <= p_buffer_->length(),
-                  "read can not have position excceed buffer length");
-    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
-    if (nread != 0) std::memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
-    curr_ptr_ += nread;
-    return nread;
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    if (size == 0) return;
-    if (curr_ptr_ + size > p_buffer_->length()) {
-      p_buffer_->resize(curr_ptr_+size);
-    }
-    std::memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
-    curr_ptr_ += size;
-  }
-  virtual void Seek(size_t pos) {
-    curr_ptr_ = static_cast<size_t>(pos);
-  }
-  virtual size_t Tell(void) {
-    return curr_ptr_;
-  }
-  virtual bool AtEnd(void) const {
-    return curr_ptr_ == p_buffer_->length();
-  }
-
- private:
-  /*! \brief in memory buffer */
-  std::string *p_buffer_;
-  /*! \brief current pointer */
-  size_t curr_ptr_;
-};  // class MemoryBufferStream
-}  // namespace utils
-}  // namespace rabit
-#endif  // RABIT_IO_H_
diff --git a/subtree/rabit/include/rabit/rabit-inl.h b/subtree/rabit/include/rabit/rabit-inl.h
deleted file mode 100644
index e82b5a9a0..000000000
--- a/subtree/rabit/include/rabit/rabit-inl.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file rabit-inl.h
- * \brief implementation of inline template function for rabit interface
- *
- * \author Tianqi Chen
- */
-#ifndef RABIT_RABIT_INL_H_
-#define RABIT_RABIT_INL_H_
-// use engine for implementation
-#include <vector>
-#include <string>
-#include "./io.h"
-#include "./utils.h"
-#include "../rabit.h"
-
-namespace rabit {
-namespace engine {
-namespace mpi {
-// template function to translate type to enum indicator
-template<typename DType>
-inline DataType GetType(void);
-template<>
-inline DataType GetType<char>(void) {
-  return kChar;
-}
-template<>
-inline DataType GetType<unsigned char>(void) {
-  return kUChar;
-}
-template<>
-inline DataType GetType<int>(void) {
-  return kInt;
-}
-template<>
-inline DataType GetType<unsigned int>(void) { // NOLINT(*)
-  return kUInt;
-}
-template<>
-inline DataType GetType<long>(void) {  // NOLINT(*)
-  return kLong;
-}
-template<>
-inline DataType GetType<unsigned long>(void) { // NOLINT(*)
-  return kULong;
-}
-template<>
-inline DataType GetType<float>(void) {
-  return kFloat;
-}
-template<>
-inline DataType GetType<double>(void) {
-  return kDouble;
-}
-template<>
-inline DataType GetType<long long>(void) { // NOLINT(*)
-  return kLongLong;
-}
-template<>
-inline DataType GetType<unsigned long long>(void) { // NOLINT(*)
-  return kULongLong;
-}
-}  // namespace mpi
-}  // namespace engine
-
-namespace op {
-struct Max {
-  static const engine::mpi::OpType kType = engine::mpi::kMax;
-  template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
-    if (dst < src) dst = src;
-  }
-};
-struct Min {
-  static const engine::mpi::OpType kType = engine::mpi::kMin;
-  template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
-    if (dst > src) dst = src;
-  }
-};
-struct Sum {
-  static const engine::mpi::OpType kType = engine::mpi::kSum;
-  template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
-    dst += src;
-  }
-};
-struct BitOR {
-  static const engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
-  template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
-    dst |= src;
-  }
-};
-template<typename OP, typename DType>
-inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
-  const DType *src = (const DType*)src_;
-  DType *dst = (DType*)dst_;  // NOLINT(*)
-  for (int i = 0; i < len; ++i) {
-    OP::Reduce(dst[i], src[i]);
-  }
-}
-}  // namespace op
-
-// intialize the rabit engine
-inline void Init(int argc, char *argv[]) {
-  engine::Init(argc, argv);
-}
-// finalize the rabit engine
-inline void Finalize(void) {
-  engine::Finalize();
-}
-// get the rank of current process
-inline int GetRank(void) {
-  return engine::GetEngine()->GetRank();
-}
-// the the size of the world
-inline int GetWorldSize(void) {
-  return engine::GetEngine()->GetWorldSize();
-}
-// whether rabit is distributed
-inline bool IsDistributed(void) {
-  return engine::GetEngine()->IsDistributed();
-}
-// get the name of current processor
-inline std::string GetProcessorName(void) {
-  return engine::GetEngine()->GetHost();
-}
-// broadcast data to all other nodes from root
-inline void Broadcast(void *sendrecv_data, size_t size, int root) {
-  engine::GetEngine()->Broadcast(sendrecv_data, size, root);
-}
-template<typename DType>
-inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
-  size_t size = sendrecv_data->size();
-  Broadcast(&size, sizeof(size), root);
-  if (sendrecv_data->size() != size) {
-    sendrecv_data->resize(size);
-  }
-  if (size != 0) {
-    Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
-  }
-}
-inline void Broadcast(std::string *sendrecv_data, int root) {
-  size_t size = sendrecv_data->length();
-  Broadcast(&size, sizeof(size), root);
-  if (sendrecv_data->length() != size) {
-    sendrecv_data->resize(size);
-  }
-  if (size != 0) {
-    Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
-  }
-}
-
-// perform inplace Allreduce
-template<typename OP, typename DType>
-inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      void (*prepare_fun)(void *arg),
-                      void *prepare_arg) {
-  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
-                     engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg);
-}
-
-// C++11 support for lambda prepare function
-#if DMLC_USE_CXX11
-inline void InvokeLambda_(void *fun) {
-  (*static_cast<std::function<void()>*>(fun))();
-}
-template<typename OP, typename DType>
-inline void Allreduce(DType *sendrecvbuf, size_t count, std::function<void()> prepare_fun) {
-  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
-                     engine::mpi::GetType<DType>(), OP::kType, InvokeLambda_, &prepare_fun);
-}
-#endif  // C++11
-
-// print message to the tracker
-inline void TrackerPrint(const std::string &msg) {
-  engine::GetEngine()->TrackerPrint(msg);
-}
-#ifndef RABIT_STRICT_CXX98_
-inline void TrackerPrintf(const char *fmt, ...) {
-  const int kPrintBuffer = 1 << 10;
-  std::string msg(kPrintBuffer, '\0');
-  va_list args;
-  va_start(args, fmt);
-  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-  va_end(args);
-  msg.resize(strlen(msg.c_str()));
-  TrackerPrint(msg);
-}
-#endif
-// load latest check point
-inline int LoadCheckPoint(Serializable *global_model,
-                          Serializable *local_model) {
-  return engine::GetEngine()->LoadCheckPoint(global_model, local_model);
-}
-// checkpoint the model, meaning we finished a stage of execution
-inline void CheckPoint(const Serializable *global_model,
-                       const Serializable *local_model) {
-  engine::GetEngine()->CheckPoint(global_model, local_model);
-}
-// lazy checkpoint the model, only remember the pointer to global_model
-inline void LazyCheckPoint(const Serializable *global_model) {
-  engine::GetEngine()->LazyCheckPoint(global_model);
-}
-// return the version number of currently stored model
-inline int VersionNumber(void) {
-  return engine::GetEngine()->VersionNumber();
-}
-// ---------------------------------
-// Code to handle customized Reduce
-// ---------------------------------
-// function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
-inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
-  const size_t kUnit = sizeof(DType);
-  const char *psrc = reinterpret_cast<const char*>(src_);
-  char *pdst = reinterpret_cast<char*>(dst_);
-  DType tdst, tsrc;
-  for (int i = 0; i < len_; ++i) {
-    // use memcpy to avoid alignment issue
-    std::memcpy(&tdst, pdst + i * kUnit, sizeof(tdst));
-    std::memcpy(&tsrc, psrc + i * kUnit, sizeof(tsrc));
-    freduce(tdst, tsrc);
-    std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
-  }
-}
-// function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
-inline void ReducerAlign_(const void *src_, void *dst_,
-                          int len_, const MPI::Datatype &dtype) {
-  const DType *psrc = reinterpret_cast<const DType*>(src_);
-  DType *pdst = reinterpret_cast<DType*>(dst_);
-  for (int i = 0; i < len_; ++i) {
-    freduce(pdst[i], psrc[i]);
-  }
-}
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
-inline Reducer<DType, freduce>::Reducer(void) {
-  // it is safe to directly use handle for aligned data types
-  if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
-    this->handle_.Init(ReducerAlign_<DType, freduce>, sizeof(DType));
-  } else {
-    this->handle_.Init(ReducerSafe_<DType, freduce>, sizeof(DType));
-  }
-}
-template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
-inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg) {
-  handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun, prepare_arg);
-}
-// function to perform reduction for SerializeReducer
-template<typename DType>
-inline void SerializeReducerFunc_(const void *src_, void *dst_,
-                                  int len_, const MPI::Datatype &dtype) {
-  int nbytes = engine::ReduceHandle::TypeSize(dtype);
-  // temp space
-  DType tsrc, tdst;
-  for (int i = 0; i < len_; ++i) {
-    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*)
-    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*)
-    tsrc.Load(fsrc);
-    tdst.Load(fdst);
-    // govern const check
-    tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
-    fdst.Seek(0);
-    tdst.Save(fdst);
-  }
-}
-template<typename DType>
-inline SerializeReducer<DType>::SerializeReducer(void) {
-  handle_.Init(SerializeReducerFunc_<DType>, sizeof(DType));
-}
-// closure to call Allreduce
-template<typename DType>
-struct SerializeReduceClosure {
-  DType *sendrecvobj;
-  size_t max_nbyte, count;
-  void (*prepare_fun)(void *arg);
-  void *prepare_arg;
-  std::string *p_buffer;
-  // invoke the closure
-  inline void Run(void) {
-    if (prepare_fun != NULL) prepare_fun(prepare_arg);
-    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(*p_buffer) + i * max_nbyte, max_nbyte);
-      sendrecvobj[i].Save(fs);
-    }
-  }
-  inline static void Invoke(void *c) {
-    static_cast<SerializeReduceClosure<DType>*>(c)->Run();
-  }
-};
-template<typename DType>
-inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
-                                               size_t max_nbyte, size_t count,
-                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg) {
-  buffer_.resize(max_nbyte * count);
-  // setup closure
-  SerializeReduceClosure<DType> c;
-  c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count;
-  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
-  // invoke here
-  handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
-                    SerializeReduceClosure<DType>::Invoke, &c);
-  for (size_t i = 0; i < count; ++i) {
-    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
-    sendrecvobj[i].Load(fs);
-  }
-}
-
-#if DMLC_USE_CXX11
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)g
-inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               std::function<void()> prepare_fun) {
-  this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun);
-}
-template<typename DType>
-inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
-                                               size_t max_nbytes, size_t count,
-                                               std::function<void()> prepare_fun) {
-  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda_, &prepare_fun);
-}
-#endif
-}  // namespace rabit
-#endif  // RABIT_RABIT_INL_H_
diff --git a/subtree/rabit/include/rabit/timer.h b/subtree/rabit/include/rabit/timer.h
deleted file mode 100644
index 1f135add6..000000000
--- a/subtree/rabit/include/rabit/timer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file timer.h
- * \brief This file defines the utils for timing
- * \author Tianqi Chen, Nacho, Tianyi
- */
-#ifndef RABIT_TIMER_H_
-#define RABIT_TIMER_H_
-#include <time.h>
-#ifdef __MACH__
-#include <mach/clock.h>
-#include <mach/mach.h>
-#endif
-#include "./utils.h"
-
-namespace rabit {
-namespace utils {
-/*!
- * \brief return time in seconds, not cross platform, avoid to use this in most places
- */
-inline double GetTime(void) {
-  #ifdef __MACH__
-  clock_serv_t cclock;
-  mach_timespec_t mts;
-  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-  utils::Check(clock_get_time(cclock, &mts) == 0, "failed to get time");
-  mach_port_deallocate(mach_task_self(), cclock);
-  return static_cast<double>(mts.tv_sec) + static_cast<double>(mts.tv_nsec) * 1e-9;
-  #else
-  #if defined(__unix__) || defined(__linux__)
-  timespec ts;
-  utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
-  return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
-  #else
-  return static_cast<double>(time(NULL));
-  #endif
-  #endif
-}
-}  // namespace utils
-}  // namespace rabit
-#endif  // RABIT_TIMER_H_
diff --git a/subtree/rabit/include/rabit/utils.h b/subtree/rabit/include/rabit/utils.h
deleted file mode 100644
index 28709ee7d..000000000
--- a/subtree/rabit/include/rabit/utils.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file utils.h
- * \brief simple utils to support the code
- * \author Tianqi Chen
- */
-#ifndef RABIT_UTILS_H_
-#define RABIT_UTILS_H_
-#define _CRT_SECURE_NO_WARNINGS
-#include <cstdio>
-#include <string>
-#include <cstdlib>
-#include <vector>
-
-#ifndef RABIT_STRICT_CXX98_
-#include <cstdarg>
-#endif
-
-#if !defined(__GNUC__)
-#define fopen64 std::fopen
-#endif
-#ifdef _MSC_VER
-// NOTE: sprintf_s is not equivalent to snprintf,
-// they are equivalent when success, which is sufficient for our case
-#define snprintf sprintf_s
-#define vsnprintf vsprintf_s
-#else
-#ifdef _FILE_OFFSET_BITS
-#if _FILE_OFFSET_BITS == 32
-#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
-#endif
-#endif
-
-#ifdef __APPLE__
-#define off64_t off_t
-#define fopen64 std::fopen
-#endif
-
-extern "C" {
-#include <sys/types.h>
-}
-#endif
-
-#ifdef _MSC_VER
-typedef unsigned char uint8_t;
-typedef unsigned __int16 uint16_t;
-typedef unsigned __int32 uint32_t;
-typedef unsigned __int64 uint64_t;
-typedef __int64 int64_t;
-#else
-#include <inttypes.h>
-#endif
-
-namespace rabit {
-/*! \brief namespace for helper utils of the project */
-namespace utils {
-
-/*! \brief error message buffer length */
-const int kPrintBuffer = 1 << 12;
-
-#ifndef RABIT_CUSTOMIZE_MSG_
-/*!
- * \brief handling of Assert error, caused by inappropriate input
- * \param msg error message
- */
-inline void HandleAssertError(const char *msg) {
-  fprintf(stderr, "AssertError:%s\n", msg);
-  exit(-1);
-}
-/*!
- * \brief handling of Check error, caused by inappropriate input
- * \param msg error message
- */
-inline void HandleCheckError(const char *msg) {
-  fprintf(stderr, "%s\n", msg);
-  exit(-1);
-}
-inline void HandlePrint(const char *msg) {
-  printf("%s", msg);
-}
-inline void HandleLogPrint(const char *msg) {
-  fprintf(stderr, "%s", msg);
-  fflush(stderr);
-}
-#else
-#ifndef RABIT_STRICT_CXX98_
-// include declarations, some one must implement this
-void HandleAssertError(const char *msg);
-void HandleCheckError(const char *msg);
-void HandlePrint(const char *msg);
-#endif
-#endif
-#ifdef RABIT_STRICT_CXX98_
-// these function pointers are to be assigned
-extern "C" void (*Printf)(const char *fmt, ...);
-extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
-extern "C" void (*Assert)(int exp, const char *fmt, ...);
-extern "C" void (*Check)(int exp, const char *fmt, ...);
-extern "C" void (*Error)(const char *fmt, ...);
-#else
-/*! \brief printf, prints messages to the console */
-inline void Printf(const char *fmt, ...) {
-  std::string msg(kPrintBuffer, '\0');
-  va_list args;
-  va_start(args, fmt);
-  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-  va_end(args);
-  HandlePrint(msg.c_str());
-}
-/*! \brief portable version of snprintf */
-inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  int ret = vsnprintf(buf, size, fmt, args);
-  va_end(args);
-  return ret;
-}
-
-/*! \brief assert a condition is true, use this to handle debug information */
-inline void Assert(bool exp, const char *fmt, ...) {
-  if (!exp) {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleAssertError(msg.c_str());
-  }
-}
-
-/*!\brief same as assert, but this is intended to be used as a message for users */
-inline void Check(bool exp, const char *fmt, ...) {
-  if (!exp) {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleCheckError(msg.c_str());
-  }
-}
-
-/*! \brief report error message, same as check */
-inline void Error(const char *fmt, ...) {
-  {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleCheckError(msg.c_str());
-  }
-}
-#endif
-
-/*! \brief replace fopen, report error when the file open fails */
-inline std::FILE *FopenCheck(const char *fname, const char *flag) {
-  std::FILE *fp = fopen64(fname, flag);
-  Check(fp != NULL, "can not open file \"%s\"\n", fname);
-  return fp;
-}
-}  // namespace utils
-// easy utils that can be directly accessed in xgboost
-/*! \brief get the beginning address of a vector */
-template<typename T>
-inline T *BeginPtr(std::vector<T> &vec) {  // NOLINT(*)
-  if (vec.size() == 0) {
-    return NULL;
-  } else {
-    return &vec[0];
-  }
-}
-/*! \brief get the beginning address of a vector */
-template<typename T>
-inline const T *BeginPtr(const std::vector<T> &vec) {  // NOLINT(*)
-  if (vec.size() == 0) {
-    return NULL;
-  } else {
-    return &vec[0];
-  }
-}
-inline char* BeginPtr(std::string &str) {  // NOLINT(*)
-  if (str.length() == 0) return NULL;
-  return &str[0];
-}
-inline const char* BeginPtr(const std::string &str) {
-  if (str.length() == 0) return NULL;
-  return &str[0];
-}
-}  // namespace rabit
-#endif  // RABIT_UTILS_H_
diff --git a/subtree/rabit/include/rabit_serializable.h b/subtree/rabit/include/rabit_serializable.h
deleted file mode 100644
index c9199bba1..000000000
--- a/subtree/rabit/include/rabit_serializable.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file rabit_serializable.h
- * \brief defines serializable interface of rabit
- * \author Tianqi Chen
- */
-#ifndef RABIT_SERIALIZABLE_H_
-#define RABIT_SERIALIZABLE_H_
-#include <vector>
-#include <string>
-#include "./rabit/utils.h"
-#include "./dmlc/io.h"
-
-namespace rabit {
-/*!
- * \brief defines stream used in rabit
- * see definition of Stream in dmlc/io.h
- */
-typedef dmlc::Stream Stream;
-/*!
- * \brief defines serializable objects used in rabit
- * see definition of Serializable in dmlc/io.h
- */
-typedef dmlc::Serializable Serializable;
-
-}  // namespace rabit
-#endif  // RABIT_SERIALIZABLE_H_
diff --git a/subtree/rabit/lib/README.md b/subtree/rabit/lib/README.md
deleted file mode 100644
index b6a5aa8b2..000000000
--- a/subtree/rabit/lib/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Rabit Library
-=====
-This folder holds the library file generated by the compiler. To generate the library file, type ```make``` in the project root folder. If you want mpi compatible library, type ```make mpi```
-
-***List of Files***
-* rabit.a The rabit package library
-  - Normally you need to link with this one
-* rabit_mock.a The rabit package library with mock test
-  - This library allows additional mock-test
-* rabit_mpi.a The MPI backed library
-  - Link against this library makes the program use MPI Allreduce
-  - This library is not fault-tolerant
-* rabit_empty.a Dummy package implementation
-  - This is an empty library that does not provide anything
-  - Only introduced to minimize code dependency for projects that only need single machine code
diff --git a/subtree/rabit/scripts/travis_runtest.sh b/subtree/rabit/scripts/travis_runtest.sh
deleted file mode 100755
index f57141c6c..000000000
--- a/subtree/rabit/scripts/travis_runtest.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-make -f test.mk model_recover_10_10k || exit -1
-make -f test.mk model_recover_10_10k_die_same  || exit -1
-make -f test.mk local_recover_10_10k || exit -1
-make -f test.mk pylocal_recover_10_10k || exit -1
-make -f test.mk lazy_recover_10_10k_die_hard || exit -1
-make -f test.mk lazy_recover_10_10k_die_same || exit -1
-make -f test.mk ringallreduce_10_10k || exit -1
\ No newline at end of file
diff --git a/subtree/rabit/scripts/travis_script.sh b/subtree/rabit/scripts/travis_script.sh
deleted file mode 100755
index 664582906..000000000
--- a/subtree/rabit/scripts/travis_script.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-# main script of travis
-if [ ${TASK} == "lint" ]; then
-    make lint || exit -1
-fi
-
-if [ ${TASK} == "doc" ]; then
-    make doc 2>log.txt
-    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep warning) && exit -1
-fi
-
-if [ ${TASK} == "build" ]; then
-    make all || exit -1
-fi
-
-if [ ${TASK} == "test" ]; then
-    cd test
-    make all || exit -1
-    ../scripts/travis_runtest.sh || exit -1
-fi
-
diff --git a/subtree/rabit/src/README.md b/subtree/rabit/src/README.md
deleted file mode 100644
index 5e55d9210..000000000
--- a/subtree/rabit/src/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-Source Files of Rabit
-====
-* This folder contains the source files of rabit library
-* The library headers are in folder [include](../include)
-* The .h files in this folder are internal header files that are only used by rabit and will not be seen by users
-
diff --git a/subtree/rabit/src/allreduce_base.cc b/subtree/rabit/src/allreduce_base.cc
deleted file mode 100644
index d3b7502ff..000000000
--- a/subtree/rabit/src/allreduce_base.cc
+++ /dev/null
@@ -1,892 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file allreduce_base.cc
- * \brief Basic implementation of AllReduce
- *
- * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <map>
-#include <cstdlib>
-#include <cstring>
-#include "./allreduce_base.h"
-
-namespace rabit {
-namespace engine {
-// constructor
-AllreduceBase::AllreduceBase(void) {
-  tracker_uri = "NULL";
-  tracker_port = 9000;
-  host_uri = "";
-  slave_port = 9010;
-  nport_trial = 1000;
-  rank = 0;
-  world_size = -1;
-  connect_retry = 5;
-  hadoop_mode = 0;
-  version_number = 0;
-  // 32 K items
-  reduce_ring_mincount = 32 << 10;
-  // tracker URL
-  task_id = "NULL";
-  err_link = NULL;
-  dmlc_role = "worker";
-  this->SetParam("rabit_reduce_buffer", "256MB");
-  // setup possible enviroment variable of intrest
-  env_vars.push_back("rabit_task_id");
-  env_vars.push_back("rabit_num_trial");
-  env_vars.push_back("rabit_reduce_buffer");
-  env_vars.push_back("rabit_reduce_ring_mincount");
-  env_vars.push_back("rabit_tracker_uri");
-  env_vars.push_back("rabit_tracker_port");
-  // also include dmlc support direct variables
-  env_vars.push_back("DMLC_TASK_ID");
-  env_vars.push_back("DMLC_ROLE");
-  env_vars.push_back("DMLC_NUM_ATTEMPT");
-  env_vars.push_back("DMLC_TRACKER_URI");
-  env_vars.push_back("DMLC_TRACKER_PORT");
-  env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
-}
-
-// initialization function
-void AllreduceBase::Init(void) {
-  // setup from enviroment variables
-  // handler to get variables from env
-  for (size_t i = 0; i < env_vars.size(); ++i) {
-    const char *value = getenv(env_vars[i].c_str());
-    if (value != NULL) {
-      this->SetParam(env_vars[i].c_str(), value);
-    }
-  }
-  {
-    // handling for hadoop
-    const char *task_id = getenv("mapred_tip_id");
-    if (task_id == NULL) {
-      task_id = getenv("mapreduce_task_id");
-    }
-    if (hadoop_mode != 0) {
-      utils::Check(task_id != NULL,
-                   "hadoop_mode is set but cannot find mapred_task_id");
-    }
-    if (task_id != NULL) {
-      this->SetParam("rabit_task_id", task_id);
-      this->SetParam("rabit_hadoop_mode", "1");
-    }
-    const char *attempt_id = getenv("mapred_task_id");
-    if (attempt_id != 0) {
-      const char *att = strrchr(attempt_id, '_');
-      int num_trial;
-      if (att != NULL && sscanf(att + 1, "%d", &num_trial) == 1) {
-        this->SetParam("rabit_num_trial", att + 1);
-      }
-    }
-    // handling for hadoop
-    const char *num_task = getenv("mapred_map_tasks");
-    if (num_task == NULL) {
-      num_task = getenv("mapreduce_job_maps");
-    }
-    if (hadoop_mode != 0) {
-      utils::Check(num_task != NULL,
-                   "hadoop_mode is set but cannot find mapred_map_tasks");
-    }
-    if (num_task != NULL) {
-      this->SetParam("rabit_world_size", num_task);
-    }
-  }
-  if (dmlc_role != "worker") {
-    fprintf(stderr, "Rabit Module currently only work with dmlc worker"\
-            ", quit this program by exit 0\n");
-    exit(0);
-  }
-  // clear the setting before start reconnection
-  this->rank = -1;
-  //---------------------
-  // start socket
-  utils::Socket::Startup();
-  utils::Assert(all_links.size() == 0, "can only call Init once");
-  this->host_uri = utils::SockAddr::GetHostName();
-  // get information from tracker
-  this->ReConnectLinks();
-}
-
-void AllreduceBase::Shutdown(void) {
-  for (size_t i = 0; i < all_links.size(); ++i) {
-    all_links[i].sock.Close();
-  }
-  all_links.clear();
-  tree_links.plinks.clear();
-
-  if (tracker_uri == "NULL") return;
-  // notify tracker rank i have shutdown
-  utils::TCPSocket tracker = this->ConnectTracker();
-  tracker.SendStr(std::string("shutdown"));
-  tracker.Close();
-  utils::TCPSocket::Finalize();
-}
-void AllreduceBase::TrackerPrint(const std::string &msg) {
-  if (tracker_uri == "NULL") {
-    utils::Printf("%s", msg.c_str()); return;
-  }
-  utils::TCPSocket tracker = this->ConnectTracker();
-  tracker.SendStr(std::string("print"));
-  tracker.SendStr(msg);
-  tracker.Close();
-}
-// util to parse data with unit suffix
-inline size_t ParseUnit(const char *name, const char *val) {
-  char unit;
-  unsigned long amt;  // NOLINT(*)
-  int n = sscanf(val, "%lu%c", &amt, &unit);
-  size_t amount = amt;
-  if (n == 2) {
-    switch (unit) {
-      case 'B': return amount;
-      case 'K': return amount << 10UL;
-      case 'M': return amount << 20UL;
-      case 'G': return amount << 30UL;
-      default: utils::Error("invalid format for %s", name); return 0;
-    }
-  } else if (n == 1) {
-    return amount;
-  } else {
-    utils::Error("invalid format for %s,"                               \
-                 "shhould be {integer}{unit}, unit can be {B, KB, MB, GB}", name);
-    return 0;
-  }
-}
-/*!
- * \brief set parameters to the engine
- * \param name parameter name
- * \param val parameter value
- */
-void AllreduceBase::SetParam(const char *name, const char *val) {
-  if (!strcmp(name, "rabit_tracker_uri")) tracker_uri = val;
-  if (!strcmp(name, "rabit_tracker_port")) tracker_port = atoi(val);
-  if (!strcmp(name, "rabit_task_id")) task_id = val;
-  if (!strcmp(name, "DMLC_TRACKER_URI")) tracker_uri = val;
-  if (!strcmp(name, "DMLC_TRACKER_PORT")) tracker_port = atoi(val);
-  if (!strcmp(name, "DMLC_TASK_ID")) task_id = val;
-  if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val;
-  if (!strcmp(name, "rabit_world_size")) world_size = atoi(val);
-  if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = atoi(val);
-  if (!strcmp(name, "rabit_reduce_ring_mincount")) {
-    reduce_ring_mincount = ParseUnit(name, val);
-  }
-  if (!strcmp(name, "rabit_reduce_buffer")) {
-    reduce_buffer_size = (ParseUnit(name, val) + 7) >> 3;
-  }
-  if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
-    connect_retry = atoi(val);
-  }
-}
-/*!
- * \brief initialize connection to the tracker
- * \return a socket that initializes the connection
- */
-utils::TCPSocket AllreduceBase::ConnectTracker(void) const {
-  int magic = kMagic;
-  // get information from tracker
-  utils::TCPSocket tracker;
-  tracker.Create();
-
-  int retry = 0;
-  do {
-    fprintf(stderr, "connect to ip: [%s]\n", tracker_uri.c_str());
-    if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
-      if (++retry >= connect_retry) {
-        fprintf(stderr, "connect to (failed): [%s]\n", tracker_uri.c_str());
-        utils::Socket::Error("Connect");
-      } else {
-        fprintf(stderr, "retry connect to ip(retry time %d): [%s]\n", retry, tracker_uri.c_str());
-        #ifdef _MSC_VER
-        Sleep(1);
-        #else
-        sleep(1);
-        #endif
-        continue;
-      }
-    }
-    break;
-  } while (1);
-
-  using utils::Assert;
-  Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic),
-         "ReConnectLink failure 1");
-  Assert(tracker.RecvAll(&magic, sizeof(magic)) == sizeof(magic),
-         "ReConnectLink failure 2");
-  utils::Check(magic == kMagic, "sync::Invalid tracker message, init failure");
-  Assert(tracker.SendAll(&rank, sizeof(rank)) == sizeof(rank),
-                "ReConnectLink failure 3");
-  Assert(tracker.SendAll(&world_size, sizeof(world_size)) == sizeof(world_size),
-         "ReConnectLink failure 3");
-  tracker.SendStr(task_id);
-  return tracker;
-}
-/*!
- * \brief connect to the tracker to fix the the missing links
- *   this function is also used when the engine start up
- */
-void AllreduceBase::ReConnectLinks(const char *cmd) {
-  // single node mode
-  if (tracker_uri == "NULL") {
-    rank = 0; world_size = 1; return;
-  }
-  utils::TCPSocket tracker = this->ConnectTracker();
-  tracker.SendStr(std::string(cmd));
-
-  // the rank of previous link, next link in ring
-  int prev_rank, next_rank;
-  // the rank of neighbors
-  std::map<int, int> tree_neighbors;
-  using utils::Assert;
-  // get new ranks
-  int newrank, num_neighbors;
-  Assert(tracker.RecvAll(&newrank, sizeof(newrank)) == sizeof(newrank),
-           "ReConnectLink failure 4");
-  Assert(tracker.RecvAll(&parent_rank, sizeof(parent_rank)) ==\
-         sizeof(parent_rank), "ReConnectLink failure 4");
-  Assert(tracker.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
-         "ReConnectLink failure 4");
-  Assert(rank == -1 || newrank == rank,
-         "must keep rank to same if the node already have one");
-  rank = newrank;
-  Assert(tracker.RecvAll(&num_neighbors, sizeof(num_neighbors)) ==  \
-         sizeof(num_neighbors), "ReConnectLink failure 4");
-  for (int i = 0; i < num_neighbors; ++i) {
-    int nrank;
-    Assert(tracker.RecvAll(&nrank, sizeof(nrank)) == sizeof(nrank),
-           "ReConnectLink failure 4");
-    tree_neighbors[nrank] = 1;
-  }
-  Assert(tracker.RecvAll(&prev_rank, sizeof(prev_rank)) == sizeof(prev_rank),
-         "ReConnectLink failure 4");
-  Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
-         "ReConnectLink failure 4");
-  // create listening socket
-  utils::TCPSocket sock_listen;
-  sock_listen.Create();
-  int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
-  utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
-  sock_listen.Listen();
-
-  // get number of to connect and number of to accept nodes from tracker
-  int num_conn, num_accept, num_error = 1;
-  do {
-    // send over good links
-    std::vector<int> good_link;
-    for (size_t i = 0; i < all_links.size(); ++i) {
-      if (!all_links[i].sock.BadSocket()) {
-        good_link.push_back(static_cast<int>(all_links[i].rank));
-      } else {
-        if (!all_links[i].sock.IsClosed()) all_links[i].sock.Close();
-      }
-    }
-    int ngood = static_cast<int>(good_link.size());
-    Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
-           "ReConnectLink failure 5");
-    for (size_t i = 0; i < good_link.size(); ++i) {
-      Assert(tracker.SendAll(&good_link[i], sizeof(good_link[i])) == \
-             sizeof(good_link[i]), "ReConnectLink failure 6");
-    }
-    Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
-           "ReConnectLink failure 7");
-    Assert(tracker.RecvAll(&num_accept, sizeof(num_accept)) ==  \
-           sizeof(num_accept), "ReConnectLink failure 8");
-    num_error = 0;
-    for (int i = 0; i < num_conn; ++i) {
-      LinkRecord r;
-      int hport, hrank;
-      std::string hname;
-      tracker.RecvStr(&hname);
-      Assert(tracker.RecvAll(&hport, sizeof(hport)) == sizeof(hport),
-             "ReConnectLink failure 9");
-      Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
-             "ReConnectLink failure 10");
-      r.sock.Create();
-      if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
-        num_error += 1; r.sock.Close(); continue;
-      }
-      Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
-             "ReConnectLink failure 12");
-      Assert(r.sock.RecvAll(&r.rank, sizeof(r.rank)) == sizeof(r.rank),
-             "ReConnectLink failure 13");
-      utils::Check(hrank == r.rank,
-                   "ReConnectLink failure, link rank inconsistent");
-      bool match = false;
-      for (size_t i = 0; i < all_links.size(); ++i) {
-        if (all_links[i].rank == hrank) {
-          Assert(all_links[i].sock.IsClosed(),
-                 "Override a link that is active");
-          all_links[i].sock = r.sock; match = true; break;
-        }
-      }
-      if (!match) all_links.push_back(r);
-    }
-    Assert(tracker.SendAll(&num_error, sizeof(num_error)) == sizeof(num_error),
-           "ReConnectLink failure 14");
-  } while (num_error != 0);
-  // send back socket listening port to tracker
-  Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port),
-         "ReConnectLink failure 14");
-  // close connection to tracker
-  tracker.Close();
-  // listen to incoming links
-  for (int i = 0; i < num_accept; ++i) {
-    LinkRecord r;
-    r.sock = sock_listen.Accept();
-    Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
-           "ReConnectLink failure 15");
-    Assert(r.sock.RecvAll(&r.rank, sizeof(r.rank)) == sizeof(r.rank),
-           "ReConnectLink failure 15");
-    bool match = false;
-    for (size_t i = 0; i < all_links.size(); ++i) {
-      if (all_links[i].rank == r.rank) {
-        utils::Assert(all_links[i].sock.IsClosed(),
-                      "Override a link that is active");
-        all_links[i].sock = r.sock; match = true; break;
-      }
-    }
-    if (!match) all_links.push_back(r);
-  }
-  // close listening sockets
-  sock_listen.Close();
-  this->parent_index = -1;
-  // setup tree links and ring structure
-  tree_links.plinks.clear();
-  for (size_t i = 0; i < all_links.size(); ++i) {
-    utils::Assert(!all_links[i].sock.BadSocket(), "ReConnectLink: bad socket");
-    // set the socket to non-blocking mode, enable TCP keepalive
-    all_links[i].sock.SetNonBlock(true);
-    all_links[i].sock.SetKeepAlive(true);
-    if (tree_neighbors.count(all_links[i].rank) != 0) {
-      if (all_links[i].rank == parent_rank) {
-        parent_index = static_cast<int>(tree_links.plinks.size());
-      }
-      tree_links.plinks.push_back(&all_links[i]);
-    }
-    if (all_links[i].rank == prev_rank) ring_prev = &all_links[i];
-    if (all_links[i].rank == next_rank) ring_next = &all_links[i];
-  }
-  Assert(parent_rank == -1 || parent_index != -1,
-         "cannot find parent in the link");
-  Assert(prev_rank == -1 || ring_prev != NULL,
-         "cannot find prev ring in the link");
-  Assert(next_rank == -1 || ring_next != NULL,
-         "cannot find next ring in the link");
-}
-/*!
- * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
- *
- * NOTE on Allreduce:
- *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
- *    It only means the current node get the correct result of Allreduce.
- *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
- *
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param type_nbytes the unit number of bytes the type have
- * \param count number of elements to be reduced
- * \param reducer reduce function
- * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceBase::ReturnType
-AllreduceBase::TryAllreduce(void *sendrecvbuf_,
-                            size_t type_nbytes,
-                            size_t count,
-                            ReduceFunction reducer) {
-  if (count > reduce_ring_mincount) {
-    return this->TryAllreduceRing(sendrecvbuf_, type_nbytes, count, reducer);
-  } else {
-    return this->TryAllreduceTree(sendrecvbuf_, type_nbytes, count, reducer);
-  }
-}
-/*!
- * \brief perform in-place allreduce, on sendrecvbuf,
- * this function implements tree-shape reduction
- *
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param type_nbytes the unit number of bytes the type have
- * \param count number of elements to be reduced
- * \param reducer reduce function
- * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceBase::ReturnType
-AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
-                                size_t type_nbytes,
-                                size_t count,
-                                ReduceFunction reducer) {
-  RefLinkVector &links = tree_links;
-  if (links.size() == 0 || count == 0) return kSuccess;
-  // total size of message
-  const size_t total_size = type_nbytes * count;
-  // number of links
-  const int nlink = static_cast<int>(links.size());
-  // send recv buffer
-  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
-  // size of space that we already performs reduce in up pass
-  size_t size_up_reduce = 0;
-  // size of space that we have already passed to parent
-  size_t size_up_out = 0;
-  // size of message we received, and send in the down pass
-  size_t size_down_in = 0;
-  // initialize the link ring-buffer and pointer
-  for (int i = 0; i < nlink; ++i) {
-    if (i != parent_index) {
-      links[i].InitBuffer(type_nbytes, count, reduce_buffer_size);
-    }
-    links[i].ResetSize();
-  }
-  // if no childs, no need to reduce
-  if (nlink == static_cast<int>(parent_index != -1)) {
-    size_up_reduce = total_size;
-  }
-  // while we have not passed the messages out
-  while (true) {
-    // select helper
-    bool finished = true;
-    utils::SelectHelper selecter;
-    for (int i = 0; i < nlink; ++i) {
-      if (i == parent_index) {
-        if (size_down_in != total_size) {
-          selecter.WatchRead(links[i].sock);
-          // only watch for exception in live channels
-          selecter.WatchException(links[i].sock);
-          finished = false;
-        }
-        if (size_up_out != total_size && size_up_out < size_up_reduce) {
-          selecter.WatchWrite(links[i].sock);
-        }
-      } else {
-        if (links[i].size_read != total_size) {
-          selecter.WatchRead(links[i].sock);
-        }
-        // size_write <= size_read
-        if (links[i].size_write != total_size) {
-          if (links[i].size_write < size_down_in) {
-            selecter.WatchWrite(links[i].sock);
-          }
-          // only watch for exception in live channels
-          selecter.WatchException(links[i].sock);
-          finished = false;
-        }
-      }
-    }
-    // finish runing allreduce
-    if (finished) break;
-    // select must return
-    selecter.Select();
-    // exception handling
-    for (int i = 0; i < nlink; ++i) {
-      // recive OOB message from some link
-      if (selecter.CheckExcept(links[i].sock)) {
-        return ReportError(&links[i], kGetExcept);
-      }
-    }
-    // read data from childs
-    for (int i = 0; i < nlink; ++i) {
-      if (i != parent_index && selecter.CheckRead(links[i].sock)) {
-        ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size);
-        if (ret != kSuccess) {
-          return ReportError(&links[i], ret);
-        }
-      }
-    }
-    // this node have childs, peform reduce
-    if (nlink > static_cast<int>(parent_index != -1)) {
-      size_t buffer_size = 0;
-      // do upstream reduce
-      size_t max_reduce = total_size;
-      for (int i = 0; i < nlink; ++i) {
-        if (i != parent_index) {
-          max_reduce = std::min(max_reduce, links[i].size_read);
-          utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
-                        "buffer size inconsistent");
-          buffer_size = links[i].buffer_size;
-        }
-      }
-      utils::Assert(buffer_size != 0, "must assign buffer_size");
-      // round to type_n4bytes
-      max_reduce = (max_reduce / type_nbytes * type_nbytes);
-      // peform reduce, can be at most two rounds
-      while (size_up_reduce < max_reduce) {
-        // start position
-        size_t start = size_up_reduce % buffer_size;
-        // peform read till end of buffer
-        size_t nread = std::min(buffer_size - start,
-                                max_reduce - size_up_reduce);
-        utils::Assert(nread % type_nbytes == 0, "Allreduce: size check");
-        for (int i = 0; i < nlink; ++i) {
-          if (i != parent_index) {
-            reducer(links[i].buffer_head + start,
-                    sendrecvbuf + size_up_reduce,
-                    static_cast<int>(nread / type_nbytes),
-                    MPI::Datatype(type_nbytes));
-          }
-        }
-        size_up_reduce += nread;
-      }
-    }
-    if (parent_index != -1) {
-      // pass message up to parent, can pass data that are already been reduced
-      if (size_up_out < size_up_reduce) {
-        ssize_t len = links[parent_index].sock.
-            Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
-        if (len != -1) {
-          size_up_out += static_cast<size_t>(len);
-        } else {
-          ReturnType ret = Errno2Return();
-          if (ret != kSuccess) {
-            return ReportError(&links[parent_index], ret);
-          }
-        }
-      }
-      // read data from parent
-      if (selecter.CheckRead(links[parent_index].sock) &&
-          total_size > size_down_in) {
-        ssize_t len = links[parent_index].sock.
-            Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
-        if (len == 0) {
-          links[parent_index].sock.Close();
-          return ReportError(&links[parent_index], kRecvZeroLen);
-        }
-        if (len != -1) {
-          size_down_in += static_cast<size_t>(len);
-          utils::Assert(size_down_in <= size_up_out,
-                        "Allreduce: boundary error");
-        } else {
-          ReturnType ret = Errno2Return();
-          if (ret != kSuccess) {
-            return ReportError(&links[parent_index], ret);
-          }
-        }
-      }
-    } else {
-      // this is root, can use reduce as most recent point
-      size_down_in = size_up_out = size_up_reduce;
-    }
-    // can pass message down to childs
-    for (int i = 0; i < nlink; ++i) {
-      if (i != parent_index && links[i].size_write < size_down_in) {
-        ReturnType ret = links[i].WriteFromArray(sendrecvbuf, size_down_in);
-        if (ret != kSuccess) {
-          return ReportError(&links[i], ret);
-        }
-      }
-    }
-  }
-  return kSuccess;
-}
-/*!
- * \brief broadcast data from root to all nodes, this function can fail,and will return the cause of failure
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param total_size the size of the data to be broadcasted
- * \param root the root worker id to broadcast the data
- * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceBase::ReturnType
-AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
-  RefLinkVector &links = tree_links;
-  if (links.size() == 0 || total_size == 0) return kSuccess;
-  utils::Check(root < world_size,
-               "Broadcast: root should be smaller than world size");
-  // number of links
-  const int nlink = static_cast<int>(links.size());
-  // size of space already read from data
-  size_t size_in = 0;
-  // input link, -2 means unknown yet, -1 means this is root
-  int in_link = -2;
-
-  // initialize the link statistics
-  for (int i = 0; i < nlink; ++i) {
-    links[i].ResetSize();
-  }
-  // root have all the data
-  if (this->rank == root) {
-    size_in = total_size;
-    in_link = -1;
-  }
-  // while we have not passed the messages out
-  while (true) {
-    bool finished = true;
-    // select helper
-    utils::SelectHelper selecter;
-    for (int i = 0; i < nlink; ++i) {
-      if (in_link == -2) {
-        selecter.WatchRead(links[i].sock); finished = false;
-      }
-      if (i == in_link && links[i].size_read != total_size) {
-        selecter.WatchRead(links[i].sock); finished = false;
-      }
-      if (in_link != -2 && i != in_link && links[i].size_write != total_size) {
-        if (links[i].size_write < size_in) {
-          selecter.WatchWrite(links[i].sock);
-        }
-        finished = false;
-      }
-      selecter.WatchException(links[i].sock);
-    }
-    // finish running
-    if (finished) break;
-    // select
-    selecter.Select();
-    // exception handling
-    for (int i = 0; i < nlink; ++i) {
-      // recive OOB message from some link
-      if (selecter.CheckExcept(links[i].sock)) {
-        return ReportError(&links[i], kGetExcept);
-      }
-    }
-    if (in_link == -2) {
-      // probe in-link
-      for (int i = 0; i < nlink; ++i) {
-        if (selecter.CheckRead(links[i].sock)) {
-          ReturnType ret = links[i].ReadToArray(sendrecvbuf_, total_size);
-          if (ret != kSuccess) {
-            return ReportError(&links[i], ret);
-          }
-          size_in = links[i].size_read;
-          if (size_in != 0) {
-            in_link = i; break;
-          }
-        }
-      }
-    } else {
-      // read from in link
-      if (in_link >= 0 && selecter.CheckRead(links[in_link].sock)) {
-        ReturnType ret = links[in_link].ReadToArray(sendrecvbuf_, total_size);
-        if (ret != kSuccess) {
-          return ReportError(&links[in_link], ret);
-        }
-        size_in = links[in_link].size_read;
-      }
-    }
-    // send data to all out-link
-    for (int i = 0; i < nlink; ++i) {
-      if (i != in_link && links[i].size_write < size_in) {
-        ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, size_in);
-        if (ret != kSuccess) {
-          return ReportError(&links[i], ret);
-        }
-      }
-    }
-  }
-  return kSuccess;
-}
-/*!
- * \brief internal Allgather function, each node have a segment of data in the ring of sendrecvbuf,
- *  the data provided by current node k is [slice_begin, slice_end),
- *  the next node's segment must start with slice_end
- *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
- *  use a ring based algorithm
- *
- * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
- * \param total_size total size of data to be gathered
- * \param slice_begin beginning of the current slice
- * \param slice_end end of the current slice
- * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
- */
-AllreduceBase::ReturnType
-AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
-                                size_t slice_begin,
-                                size_t slice_end,
-                                size_t size_prev_slice) {
-  // read from next link and send to prev one
-  LinkRecord &prev = *ring_prev, &next = *ring_next;
-  // need to reply on special rank structure
-  utils::Assert(next.rank == (rank + 1) % world_size &&
-                rank == (prev.rank + 1) % world_size,
-                "need to assume rank structure");
-  // send recv buffer
-  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
-  const size_t stop_read = total_size + slice_begin;
-  const size_t stop_write = total_size + slice_begin - size_prev_slice;
-  size_t write_ptr = slice_begin;
-  size_t read_ptr = slice_end;
-
-  while (true) {
-    // select helper
-    bool finished = true;
-    utils::SelectHelper selecter;
-    if (read_ptr != stop_read) {
-      selecter.WatchRead(next.sock);
-      finished = false;
-    }
-    if (write_ptr != stop_write) {
-      if (write_ptr < read_ptr) {
-        selecter.WatchWrite(prev.sock);
-      }
-      finished  = false;
-    }
-    if (finished) break;
-    selecter.Select();
-    if (read_ptr != stop_read && selecter.CheckRead(next.sock)) {
-      size_t size = stop_read - read_ptr;
-      size_t start = read_ptr % total_size;
-      if (start + size > total_size) {
-        size = total_size - start;
-      }
-      ssize_t len = next.sock.Recv(sendrecvbuf + start, size);
-      if (len != -1) {
-        read_ptr += static_cast<size_t>(len);
-      } else {
-        ReturnType ret = Errno2Return();
-        if (ret != kSuccess) return ReportError(&next, ret);
-      }
-    }
-    if (write_ptr < read_ptr && write_ptr != stop_write) {
-      size_t size = std::min(read_ptr, stop_write) - write_ptr;
-      size_t start = write_ptr % total_size;
-      if (start + size > total_size) {
-        size = total_size - start;
-      }
-      ssize_t len = prev.sock.Send(sendrecvbuf + start, size);
-      if (len != -1) {
-        write_ptr += static_cast<size_t>(len);
-      } else {
-        ReturnType ret = Errno2Return();
-        if (ret != kSuccess) return ReportError(&prev, ret);
-      }
-    }
-  }
-  return kSuccess;
-}
-/*!
- * \brief perform in-place allreduce, on sendrecvbuf, this function can fail,
- *  and will return the cause of failure
- *
- *  Ring-based algorithm
- *
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param type_nbytes the unit number of bytes the type have
- * \param count number of elements to be reduced
- * \param reducer reduce function
- * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
- * \sa ReturnType, TryAllreduce
- */
-AllreduceBase::ReturnType
-AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
-                                    size_t type_nbytes,
-                                    size_t count,
-                                    ReduceFunction reducer) {
-  // read from next link and send to prev one
-  LinkRecord &prev = *ring_prev, &next = *ring_next;
-  // need to reply on special rank structure
-  utils::Assert(next.rank == (rank + 1) % world_size &&
-                rank == (prev.rank + 1) % world_size,
-                "need to assume rank structure");
-  // total size of message
-  const size_t total_size = type_nbytes * count;
-  size_t n = static_cast<size_t>(world_size);
-  size_t step = (count + n - 1) / n;
-  size_t r = static_cast<size_t>(next.rank);
-  size_t write_ptr = std::min(r * step, count) * type_nbytes;
-  size_t read_ptr = std::min((r + 1) * step, count) * type_nbytes;
-  size_t reduce_ptr = read_ptr;
-  // send recv buffer
-  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
-  // position to stop reading
-  const size_t stop_read = total_size + write_ptr;
-  // position to stop writing
-  size_t stop_write = total_size + std::min(rank * step, count) * type_nbytes;
-  if (stop_write > stop_read) {
-    stop_write -= total_size;
-    utils::Assert(write_ptr <= stop_write, "write ptr boundary check");
-  }
-  // use ring buffer in next position
-  next.InitBuffer(type_nbytes, step, reduce_buffer_size);
-  // set size_read to read pointer for ring buffer to work properly
-  next.size_read = read_ptr;
-
-  while (true) {
-    // select helper
-    bool finished = true;
-    utils::SelectHelper selecter;
-    if (read_ptr != stop_read) {
-      selecter.WatchRead(next.sock);
-      finished = false;
-    }
-    if (write_ptr != stop_write) {
-      if (write_ptr < reduce_ptr) {
-        selecter.WatchWrite(prev.sock);
-      }
-      finished = false;
-    }
-    if (finished) break;
-    selecter.Select();
-    if (read_ptr != stop_read && selecter.CheckRead(next.sock)) {
-      ReturnType ret = next.ReadToRingBuffer(reduce_ptr, stop_read);
-      if (ret != kSuccess) {
-        return ReportError(&next, ret);
-      }
-      // sync the rate
-      read_ptr = next.size_read;
-      utils::Assert(read_ptr <= stop_read, "[%d] read_ptr boundary check", rank);
-      const size_t buffer_size = next.buffer_size;
-      size_t max_reduce = (read_ptr  / type_nbytes) * type_nbytes;
-      while (reduce_ptr < max_reduce) {
-        size_t bstart = reduce_ptr % buffer_size;
-        size_t nread = std::min(buffer_size - bstart,
-                                max_reduce - reduce_ptr);
-        size_t rstart = reduce_ptr % total_size;
-        nread = std::min(nread, total_size - rstart);
-        reducer(next.buffer_head + bstart,
-                sendrecvbuf + rstart,
-                static_cast<int>(nread / type_nbytes),
-                MPI::Datatype(type_nbytes));
-        reduce_ptr += nread;
-      }
-    }
-    if (write_ptr < reduce_ptr && write_ptr != stop_write) {
-      size_t size = std::min(reduce_ptr, stop_write) - write_ptr;
-      size_t start = write_ptr % total_size;
-      if (start + size > total_size) {
-        size = total_size - start;
-      }
-      ssize_t len = prev.sock.Send(sendrecvbuf + start, size);
-      if (len != -1) {
-        write_ptr += static_cast<size_t>(len);
-      } else {
-        ReturnType ret = Errno2Return();
-        if (ret != kSuccess) return ReportError(&prev, ret);
-      }
-    }
-  }
-  return kSuccess;
-}
-/*!
- * \brief perform in-place allreduce, on sendrecvbuf
- *  use a ring based algorithm
- *
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param type_nbytes the unit number of bytes the type have
- * \param count number of elements to be reduced
- * \param reducer reduce function
- * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceBase::ReturnType
-AllreduceBase::TryAllreduceRing(void *sendrecvbuf_,
-                                size_t type_nbytes,
-                                size_t count,
-                                ReduceFunction reducer) {
-  ReturnType ret = TryReduceScatterRing(sendrecvbuf_, type_nbytes, count, reducer);
-  if (ret != kSuccess) return ret;
-  size_t n = static_cast<size_t>(world_size);
-  size_t step = (count + n - 1) / n;
-  size_t begin = std::min(rank * step, count) * type_nbytes;
-  size_t end = std::min((rank + 1) * step, count) * type_nbytes;
-  // previous rank
-  int prank = ring_prev->rank;
-  // get rank of previous
-  return TryAllgatherRing
-      (sendrecvbuf_, type_nbytes * count,
-       begin, end,
-       (std::min((prank + 1) * step, count) -
-        std::min(prank * step, count)) * type_nbytes);
-}
-}  // namespace engine
-}  // namespace rabit
diff --git a/subtree/rabit/src/allreduce_base.h b/subtree/rabit/src/allreduce_base.h
deleted file mode 100644
index 63acd75d5..000000000
--- a/subtree/rabit/src/allreduce_base.h
+++ /dev/null
@@ -1,527 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file allreduce_base.h
- * \brief Basic implementation of AllReduce
- *   using TCP non-block socket and tree-shape reduction.
- *
- *   This implementation provides basic utility of AllReduce and Broadcast
- *   without considering node failure
- *
- * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
- */
-#ifndef RABIT_ALLREDUCE_BASE_H_
-#define RABIT_ALLREDUCE_BASE_H_
-
-#include <vector>
-#include <string>
-#include <algorithm>
-#include "../include/rabit/utils.h"
-#include "../include/rabit/engine.h"
-#include "./socket.h"
-
-namespace MPI {
-// MPI data type to be compatible with existing MPI interface
-class Datatype {
- public:
-  size_t type_size;
-  explicit Datatype(size_t type_size) : type_size(type_size) {}
-};
-}
-namespace rabit {
-namespace engine {
-/*! \brief implementation of basic Allreduce engine */
-class AllreduceBase : public IEngine {
- public:
-  // magic number to verify server
-  static const int kMagic = 0xff99;
-  // constant one byte out of band message to indicate error happening
-  AllreduceBase(void);
-  virtual ~AllreduceBase(void) {}
-  // initialize the manager
-  virtual void Init(void);
-  // shutdown the engine
-  virtual void Shutdown(void);
-  /*!
-   * \brief set parameters to the engine
-   * \param name parameter name
-   * \param val parameter value
-   */
-  virtual void SetParam(const char *name, const char *val);
-  /*!
-   * \brief print the msg in the tracker,
-   *    this function can be used to communicate the information of the progress to
-   *    the user who monitors the tracker
-   * \param msg message to be printed in the tracker
-   */
-  virtual void TrackerPrint(const std::string &msg);
-  /*! \brief get rank */
-  virtual int GetRank(void) const {
-    return rank;
-  }
-  /*! \brief get rank */
-  virtual int GetWorldSize(void) const {
-    if (world_size == -1) return 1;
-    return world_size;
-  }
-  /*! \brief whether is distributed or not */
-  virtual bool IsDistributed(void) const {
-    return tracker_uri != "NULL";
-  }
-  /*! \brief get rank */
-  virtual std::string GetHost(void) const {
-    return host_uri;
-  }
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf
-   *        this function is NOT thread-safe
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to passed into the lazy preprocessing function
-   */
-  virtual void Allreduce(void *sendrecvbuf_,
-                         size_t type_nbytes,
-                         size_t count,
-                         ReduceFunction reducer,
-                         PreprocFunction prepare_fun = NULL,
-                         void *prepare_arg = NULL) {
-    if (prepare_fun != NULL) prepare_fun(prepare_arg);
-    if (world_size == 1) return;
-    utils::Assert(TryAllreduce(sendrecvbuf_,
-                               type_nbytes, count, reducer) == kSuccess,
-                  "Allreduce failed");
-  }
-  /*!
-   * \brief broadcast data from root to all nodes
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param size the size of the data to be broadcasted
-   * \param root the root worker id to broadcast the data
-   */
-  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
-    if (world_size == 1) return;
-    utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
-                  "Broadcast failed");
-  }
-  /*!
-   * \brief load latest check point
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \param local_model pointer to local model, that is specific to current node/rank
-   *   this can be NULL when no local model is needed
-   *
-   * \return the version number of check point loaded
-   *     if returned version == 0, this means no model has been CheckPointed
-   *     the p_model is not touched, user should do necessary initialization by themselves
-   *
-   *   Common usage example:
-   *      int iter = rabit::LoadCheckPoint(&model);
-   *      if (iter == 0) model.InitParameters();
-   *      for (i = iter; i < max_iter; ++i) {
-   *        do many things, include allreduce
-   *        rabit::CheckPoint(model);
-   *      }
-   *
-   * \sa CheckPoint, VersionNumber
-   */
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model = NULL) {
-    return 0;
-  }
-  /*!
-   * \brief checkpoint the model, meaning we finished a stage of execution
-   *  every time we call check point, there is a version number which will increase by one
-   *
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \param local_model pointer to local model, that is specific to current node/rank
-   *   this can be NULL when no local state is needed
-   *
-   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
-   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
-   *       So only CheckPoint with global_model if possible
-   *
-   * \sa LoadCheckPoint, VersionNumber
-   */
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model = NULL) {
-    version_number += 1;
-  }
-  /*!
-   * \brief This function can be used to replace CheckPoint for global_model only,
-   *   when certain condition is met(see detailed expplaination).
-   *
-   *   This is a "lazy" checkpoint such that only the pointer to global_model is
-   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
-   *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
-   *   In another words, global_model model can be changed only between last call of
-   *   Allreduce/Broadcast and LazyCheckPoint in current version
-   *
-   *   For example, suppose the calling sequence is:
-   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
-   *
-   *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
-   *   improve efficiency of the program.
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \sa LoadCheckPoint, CheckPoint, VersionNumber
-   */
-  virtual void LazyCheckPoint(const Serializable *global_model) {
-    version_number += 1;
-  }
-  /*!
-   * \return version number of current stored model,
-   *         which means how many calls to CheckPoint we made so far
-   * \sa LoadCheckPoint, CheckPoint
-   */
-  virtual int VersionNumber(void) const {
-    return version_number;
-  }
-  /*!
-   * \brief explicitly re-init everything before calling LoadCheckPoint
-   *    call this function when IEngine throw an exception out,
-   *    this function is only used for test purpose
-   */
-  virtual void InitAfterException(void) {
-    utils::Error("InitAfterException: not implemented");
-  }
-  /*!
-   * \brief report current status to the job tracker
-   * depending on the job tracker we are in
-   */
-  inline void ReportStatus(void) const {
-    if (hadoop_mode != 0) {
-      fprintf(stderr, "reporter:status:Rabit Phase[%03d] Operation %03d\n",
-              version_number, seq_counter);
-    }
-  }
-
- protected:
-  /*! \brief enumeration of possible returning results from Try functions */
-  enum ReturnTypeEnum {
-    /*! \brief execution is successful */
-    kSuccess,
-    /*! \brief a link was reset by peer */
-    kConnReset,
-    /*! \brief received a zero length message */
-    kRecvZeroLen,
-    /*! \brief a neighbor node go down, the connection is dropped */
-    kSockError,
-    /*!
-     * \brief another node which is not my neighbor go down,
-     *   get Out-of-Band exception notification from my neighbor
-     */
-    kGetExcept
-  };
-  /*! \brief struct return type to avoid implicit conversion to int/bool */
-  struct ReturnType {
-    /*! \brief internal return type */
-    ReturnTypeEnum value;
-    // constructor
-    ReturnType() {}
-    ReturnType(ReturnTypeEnum value) : value(value) {}  // NOLINT(*)
-    inline bool operator==(const ReturnTypeEnum &v) const {
-      return value == v;
-    }
-    inline bool operator!=(const ReturnTypeEnum &v) const {
-      return value != v;
-    }
-  };
-  /*! \brief translate errno to return type */
-  inline static ReturnType Errno2Return() {
-    int errsv = utils::Socket::GetLastError();
-    if (errsv == EAGAIN || errsv == EWOULDBLOCK || errsv == 0) return kSuccess;
-#ifdef _WIN32
-    if (errsv == WSAEWOULDBLOCK) return kSuccess;
-    if (errsv == WSAECONNRESET) return kConnReset;
-#endif
-    if (errsv == ECONNRESET) return kConnReset;
-    return kSockError;
-  }
-  // link record to a neighbor
-  struct LinkRecord {
-   public:
-    // socket to get data from/to link
-    utils::TCPSocket sock;
-    // rank of the node in this link
-    int rank;
-    // size of data readed from link
-    size_t size_read;
-    // size of data sent to the link
-    size_t size_write;
-    // pointer to buffer head
-    char *buffer_head;
-    // buffer size, in bytes
-    size_t buffer_size;
-    // constructor
-    LinkRecord(void)
-        : buffer_head(NULL), buffer_size(0) {
-    }
-    // initialize buffer
-    inline void InitBuffer(size_t type_nbytes, size_t count,
-                           size_t reduce_buffer_size) {
-      size_t n = (type_nbytes * count + 7)/ 8;
-      buffer_.resize(std::min(reduce_buffer_size, n));
-      // make sure align to type_nbytes
-      buffer_size =
-          buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
-      utils::Assert(type_nbytes <= buffer_size,
-                    "too large type_nbytes=%lu, buffer_size=%lu",
-                    type_nbytes, buffer_size);
-      // set buffer head
-      buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
-    }
-    // reset the recv and sent size
-    inline void ResetSize(void) {
-      size_write = size_read = 0;
-    }
-    /*!
-     * \brief read data into ring-buffer, with care not to existing useful override data
-     *  position after protect_start
-     * \param protect_start all data start from protect_start is still needed in buffer
-     *                      read shall not override this
-     * \param max_size_read maximum logical amount we can read, size_read cannot exceed this value
-     * \return the type of reading
-     */
-    inline ReturnType ReadToRingBuffer(size_t protect_start, size_t max_size_read) {
-      utils::Assert(buffer_head != NULL, "ReadToRingBuffer: buffer not allocated");
-      utils::Assert(size_read <= max_size_read, "ReadToRingBuffer: max_size_read check");
-      size_t ngap = size_read - protect_start;
-      utils::Assert(ngap <= buffer_size, "Allreduce: boundary check");
-      size_t offset = size_read % buffer_size;
-      size_t nmax = max_size_read - size_read;
-      nmax = std::min(nmax, buffer_size - ngap);
-      nmax = std::min(nmax, buffer_size - offset);
-      if (nmax == 0) return kSuccess;
-      ssize_t len = sock.Recv(buffer_head + offset, nmax);
-      // length equals 0, remote disconnected
-      if (len == 0) {
-        sock.Close(); return kRecvZeroLen;
-      }
-      if (len == -1) return Errno2Return();
-      size_read += static_cast<size_t>(len);
-      return kSuccess;
-    }
-    /*!
-     * \brief read data into array,
-     * this function can not be used together with ReadToRingBuffer
-     * a link can either read into the ring buffer, or existing array
-     * \param max_size maximum size of array
-     * \return true if it is an successful read, false if there is some error happens, check errno
-     */
-    inline ReturnType ReadToArray(void *recvbuf_, size_t max_size) {
-      if (max_size == size_read) return kSuccess;
-      char *p = static_cast<char*>(recvbuf_);
-      ssize_t len = sock.Recv(p + size_read, max_size - size_read);
-      // length equals 0, remote disconnected
-      if (len == 0) {
-        sock.Close(); return kRecvZeroLen;
-      }
-      if (len == -1) return Errno2Return();
-      size_read += static_cast<size_t>(len);
-      return kSuccess;
-    }
-    /*!
-     * \brief write data in array to sock
-     * \param sendbuf_ head of array
-     * \param max_size maximum size of array
-     * \return true if it is an successful write, false if there is some error happens, check errno
-     */
-    inline ReturnType WriteFromArray(const void *sendbuf_, size_t max_size) {
-      const char *p = static_cast<const char*>(sendbuf_);
-      ssize_t len = sock.Send(p + size_write, max_size - size_write);
-      if (len == -1) return Errno2Return();
-      size_write += static_cast<size_t>(len);
-      return kSuccess;
-    }
-
-   private:
-    // recv buffer to get data from child
-    // aligned with 64 bits, will be able to perform 64 bits operations freely
-    std::vector<uint64_t> buffer_;
-  };
-  /*!
-   * \brief simple data structure that works like a vector
-   *  but takes reference instead of space
-   */
-  struct RefLinkVector {
-    std::vector<LinkRecord*> plinks;
-    inline LinkRecord &operator[](size_t i) {
-      return *plinks[i];
-    }
-    inline size_t size(void) const {
-      return plinks.size();
-    }
-  };
-  /*!
-   * \brief initialize connection to the tracker
-   * \return a socket that initializes the connection
-   */
-  utils::TCPSocket ConnectTracker(void) const;
-  /*!
-   * \brief connect to the tracker to fix the the missing links
-   *   this function is also used when the engine start up
-   * \param cmd possible command to sent to tracker
-   */
-  void ReConnectLinks(const char *cmd = "start");
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
-   *
-   * NOTE on Allreduce:
-   *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
-   *    It only means the current node get the correct result of Allreduce.
-   *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
-   *
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryAllreduce(void *sendrecvbuf_,
-                          size_t type_nbytes,
-                          size_t count,
-                          ReduceFunction reducer);
-  /*!
-   * \brief broadcast data from root to all nodes, this function can fail,and will return the cause of failure
-   * \param sendrecvbuf_ buffer for both sending and receiving data
-   * \param size the size of the data to be broadcasted
-   * \param root the root worker id to broadcast the data
-   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf,
-   * this function implements tree-shape reduction
-   *
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryAllreduceTree(void *sendrecvbuf_,
-                              size_t type_nbytes,
-                              size_t count,
-                              ReduceFunction reducer);
-  /*!
-   * \brief internal Allgather function, each node have a segment of data in the ring of sendrecvbuf,
-   *  the data provided by current node k is [slice_begin, slice_end),
-   *  the next node's segment must start with slice_end
-   *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
-   *  use a ring based algorithm
-   *
-   * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
-   * \param total_size total size of data to be gathered
-   * \param slice_begin beginning of the current slice
-   * \param slice_end end of the current slice
-   * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
-   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
-                              size_t slice_begin, size_t slice_end,
-                              size_t size_prev_slice);
-  /*!
-   * \brief perform in-place allreduce, reduce on the sendrecvbuf,
-   *
-   *  after the function, node k get k-th segment of the reduction result
-   *  the k-th segment is defined by [k * step, min((k + 1) * step,count) )
-   *  where step = ceil(count / world_size)
-   *
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   * \sa ReturnType, TryAllreduce
-   */
-  ReturnType TryReduceScatterRing(void *sendrecvbuf_,
-                                  size_t type_nbytes,
-                                  size_t count,
-                                  ReduceFunction reducer);
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf
-   *  use a ring based algorithm, reduce-scatter + allgather
-   *
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryAllreduceRing(void *sendrecvbuf_,
-                              size_t type_nbytes,
-                              size_t count,
-                              ReduceFunction reducer);
-  /*!
-   * \brief function used to report error when a link goes wrong
-   * \param link the pointer to the link who causes the error
-   * \param err the error type
-   */
-  inline ReturnType ReportError(LinkRecord *link, ReturnType err) {
-    err_link = link; return err;
-  }
-  //---- data structure related to model ----
-  // call sequence counter, records how many calls we made so far
-  // from last call to CheckPoint, LoadCheckPoint
-  int seq_counter;
-  // version number of model
-  int version_number;
-  // whether the job is running in hadoop
-  int hadoop_mode;
-  //---- local data related to link ----
-  // index of parent link, can be -1, meaning this is root of the tree
-  int parent_index;
-  // rank of parent node, can be -1
-  int parent_rank;
-  // sockets of all links this connects to
-  std::vector<LinkRecord> all_links;
-  // used to record the link where things goes wrong
-  LinkRecord *err_link;
-  // all the links in the reduction tree connection
-  RefLinkVector tree_links;
-  // pointer to links in the ring
-  LinkRecord *ring_prev, *ring_next;
-  //----- meta information-----
-  // list of enviroment variables that are of possible interest
-  std::vector<std::string> env_vars;
-  // unique identifier of the possible job this process is doing
-  // used to assign ranks, optional, default to NULL
-  std::string task_id;
-  // uri of current host, to be set by Init
-  std::string host_uri;
-  // uri of tracker
-  std::string tracker_uri;
-  // role in dmlc jobs
-  std::string dmlc_role;
-  // port of tracker address
-  int tracker_port;
-  // port of slave process
-  int slave_port, nport_trial;
-  // reduce buffer size
-  size_t reduce_buffer_size;
-  // reduction method
-  int reduce_method;
-  // mininum count of cells to use ring based method
-  size_t reduce_ring_mincount;
-  // current rank
-  int rank;
-  // world size
-  int world_size;
-  // connect retry time
-  int connect_retry;
-};
-}  // namespace engine
-}  // namespace rabit
-#endif  // RABIT_ALLREDUCE_BASE_H_
diff --git a/subtree/rabit/src/allreduce_mock.h b/subtree/rabit/src/allreduce_mock.h
deleted file mode 100644
index c3f9f4f1d..000000000
--- a/subtree/rabit/src/allreduce_mock.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file allreduce_mock.h
- * \brief Mock test module of AllReduce engine,
- * insert failures in certain call point, to test if the engine is robust to failure
- *
- * \author Ignacio Cano, Tianqi Chen
- */
-#ifndef RABIT_ALLREDUCE_MOCK_H_
-#define RABIT_ALLREDUCE_MOCK_H_
-#include <vector>
-#include <map>
-#include <sstream>
-#include "../include/rabit/engine.h"
-#include "../include/rabit/timer.h"
-#include "./allreduce_robust.h"
-
-namespace rabit {
-namespace engine {
-class AllreduceMock : public AllreduceRobust {
- public:
-  // constructor
-  AllreduceMock(void) {
-    num_trial = 0;
-    force_local = 0;
-    report_stats = 0;
-    tsum_allreduce = 0.0;
-  }
-  // destructor
-  virtual ~AllreduceMock(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    AllreduceRobust::SetParam(name, val);
-    // additional parameters
-    if (!strcmp(name, "rabit_num_trial")) num_trial = atoi(val);
-    if (!strcmp(name, "DMLC_NUM_ATTEMPT")) num_trial = atoi(val);
-    if (!strcmp(name, "report_stats")) report_stats = atoi(val);
-    if (!strcmp(name, "force_local")) force_local = atoi(val);
-    if (!strcmp(name, "mock")) {
-      MockKey k;
-      utils::Check(sscanf(val, "%d,%d,%d,%d",
-                          &k.rank, &k.version, &k.seqno, &k.ntrial) == 4,
-                   "invalid mock parameter");
-      mock_map[k] = 1;
-    }
-  }
-  virtual void Allreduce(void *sendrecvbuf_,
-                         size_t type_nbytes,
-                         size_t count,
-                         ReduceFunction reducer,
-                         PreprocFunction prepare_fun,
-                         void *prepare_arg) {
-    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "AllReduce");
-    double tstart = utils::GetTime();
-    AllreduceRobust::Allreduce(sendrecvbuf_, type_nbytes,
-                               count, reducer, prepare_fun, prepare_arg);
-    tsum_allreduce += utils::GetTime() - tstart;
-  }
-  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
-    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "Broadcast");
-    AllreduceRobust::Broadcast(sendrecvbuf_, total_size, root);
-  }
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model) {
-    tsum_allreduce = 0.0;
-    time_checkpoint = utils::GetTime();
-    if (force_local == 0) {
-      return AllreduceRobust::LoadCheckPoint(global_model, local_model);
-    } else {
-      DummySerializer dum;
-      ComboSerializer com(global_model, local_model);
-      return AllreduceRobust::LoadCheckPoint(&dum, &com);
-    }
-  }
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model) {
-    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "CheckPoint");
-    double tstart = utils::GetTime();
-    double tbet_chkpt = tstart - time_checkpoint;
-    if (force_local == 0) {
-      AllreduceRobust::CheckPoint(global_model, local_model);
-    } else {
-      DummySerializer dum;
-      ComboSerializer com(global_model, local_model);
-      AllreduceRobust::CheckPoint(&dum, &com);
-    }
-    time_checkpoint = utils::GetTime();
-    double tcost = utils::GetTime() - tstart;
-    if (report_stats != 0 && rank == 0) {
-      std::stringstream ss;
-      ss << "[v" << version_number << "] global_size=" << global_checkpoint.length()
-         << ",local_size=" << (local_chkpt[0].length() + local_chkpt[1].length())
-         << ",check_tcost="<< tcost <<" sec"
-         << ",allreduce_tcost=" << tsum_allreduce << " sec"
-         << ",between_chpt=" << tbet_chkpt << "sec\n";
-      this->TrackerPrint(ss.str());
-    }
-    tsum_allreduce = 0.0;
-  }
-
-  virtual void LazyCheckPoint(const Serializable *global_model) {
-    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
-    AllreduceRobust::LazyCheckPoint(global_model);
-  }
-
- protected:
-  // force checkpoint to local
-  int force_local;
-  // whether report statistics
-  int report_stats;
-  // sum of allreduce
-  double tsum_allreduce;
-  double time_checkpoint;
-
- private:
-  struct DummySerializer : public Serializable {
-    virtual void Load(Stream *fi) {
-    }
-    virtual void Save(Stream *fo) const {
-    }
-  };
-  struct ComboSerializer : public Serializable {
-    Serializable *lhs;
-    Serializable *rhs;
-    const Serializable *c_lhs;
-    const Serializable *c_rhs;
-    ComboSerializer(Serializable *lhs, Serializable *rhs)
-        : lhs(lhs), rhs(rhs), c_lhs(lhs), c_rhs(rhs) {
-    }
-    ComboSerializer(const Serializable *lhs, const Serializable *rhs)
-        : lhs(NULL), rhs(NULL), c_lhs(lhs), c_rhs(rhs) {
-    }
-    virtual void Load(Stream *fi) {
-      if (lhs != NULL) lhs->Load(fi);
-      if (rhs != NULL) rhs->Load(fi);
-    }
-    virtual void Save(Stream *fo) const {
-      if (c_lhs != NULL) c_lhs->Save(fo);
-      if (c_rhs != NULL) c_rhs->Save(fo);
-    }
-  };
-  // key to identify the mock stage
-  struct MockKey {
-    int rank;
-    int version;
-    int seqno;
-    int ntrial;
-    MockKey(void) {}
-    MockKey(int rank, int version, int seqno, int ntrial)
-        : rank(rank), version(version), seqno(seqno), ntrial(ntrial) {}
-    inline bool operator==(const MockKey &b) const {
-      return rank == b.rank &&
-          version == b.version &&
-          seqno == b.seqno &&
-          ntrial == b.ntrial;
-    }
-    inline bool operator<(const MockKey &b) const {
-      if (rank != b.rank) return rank < b.rank;
-      if (version != b.version) return version < b.version;
-      if (seqno != b.seqno) return seqno < b.seqno;
-      return ntrial < b.ntrial;
-    }
-  };
-  // number of failure trials
-  int num_trial;
-  // record all mock actions
-  std::map<MockKey, int> mock_map;
-  // used to generate all kinds of exceptions
-  inline void Verify(const MockKey &key, const char *name) {
-    if (mock_map.count(key) != 0) {
-      num_trial += 1;
-      fprintf(stderr, "[%d]@@@Hit Mock Error:%s\n", rank, name);
-      exit(-2);
-    }
-  }
-};
-}  // namespace engine
-}  // namespace rabit
-#endif  // RABIT_ALLREDUCE_MOCK_H_
diff --git a/subtree/rabit/src/allreduce_robust-inl.h b/subtree/rabit/src/allreduce_robust-inl.h
deleted file mode 100644
index d3cbc0033..000000000
--- a/subtree/rabit/src/allreduce_robust-inl.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file allreduce_robust-inl.h
- * \brief implementation of inline template function in AllreduceRobust
- *
- * \author Tianqi Chen
- */
-#ifndef RABIT_ALLREDUCE_ROBUST_INL_H_
-#define RABIT_ALLREDUCE_ROBUST_INL_H_
-#include <vector>
-
-namespace rabit {
-namespace engine {
-/*!
- * \brief run message passing algorithm on the allreduce tree
- *        the result is edge message stored in p_edge_in and p_edge_out
- * \param node_value the value associated with current node
- * \param p_edge_in used to store input message from each of the edge
- * \param p_edge_out used to store output message from each of the edge
- * \param func a function that defines the message passing rule
- *        Parameters of func:
- *           - node_value same as node_value in the main function
- *           - edge_in the array of input messages from each edge,
- *                     this includes the output edge, which should be excluded
- *           - out_index array the index of output edge, the function should
- *                       exclude the output edge when compute the message passing value
- *        Return of func:
- *           the function returns the output message based on the input message and node_value
- *
- * \tparam EdgeType type of edge message, must be simple struct
- * \tparam NodeType type of node value
- */
-template<typename NodeType, typename EdgeType>
-inline AllreduceRobust::ReturnType
-AllreduceRobust::MsgPassing(const NodeType &node_value,
-                            std::vector<EdgeType> *p_edge_in,
-                            std::vector<EdgeType> *p_edge_out,
-                            EdgeType(*func)
-                            (const NodeType &node_value,
-                             const std::vector<EdgeType> &edge_in,
-                             size_t out_index)) {
-  RefLinkVector &links = tree_links;
-  if (links.size() == 0) return kSuccess;
-  // number of links
-  const int nlink = static_cast<int>(links.size());
-  // initialize the pointers
-  for (int i = 0; i < nlink; ++i) {
-    links[i].ResetSize();
-  }
-  std::vector<EdgeType> &edge_in = *p_edge_in;
-  std::vector<EdgeType> &edge_out = *p_edge_out;
-  edge_in.resize(nlink);
-  edge_out.resize(nlink);
-  // stages in the process
-  // 0: recv messages from childs
-  // 1: send message to parent
-  // 2: recv message from parent
-  // 3: send message to childs
-  int stage = 0;
-  // if no childs, no need to, directly start passing message
-  if (nlink == static_cast<int>(parent_index != -1)) {
-    utils::Assert(parent_index == 0, "parent must be 0");
-    edge_out[parent_index] = func(node_value, edge_in, parent_index);
-    stage = 1;
-  }
-  // while we have not passed the messages out
-  while (true) {
-    // for node with no parent, directly do stage 3
-    if (parent_index == -1) {
-      utils::Assert(stage != 2 && stage != 1, "invalie stage id");
-    }
-    // select helper
-    utils::SelectHelper selecter;
-    bool done = (stage == 3);
-    for (int i = 0; i < nlink; ++i) {
-      selecter.WatchException(links[i].sock);
-      switch (stage) {
-        case 0:
-          if (i != parent_index && links[i].size_read != sizeof(EdgeType)) {
-            selecter.WatchRead(links[i].sock);
-          }
-          break;
-        case 1:
-          if (i == parent_index) {
-            selecter.WatchWrite(links[i].sock);
-          }
-          break;
-        case 2:
-          if (i == parent_index) {
-            selecter.WatchRead(links[i].sock);
-          }
-          break;
-        case 3:
-          if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
-            selecter.WatchWrite(links[i].sock);
-            done = false;
-          }
-          break;
-        default: utils::Error("invalid stage");
-      }
-    }
-    // finish all the stages, and write out message
-    if (done) break;
-    selecter.Select();
-    // exception handling
-    for (int i = 0; i < nlink; ++i) {
-      // recive OOB message from some link
-      if (selecter.CheckExcept(links[i].sock)) {
-        return ReportError(&links[i], kGetExcept);
-      }
-    }
-    if (stage == 0) {
-      bool finished = true;
-      // read data from childs
-      for (int i = 0; i < nlink; ++i) {
-        if (i != parent_index) {
-          if (selecter.CheckRead(links[i].sock)) {
-            ReturnType ret = links[i].ReadToArray(&edge_in[i], sizeof(EdgeType));
-            if (ret != kSuccess) return ReportError(&links[i], ret);
-          }
-          if (links[i].size_read != sizeof(EdgeType)) finished = false;
-        }
-      }
-      // if no parent, jump to stage 3, otherwise do stage 1
-      if (finished) {
-        if (parent_index != -1) {
-          edge_out[parent_index] = func(node_value, edge_in, parent_index);
-          stage = 1;
-        } else {
-          for (int i = 0; i < nlink; ++i) {
-            edge_out[i] = func(node_value, edge_in, i);
-          }
-          stage = 3;
-        }
-      }
-    }
-    if (stage == 1) {
-      const int pid = this->parent_index;
-      utils::Assert(pid != -1, "MsgPassing invalid stage");
-      ReturnType ret = links[pid].WriteFromArray(&edge_out[pid], sizeof(EdgeType));
-      if (ret != kSuccess) return ReportError(&links[pid], ret);
-      if (links[pid].size_write == sizeof(EdgeType)) stage = 2;
-    }
-    if (stage == 2) {
-      const int pid = this->parent_index;
-      utils::Assert(pid != -1, "MsgPassing invalid stage");
-      ReturnType ret = links[pid].ReadToArray(&edge_in[pid], sizeof(EdgeType));
-      if (ret != kSuccess) return ReportError(&links[pid], ret);
-      if (links[pid].size_read == sizeof(EdgeType)) {
-        for (int i = 0; i < nlink; ++i) {
-          if (i != pid) edge_out[i] = func(node_value, edge_in, i);
-        }
-        stage = 3;
-      }
-    }
-    if (stage == 3) {
-      for (int i = 0; i < nlink; ++i) {
-        if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
-          ReturnType ret = links[i].WriteFromArray(&edge_out[i], sizeof(EdgeType));
-          if (ret != kSuccess) return ReportError(&links[i], ret);
-        }
-      }
-    }
-  }
-  return kSuccess;
-}
-}  // namespace engine
-}  // namespace rabit
-#endif  // RABIT_ALLREDUCE_ROBUST_INL_H_
diff --git a/subtree/rabit/src/allreduce_robust.cc b/subtree/rabit/src/allreduce_robust.cc
deleted file mode 100644
index 175751842..000000000
--- a/subtree/rabit/src/allreduce_robust.cc
+++ /dev/null
@@ -1,1183 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file allreduce_robust.cc
- * \brief Robust implementation of Allreduce
- *
- * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <limits>
-#include <utility>
-#include "../include/rabit/io.h"
-#include "../include/rabit/utils.h"
-#include "../include/rabit/engine.h"
-#include "../include/rabit/rabit-inl.h"
-#include "./allreduce_robust.h"
-
-namespace rabit {
-namespace engine {
-AllreduceRobust::AllreduceRobust(void) {
-  num_local_replica = 0;
-  num_global_replica = 5;
-  default_local_replica = 2;
-  seq_counter = 0;
-  local_chkpt_version = 0;
-  result_buffer_round = 1;
-  global_lazycheck = NULL;
-  use_local_model = -1;
-  recover_counter = 0;
-  env_vars.push_back("rabit_global_replica");
-  env_vars.push_back("rabit_local_replica");
-}
-void AllreduceRobust::Init(void) {
-  AllreduceBase::Init();
-  result_buffer_round = std::max(world_size / num_global_replica, 1);
-}
-/*! \brief shutdown the engine */
-void AllreduceRobust::Shutdown(void) {
-  // need to sync the exec before we shutdown, do a pesudo check point
-  // execute checkpoint, note: when checkpoint existing, load will not happen
-  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp),
-                "Shutdown: check point must return true");
-  // reset result buffer
-  resbuf.Clear(); seq_counter = 0;
-  // execute check ack step, load happens here
-  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
-                "Shutdown: check ack must return true");
-  AllreduceBase::Shutdown();
-}
-/*!
- * \brief set parameters to the engine
- * \param name parameter name
- * \param val parameter value
- */
-void AllreduceRobust::SetParam(const char *name, const char *val) {
-  AllreduceBase::SetParam(name, val);
-  if (!strcmp(name, "rabit_global_replica")) num_global_replica = atoi(val);
-  if (!strcmp(name, "rabit_local_replica")) {
-    num_local_replica = atoi(val);
-  }
-}
-/*!
- * \brief perform in-place allreduce, on sendrecvbuf
- *        this function is NOT thread-safe
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param type_nbytes the unit number of bytes the type have
- * \param count number of elements to be reduced
- * \param reducer reduce function
- * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
- *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
- *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
- * \param prepare_arg argument used to passed into the lazy preprocessing function
- */
-void AllreduceRobust::Allreduce(void *sendrecvbuf_,
-                                size_t type_nbytes,
-                                size_t count,
-                                ReduceFunction reducer,
-                                PreprocFunction prepare_fun,
-                                void *prepare_arg) {
-  // skip action in single node
-  if (world_size == 1) {
-    if (prepare_fun != NULL) prepare_fun(prepare_arg);
-    return;
-  }
-  bool recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter);
-  // now we are free to remove the last result, if any
-  if (resbuf.LastSeqNo() != -1 &&
-      (resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
-    resbuf.DropLast();
-  }
-  if (!recovered && prepare_fun != NULL) prepare_fun(prepare_arg);
-  void *temp = resbuf.AllocTemp(type_nbytes, count);
-  while (true) {
-    if (recovered) {
-      std::memcpy(temp, sendrecvbuf_, type_nbytes * count); break;
-    } else {
-      std::memcpy(temp, sendrecvbuf_, type_nbytes * count);
-      if (CheckAndRecover(TryAllreduce(temp, type_nbytes, count, reducer))) {
-        std::memcpy(sendrecvbuf_, temp, type_nbytes * count); break;
-      } else {
-        recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter);
-      }
-    }
-  }
-  resbuf.PushTemp(seq_counter, type_nbytes, count);
-  seq_counter += 1;
-}
-/*!
- * \brief broadcast data from root to all nodes
- * \param sendrecvbuf_ buffer for both sending and recving data
- * \param size the size of the data to be broadcasted
- * \param root the root worker id to broadcast the data
- */
-void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
-  // skip action in single node
-  if (world_size == 1) return;
-  bool recovered = RecoverExec(sendrecvbuf_, total_size, 0, seq_counter);
-  // now we are free to remove the last result, if any
-  if (resbuf.LastSeqNo() != -1 &&
-      (resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
-    resbuf.DropLast();
-  }
-  void *temp = resbuf.AllocTemp(1, total_size);
-  while (true) {
-    if (recovered) {
-      std::memcpy(temp, sendrecvbuf_, total_size); break;
-    } else {
-      if (CheckAndRecover(TryBroadcast(sendrecvbuf_, total_size, root))) {
-        std::memcpy(temp, sendrecvbuf_, total_size); break;
-      } else {
-        recovered = RecoverExec(sendrecvbuf_, total_size, 0, seq_counter);
-      }
-    }
-  }
-  resbuf.PushTemp(seq_counter, 1, total_size);
-  seq_counter += 1;
-}
-/*!
- * \brief load latest check point
- * \param global_model pointer to the globally shared model/state
- *   when calling this function, the caller need to gauranttees that global_model
- *   is the same in all nodes
- * \param local_model pointer to local model, that is specific to current node/rank
- *   this can be NULL when no local model is needed
- *
- * \return the version number of check point loaded
- *     if returned version == 0, this means no model has been CheckPointed
- *     the p_model is not touched, user should do necessary initialization by themselves
- *
- *   Common usage example:
- *      int iter = rabit::LoadCheckPoint(&model);
- *      if (iter == 0) model.InitParameters();
- *      for (i = iter; i < max_iter; ++i) {
- *        do many things, include allreduce
- *        rabit::CheckPoint(model);
- *      }
- *
- * \sa CheckPoint, VersionNumber
- */
-int AllreduceRobust::LoadCheckPoint(Serializable *global_model,
-                                    Serializable *local_model) {
-  // skip action in single node
-  if (world_size == 1) return 0;
-  this->LocalModelCheck(local_model != NULL);
-  if (num_local_replica == 0) {
-    utils::Check(local_model == NULL,
-                 "need to set rabit_local_replica larger than 1 to checkpoint local_model");
-  }
-  // check if we succesful
-  if (RecoverExec(NULL, 0, ActionSummary::kLoadCheck, ActionSummary::kSpecialOp)) {
-    int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
-    if (local_model != NULL) {
-      if (nlocal == num_local_replica + 1) {
-        // load in local model
-        utils::MemoryFixSizeBuffer fs(BeginPtr(local_chkpt[local_chkpt_version]),
-                                      local_rptr[local_chkpt_version][1]);
-        local_model->Load(&fs);
-      } else {
-        utils::Assert(nlocal == 0, "[%d] local model inconsistent, nlocal=%d", rank, nlocal);
-      }
-    }
-    // reset result buffer
-    resbuf.Clear(); seq_counter = 0;
-    // load from buffer
-    utils::MemoryBufferStream fs(&global_checkpoint);
-    if (global_checkpoint.length() == 0) {
-      version_number = 0;
-    } else {
-      utils::Assert(fs.Read(&version_number, sizeof(version_number)) != 0,
-                    "read in version number");
-      global_model->Load(&fs);
-      utils::Assert(local_model == NULL || nlocal == num_local_replica + 1,
-                    "local model inconsistent, nlocal=%d", nlocal);
-    }
-    // run another phase of check ack, if recovered from data
-    utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
-                  "check ack must return true");
-    return version_number;
-  } else {
-    // reset result buffer
-    resbuf.Clear(); seq_counter = 0; version_number = 0;
-    // nothing loaded, a fresh start, everyone init model
-    return version_number;
-  }
-}
-/*!
- * \brief internal consistency check function,
- *  use check to ensure user always call CheckPoint/LoadCheckPoint
- *  with or without local but not both, this function will set the approperiate settings
- *  in the first call of LoadCheckPoint/CheckPoint
- *
- * \param with_local whether the user calls CheckPoint with local model
- */
-void AllreduceRobust::LocalModelCheck(bool with_local) {
-  if (use_local_model == -1) {
-    if (with_local) {
-      use_local_model = 1;
-      if (num_local_replica == 0) {
-        num_local_replica = default_local_replica;
-      }
-    } else {
-      use_local_model = 0;
-      num_local_replica = 0;
-    }
-  } else {
-    utils::Check(use_local_model == static_cast<int>(with_local),
-                 "Can only call Checkpoint/LoadCheckPoint always with"\
-                 "or without local_model, but not mixed case");
-  }
-}
-/*!
- * \brief internal implementation of checkpoint, support both lazy and normal way
- *
- * \param global_model pointer to the globally shared model/state
- *   when calling this function, the caller need to gauranttees that global_model
- *   is the same in all nodes
- * \param local_model pointer to local model, that is specific to current node/rank
- *   this can be NULL when no local state is needed
- * \param lazy_checkpt whether the action is lazy checkpoint
- *
- * \sa CheckPoint, LazyCheckPoint
- */
-void AllreduceRobust::CheckPoint_(const Serializable *global_model,
-                                  const Serializable *local_model,
-                                  bool lazy_checkpt) {
-  // never do check point in single machine mode
-  if (world_size == 1) {
-    version_number += 1; return;
-  }
-  this->LocalModelCheck(local_model != NULL);
-  if (num_local_replica == 0) {
-    utils::Check(local_model == NULL,
-                 "need to set rabit_local_replica larger than 1 to checkpoint local_model");
-  }
-  if (num_local_replica != 0) {
-    while (true) {
-      if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
-      // save model model to new version place
-      int new_version = !local_chkpt_version;
-      local_chkpt[new_version].clear();
-      utils::MemoryBufferStream fs(&local_chkpt[new_version]);
-      if (local_model != NULL) {
-        local_model->Save(&fs);
-      }
-      local_rptr[new_version].clear();
-      local_rptr[new_version].push_back(0);
-      local_rptr[new_version].push_back(local_chkpt[new_version].length());
-      if (CheckAndRecover(TryCheckinLocalState(&local_rptr[new_version],
-                                               &local_chkpt[new_version]))) break;
-    }
-    // run the ack phase, can be true or false
-    RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckAck);
-    // switch pointer to new version
-    local_chkpt_version = !local_chkpt_version;
-  }
-  // execute checkpoint, note: when checkpoint existing, load will not happen
-  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp),
-                "check point must return true");
-  // this is the critical region where we will change all the stored models
-  // increase version number
-  version_number += 1;
-  // save model
-  if (lazy_checkpt) {
-    global_lazycheck = global_model;
-  } else {
-    global_checkpoint.resize(0);
-    utils::MemoryBufferStream fs(&global_checkpoint);
-    fs.Write(&version_number, sizeof(version_number));
-    global_model->Save(&fs);
-    global_lazycheck = NULL;
-  }
-  // reset result buffer
-  resbuf.Clear(); seq_counter = 0;
-  // execute check ack step, load happens here
-  utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
-                "check ack must return true");
-}
-/*!
- * \brief reset the all the existing links by sending Out-of-Band message marker
- *  after this function finishes, all the messages received and sent before in all live links are discarded,
- *  This allows us to get a fresh start after error has happened
- *
- * \return this function can return kSuccess or kSockError
- *         when kSockError is returned, it simply means there are bad sockets in the links,
- *         and some link recovery proceduer is needed
- */
-AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
-  // number of links
-  const int nlink = static_cast<int>(all_links.size());
-  for (int i = 0; i < nlink; ++i) {
-    all_links[i].InitBuffer(sizeof(int), 1 << 10, reduce_buffer_size);
-    all_links[i].ResetSize();
-  }
-  // read and discard data from all channels until pass mark
-  while (true) {
-    for (int i = 0; i < nlink; ++i) {
-      if (all_links[i].sock.BadSocket()) continue;
-      if (all_links[i].size_write == 0) {
-        char sig = kOOBReset;
-        ssize_t len = all_links[i].sock.Send(&sig, sizeof(sig), MSG_OOB);
-        // error will be filtered in next loop
-        if (len == sizeof(sig)) all_links[i].size_write = 1;
-      }
-      if (all_links[i].size_write == 1) {
-        char sig = kResetMark;
-        ssize_t len = all_links[i].sock.Send(&sig, sizeof(sig));
-        if (len == sizeof(sig)) all_links[i].size_write = 2;
-      }
-    }
-    utils::SelectHelper rsel;
-    bool finished = true;
-    for (int i = 0; i < nlink; ++i) {
-      if (all_links[i].size_write != 2 && !all_links[i].sock.BadSocket()) {
-        rsel.WatchWrite(all_links[i].sock); finished = false;
-      }
-    }
-    if (finished) break;
-    // wait to read from the channels to discard data
-    rsel.Select();
-  }
-  for (int i = 0; i < nlink; ++i) {
-    if (!all_links[i].sock.BadSocket()) {
-      utils::SelectHelper::WaitExcept(all_links[i].sock);
-    }
-  }
-  while (true) {
-    utils::SelectHelper rsel;
-    bool finished = true;
-    for (int i = 0; i < nlink; ++i) {
-      if (all_links[i].size_read == 0 && !all_links[i].sock.BadSocket()) {
-        rsel.WatchRead(all_links[i].sock); finished = false;
-      }
-    }
-    if (finished) break;
-    rsel.Select();
-    for (int i = 0; i < nlink; ++i) {
-      if (all_links[i].sock.BadSocket()) continue;
-      if (all_links[i].size_read == 0) {
-        int atmark = all_links[i].sock.AtMark();
-        if (atmark < 0) {
-          utils::Assert(all_links[i].sock.BadSocket(), "must already gone bad");
-        } else if (atmark > 0) {
-          all_links[i].size_read = 1;
-        } else {
-          // no at mark, read and discard data
-          ssize_t len = all_links[i].sock.Recv(all_links[i].buffer_head, all_links[i].buffer_size);
-          if (all_links[i].sock.AtMark()) all_links[i].size_read = 1;
-          // zero length, remote closed the connection, close socket
-          if (len == 0) all_links[i].sock.Close();
-        }
-      }
-    }
-  }
-  // start synchronization, use blocking I/O to avoid select
-  for (int i = 0; i < nlink; ++i) {
-    if (!all_links[i].sock.BadSocket()) {
-      char oob_mark;
-      all_links[i].sock.SetNonBlock(false);
-      ssize_t len = all_links[i].sock.Recv(&oob_mark, sizeof(oob_mark), MSG_WAITALL);
-      if (len == 0) {
-        all_links[i].sock.Close(); continue;
-      } else if (len > 0) {
-        utils::Assert(oob_mark == kResetMark, "wrong oob msg");
-        utils::Assert(all_links[i].sock.AtMark() != 1, "should already read past mark");
-      } else {
-        utils::Assert(errno != EAGAIN|| errno != EWOULDBLOCK, "BUG");
-      }
-      // send out ack
-      char ack = kResetAck;
-      while (true) {
-        len = all_links[i].sock.Send(&ack, sizeof(ack));
-        if (len == sizeof(ack)) break;
-        if (len == -1) {
-          if (errno != EAGAIN && errno != EWOULDBLOCK) break;
-        }
-      }
-    }
-  }
-  // wait all ack
-  for (int i = 0; i < nlink; ++i) {
-    if (!all_links[i].sock.BadSocket()) {
-      char ack;
-      ssize_t len = all_links[i].sock.Recv(&ack, sizeof(ack), MSG_WAITALL);
-      if (len == 0) {
-        all_links[i].sock.Close(); continue;
-      } else if (len > 0) {
-        utils::Assert(ack == kResetAck, "wrong Ack MSG");
-      } else {
-        utils::Assert(errno != EAGAIN|| errno != EWOULDBLOCK, "BUG");
-      }
-      // set back to nonblock mode
-      all_links[i].sock.SetNonBlock(true);
-    }
-  }
-  for (int i = 0; i < nlink; ++i) {
-    if (all_links[i].sock.BadSocket()) return kSockError;
-  }
-  return kSuccess;
-}
-/*!
- * \brief if err_type indicates an error
- *         recover links according to the error type reported
- *        if there is no error, return true
- * \param err_type the type of error happening in the system
- * \return true if err_type is kSuccess, false otherwise
- */
-bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
-  if (err_type == kSuccess) return true;
-  utils::Assert(err_link != NULL, "must know the error source");
-  recover_counter += 1;
-  {
-    // simple way, shutdown all links
-    for (size_t i = 0; i < all_links.size(); ++i) {
-      if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
-    }
-    ReConnectLinks("recover");
-    return false;
-  }
-  // this was old way
-  // TryResetLinks still causes possible errors, so not use this one
-  while (err_type != kSuccess) {
-    switch (err_type.value) {
-      case kGetExcept: err_type = TryResetLinks(); break;
-      case kSockError: {
-        TryResetLinks();
-        ReConnectLinks();
-        err_type = kSuccess;
-        break;
-      }
-      default: utils::Assert(false, "RecoverLinks: cannot reach here");
-    }
-  }
-  return false;
-}
-/*!
- * \brief message passing function, used to decide the
- *        shortest distance to the possible source of data
- * \param node_value a pair of have_data and size
- *           have_data whether current node have data
- *           size gives the size of data, if current node is kHaveData
- * \param dist_in the shorest to any data source distance in each direction
- * \param out_index the edge index of output link
- * \return the shorest distance result of out edge specified by out_index
- */
-inline std::pair<int, size_t>
-ShortestDist(const std::pair<bool, size_t> &node_value,
-             const std::vector< std::pair<int, size_t> > &dist_in,
-             size_t out_index) {
-  if (node_value.first) {
-    return std::make_pair(1, node_value.second);
-  }
-  size_t size = 0;
-  int res = std::numeric_limits<int>::max();
-  for (size_t i = 0; i < dist_in.size(); ++i) {
-    if (i == out_index) continue;
-    if (dist_in[i].first == std::numeric_limits<int>::max()) continue;
-    if (dist_in[i].first + 1 < res) {
-      res = dist_in[i].first + 1;
-      size = dist_in[i].second;
-    }
-  }
-  // add one hop
-
-  return std::make_pair(res, size);
-}
-/*!
- * \brief message passing function, used to decide the
- *    data request from each edge, whether need to request data from certain edge
- * \param node_value a pair of request_data and best_link
- *           request_data stores whether current node need to request data
- *           best_link gives the best edge index to fetch the data
- * \param req_in the data request from incoming edges
- * \param out_index the edge index of output link
- * \return the request to the output edge
- */
-inline char DataRequest(const std::pair<bool, int> &node_value,
-                        const std::vector<char> &req_in,
-                        size_t out_index) {
-  // whether current node need to request data
-  bool request_data = node_value.first;
-  // which edge index is the best link to request data
-  // can be -1, which means current node contains data
-  const int best_link = node_value.second;
-  if (static_cast<int>(out_index) == best_link) {
-    if (request_data) return 1;
-    for (size_t i = 0; i < req_in.size(); ++i) {
-      if (i == out_index) continue;
-      if (req_in[i] != 0) return 1;
-    }
-  }
-  return 0;
-}
-/*!
- * \brief try to decide the recovery message passing request
- * \param role the current role of the node
- * \param p_size used to store the size of the message, for node in state kHaveData,
- *               this size must be set correctly before calling the function
- *               for others, this surves as output parameter
- *
- * \param p_recvlink used to store the link current node should recv data from, if necessary
- *          this can be -1, which means current node have the data
- * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
- *
- * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceRobust::ReturnType
-AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
-                                  size_t *p_size,
-                                  int *p_recvlink,
-                                  std::vector<bool> *p_req_in) {
-  int best_link = -2;
-  {
-    // get the shortest distance to the request point
-    std::vector<std::pair<int, size_t> > dist_in, dist_out;
-    ReturnType succ = MsgPassing(std::make_pair(role == kHaveData, *p_size),
-                                 &dist_in, &dist_out, ShortestDist);
-    if (succ != kSuccess) return succ;
-    if (role != kHaveData) {
-      for (size_t i = 0; i < dist_in.size(); ++i) {
-        if (dist_in[i].first != std::numeric_limits<int>::max()) {
-          utils::Check(best_link == -2 || *p_size == dist_in[i].second,
-                       "[%d] Allreduce size inconsistent, distin=%lu, size=%lu, reporting=%lu\n",
-                       rank, dist_in[i].first, *p_size, dist_in[i].second);
-          if (best_link == -2 || dist_in[i].first < dist_in[best_link].first) {
-            best_link = static_cast<int>(i);
-            *p_size = dist_in[i].second;
-          }
-        }
-      }
-      utils::Check(best_link != -2, "Too many nodes went down and we cannot recover..");
-    } else {
-      best_link = -1;
-    }
-  }
-  // get the node request
-  std::vector<char> req_in, req_out;
-  ReturnType succ = MsgPassing(std::make_pair(role == kRequestData, best_link),
-                               &req_in, &req_out, DataRequest);
-  if (succ != kSuccess) return succ;
-  // set p_req_in
-  p_req_in->resize(req_in.size());
-  for (size_t i = 0; i < req_in.size(); ++i) {
-    // set p_req_in
-    (*p_req_in)[i] = (req_in[i] != 0);
-    if (req_out[i] != 0) {
-      utils::Assert(req_in[i] == 0, "cannot get and receive request");
-      utils::Assert(static_cast<int>(i) == best_link, "request result inconsistent");
-    }
-  }
-  *p_recvlink = best_link;
-  return kSuccess;
-}
-/*!
- * \brief try to finish the data recovery request,
- *        this function is used together with TryDecideRouting
- * \param role the current role of the node
- * \param sendrecvbuf_ the buffer to store the data to be sent/recived
- *          - if the role is kHaveData, this stores the data to be sent
- *          - if the role is kRequestData, this is the buffer to store the result
- *          - if the role is kPassData, this will not be used, and can be NULL
- * \param size the size of the data, obtained from TryDecideRouting
- * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
- * \param req_in the request of each link to send data, obtained from TryDecideRouting
- *
- * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
- * \sa ReturnType, TryDecideRouting
- */
-AllreduceRobust::ReturnType
-AllreduceRobust::TryRecoverData(RecoverType role,
-                                void *sendrecvbuf_,
-                                size_t size,
-                                int recv_link,
-                                const std::vector<bool> &req_in) {
-  RefLinkVector &links = tree_links;
-  // no need to run recovery for zero size messages
-  if (links.size() == 0 || size == 0) return kSuccess;
-  utils::Assert(req_in.size() == links.size(), "TryRecoverData");
-  const int nlink = static_cast<int>(links.size());
-  {
-    bool req_data = role == kRequestData;
-    for (int i = 0; i < nlink; ++i) {
-      if (req_in[i]) {
-        utils::Assert(i != recv_link, "TryDecideRouting");
-        req_data = true;
-      }
-    }
-    // do not need to provide data or receive data, directly exit
-    if (!req_data) return kSuccess;
-  }
-  utils::Assert(recv_link >= 0 || role == kHaveData, "recv_link must be active");
-  if (role == kPassData) {
-    links[recv_link].InitBuffer(1, size, reduce_buffer_size);
-  }
-  for (int i = 0; i < nlink; ++i) {
-    links[i].ResetSize();
-  }
-  while (true) {
-    bool finished = true;
-    utils::SelectHelper selecter;
-    for (int i = 0; i < nlink; ++i) {
-      if (i == recv_link && links[i].size_read != size) {
-        selecter.WatchRead(links[i].sock);
-        finished = false;
-      }
-      if (req_in[i] && links[i].size_write != size) {
-        if (role == kHaveData ||
-            (links[recv_link].size_read != links[i].size_write)) {
-          selecter.WatchWrite(links[i].sock);
-        }
-        finished = false;
-      }
-      selecter.WatchException(links[i].sock);
-    }
-    if (finished) break;
-    selecter.Select();
-    // exception handling
-    for (int i = 0; i < nlink; ++i) {
-      if (selecter.CheckExcept(links[i].sock)) {
-        return ReportError(&links[i], kGetExcept);
-      }
-    }
-    if (role == kRequestData) {
-      const int pid = recv_link;
-      if (selecter.CheckRead(links[pid].sock)) {
-        ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);
-        if (ret != kSuccess) {
-          return ReportError(&links[pid], ret);
-        }
-      }
-      for (int i = 0; i < nlink; ++i) {
-        if (req_in[i] && links[i].size_write != links[pid].size_read) {
-          ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, links[pid].size_read);
-          if (ret != kSuccess) {
-            return ReportError(&links[i], ret);
-          }
-        }
-      }
-    }
-    if (role == kHaveData) {
-      for (int i = 0; i < nlink; ++i) {
-        if (req_in[i] && links[i].size_write != size) {
-          ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, size);
-          if (ret != kSuccess) {
-            return ReportError(&links[i], ret);
-          }
-        }
-      }
-    }
-    if (role == kPassData) {
-      const int pid = recv_link;
-      const size_t buffer_size = links[pid].buffer_size;
-      if (selecter.CheckRead(links[pid].sock)) {
-        size_t min_write = size;
-        for (int i = 0; i < nlink; ++i) {
-          if (req_in[i]) min_write = std::min(links[i].size_write, min_write);
-        }
-        utils::Assert(min_write <= links[pid].size_read, "boundary check");
-        ReturnType ret = links[pid].ReadToRingBuffer(min_write, size);
-        if (ret != kSuccess) {
-          return ReportError(&links[pid], ret);
-        }
-      }
-      for (int i = 0; i < nlink; ++i) {
-        if (req_in[i] && links[pid].size_read != links[i].size_write) {
-          size_t start = links[i].size_write % buffer_size;
-          // send out data from ring buffer
-          size_t nwrite = std::min(buffer_size - start, links[pid].size_read - links[i].size_write);
-          ssize_t len = links[i].sock.Send(links[pid].buffer_head + start, nwrite);
-          if (len != -1) {
-            links[i].size_write += len;
-          } else {
-            ReturnType ret = Errno2Return();
-            if (ret != kSuccess) return ReportError(&links[i], ret);
-          }
-        }
-      }
-    }
-  }
-  return kSuccess;
-}
-/*!
- * \brief try to load check point
- *
- *        This is a collaborative function called by all nodes
- *        only the nodes with requester set to true really needs to load the check point
- *        other nodes acts as collaborative roles to complete this request
- *
- * \param requester whether current node is the requester
- * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
-  // check in local data
-  RecoverType role =  requester ? kRequestData : kHaveData;
-  ReturnType succ;
-  if (num_local_replica != 0) {
-    if (requester) {
-      // clear existing history, if any, before load
-      local_rptr[local_chkpt_version].clear();
-      local_chkpt[local_chkpt_version].clear();
-    }
-    // recover local checkpoint
-    succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
-                                &local_chkpt[local_chkpt_version]);
-    if (succ != kSuccess) return succ;
-    int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
-    // check if everyone is OK
-    unsigned state = 0;
-    if (nlocal == num_local_replica + 1) {
-      // complete recovery
-      state = 1;
-    } else if (nlocal == 0) {
-      // get nothing
-      state = 2;
-    } else {
-      // partially complete state
-      state = 4;
-    }
-    succ = TryAllreduce(&state, sizeof(state), 1, op::Reducer<op::BitOR, unsigned>);
-    if (succ != kSuccess) return succ;
-    utils::Check(state == 1 || state == 2,
-                 "LoadCheckPoint: too many nodes fails, cannot recover local state");
-  }
-  // do call save model if the checkpoint was lazy
-  if (role == kHaveData && global_lazycheck != NULL) {
-    global_checkpoint.resize(0);
-    utils::MemoryBufferStream fs(&global_checkpoint);
-    fs.Write(&version_number, sizeof(version_number));
-    global_lazycheck->Save(&fs);
-    global_lazycheck = NULL;
-  }
-  // recover global checkpoint
-  size_t size = this->global_checkpoint.length();
-  int recv_link;
-  std::vector<bool> req_in;
-  succ = TryDecideRouting(role, &size, &recv_link, &req_in);
-  if (succ != kSuccess) return succ;
-  if (role == kRequestData) {
-    global_checkpoint.resize(size);
-  }
-  if (size == 0) return kSuccess;
-  return TryRecoverData(role, BeginPtr(global_checkpoint), size, recv_link, req_in);
-}
-/*!
- * \brief try to get the result of operation specified by seqno
- *
- *        This is a collaborative function called by all nodes
- *        only the nodes with requester set to true really needs to get the result
- *        other nodes acts as collaborative roles to complete this request
- *
- * \param buf the buffer to store the result, this parameter is only used when current node is requester
- * \param size the total size of the buffer, this parameter is only used when current node is requester
- * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
- * \param requester whether current node is the requester
- * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceRobust::ReturnType
-AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool requester) {
-  // if minimum sequence requested is local check point ack,
-  // this means all nodes have finished local check point, directly return
-  if (seqno == ActionSummary::kLocalCheckAck) return kSuccess;
-  if (seqno == ActionSummary::kLocalCheckPoint) {
-    // new version of local model
-    int new_version = !local_chkpt_version;
-    int nlocal = std::max(static_cast<int>(local_rptr[new_version].size()) - 1, 0);
-    // if we goes to this place, use must have already setup the state once
-    utils::Assert(nlocal == 1 || nlocal == num_local_replica + 1,
-                  "TryGetResult::Checkpoint");
-    return TryRecoverLocalState(&local_rptr[new_version], &local_chkpt[new_version]);
-  }
-  // handles normal data recovery
-  RecoverType role;
-  if (!requester) {
-    sendrecvbuf = resbuf.Query(seqno, &size);
-    role = sendrecvbuf != NULL ? kHaveData : kPassData;
-  } else {
-    role = kRequestData;
-  }
-  int recv_link;
-  std::vector<bool> req_in;
-  // size of data
-  size_t data_size = size;
-  ReturnType succ = TryDecideRouting(role, &data_size, &recv_link, &req_in);
-  if (succ != kSuccess) return succ;
-  utils::Check(data_size != 0, "zero size check point is not allowed");
-  if (role == kRequestData || role == kHaveData) {
-    utils::Check(data_size == size,
-                 "Allreduce Recovered data size do not match the specification of function call.\n"\
-                 "Please check if calling sequence of recovered program is the " \
-                 "same the original one in current VersionNumber");
-  }
-  return TryRecoverData(role, sendrecvbuf, data_size, recv_link, req_in);
-}
-/*!
- * \brief try to run recover execution for a request action described by flag and seqno,
- *        the function will keep blocking to run possible recovery operations before the specified action,
- *        until the requested result is received by a recovering procedure,
- *        or the function discovers that the requested action is not yet executed, and return false
- *
- * \param buf the buffer to store the result
- * \param size the total size of the buffer
- * \param flag flag information about the action \sa ActionSummary
- * \param seqno sequence number of the action, if it is special action with flag set,
- *              seqno needs to be set to ActionSummary::kSpecialOp
- *
- * \return if this function can return true or false
- *    - true means buf already set to the
- *           result by recovering procedure, the action is complete, no further action is needed
- *    - false means this is the lastest action that has not yet been executed, need to execute the action
- */
-bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
-  if (flag != 0) {
-    utils::Assert(seqno == ActionSummary::kSpecialOp, "must only set seqno for normal operations");
-  }
-  // request
-  ActionSummary req(flag, seqno);
-  while (true) {
-    this->ReportStatus();
-    // action
-    ActionSummary act = req;
-    // get the reduced action
-    if (!CheckAndRecover(TryAllreduce(&act, sizeof(act), 1, ActionSummary::Reducer))) continue;
-    if (act.check_ack()) {
-      if (act.check_point()) {
-        // if we also have check_point, do check point first
-        utils::Assert(!act.diff_seq(),
-                      "check ack & check pt  cannot occur together with normal ops");
-        // if we requested checkpoint, we are free to go
-        if (req.check_point()) return true;
-      } else if (act.load_check()) {
-        // if there is only check_ack and load_check, do load_check
-        if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
-        // if requested load check, then misson complete
-        if (req.load_check()) return true;
-      } else {
-        // there is no check point and no load check, execute check ack
-        if (req.check_ack()) return true;
-      }
-      // if execute to this point
-      // this means the action requested has not been completed
-      // try next round
-    } else {
-      if (act.check_point()) {
-        if (act.diff_seq()) {
-          utils::Assert(act.min_seqno() != ActionSummary::kSpecialOp, "min seq bug");
-          bool requester = req.min_seqno() == act.min_seqno();
-          if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
-          if (requester) return true;
-        } else  {
-          // no difference in seq no, means we are free to check point
-          if (req.check_point()) return true;
-        }
-      } else {
-        // no check point
-        if (act.load_check()) {
-          // all the nodes called load_check, this is an incomplete action
-          if (!act.diff_seq()) return false;
-          // load check have higher priority, do load_check
-          if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
-          // if requested load check, then misson complete
-          if (req.load_check()) return true;
-        } else {
-          // no special flags, no checkpoint, check ack, load_check
-          utils::Assert(act.min_seqno() != ActionSummary::kSpecialOp, "min seq bug");
-          if (act.diff_seq()) {
-            bool requester = req.min_seqno() == act.min_seqno();
-            if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
-            if (requester) return true;
-          } else {
-            // all the request is same,
-            // this is most recent command that is yet to be executed
-            return false;
-          }
-        }
-      }
-      // something is still incomplete try next round
-    }
-  }
-  utils::Assert(false, "RecoverExec: should not reach here");
-  return true;
-}
-/*!
- * \brief try to recover the local state, making each local state to be the result of itself
- *        plus replication of states in previous num_local_replica hops in the ring
- *
- * The input parameters must contain the valid local states available in current nodes,
- * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
- * If there is sufficient information in the ring, when the function returns, local_chkpt will
- * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
- * If there is no sufficient information in the ring, this function the number of checkpoints
- * will be less than the specified value
- *
- * \param p_local_rptr the pointer to the segment pointers in the states array
- * \param p_local_chkpt the pointer to the storage of local check points
- * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
- * \sa ReturnType
- */
-AllreduceRobust::ReturnType
-AllreduceRobust::TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
-                                      std::string *p_local_chkpt) {
-  // if there is no local replica, we can do nothing
-  if (num_local_replica == 0) return kSuccess;
-  std::vector<size_t> &rptr = *p_local_rptr;
-  std::string &chkpt = *p_local_chkpt;
-  if (rptr.size() == 0) {
-    rptr.push_back(0);
-    utils::Assert(chkpt.length() == 0, "local chkpt space inconsistent");
-  }
-  const int n = num_local_replica;
-  {
-    // backward passing, passing state in backward direction of the ring
-    const int nlocal = static_cast<int>(rptr.size() - 1);
-    utils::Assert(nlocal <= n + 1, "invalid local replica");
-    std::vector<int> msg_back(n + 1);
-    msg_back[0] = nlocal;
-    // backward passing one hop the request
-    ReturnType succ;
-    succ = RingPassing(BeginPtr(msg_back),
-                       1 * sizeof(int), (n+1) * sizeof(int),
-                       0 * sizeof(int), n * sizeof(int),
-                       ring_next, ring_prev);
-    if (succ != kSuccess) return succ;
-    int msg_forward[2];
-    msg_forward[0] = nlocal;
-    succ = RingPassing(msg_forward,
-                       1 * sizeof(int), 2 * sizeof(int),
-                       0 * sizeof(int), 1 * sizeof(int),
-                       ring_prev, ring_next);
-    if (succ != kSuccess) return succ;
-    // calculate the number of things we can read from next link
-    int nread_end = nlocal;
-    for (int i = 1; i <= n; ++i) {
-      nread_end = std::max(nread_end, msg_back[i] - i);
-    }
-    // gives the size of forward
-    int nwrite_start = std::min(msg_forward[1] + 1, nread_end);
-    // get the size of each segments
-    std::vector<size_t> sizes(nread_end);
-    for (int i = 0; i < nlocal; ++i) {
-      sizes[i] = rptr[i + 1] - rptr[i];
-    }
-    // pass size through the link
-    succ = RingPassing(BeginPtr(sizes),
-                       nlocal * sizeof(size_t),
-                       nread_end * sizeof(size_t),
-                       nwrite_start * sizeof(size_t),
-                       nread_end * sizeof(size_t),
-                       ring_next, ring_prev);
-    if (succ != kSuccess) return succ;
-    // update rptr
-    rptr.resize(nread_end + 1);
-    for (int i = nlocal; i < nread_end; ++i) {
-      rptr[i + 1] = rptr[i] + sizes[i];
-    }
-    chkpt.resize(rptr.back());
-    // pass data through the link
-    succ = RingPassing(BeginPtr(chkpt), rptr[nlocal], rptr[nread_end],
-                       rptr[nwrite_start], rptr[nread_end],
-                       ring_next, ring_prev);
-    if (succ != kSuccess) {
-      rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ;
-    }
-  }
-  {
-    // forward passing, passing state in forward direction of the ring
-    const int nlocal = static_cast<int>(rptr.size() - 1);
-    utils::Assert(nlocal <= n + 1, "invalid local replica");
-    std::vector<int> msg_forward(n + 1);
-    msg_forward[0] = nlocal;
-    // backward passing one hop the request
-    ReturnType succ;
-    succ = RingPassing(BeginPtr(msg_forward),
-                       1 * sizeof(int), (n+1) * sizeof(int),
-                       0 * sizeof(int), n * sizeof(int),
-                       ring_prev, ring_next);
-    if (succ != kSuccess) return succ;
-    int msg_back[2];
-    msg_back[0] = nlocal;
-    succ = RingPassing(msg_back,
-                       1 * sizeof(int), 2 * sizeof(int),
-                       0 * sizeof(int), 1 * sizeof(int),
-                       ring_next, ring_prev);
-    if (succ != kSuccess) return succ;
-    // calculate the number of things we can read from next link
-    int nread_end = nlocal, nwrite_end = 1;
-    // have to have itself in order to get other data from prev link
-    if (nlocal != 0) {
-      for (int i = 1; i <= n; ++i) {
-        if (msg_forward[i] == 0) break;
-        nread_end = std::max(nread_end, i + 1);
-        nwrite_end = i + 1;
-      }
-      if (nwrite_end > n) nwrite_end = n;
-    } else  {
-      nread_end = 0; nwrite_end = 0;
-    }
-    // gives the size of forward
-    int nwrite_start = std::min(msg_back[1] - 1, nwrite_end);
-    // next node miss the state of itself, cannot recover
-    if (nwrite_start < 0) nwrite_start = nwrite_end = 0;
-    // get the size of each segments
-    std::vector<size_t> sizes(nread_end);
-    for (int i = 0; i < nlocal; ++i) {
-      sizes[i] = rptr[i + 1] - rptr[i];
-    }
-    // pass size through the link, check consistency
-    succ = RingPassing(BeginPtr(sizes),
-                       nlocal * sizeof(size_t),
-                       nread_end * sizeof(size_t),
-                       nwrite_start * sizeof(size_t),
-                       nwrite_end * sizeof(size_t),
-                       ring_prev, ring_next);
-    if (succ != kSuccess) return succ;
-    // update rptr
-    rptr.resize(nread_end + 1);
-    for (int i = nlocal; i < nread_end; ++i) {
-      rptr[i + 1] = rptr[i] + sizes[i];
-    }
-    chkpt.resize(rptr.back());
-    // pass data through the link
-    succ = RingPassing(BeginPtr(chkpt), rptr[nlocal], rptr[nread_end],
-                       rptr[nwrite_start], rptr[nwrite_end],
-                       ring_prev, ring_next);
-    if (succ != kSuccess) {
-      rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ;
-    }
-  }
-  return kSuccess;
-}
-/*!
- * \brief try to checkpoint local state, this function is called in normal executation phase
- *    of checkpoint that contains local state
- *  the input state must exactly one saved state(local state of current node),
- *  after complete, this function will get local state from previous num_local_replica nodes and put them
- *  into local_chkpt and local_rptr
- *
- *  It is also OK to call TryRecoverLocalState instead,
- *  TryRecoverLocalState makes less assumption about the input, and requires more communications
- *
- * \param p_local_rptr the pointer to the segment pointers in the states array
- * \param p_local_chkpt the pointer to the storage of local check points
- * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
- * \sa ReturnType, TryRecoverLocalState
- */
-AllreduceRobust::ReturnType
-AllreduceRobust::TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
-                                      std::string *p_local_chkpt) {
-  // if there is no local replica, we can do nothing
-  if (num_local_replica == 0) return kSuccess;
-  std::vector<size_t> &rptr = *p_local_rptr;
-  std::string &chkpt = *p_local_chkpt;
-  utils::Assert(rptr.size() == 2,
-                "TryCheckinLocalState must have exactly 1 state");
-  const int n = num_local_replica;
-  std::vector<size_t> sizes(n + 1);
-  sizes[0] = rptr[1] - rptr[0];
-  ReturnType succ;
-  // pass size through the link
-  succ = RingPassing(BeginPtr(sizes),
-                     1 * sizeof(size_t),
-                     (n + 1) * sizeof(size_t),
-                     0 * sizeof(size_t),
-                     n * sizeof(size_t),
-                     ring_prev, ring_next);
-  if (succ != kSuccess) return succ;
-  // update rptr
-  rptr.resize(n + 2);
-  for (int i = 1; i <= n; ++i) {
-    rptr[i + 1] = rptr[i] + sizes[i];
-  }
-  chkpt.resize(rptr.back());
-  // pass data through the link
-  succ = RingPassing(BeginPtr(chkpt),
-                     rptr[1], rptr[n + 1],
-                     rptr[0], rptr[n],
-                     ring_prev, ring_next);
-  if (succ != kSuccess) {
-    rptr.resize(2); chkpt.resize(rptr.back()); return succ;
-  }
-  return kSuccess;
-}
-/*!
- * \brief perform a ring passing to receive data from prev link, and sent data to next link
- *  this allows data to stream over a ring structure
- *  sendrecvbuf[0:read_ptr] are already provided by current node
- *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
- *  current node will send sendrecvbuf[write_ptr:write_end] to next link
- *  write_ptr will wait till the data is readed before sending the data
- *  this function requires read_end >= write_end
- *
- * \param sendrecvbuf_ the place to hold the incoming and outgoing data
- * \param read_ptr the initial read pointer
- * \param read_end the ending position to read
- * \param write_ptr the initial write pointer
- * \param write_end the ending position to write
- * \param read_link pointer to link to previous position in ring
- * \param write_link pointer to link of next position in ring
- */
-AllreduceRobust::ReturnType
-AllreduceRobust::RingPassing(void *sendrecvbuf_,
-                             size_t read_ptr,
-                             size_t read_end,
-                             size_t write_ptr,
-                             size_t write_end,
-                             LinkRecord *read_link,
-                             LinkRecord *write_link) {
-  if (read_link == NULL || write_link == NULL || read_end == 0) return kSuccess;
-  utils::Assert(write_end <= read_end,
-                "RingPassing: boundary check1");
-  utils::Assert(read_ptr <= read_end, "RingPassing: boundary check2");
-  utils::Assert(write_ptr <= write_end, "RingPassing: boundary check3");
-  // take reference
-  LinkRecord &prev = *read_link, &next = *write_link;
-  // send recv buffer
-  char *buf = reinterpret_cast<char*>(sendrecvbuf_);
-  while (true) {
-    bool finished = true;
-    utils::SelectHelper selecter;
-    if (read_ptr != read_end) {
-      selecter.WatchRead(prev.sock);
-      finished = false;
-    }
-    if (write_ptr < read_ptr && write_ptr != write_end) {
-      selecter.WatchWrite(next.sock);
-      finished = false;
-    }
-    selecter.WatchException(prev.sock);
-    selecter.WatchException(next.sock);
-    if (finished) break;
-    selecter.Select();
-    if (selecter.CheckExcept(prev.sock)) return ReportError(&prev, kGetExcept);
-    if (selecter.CheckExcept(next.sock)) return ReportError(&next, kGetExcept);
-    if (read_ptr != read_end && selecter.CheckRead(prev.sock)) {
-      ssize_t len = prev.sock.Recv(buf + read_ptr, read_end - read_ptr);
-      if (len == 0) {
-        prev.sock.Close(); return ReportError(&prev, kRecvZeroLen);
-      }
-      if (len != -1) {
-        read_ptr += static_cast<size_t>(len);
-      } else {
-        ReturnType ret = Errno2Return();
-        if (ret != kSuccess) return ReportError(&prev, ret);
-      }
-    }
-    if (write_ptr != write_end && write_ptr < read_ptr) {
-      size_t nsend = std::min(write_end - write_ptr, read_ptr - write_ptr);
-      ssize_t len = next.sock.Send(buf + write_ptr, nsend);
-      if (len != -1) {
-        write_ptr += static_cast<size_t>(len);
-      } else {
-        ReturnType ret = Errno2Return();
-        if (ret != kSuccess) return ReportError(&prev, ret);
-      }
-    }
-  }
-  return kSuccess;
-}
-}  // namespace engine
-}  // namespace rabit
-
diff --git a/subtree/rabit/src/allreduce_robust.h b/subtree/rabit/src/allreduce_robust.h
deleted file mode 100644
index caf2e57af..000000000
--- a/subtree/rabit/src/allreduce_robust.h
+++ /dev/null
@@ -1,553 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file allreduce_robust.h
- * \brief Robust implementation of Allreduce
- *   using TCP non-block socket and tree-shape reduction.
- *
- *   This implementation considers the failure of nodes
- *
- * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
- */
-#ifndef RABIT_ALLREDUCE_ROBUST_H_
-#define RABIT_ALLREDUCE_ROBUST_H_
-#include <vector>
-#include <string>
-#include <algorithm>
-#include "../include/rabit/engine.h"
-#include "./allreduce_base.h"
-
-namespace rabit {
-namespace engine {
-/*! \brief implementation of fault tolerant all reduce engine */
-class AllreduceRobust : public AllreduceBase {
- public:
-  AllreduceRobust(void);
-  virtual ~AllreduceRobust(void) {}
-  // initialize the manager
-  virtual void Init(void);
-  /*! \brief shutdown the engine */
-  virtual void Shutdown(void);
-  /*!
-   * \brief set parameters to the engine
-   * \param name parameter name
-   * \param val parameter value
-   */
-  virtual void SetParam(const char *name, const char *val);
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf
-   *        this function is NOT thread-safe
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to passed into the lazy preprocessing function
-   */
-  virtual void Allreduce(void *sendrecvbuf_,
-                         size_t type_nbytes,
-                         size_t count,
-                         ReduceFunction reducer,
-                         PreprocFunction prepare_fun = NULL,
-                         void *prepare_arg = NULL);
-  /*!
-   * \brief broadcast data from root to all nodes
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param size the size of the data to be broadcasted
-   * \param root the root worker id to broadcast the data
-   */
-  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root);
-  /*!
-   * \brief load latest check point
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \param local_model pointer to local model, that is specific to current node/rank
-   *   this can be NULL when no local model is needed
-   *
-   * \return the version number of check point loaded
-   *     if returned version == 0, this means no model has been CheckPointed
-   *     the p_model is not touched, user should do necessary initialization by themselves
-   *
-   *   Common usage example:
-   *      int iter = rabit::LoadCheckPoint(&model);
-   *      if (iter == 0) model.InitParameters();
-   *      for (i = iter; i < max_iter; ++i) {
-   *        do many things, include allreduce
-   *        rabit::CheckPoint(model);
-   *      }
-   *
-   * \sa CheckPoint, VersionNumber
-   */
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model = NULL);
-  /*!
-   * \brief checkpoint the model, meaning we finished a stage of execution
-   *  every time we call check point, there is a version number which will increase by one
-   *
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \param local_model pointer to local model, that is specific to current node/rank
-   *   this can be NULL when no local state is needed
-   *
-   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
-   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
-   *       So only CheckPoint with global_model if possible
-   *
-   * \sa LoadCheckPoint, VersionNumber
-   */
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model = NULL) {
-    this->CheckPoint_(global_model, local_model, false);
-  }
-  /*!
-   * \brief This function can be used to replace CheckPoint for global_model only,
-   *   when certain condition is met(see detailed expplaination).
-   *
-   *   This is a "lazy" checkpoint such that only the pointer to global_model is
-   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
-   *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
-   *   In another words, global_model model can be changed only between last call of
-   *   Allreduce/Broadcast and LazyCheckPoint in current version
-   *
-   *   For example, suppose the calling sequence is:
-   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
-   *
-   *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
-   *   improve efficiency of the program.
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \sa LoadCheckPoint, CheckPoint, VersionNumber
-   */
-  virtual void LazyCheckPoint(const Serializable *global_model) {
-    this->CheckPoint_(global_model, NULL, true);
-  }
-  /*!
-   * \brief explicitly re-init everything before calling LoadCheckPoint
-   *    call this function when IEngine throw an exception out,
-   *    this function is only used for test purpose
-   */
-  virtual void InitAfterException(void) {
-    // simple way, shutdown all links
-    for (size_t i = 0; i < all_links.size(); ++i) {
-      if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
-    }
-    ReConnectLinks("recover");
-  }
-
- protected:
-  // constant one byte out of band message to indicate error happening
-  // and mark for channel cleanup
-  static const char kOOBReset = 95;
-  // and mark for channel cleanup, after OOB signal
-  static const char kResetMark = 97;
-  // and mark for channel cleanup
-  static const char kResetAck = 97;
-  /*! \brief type of roles each node can play during recovery */
-  enum RecoverType {
-    /*! \brief current node have data */
-    kHaveData = 0,
-    /*! \brief current node request data */
-    kRequestData = 1,
-    /*! \brief current node only helps to pass data around */
-    kPassData = 2
-  };
-  /*!
-   * \brief summary of actions proposed in all nodes
-   *  this data structure is used to make consensus decision
-   *  about next action to take in the recovery mode
-   */
-  struct ActionSummary {
-    // maximumly allowed sequence id
-    static const int kSpecialOp = (1 << 26);
-    // special sequence number for local state checkpoint
-    static const int kLocalCheckPoint = (1 << 26) - 2;
-    // special sequnce number for local state checkpoint ack signal
-    static const int kLocalCheckAck = (1 << 26) - 1;
-    //---------------------------------------------
-    // The following are bit mask of flag used in
-    //----------------------------------------------
-    // some node want to load check point
-    static const int kLoadCheck = 1;
-    // some node want to do check point
-    static const int kCheckPoint = 2;
-    // check point Ack, we use a two phase message in check point,
-    // this is the second phase of check pointing
-    static const int kCheckAck = 4;
-    // there are difference sequence number the nodes proposed
-    // this means we want to do recover execution of the lower sequence
-    // action instead of normal execution
-    static const int kDiffSeq = 8;
-    // constructor
-    ActionSummary(void) {}
-    // constructor of action
-    explicit ActionSummary(int flag, int minseqno = kSpecialOp) {
-      seqcode = (minseqno << 4) | flag;
-    }
-    // minimum number of all operations
-    inline int min_seqno(void) const {
-      return seqcode >> 4;
-    }
-    // whether the operation set contains a load_check
-    inline bool load_check(void) const {
-      return (seqcode & kLoadCheck) != 0;
-    }
-    // whether the operation set contains a check point
-    inline bool check_point(void) const {
-      return (seqcode & kCheckPoint) != 0;
-    }
-    // whether the operation set contains a check ack
-    inline bool check_ack(void) const {
-      return (seqcode & kCheckAck) != 0;
-    }
-    // whether the operation set contains different sequence number
-    inline bool diff_seq(void) const {
-      return (seqcode & kDiffSeq) != 0;
-    }
-    // returns the operation flag of the result
-    inline int flag(void) const {
-      return seqcode & 15;
-    }
-    // reducer for Allreduce, get the result ActionSummary from all nodes
-    inline static void Reducer(const void *src_, void *dst_,
-                               int len, const MPI::Datatype &dtype) {
-      const ActionSummary *src = (const ActionSummary*)src_;
-      ActionSummary *dst = reinterpret_cast<ActionSummary*>(dst_);
-      for (int i = 0; i < len; ++i) {
-        int src_seqno = src[i].min_seqno();
-        int dst_seqno = dst[i].min_seqno();
-        int flag = src[i].flag() | dst[i].flag();
-        if (src_seqno == dst_seqno) {
-          dst[i] = ActionSummary(flag, src_seqno);
-        } else {
-          dst[i] = ActionSummary(flag | kDiffSeq,
-                                 std::min(src_seqno, dst_seqno));
-        }
-      }
-    }
-
-   private:
-    // internel sequence code
-    int seqcode;
-  };
-  /*! \brief data structure to remember result of Bcast and Allreduce calls */
-  class ResultBuffer {
-   public:
-    // constructor
-    ResultBuffer(void) {
-      this->Clear();
-    }
-    // clear the existing record
-    inline void Clear(void) {
-      seqno_.clear(); size_.clear();
-      rptr_.clear(); rptr_.push_back(0);
-      data_.clear();
-    }
-    // allocate temporal space
-    inline void *AllocTemp(size_t type_nbytes, size_t count) {
-      size_t size = type_nbytes * count;
-      size_t nhop = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-      utils::Assert(nhop != 0, "cannot allocate 0 size memory");
-      data_.resize(rptr_.back() + nhop);
-      return BeginPtr(data_) + rptr_.back();
-    }
-    // push the result in temp to the
-    inline void PushTemp(int seqid, size_t type_nbytes, size_t count) {
-      size_t size = type_nbytes * count;
-      size_t nhop = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-      if (seqno_.size() != 0) {
-        utils::Assert(seqno_.back() < seqid, "PushTemp seqid inconsistent");
-      }
-      seqno_.push_back(seqid);
-      rptr_.push_back(rptr_.back() + nhop);
-      size_.push_back(size);
-      utils::Assert(data_.size() == rptr_.back(), "PushTemp inconsistent");
-    }
-    // return the stored result of seqid, if any
-    inline void* Query(int seqid, size_t *p_size) {
-      size_t idx = std::lower_bound(seqno_.begin(),
-                                    seqno_.end(), seqid) - seqno_.begin();
-      if (idx == seqno_.size() || seqno_[idx] != seqid) return NULL;
-      *p_size = size_[idx];
-      return BeginPtr(data_) + rptr_[idx];
-    }
-    // drop last stored result
-    inline void DropLast(void) {
-      utils::Assert(seqno_.size() != 0, "there is nothing to be dropped");
-      seqno_.pop_back();
-      rptr_.pop_back();
-      size_.pop_back();
-      data_.resize(rptr_.back());
-    }
-    // the sequence number of last stored result
-    inline int LastSeqNo(void) const {
-      if (seqno_.size() == 0) return -1;
-      return seqno_.back();
-    }
-
-   private:
-    // sequence number of each
-    std::vector<int> seqno_;
-    // pointer to the positions
-    std::vector<size_t> rptr_;
-    // actual size of each buffer
-    std::vector<size_t> size_;
-    // content of the buffer
-    std::vector<uint64_t> data_;
-  };
-  /*!
-   * \brief internal consistency check function,
-   *  use check to ensure user always call CheckPoint/LoadCheckPoint
-   *  with or without local but not both, this function will set the approperiate settings
-   *  in the first call of LoadCheckPoint/CheckPoint
-   *
-   * \param with_local whether the user calls CheckPoint with local model
-   */
-  void LocalModelCheck(bool with_local);
-  /*!
-   * \brief internal implementation of checkpoint, support both lazy and normal way
-   *
-   * \param global_model pointer to the globally shared model/state
-   *   when calling this function, the caller need to gauranttees that global_model
-   *   is the same in all nodes
-   * \param local_model pointer to local model, that is specific to current node/rank
-   *   this can be NULL when no local state is needed
-   * \param lazy_checkpt whether the action is lazy checkpoint
-   *
-   * \sa CheckPoint, LazyCheckPoint
-   */
-  void CheckPoint_(const Serializable *global_model,
-                   const Serializable *local_model,
-                   bool lazy_checkpt);
-  /*!
-   * \brief reset the all the existing links by sending Out-of-Band message marker
-   *  after this function finishes, all the messages received and sent
-   *  before in all live links are discarded,
-   *  This allows us to get a fresh start after error has happened
-   *
-   *  TODO(tqchen): this function is not yet functioning was not used by engine,
-   *   simple resetlink and reconnect strategy is used
-   *
-   * \return this function can return kSuccess or kSockError
-   *         when kSockError is returned, it simply means there are bad sockets in the links,
-   *         and some link recovery proceduer is needed
-   */
-  ReturnType TryResetLinks(void);
-  /*!
-   * \brief if err_type indicates an error
-   *         recover links according to the error type reported
-   *        if there is no error, return true
-   * \param err_type the type of error happening in the system
-   * \return true if err_type is kSuccess, false otherwise
-   */
-  bool CheckAndRecover(ReturnType err_type);
-  /*!
-   * \brief try to run recover execution for a request action described by flag and seqno,
-   *        the function will keep blocking to run possible recovery operations before the specified action,
-   *        until the requested result is received by a recovering procedure,
-   *        or the function discovers that the requested action is not yet executed, and return false
-   *
-   * \param buf the buffer to store the result
-   * \param size the total size of the buffer
-   * \param flag flag information about the action \sa ActionSummary
-   * \param seqno sequence number of the action, if it is special action with flag set,
-   *        seqno needs to be set to ActionSummary::kSpecialOp
-   *
-   * \return if this function can return true or false
-   *    - true means buf already set to the
-   *           result by recovering procedure, the action is complete, no further action is needed
-   *    - false means this is the lastest action that has not yet been executed, need to execute the action
-   */
-  bool RecoverExec(void *buf, size_t size, int flag,
-                   int seqno = ActionSummary::kSpecialOp);
-  /*!
-   * \brief try to load check point
-   *
-   *        This is a collaborative function called by all nodes
-   *        only the nodes with requester set to true really needs to load the check point
-   *        other nodes acts as collaborative roles to complete this request
-   *
-   * \param requester whether current node is the requester
-   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryLoadCheckPoint(bool requester);
-  /*!
-   * \brief try to get the result of operation specified by seqno
-   *
-   *        This is a collaborative function called by all nodes
-   *        only the nodes with requester set to true really needs to get the result
-   *        other nodes acts as collaborative roles to complete this request
-   *
-   * \param buf the buffer to store the result, this parameter is only used when current node is requester
-   * \param size the total size of the buffer, this parameter is only used when current node is requester
-   * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
-   * \param requester whether current node is the requester
-   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
-  /*!
-   * \brief try to decide the routing strategy for recovery
-   * \param role the current role of the node
-   * \param p_size used to store the size of the message, for node in state kHaveData,
-   *               this size must be set correctly before calling the function
-   *               for others, this surves as output parameter
-
-   * \param p_recvlink used to store the link current node should recv data from, if necessary
-   *          this can be -1, which means current node have the data
-   * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
-   *
-   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
-   * \sa ReturnType, TryRecoverData
-   */
-  ReturnType TryDecideRouting(RecoverType role,
-                              size_t *p_size,
-                              int *p_recvlink,
-                              std::vector<bool> *p_req_in);
-  /*!
-   * \brief try to finish the data recovery request,
-   *        this function is used together with TryDecideRouting
-   * \param role the current role of the node
-   * \param sendrecvbuf_ the buffer to store the data to be sent/recived
-   *          - if the role is kHaveData, this stores the data to be sent
-   *          - if the role is kRequestData, this is the buffer to store the result
-   *          - if the role is kPassData, this will not be used, and can be NULL
-   * \param size the size of the data, obtained from TryDecideRouting
-   * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
-   * \param req_in the request of each link to send data, obtained from TryDecideRouting
-   *
-   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
-   * \sa ReturnType, TryDecideRouting
-   */
-  ReturnType TryRecoverData(RecoverType role,
-                            void *sendrecvbuf_,
-                            size_t size,
-                            int recv_link,
-                            const std::vector<bool> &req_in);
-  /*!
-   * \brief try to recover the local state, making each local state to be the result of itself
-   *        plus replication of states in previous num_local_replica hops in the ring
-   *
-   * The input parameters must contain the valid local states available in current nodes,
-   * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
-   * If there is sufficient information in the ring, when the function returns, local_chkpt will
-   * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
-   * If there is no sufficient information in the ring, this function the number of checkpoints
-   * will be less than the specified value
-   *
-   * \param p_local_rptr the pointer to the segment pointers in the states array
-   * \param p_local_chkpt the pointer to the storage of local check points
-   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
-   * \sa ReturnType
-   */
-  ReturnType TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
-                                  std::string *p_local_chkpt);
-  /*!
-   * \brief try to checkpoint local state, this function is called in normal executation phase
-   *    of checkpoint that contains local state
-o   *  the input state must exactly one saved state(local state of current node),
-   *  after complete, this function will get local state from previous num_local_replica nodes and put them
-   *  into local_chkpt and local_rptr
-   *
-   *  It is also OK to call TryRecoverLocalState instead,
-   *  TryRecoverLocalState makes less assumption about the input, and requires more communications
-   *
-   * \param p_local_rptr the pointer to the segment pointers in the states array
-   * \param p_local_chkpt the pointer to the storage of local check points
-   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
-   * \sa ReturnType, TryRecoverLocalState
-   */
-  ReturnType TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
-                                  std::string *p_local_chkpt);
-  /*!
-   * \brief perform a ring passing to receive data from prev link, and sent data to next link
-   *  this allows data to stream over a ring structure
-   *  sendrecvbuf[0:read_ptr] are already provided by current node
-   *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
-   *  current node will send sendrecvbuf[write_ptr:write_end] to next link
-   *  write_ptr will wait till the data is readed before sending the data
-   *  this function requires read_end >= write_end
-   *
-   * \param sendrecvbuf_ the place to hold the incoming and outgoing data
-   * \param read_ptr the initial read pointer
-   * \param read_end the ending position to read
-   * \param write_ptr the initial write pointer
-   * \param write_end the ending position to write
-   * \param read_link pointer to link to previous position in ring
-   * \param write_link pointer to link of next position in ring
-   */
-  ReturnType RingPassing(void *senrecvbuf_,
-                         size_t read_ptr,
-                         size_t read_end,
-                         size_t write_ptr,
-                         size_t write_end,
-                         LinkRecord *read_link,
-                         LinkRecord *write_link);
-  /*!
-   * \brief run message passing algorithm on the allreduce tree
-   *        the result is edge message stored in p_edge_in and p_edge_out
-   * \param node_value the value associated with current node
-   * \param p_edge_in used to store input message from each of the edge
-   * \param p_edge_out used to store output message from each of the edge
-   * \param func a function that defines the message passing rule
-   *        Parameters of func:
-   *           - node_value same as node_value in the main function
-   *           - edge_in the array of input messages from each edge,
-   *                     this includes the output edge, which should be excluded
-   *           - out_index array the index of output edge, the function should
-   *                       exclude the output edge when compute the message passing value
-   *        Return of func:
-   *           the function returns the output message based on the input message and node_value
-   *
-   * \tparam EdgeType type of edge message, must be simple struct
-   * \tparam NodeType type of node value
-   */
-  template<typename NodeType, typename EdgeType>
-  inline ReturnType MsgPassing(const NodeType &node_value,
-                               std::vector<EdgeType> *p_edge_in,
-                               std::vector<EdgeType> *p_edge_out,
-                               EdgeType(*func)
-                               (const NodeType &node_value,
-                                const std::vector<EdgeType> &edge_in,
-                                size_t out_index));
-  //---- recovery data structure ----
-  // the round of result buffer, used to mode the result
-  int result_buffer_round;
-  // result buffer of all reduce
-  ResultBuffer resbuf;
-  // last check point global model
-  std::string global_checkpoint;
-  // lazy checkpoint of global model
-  const Serializable *global_lazycheck;
-  // number of replica for local state/model
-  int num_local_replica;
-  // number of default local replica
-  int default_local_replica;
-  // flag to decide whether local model is used, -1: unknown, 0: no, 1:yes
-  int use_local_model;
-  // number of replica for global state/model
-  int num_global_replica;
-  // number of times recovery happens
-  int recover_counter;
-  // --- recovery data structure for local checkpoint
-  // there is two version of the data structure,
-  // at one time one version is valid and another is used as temp memory
-  // pointer to memory position in the local model
-  // local model is stored in CSR format(like a sparse matrices)
-  // local_model[rptr[0]:rptr[1]] stores the model of current node
-  // local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops
-  std::vector<size_t> local_rptr[2];
-  // storage for local model replicas
-  std::string local_chkpt[2];
-  // version of local checkpoint can be 1 or 0
-  int local_chkpt_version;
-};
-}  // namespace engine
-}  // namespace rabit
-// implementation of inline template function
-#include "./allreduce_robust-inl.h"
-#endif  // RABIT_ALLREDUCE_ROBUST_H_
diff --git a/subtree/rabit/src/engine.cc b/subtree/rabit/src/engine.cc
deleted file mode 100644
index 0f4770fe2..000000000
--- a/subtree/rabit/src/engine.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine.cc
- * \brief this file governs which implementation of engine we are actually using
- *  provides an singleton of engine interface
- *
- * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-
-#include "../include/rabit/engine.h"
-#include "./allreduce_base.h"
-#include "./allreduce_robust.h"
-
-namespace rabit {
-namespace engine {
-// singleton sync manager
-#ifndef RABIT_USE_BASE
-#ifndef RABIT_USE_MOCK
-AllreduceRobust manager;
-#else
-AllreduceMock manager;
-#endif
-#else
-AllreduceBase manager;
-#endif
-
-/*! \brief intiialize the synchronization module */
-void Init(int argc, char *argv[]) {
-  for (int i = 1; i < argc; ++i) {
-    char name[256], val[256];
-    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
-      manager.SetParam(name, val);
-    }
-  }
-  manager.Init();
-}
-
-/*! \brief finalize syncrhonization module */
-void Finalize(void) {
-  manager.Shutdown();
-}
-/*! \brief singleton method to get engine */
-IEngine *GetEngine(void) {
-  return &manager;
-}
-// perform in-place allreduce, on sendrecvbuf
-void Allreduce_(void *sendrecvbuf,
-                size_t type_nbytes,
-                size_t count,
-                IEngine::ReduceFunction red,
-                mpi::DataType dtype,
-                mpi::OpType op,
-                IEngine::PreprocFunction prepare_fun,
-                void *prepare_arg) {
-  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
-                         red, prepare_fun, prepare_arg);
-}
-
-// code for reduce handle
-ReduceHandle::ReduceHandle(void)
-  : handle_(NULL), redfunc_(NULL), htype_(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {}
-
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return static_cast<int>(dtype.type_size);
-}
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
-  utils::Assert(redfunc_ == NULL, "cannot initialize reduce handle twice");
-  redfunc_ = redfunc;
-}
-void ReduceHandle::Allreduce(void *sendrecvbuf,
-                             size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun,
-                             void *prepare_arg) {
-  utils::Assert(redfunc_ != NULL, "must intialize handle to call AllReduce");
-  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
-                         redfunc_, prepare_fun, prepare_arg);
-}
-}  // namespace engine
-}  // namespace rabit
diff --git a/subtree/rabit/src/engine_base.cc b/subtree/rabit/src/engine_base.cc
deleted file mode 100644
index 62739536f..000000000
--- a/subtree/rabit/src/engine_base.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine_mock.cc
- * \brief this is an engine implementation that will 
- * insert failures in certain call point, to test if the engine is robust to failure
- * \author Tianqi Chen
- */
-// define use MOCK, os we will use mock Manager
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-// switch engine to AllreduceMock
-#define RABIT_USE_BASE
-#include "./engine.cc"
-
diff --git a/subtree/rabit/src/engine_empty.cc b/subtree/rabit/src/engine_empty.cc
deleted file mode 100644
index 5fc16d9f4..000000000
--- a/subtree/rabit/src/engine_empty.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine_empty.cc
- * \brief this file provides a dummy implementation of engine that does nothing
- *  this file provides a way to fall back to single node program without causing too many dependencies
- *  This is usually NOT needed, use engine_mpi or engine for real distributed version
- * \author Tianqi Chen
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-
-#include "../include/rabit/engine.h"
-
-namespace rabit {
-namespace engine {
-/*! \brief EmptyEngine */
-class EmptyEngine : public IEngine {
- public:
-  EmptyEngine(void) {
-    version_number = 0;
-  }
-  virtual void Allreduce(void *sendrecvbuf_,
-                         size_t type_nbytes,
-                         size_t count,
-                         ReduceFunction reducer,
-                         PreprocFunction prepare_fun,
-                         void *prepare_arg) {
-    utils::Error("EmptyEngine:: Allreduce is not supported,"\
-                 "use Allreduce_ instead");
-  }
-  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) {
-  }
-  virtual void InitAfterException(void) {
-    utils::Error("EmptyEngine is not fault tolerant");
-  }
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model = NULL) {
-    return 0;
-  }
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model = NULL) {
-    version_number += 1;
-  }
-  virtual void LazyCheckPoint(const Serializable *global_model) {
-    version_number += 1;
-  }
-  virtual int VersionNumber(void) const {
-    return version_number;
-  }
-  /*! \brief get rank of current node */
-  virtual int GetRank(void) const {
-    return 0;
-  }
-  /*! \brief get total number of */
-  virtual int GetWorldSize(void) const {
-    return 1;
-  }
-  /*! \brief whether it is distributed */
-  virtual bool IsDistributed(void) const {
-    return false;
-  }
-  /*! \brief get the host name of current node */
-  virtual std::string GetHost(void) const {
-    return std::string("");
-  }
-  virtual void TrackerPrint(const std::string &msg) {
-    // simply print information into the tracker
-    utils::Printf("%s", msg.c_str());
-  }
-
- private:
-  int version_number;
-};
-
-// singleton sync manager
-EmptyEngine manager;
-
-/*! \brief intiialize the synchronization module */
-void Init(int argc, char *argv[]) {
-}
-/*! \brief finalize syncrhonization module */
-void Finalize(void) {
-}
-
-/*! \brief singleton method to get engine */
-IEngine *GetEngine(void) {
-  return &manager;
-}
-// perform in-place allreduce, on sendrecvbuf
-void Allreduce_(void *sendrecvbuf,
-                size_t type_nbytes,
-                size_t count,
-                IEngine::ReduceFunction red,
-                mpi::DataType dtype,
-                mpi::OpType op,
-                IEngine::PreprocFunction prepare_fun,
-                void *prepare_arg) {
-  if (prepare_fun != NULL) prepare_fun(prepare_arg);
-}
-
-// code for reduce handle
-ReduceHandle::ReduceHandle(void) : handle_(NULL), htype_(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {}
-
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return 0;
-}
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {}
-void ReduceHandle::Allreduce(void *sendrecvbuf,
-                             size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun,
-                             void *prepare_arg) {
-  if (prepare_fun != NULL) prepare_fun(prepare_arg);
-}
-}  // namespace engine
-}  // namespace rabit
diff --git a/subtree/rabit/src/engine_mock.cc b/subtree/rabit/src/engine_mock.cc
deleted file mode 100644
index 24415a1d5..000000000
--- a/subtree/rabit/src/engine_mock.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine_mock.cc
- * \brief this is an engine implementation that will 
- * insert failures in certain call point, to test if the engine is robust to failure
- * \author Tianqi Chen
- */
-// define use MOCK, os we will use mock Manager
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-// switch engine to AllreduceMock
-#define RABIT_USE_MOCK
-#include "./allreduce_mock.h"
-#include "./engine.cc"
-
diff --git a/subtree/rabit/src/engine_mpi.cc b/subtree/rabit/src/engine_mpi.cc
deleted file mode 100644
index 11e55335b..000000000
--- a/subtree/rabit/src/engine_mpi.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine_mpi.cc
- * \brief this file gives an implementation of engine interface using MPI,
- *   this will allow rabit program to run with MPI, but do not comes with fault tolerant
- *
- * \author Tianqi Chen
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <mpi.h>
-#include <cstdio>
-#include "../include/rabit/engine.h"
-#include "../include/rabit/utils.h"
-
-namespace rabit {
-namespace engine {
-/*! \brief implementation of engine using MPI */
-class MPIEngine : public IEngine {
- public:
-  MPIEngine(void) {
-    version_number = 0;
-  }
-  virtual void Allreduce(void *sendrecvbuf_,
-                         size_t type_nbytes,
-                         size_t count,
-                         ReduceFunction reducer,
-                         PreprocFunction prepare_fun,
-                         void *prepare_arg) {
-    utils::Error("MPIEngine:: Allreduce is not supported,"\
-                 "use Allreduce_ instead");
-  }
-  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) {
-    MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root);
-  }
-  virtual void InitAfterException(void) {
-    utils::Error("MPI is not fault tolerant");
-  }
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model = NULL) {
-    return 0;
-  }
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model = NULL) {
-    version_number += 1;
-  }
-  virtual void LazyCheckPoint(const Serializable *global_model) {
-    version_number += 1;
-  }
-  virtual int VersionNumber(void) const {
-    return version_number;
-  }
-  /*! \brief get rank of current node */
-  virtual int GetRank(void) const {
-    return MPI::COMM_WORLD.Get_rank();
-  }
-  /*! \brief get total number of */
-  virtual int GetWorldSize(void) const {
-    return MPI::COMM_WORLD.Get_size();
-  }
-  /*! \brief whether it is distributed */
-  virtual bool IsDistributed(void) const {
-    return true;
-  }
-  /*! \brief get the host name of current node */
-  virtual std::string GetHost(void) const {
-    int len;
-    char name[MPI_MAX_PROCESSOR_NAME];
-    MPI::Get_processor_name(name, len);
-    name[len] = '\0';
-    return std::string(name);
-  }
-  virtual void TrackerPrint(const std::string &msg) {
-    // simply print information into the tracker
-    if (GetRank() == 0) {
-      utils::Printf("%s", msg.c_str());
-    }
-  }
-
- private:
-  int version_number;
-};
-
-// singleton sync manager
-MPIEngine manager;
-
-/*! \brief intiialize the synchronization module */
-void Init(int argc, char *argv[]) {
-  MPI::Init(argc, argv);
-}
-/*! \brief finalize syncrhonization module */
-void Finalize(void) {
-  MPI::Finalize();
-}
-
-/*! \brief singleton method to get engine */
-IEngine *GetEngine(void) {
-  return &manager;
-}
-// transform enum to MPI data type
-inline MPI::Datatype GetType(mpi::DataType dtype) {
-  using namespace mpi;
-  switch (dtype) {
-    case kChar: return MPI::CHAR;
-    case kUChar: return MPI::BYTE;
-    case kInt: return MPI::INT;
-    case kUInt: return MPI::UNSIGNED;
-    case kLong: return MPI::LONG;
-    case kULong: return MPI::UNSIGNED_LONG;
-    case kFloat: return MPI::FLOAT;
-    case kDouble: return MPI::DOUBLE;
-    case kLongLong: return MPI::LONG_LONG;
-    case kULongLong: return MPI::UNSIGNED_LONG_LONG;
-  }
-  utils::Error("unknown mpi::DataType");
-  return MPI::CHAR;
-}
-// transform enum to MPI OP
-inline MPI::Op GetOp(mpi::OpType otype) {
-  using namespace mpi;
-  switch (otype) {
-    case kMax: return MPI::MAX;
-    case kMin: return MPI::MIN;
-    case kSum: return MPI::SUM;
-    case kBitwiseOR: return MPI::BOR;
-  }
-  utils::Error("unknown mpi::OpType");
-  return MPI::MAX;
-}
-// perform in-place allreduce, on sendrecvbuf
-void Allreduce_(void *sendrecvbuf,
-                size_t type_nbytes,
-                size_t count,
-                IEngine::ReduceFunction red,
-                mpi::DataType dtype,
-                mpi::OpType op,
-                IEngine::PreprocFunction prepare_fun,
-                void *prepare_arg) {
-  if (prepare_fun != NULL) prepare_fun(prepare_arg);
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf,
-                            count, GetType(dtype), GetOp(op));
-}
-
-// code for reduce handle
-ReduceHandle::ReduceHandle(void)
-    : handle_(NULL), redfunc_(NULL), htype_(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {
-  if (handle_ != NULL) {
-    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
-    op->Free();
-    delete op;
-  }
-  if (htype_ != NULL) {
-    MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
-    dtype->Free();
-    delete dtype;
-  }
-}
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return dtype.Get_size();
-}
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
-  utils::Assert(handle_ == NULL, "cannot initialize reduce handle twice");
-  if (type_nbytes != 0) {
-    MPI::Datatype *dtype = new MPI::Datatype();
-    if (type_nbytes % 8 == 0) {
-      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
-    } else if (type_nbytes % 4 == 0) {
-      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
-    } else {
-      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
-    }
-    dtype->Commit();
-    created_type_nbytes_ = type_nbytes;
-    htype_ = dtype;
-  }
-  MPI::Op *op = new MPI::Op();
-  MPI::User_function *pf = redfunc;
-  op->Init(pf, true);
-  handle_ = op;
-}
-void ReduceHandle::Allreduce(void *sendrecvbuf,
-                             size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun,
-                             void *prepare_arg) {
-  utils::Assert(handle_ != NULL, "must intialize handle to call AllReduce");
-  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
-  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
-  if (created_type_nbytes_ != type_nbytes || dtype == NULL) {
-    if (dtype == NULL) {
-      dtype = new MPI::Datatype();
-    } else {
-      dtype->Free();
-    }
-    if (type_nbytes % 8 == 0) {
-      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
-    } else if (type_nbytes % 4 == 0) {
-      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
-    } else {
-      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
-    }
-    dtype->Commit();
-    created_type_nbytes_ = type_nbytes;
-  }
-  if (prepare_fun != NULL) prepare_fun(prepare_arg);
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op);
-}
-}  // namespace engine
-}  // namespace rabit
diff --git a/subtree/rabit/src/socket.h b/subtree/rabit/src/socket.h
deleted file mode 100644
index 6df7a7b78..000000000
--- a/subtree/rabit/src/socket.h
+++ /dev/null
@@ -1,523 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file socket.h
- * \brief this file aims to provide a wrapper of sockets
- * \author Tianqi Chen
- */
-#ifndef RABIT_SOCKET_H_
-#define RABIT_SOCKET_H_
-#if defined(_WIN32)
-#include <winsock2.h>
-#include <ws2tcpip.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "Ws2_32.lib")
-#endif
-#else
-#include <fcntl.h>
-#include <netdb.h>
-#include <errno.h>
-#include <unistd.h>
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <sys/select.h>
-#include <sys/ioctl.h>
-#endif
-#include <string>
-#include <cstring>
-#include "../include/rabit/utils.h"
-
-#if defined(_WIN32)
-typedef int ssize_t;
-typedef int sock_size_t;
-#else
-typedef int SOCKET;
-typedef size_t sock_size_t;
-const int INVALID_SOCKET = -1;
-#endif
-
-namespace rabit {
-namespace utils {
-/*! \brief data structure for network address */
-struct SockAddr {
-  sockaddr_in addr;
-  // constructor
-  SockAddr(void) {}
-  SockAddr(const char *url, int port) {
-    this->Set(url, port);
-  }
-  inline static std::string GetHostName(void) {
-    std::string buf; buf.resize(256);
-    utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
-    return std::string(buf.c_str());
-  }
-  /*!
-   * \brief set the address
-   * \param url the url of the address
-   * \param port the port of address
-   */
-  inline void Set(const char *host, int port) {
-    hostent *hp = gethostbyname(host);
-    Check(hp != NULL, "cannot obtain address of %s", host);
-    memset(&addr, 0, sizeof(addr));
-    addr.sin_family = AF_INET;
-    addr.sin_port = htons(port);
-    memcpy(&addr.sin_addr, hp->h_addr_list[0], hp->h_length);
-  }
-  /*! \brief return port of the address*/
-  inline int port(void) const {
-    return ntohs(addr.sin_port);
-  }
-  /*! \return a string representation of the address */
-  inline std::string AddrStr(void) const {
-    std::string buf; buf.resize(256);
-#ifdef _WIN32
-    const char *s = inet_ntop(AF_INET, (PVOID)&addr.sin_addr,
-                              &buf[0], buf.length());
-#else
-    const char *s = inet_ntop(AF_INET, &addr.sin_addr,
-                              &buf[0], buf.length());
-#endif
-    Assert(s != NULL, "cannot decode address");
-    return std::string(s);
-  }
-};
-
-/*!
- * \brief base class containing common operations of TCP and UDP sockets
- */
-class Socket {
- public:
-  /*! \brief the file descriptor of socket */
-  SOCKET sockfd;
-  // default conversion to int
-  inline operator SOCKET() const {
-    return sockfd;
-  }
-  /*!
-   * \return last error of socket operation
-   */
-  inline static int GetLastError(void) {
-#ifdef _WIN32
-    return WSAGetLastError();
-#else
-    return errno;
-#endif
-  }
-  /*! \return whether last error was would block */
-  inline static bool LastErrorWouldBlock(void) {
-    int errsv = GetLastError();
-#ifdef _WIN32
-    return errsv == WSAEWOULDBLOCK;
-#else
-    return errsv == EAGAIN || errsv == EWOULDBLOCK;
-#endif
-  }
-  /*!
-   * \brief start up the socket module
-   *   call this before using the sockets
-   */
-  inline static void Startup(void) {
-#ifdef _WIN32
-    WSADATA wsa_data;
-    if (WSAStartup(MAKEWORD(2, 2), &wsa_data) == -1) {
-      Socket::Error("Startup");
-    }
-    if (LOBYTE(wsa_data.wVersion) != 2 || HIBYTE(wsa_data.wVersion) != 2) {
-      WSACleanup();
-      utils::Error("Could not find a usable version of Winsock.dll\n");
-    }
-#endif
-  }
-  /*!
-   * \brief shutdown the socket module after use, all sockets need to be closed
-   */
-  inline static void Finalize(void) {
-#ifdef _WIN32
-    WSACleanup();
-#endif
-  }
-  /*!
-   * \brief set this socket to use non-blocking mode
-   * \param non_block whether set it to be non-block, if it is false
-   *        it will set it back to block mode
-   */
-  inline void SetNonBlock(bool non_block) {
-#ifdef _WIN32
-    u_long mode = non_block ? 1 : 0;
-    if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
-      Socket::Error("SetNonBlock");
-    }
-#else
-    int flag = fcntl(sockfd, F_GETFL, 0);
-    if (flag == -1) {
-      Socket::Error("SetNonBlock-1");
-    }
-    if (non_block) {
-      flag |= O_NONBLOCK;
-    } else {
-      flag &= ~O_NONBLOCK;
-    }
-    if (fcntl(sockfd, F_SETFL, flag) == -1) {
-      Socket::Error("SetNonBlock-2");
-    }
-#endif
-  }
-  /*!
-   * \brief bind the socket to an address
-   * \param addr
-   */
-  inline void Bind(const SockAddr &addr) {
-    if (bind(sockfd, reinterpret_cast<const sockaddr*>(&addr.addr),
-             sizeof(addr.addr)) == -1) {
-      Socket::Error("Bind");
-    }
-  }
-  /*!
-   * \brief try bind the socket to host, from start_port to end_port
-   * \param start_port starting port number to try
-   * \param end_port ending port number to try
-   * \return the port successfully bind to, return -1 if failed to bind any port
-   */
-  inline int TryBindHost(int start_port, int end_port) {
-    // TODO(tqchen) add prefix check
-    for (int port = start_port; port < end_port; ++port) {
-      SockAddr addr("0.0.0.0", port);
-      if (bind(sockfd, reinterpret_cast<sockaddr*>(&addr.addr),
-               sizeof(addr.addr)) == 0) {
-        return port;
-      }
-#if defined(_WIN32)
-      if (WSAGetLastError() != WSAEADDRINUSE) {
-        Socket::Error("TryBindHost");
-      }
-#else
-      if (errno != EADDRINUSE) {
-        Socket::Error("TryBindHost");
-      }
-#endif
-    }
-
-    return -1;
-  }
-  /*! \brief get last error code if any */
-  inline int GetSockError(void) const {
-    int error = 0;
-    socklen_t len = sizeof(error);
-    if (getsockopt(sockfd,  SOL_SOCKET, SO_ERROR, reinterpret_cast<char*>(&error), &len) != 0) {
-      Error("GetSockError");
-    }
-    return error;
-  }
-  /*! \brief check if anything bad happens */
-  inline bool BadSocket(void) const {
-    if (IsClosed()) return true;
-    int err = GetSockError();
-    if (err == EBADF || err == EINTR) return true;
-    return false;
-  }
-  /*! \brief check if socket is already closed */
-  inline bool IsClosed(void) const {
-    return sockfd == INVALID_SOCKET;
-  }
-  /*! \brief close the socket */
-  inline void Close(void) {
-    if (sockfd != INVALID_SOCKET) {
-#ifdef _WIN32
-      closesocket(sockfd);
-#else
-      close(sockfd);
-#endif
-      sockfd = INVALID_SOCKET;
-    } else {
-      Error("Socket::Close double close the socket or close without create");
-    }
-  }
-  // report an socket error
-  inline static void Error(const char *msg) {
-    int errsv = GetLastError();
-#ifdef _WIN32
-    utils::Error("Socket %s Error:WSAError-code=%d", msg, errsv);
-#else
-    utils::Error("Socket %s Error:%s", msg, strerror(errsv));
-#endif
-  }
-
- protected:
-  explicit Socket(SOCKET sockfd) : sockfd(sockfd) {
-  }
-};
-
-/*!
- * \brief a wrapper of TCP socket that hopefully be cross platform
- */
-class TCPSocket : public Socket{
- public:
-  // constructor
-  TCPSocket(void) : Socket(INVALID_SOCKET) {
-  }
-  explicit TCPSocket(SOCKET sockfd) : Socket(sockfd) {
-  }
-  /*!
-   * \brief enable/disable TCP keepalive
-   * \param keepalive whether to set the keep alive option on
-   */
-  inline void SetKeepAlive(bool keepalive) {
-    int opt = static_cast<int>(keepalive);
-    if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
-                   reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
-      Socket::Error("SetKeepAlive");
-    }
-  }
-  /*!
-   * \brief create the socket, call this before using socket
-   * \param af domain
-   */
-  inline void Create(int af = PF_INET) {
-    sockfd = socket(PF_INET, SOCK_STREAM, 0);
-    if (sockfd == INVALID_SOCKET) {
-      Socket::Error("Create");
-    }
-  }
-  /*!
-   * \brief perform listen of the socket
-   * \param backlog backlog parameter
-   */
-  inline void Listen(int backlog = 16) {
-    listen(sockfd, backlog);
-  }
-  /*! \brief get a new connection */
-  TCPSocket Accept(void) {
-    SOCKET newfd = accept(sockfd, NULL, NULL);
-    if (newfd == INVALID_SOCKET) {
-      Socket::Error("Accept");
-    }
-    return TCPSocket(newfd);
-  }
-  /*!
-   * \brief decide whether the socket is at OOB mark
-   * \return 1 if at mark, 0 if not, -1 if an error occured
-   */
-  inline int AtMark(void) const {
-#ifdef _WIN32
-    unsigned long atmark;  // NOLINT(*)
-    if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
-#else
-    int atmark;
-    if (ioctl(sockfd, SIOCATMARK, &atmark) == -1) return -1;
-#endif
-    return static_cast<int>(atmark);
-  }
-  /*!
-   * \brief connect to an address
-   * \param addr the address to connect to
-   * \return whether connect is successful
-   */
-  inline bool Connect(const SockAddr &addr) {
-    return connect(sockfd, reinterpret_cast<const sockaddr*>(&addr.addr),
-                   sizeof(addr.addr)) == 0;
-  }
-  /*!
-   * \brief send data using the socket
-   * \param buf the pointer to the buffer
-   * \param len the size of the buffer
-   * \param flags extra flags
-   * \return size of data actually sent
-   *         return -1 if error occurs
-   */
-  inline ssize_t Send(const void *buf_, size_t len, int flag = 0) {
-    const char *buf = reinterpret_cast<const char*>(buf_);
-    return send(sockfd, buf, static_cast<sock_size_t>(len), flag);
-  }
-  /*!
-   * \brief receive data using the socket
-   * \param buf_ the pointer to the buffer
-   * \param len the size of the buffer
-   * \param flags extra flags
-   * \return size of data actually received
-   *         return -1 if error occurs
-   */
-  inline ssize_t Recv(void *buf_, size_t len, int flags = 0) {
-    char *buf = reinterpret_cast<char*>(buf_);
-    return recv(sockfd, buf, static_cast<sock_size_t>(len), flags);
-  }
-  /*!
-   * \brief peform block write that will attempt to send all data out
-   *    can still return smaller than request when error occurs
-   * \param buf the pointer to the buffer
-   * \param len the size of the buffer
-   * \return size of data actually sent
-   */
-  inline size_t SendAll(const void *buf_, size_t len) {
-    const char *buf = reinterpret_cast<const char*>(buf_);
-    size_t ndone = 0;
-    while (ndone <  len) {
-      ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
-      if (ret == -1) {
-        if (LastErrorWouldBlock()) return ndone;
-        Socket::Error("SendAll");
-      }
-      buf += ret;
-      ndone += ret;
-    }
-    return ndone;
-  }
-  /*!
-   * \brief peforma block read that will attempt to read all data
-   *    can still return smaller than request when error occurs
-   * \param buf_ the buffer pointer
-   * \param len length of data to recv
-   * \return size of data actually sent
-   */
-  inline size_t RecvAll(void *buf_, size_t len) {
-    char *buf = reinterpret_cast<char*>(buf_);
-    size_t ndone = 0;
-    while (ndone <  len) {
-      ssize_t ret = recv(sockfd, buf,
-                         static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
-      if (ret == -1) {
-        if (LastErrorWouldBlock()) return ndone;
-        Socket::Error("RecvAll");
-      }
-      if (ret == 0) return ndone;
-      buf += ret;
-      ndone += ret;
-    }
-    return ndone;
-  }
-  /*!
-   * \brief send a string over network
-   * \param str the string to be sent
-   */
-  inline void SendStr(const std::string &str) {
-    int len = static_cast<int>(str.length());
-    utils::Assert(this->SendAll(&len, sizeof(len)) == sizeof(len),
-                  "error during send SendStr");
-    if (len != 0) {
-      utils::Assert(this->SendAll(str.c_str(), str.length()) == str.length(),
-                    "error during send SendStr");
-    }
-  }
-  /*!
-   * \brief recv a string from network
-   * \param out_str the string to receive
-   */
-  inline void RecvStr(std::string *out_str) {
-    int len;
-    utils::Assert(this->RecvAll(&len, sizeof(len)) == sizeof(len),
-                  "error during send RecvStr");
-    out_str->resize(len);
-    if (len != 0) {
-      utils::Assert(this->RecvAll(&(*out_str)[0], len) == out_str->length(),
-                    "error during send SendStr");
-    }
-  }
-};
-
-/*! \brief helper data structure to perform select */
-struct SelectHelper {
- public:
-  SelectHelper(void) {
-    FD_ZERO(&read_set);
-    FD_ZERO(&write_set);
-    FD_ZERO(&except_set);
-    maxfd = 0;
-  }
-  /*!
-   * \brief add file descriptor to watch for read
-   * \param fd file descriptor to be watched
-   */
-  inline void WatchRead(SOCKET fd) {
-    FD_SET(fd, &read_set);
-    if (fd > maxfd) maxfd = fd;
-  }
-  /*!
-   * \brief add file descriptor to watch for write
-   * \param fd file descriptor to be watched
-   */
-  inline void WatchWrite(SOCKET fd) {
-    FD_SET(fd, &write_set);
-    if (fd > maxfd) maxfd = fd;
-  }
-  /*!
-   * \brief add file descriptor to watch for exception
-   * \param fd file descriptor to be watched
-   */
-  inline void WatchException(SOCKET fd) {
-    FD_SET(fd, &except_set);
-    if (fd > maxfd) maxfd = fd;
-  }
-  /*!
-   * \brief Check if the descriptor is ready for read
-   * \param fd file descriptor to check status
-   */
-  inline bool CheckRead(SOCKET fd) const {
-    return FD_ISSET(fd, &read_set) != 0;
-  }
-  /*!
-   * \brief Check if the descriptor is ready for write
-   * \param fd file descriptor to check status
-   */
-  inline bool CheckWrite(SOCKET fd) const {
-    return FD_ISSET(fd, &write_set) != 0;
-  }
-  /*!
-   * \brief Check if the descriptor has any exception
-   * \param fd file descriptor to check status
-   */
-  inline bool CheckExcept(SOCKET fd) const {
-    return FD_ISSET(fd, &except_set) != 0;
-  }
-  /*!
-   * \brief wait for exception event on a single descriptor
-   * \param fd the file descriptor to wait the event for
-   * \param timeout the timeout counter, can be 0, which means wait until the event happen
-   * \return 1 if success, 0 if timeout, and -1 if error occurs
-   */
-  inline static int WaitExcept(SOCKET fd, long timeout = 0) { // NOLINT(*)
-    fd_set wait_set;
-    FD_ZERO(&wait_set);
-    FD_SET(fd, &wait_set);
-    return Select_(static_cast<int>(fd + 1),
-                   NULL, NULL, &wait_set, timeout);
-  }
-  /*!
-   * \brief peform select on the set defined
-   * \param select_read whether to watch for read event
-   * \param select_write whether to watch for write event
-   * \param select_except whether to watch for exception event
-   * \param timeout specify timeout in micro-seconds(ms) if equals 0, means select will always block
-   * \return number of active descriptors selected,
-   *         return -1 if error occurs
-   */
-  inline int Select(long timeout = 0) {  // NOLINT(*)
-    int ret =  Select_(static_cast<int>(maxfd + 1),
-                       &read_set, &write_set, &except_set, timeout);
-    if (ret == -1) {
-      Socket::Error("Select");
-    }
-    return ret;
-  }
-
- private:
-  inline static int Select_(int maxfd, fd_set *rfds,
-                            fd_set *wfds, fd_set *efds, long timeout) { // NOLINT(*)
-#if !defined(_WIN32)
-    utils::Assert(maxfd < FD_SETSIZE, "maxdf must be smaller than FDSETSIZE");
-#endif
-    if (timeout == 0) {
-      return select(maxfd, rfds, wfds, efds, NULL);
-    } else {
-      timeval tm;
-      tm.tv_usec = (timeout % 1000) * 1000;
-      tm.tv_sec = timeout / 1000;
-      return select(maxfd, rfds, wfds, efds, &tm);
-    }
-  }
-
-  SOCKET maxfd;
-  fd_set read_set, write_set, except_set;
-};
-}  // namespace utils
-}  // namespace rabit
-#endif  // RABIT_SOCKET_H_
diff --git a/subtree/rabit/test/.gitignore b/subtree/rabit/test/.gitignore
deleted file mode 100644
index eb87d8f26..000000000
--- a/subtree/rabit/test/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*.mpi
-test_*
-*_test
-*_recover
diff --git a/subtree/rabit/test/Makefile b/subtree/rabit/test/Makefile
deleted file mode 100644
index 62e4e17f0..000000000
--- a/subtree/rabit/test/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-export CC  = gcc
-export CXX = g++
-export MPICXX = mpicxx
-export LDFLAGS=  -L../lib -pthread -lm  -lrt 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include  -std=c++0x
-
-# specify tensor path
-BIN = speed_test model_recover local_recover lazy_recover
-OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
-MPIBIN = speed_test.mpi
-.PHONY: clean all lib mpi
-
-all: $(BIN) $(MPIBIN)
-lib:
-	cd ..;make;cd -
-mpi:
-	cd ..;make mpi;cd -
-# programs 
-speed_test.o: speed_test.cc ../include/*.h lib mpi
-model_recover.o: model_recover.cc ../include/*.h lib
-local_recover.o: local_recover.cc ../include/*.h lib
-lazy_recover.o: lazy_recover.cc ../include/*.h lib
-
-# we can link against MPI version to get use MPI
-speed_test: speed_test.o  $(RABIT_OBJ)
-speed_test.mpi: speed_test.o $(MPIOBJ)
-model_recover: model_recover.o  $(RABIT_OBJ)
-local_recover: local_recover.o  $(RABIT_OBJ)
-lazy_recover: lazy_recover.o  $(RABIT_OBJ)
-
-$(BIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit_mock $(LDFLAGS) 
-
-$(OBJ) : 
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
-
-$(MPIBIN) : 
-	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) $(LDFLAGS) -lrabit_mpi
-
-clean:
-	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
diff --git a/subtree/rabit/test/README.md b/subtree/rabit/test/README.md
deleted file mode 100644
index fb68112bf..000000000
--- a/subtree/rabit/test/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-Testcases of Rabit
-====
-This folder contains internal testcases to test correctness and efficiency of rabit API
-
-The example running scripts for testcases are given by test.mk
-* type ```make -f test.mk testcasename``` to run certain testcase
-
-
-Helper Scripts
-====
-* test.mk contains Makefile documentation of all testcases
-* keepalive.sh helper bash to restart a program when it dies abnormally
-
-List of Programs
-====
-* speed_test: test the running speed of rabit API
-* test_local_recover: test recovery of local state when error happens
-* test_model_recover: test recovery of global state when error happens
diff --git a/subtree/rabit/test/lazy_recover.cc b/subtree/rabit/test/lazy_recover.cc
deleted file mode 100644
index 610a20664..000000000
--- a/subtree/rabit/test/lazy_recover.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// this is a test case to test whether rabit can recover model when 
-// facing an exception
-#include <rabit.h>
-#include <rabit/utils.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-using namespace rabit;
-
-// dummy model
-class Model : public rabit::Serializable {
- public:
-  // iterations
-  std::vector<float> data;
-  // load from stream
-  virtual void Load(rabit::Stream *fi) {
-    fi->Read(&data);
-  }
-  /*! \brief save the model to the stream */
-  virtual void Save(rabit::Stream *fo) const {
-    fo->Write(data);
-  }
-  virtual void InitModel(size_t n) {
-    data.clear();
-    data.resize(n, 1.0f);
-  }
-};
-
-inline void TestMax(Model *model, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  const int z = iter + 111;
-
-  std::vector<float> ndata(model->data.size());
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % z  + model->data[i];
-  }
-  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());  
-
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    float rmax = (i * 1) % z + model->data[i];
-    for (int r = 0; r < nproc; ++r) {
-      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
-    }
-    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
-  }
-}
-
-inline void TestSum(Model *model, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  const int z = 131 + iter;
-
-  std::vector<float> ndata(model->data.size());
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % z + model->data[i];
-  }
-  Allreduce<op::Sum>(&ndata[0], ndata.size());
-
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    float rsum = model->data[i] * nproc;
-    for (int r = 0; r < nproc; ++r) {
-      rsum += (float)((i * (r+1)) % z);
-    }
-    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
-                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
-  }
-  model->data = ndata;
-}
-
-inline void TestBcast(size_t n, int root, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  std::string s; s.resize(n);      
-  for (size_t i = 0; i < n; ++i) {
-    s[i] = char(i % 126 + 1);
-  }
-  std::string res;
-  if (root == rank) {
-    res = s;
-    rabit::Broadcast(&res, root);
-  } else {
-    rabit::Broadcast(&res, root);
-  }
-  utils::Check(res == s, "[%d] TestBcast fail", rank);
-}
-
-int main(int argc, char *argv[]) {
-  if (argc < 3) {
-    printf("Usage: <ndata> <config>\n");
-    return 0;
-  }
-  int n = atoi(argv[1]);
-  rabit::Init(argc, argv);
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  std::string name = rabit::GetProcessorName();
-  Model model;  
-  srand(0);
-  int ntrial = 0;
-  for (int i = 1; i < argc; ++i) {
-    int n;
-    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n; 
-  } 
-  int iter = rabit::LoadCheckPoint(&model);
-  if (iter == 0) {
-    model.InitModel(n);
-    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
-  } else {
-    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
-  }
-  for (int r = iter; r < 3; ++r) { 
-    TestMax(&model, ntrial, r);
-    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);  
-    int step = std::max(nproc / 3, 1);
-    for (int i = 0; i < nproc; i += step) {
-      TestBcast(n, i, ntrial, r);
-    }
-    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
-    TestSum(&model, ntrial, r);
-    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
-    rabit::LazyCheckPoint(&model);
-    printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
-  }
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/test/local_recover.cc b/subtree/rabit/test/local_recover.cc
deleted file mode 100644
index 5162d5a2d..000000000
--- a/subtree/rabit/test/local_recover.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// this is a test case to test whether rabit can recover model when 
-// facing an exception
-#include <rabit.h>
-#include <rabit/utils.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-
-using namespace rabit;
-
-// dummy model
-class Model : public rabit::Serializable {
- public:
-  // iterations
-  std::vector<float> data;
-  // load from stream
-  virtual void Load(rabit::Stream *fi) {
-    fi->Read(&data);
-  }
-  /*! \brief save the model to the stream */
-  virtual void Save(rabit::Stream *fo) const {
-    fo->Write(data);
-  }
-  virtual void InitModel(size_t n, float v) {
-    data.clear();
-    data.resize(n, v);
-  }
-};
-
-inline void TestMax(Model *model, Model *local, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  const int z = iter + 111;  
-  std::vector<float> ndata(model->data.size());
-  rabit::Allreduce<op::Max>(&ndata[0], ndata.size(), 
-                            [&]() {
-                              // use lambda expression to prepare the data
-                              for (size_t i = 0; i < ndata.size(); ++i) {
-                                ndata[i] = (i * (rank+1)) % z  + local->data[i];
-                              }
-                            });  
-
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    float rmax = (i * 1) % z + model->data[i];
-    for (int r = 0; r < nproc; ++r) {
-      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i] + r);
-    }
-    utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
-  }
-  model->data = ndata;
-  local->data = ndata;
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    local->data[i] = ndata[i] + rank;
-  }
-}
-
-inline void TestSum(Model *model, Model *local, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  const int z = 131 + iter;
-
-  std::vector<float> ndata(model->data.size());
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % z + local->data[i];
-  }
-  Allreduce<op::Sum>(&ndata[0], ndata.size());
-  
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    float rsum = 0.0f;
-    for (int r = 0; r < nproc; ++r) {
-      rsum += (float)((i * (r+1)) % z) + model->data[i] + r;
-    }
-    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
-                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
-  }
-  model->data = ndata;
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    local->data[i] = ndata[i] + rank;
-  }
-}
-
-inline void TestBcast(size_t n, int root, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  std::string s; s.resize(n);      
-  for (size_t i = 0; i < n; ++i) {
-    s[i] = char(i % 126 + 1);
-  }
-  std::string res;
-  if (root == rank) {
-    res = s;
-    rabit::Broadcast(&res, root);
-  } else {
-    rabit::Broadcast(&res, root);
-  }
-  utils::Check(res == s, "[%d] TestBcast fail", rank);
-}
-
-int main(int argc, char *argv[]) {
-  if (argc < 3) {
-    printf("Usage: <ndata>\n");
-    return 0;
-  }
-  int n = atoi(argv[1]);
-  rabit::Init(argc, argv);
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  std::string name = rabit::GetProcessorName();
-  Model model, local;  
-  srand(0);
-  int ntrial = 0;
-  for (int i = 1; i < argc; ++i) {
-    int n;
-    if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n; 
-  } 
-  int iter = rabit::LoadCheckPoint(&model, &local);
-  if (iter == 0) {
-    model.InitModel(n, 1.0f);
-    local.InitModel(n, 1.0f + rank);
-    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
-  } else {
-    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
-  }
-  for (int r = iter; r < 3; ++r) { 
-    TestMax(&model, &local, ntrial, r);
-    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);  
-    int step = std::max(nproc / 3, 1);
-    for (int i = 0; i < nproc; i += step) {
-      TestBcast(n, i, ntrial, r);
-    }
-    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
-    TestSum(&model, &local, ntrial, r);
-    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
-    rabit::CheckPoint(&model, &local);
-    printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
-  }
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/test/local_recover.py b/subtree/rabit/test/local_recover.py
deleted file mode 100755
index e35bd3177..000000000
--- a/subtree/rabit/test/local_recover.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/python
-import rabit
-import numpy as np
-
-rabit.init(lib='mock')
-rank = rabit.get_rank()
-n = 10
-nround = 3
-data = np.ones(n) * rank
-
-version, model, local = rabit.load_checkpoint(True)
-if version == 0:
-    model = np.zeros(n)
-    local = np.ones(n)
-else:
-    print '[%d] restart from version %d' % (rank, version)
-
-for i in xrange(version, nround):    
-    res = rabit.allreduce(data + model+local, rabit.SUM)
-    print '[%d] iter=%d: %s' % (rank, i, str(res))
-    model = res
-    local[:] = i
-    rabit.checkpoint(model, local)
-
-rabit.finalize()
diff --git a/subtree/rabit/test/model_recover.cc b/subtree/rabit/test/model_recover.cc
deleted file mode 100644
index f833ef295..000000000
--- a/subtree/rabit/test/model_recover.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// this is a test case to test whether rabit can recover model when 
-// facing an exception
-#include <rabit.h>
-#include <rabit/utils.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-using namespace rabit;
-
-// dummy model
-class Model : public rabit::Serializable {
- public:
-  // iterations
-  std::vector<float> data;
-  // load from stream
-  virtual void Load(rabit::Stream *fi) {
-    fi->Read(&data);
-  }
-  /*! \brief save the model to the stream */
-  virtual void Save(rabit::Stream *fo) const {
-    fo->Write(data);
-  }
-  virtual void InitModel(size_t n) {
-    data.clear();
-    data.resize(n, 1.0f);
-  }
-};
-
-inline void TestMax(Model *model, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  const int z = iter + 111;
-
-  std::vector<float> ndata(model->data.size());
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % z  + model->data[i];
-  }
-  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());  
-
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    float rmax = (i * 1) % z + model->data[i];
-    for (int r = 0; r < nproc; ++r) {
-      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
-    }
-    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
-  }
-  model->data = ndata;
-}
-
-inline void TestSum(Model *model, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  const int z = 131 + iter;
-
-  std::vector<float> ndata(model->data.size());
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % z + model->data[i];
-  }
-  Allreduce<op::Sum>(&ndata[0], ndata.size());
-
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    float rsum = model->data[i] * nproc;
-    for (int r = 0; r < nproc; ++r) {
-      rsum += (float)((i * (r+1)) % z);
-    }
-    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
-                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
-  }
-  model->data = ndata;
-}
-
-inline void TestBcast(size_t n, int root, int ntrial, int iter) {
-  int rank = rabit::GetRank();
-  std::string s; s.resize(n);      
-  for (size_t i = 0; i < n; ++i) {
-    s[i] = char(i % 126 + 1);
-  }
-  std::string res;
-  if (root == rank) {
-    res = s;
-    rabit::Broadcast(&res, root);
-  } else {
-    rabit::Broadcast(&res, root);
-  }
-  utils::Check(res == s, "[%d] TestBcast fail", rank);
-}
-
-int main(int argc, char *argv[]) {
-  if (argc < 3) {
-    printf("Usage: <ndata> <config>\n");
-    return 0;
-  }
-  int n = atoi(argv[1]);
-  rabit::Init(argc, argv);
-  int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  std::string name = rabit::GetProcessorName();
-  Model model;  
-  srand(0);
-  int ntrial = 0;
-  for (int i = 1; i < argc; ++i) {
-    int n;
-    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n; 
-  } 
-  int iter = rabit::LoadCheckPoint(&model);
-  if (iter == 0) {
-    model.InitModel(n);
-    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
-  } else {
-    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
-  }
-  for (int r = iter; r < 3; ++r) { 
-    TestMax(&model, ntrial, r);
-    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);  
-    int step = std::max(nproc / 3, 1);
-    for (int i = 0; i < nproc; i += step) {
-      TestBcast(n, i, ntrial, r);
-    }
-    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
-    TestSum(&model, ntrial, r);
-    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
-    rabit::CheckPoint(&model);
-    printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
-  }
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/test/speed_runner.py b/subtree/rabit/test/speed_runner.py
deleted file mode 100644
index 1644bfe99..000000000
--- a/subtree/rabit/test/speed_runner.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import argparse
-import sys
-
-def main():
-  parser = argparse.ArgumentParser(description='TODO')
-  parser.add_argument('-ho', '--host_dir', required=True)
-  parser.add_argument('-s', '--submit_script', required=True)
-  parser.add_argument('-rex', '--rabit_exec', required=True)
-  parser.add_argument('-mpi', '--mpi_exec', required=True)
-  args = parser.parse_args()
-
-  ndata = [10**4, 10**5, 10**6, 10**7]
-  nrepeat = [10**4, 10**3, 10**2, 10]
-
-  machines = [2,4,8,16,31]
-
-  executables = [args.rabit_exec, args.mpi_exec]
-
-  for executable in executables:
-    sys.stderr.write('Executable %s' % executable)
-    sys.stderr.flush()
-    for i, data in enumerate(ndata):
-      for machine in machines:
-        host_file = os.path.join(args.host_dir, 'hosts%d' % machine)
-        cmd = 'python %s %d %s %s %d %d' % (args.submit_script, machine, host_file, executable, data, nrepeat[i])
-        sys.stderr.write('data=%d, repeat=%d, machine=%d\n' % (data, nrepeat[i], machine))
-        sys.stderr.flush()
-        os.system(cmd)
-    sys.stderr.write('\n')
-    sys.stderr.flush()
-
-if __name__ == "__main__":
-  main()
diff --git a/subtree/rabit/test/speed_test.cc b/subtree/rabit/test/speed_test.cc
deleted file mode 100644
index 68891bd31..000000000
--- a/subtree/rabit/test/speed_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// This program is used to test the speed of rabit API
-#include <rabit.h>
-#include <rabit/utils.h>
-#include <rabit/timer.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include <time.h>
-
-using namespace rabit;
-
-double max_tdiff, sum_tdiff, bcast_tdiff, tot_tdiff;
-
-inline void TestMax(size_t n) {
-  int rank = rabit::GetRank();
-  std::vector<float> ndata(n);
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % 111;
-  }
-  double tstart = utils::GetTime();
-  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
-  max_tdiff += utils::GetTime() - tstart;
-}
-
-inline void TestSum(size_t n) {
-  int rank = rabit::GetRank();
-  const int z = 131;
-  std::vector<float> ndata(n);
-  for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = (i * (rank+1)) % z;
-  }
-  double tstart = utils::GetTime();
-  rabit::Allreduce<op::Sum>(&ndata[0], ndata.size());  
-  sum_tdiff += utils::GetTime() - tstart;
-}
-
-inline void TestBcast(size_t n, int root) {
-  int rank = rabit::GetRank();
-  std::string s; s.resize(n);
-  for (size_t i = 0; i < n; ++i) {
-    s[i] = char(i % 126 + 1);
-  }
-  std::string res;
-  res.resize(n);
-  if (root == rank) {
-    res = s;
-  }
-  double tstart = utils::GetTime();  
-  rabit::Broadcast(&res[0], res.length(), root);
-  bcast_tdiff += utils::GetTime() - tstart;  
-}
-
-inline void PrintStats(const char *name, double tdiff, int n, int nrep, size_t size) {
-  int nproc = rabit::GetWorldSize();
-  double tsum = tdiff;
-  rabit::Allreduce<op::Sum>(&tsum, 1);
-  double tavg = tsum / nproc;
-  double tsqr = tdiff - tavg;
-  tsqr *= tsqr;
-  rabit::Allreduce<op::Sum>(&tsqr, 1);
-  double tstd = sqrt(tsqr / nproc);
-  if (rabit::GetRank() == 0) {
-    rabit::TrackerPrintf("%s: mean=%g, std=%g sec\n", name, tavg, tstd);
-    double ndata = n;
-    ndata *= nrep * size;
-    if (n != 0) {
-      rabit::TrackerPrintf("%s-speed: %g MB/sec\n", name, (ndata / tavg) / 1024 / 1024 );
-    }
-  }
-}
-
-int main(int argc, char *argv[]) {
-  if (argc < 3) {
-    printf("Usage: <ndata> <nrepeat>\n");
-    return 0;
-  }
-  srand(0);
-  int n = atoi(argv[1]);
-  int nrep = atoi(argv[2]);
-  utils::Check(nrep >= 1, "need to at least repeat running once");
-  rabit::Init(argc, argv);
-  //int rank = rabit::GetRank();
-  int nproc = rabit::GetWorldSize();
-  std::string name = rabit::GetProcessorName();
-  max_tdiff = sum_tdiff = bcast_tdiff = 0;
-  double tstart = utils::GetTime();
-  for (int i = 0; i < nrep; ++i) {
-    TestMax(n);
-    TestSum(n);
-    TestBcast(n, rand() % nproc);
-  }
-  tot_tdiff = utils::GetTime() - tstart;
-  // use allreduce to get the sum and std of time
-  PrintStats("max_tdiff", max_tdiff, n, nrep, sizeof(float));
-  PrintStats("sum_tdiff", sum_tdiff, n, nrep, sizeof(float));
-  PrintStats("bcast_tdiff", bcast_tdiff, n, nrep, sizeof(char));
-  PrintStats("tot_tdiff", tot_tdiff, 0, nrep, sizeof(float));
-  rabit::Finalize();
-  return 0;
-}
diff --git a/subtree/rabit/test/test.mk b/subtree/rabit/test/test.mk
deleted file mode 100644
index 282a82bc4..000000000
--- a/subtree/rabit/test/test.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-# this is a makefile used to show testcases of rabit
-.PHONY: all
-
-all: model_recover_10_10k  model_recover_10_10k_die_same
-
-# this experiment test recovery with actually process exit, use keepalive to keep program alive
-model_recover_10_10k:
-	../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
-
-model_recover_10_10k_die_same:
-	../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
-
-model_recover_10_10k_die_hard:
-	../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
-
-local_recover_10_10k:
-	../tracker/rabit_demo.py -n 10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
-
-pylocal_recover_10_10k:
-	../tracker/rabit_demo.py -n 10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
-
-lazy_recover_10_10k_die_hard:
-	../tracker/rabit_demo.py -n 10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
-
-lazy_recover_10_10k_die_same:
-	../tracker/rabit_demo.py -n 10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
-
-ringallreduce_10_10k:
-	../tracker/rabit_demo.py -v  1 -n 10 model_recover 100 rabit_reduce_ring_mincount=10
diff --git a/subtree/rabit/tracker/README.md b/subtree/rabit/tracker/README.md
deleted file mode 100644
index 23d14b079..000000000
--- a/subtree/rabit/tracker/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-Trackers
-=====
-This folder contains tracker scripts that can be used to submit yarn jobs to different platforms,
-the example guidelines are in the script themselfs
-
-***Supported Platforms***
-* Local demo: [rabit_demo.py](rabit_demo.py)
-* MPI: [rabit_mpi.py](rabit_mpi.py)
-* Yarn (Hadoop): [rabit_yarn.py](rabit_yarn.py)
-  - It is also possible to submit via hadoop streaming with rabit_hadoop_streaming.py
-  - However, it is higly recommended to use rabit_yarn.py because this will allocate resources more precisely and fits machine learning scenarios
-* Sun Grid engine: [rabit_sge.py](rabit_sge.py)
diff --git a/subtree/rabit/tracker/rabit_demo.py b/subtree/rabit/tracker/rabit_demo.py
deleted file mode 100755
index 6008e0efc..000000000
--- a/subtree/rabit/tracker/rabit_demo.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/env python
-"""
-This is the demo submission script of rabit for submitting jobs in local machine
-"""
-import argparse
-import sys
-import os
-import subprocess
-from threading import Thread
-import rabit_tracker as tracker
-if os.name == 'nt':
-    WRAPPER_PATH = os.path.dirname(__file__) + '\\..\\wrapper'
-else:
-    WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
-
-parser = argparse.ArgumentParser(description='Rabit script to submit rabit job locally using python subprocess')
-parser.add_argument('-n', '--nworker', required=True, type=int,
-                    help = 'number of worker proccess to be launched')
-parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
-                    help = 'print more messages into the console')
-parser.add_argument('command', nargs='+',
-                    help = 'command for rabit program')
-args = parser.parse_args()
-
-# bash script for keepalive
-# use it so that python do not need to communicate with subprocess
-echo="echo %s rabit_num_trial=$nrep;"
-keepalive = """
-nrep=0
-rc=254
-while [ $rc -eq 254 ]; 
-do
-    export rabit_num_trial=$nrep
-    %s
-    %s 
-    rc=$?;
-    nrep=$((nrep+1));
-done
-"""
-
-def exec_cmd(cmd, taskid, worker_env):
-    if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
-        cmd[0] = './' + cmd[0]
-    cmd = ' '.join(cmd)
-    env = os.environ.copy()
-    for k, v in worker_env.items():
-        env[k] = str(v)        
-    env['rabit_task_id'] = str(taskid)
-    env['PYTHONPATH'] = WRAPPER_PATH
-
-    ntrial = 0
-    while True:
-        if os.name == 'nt':
-            env['rabit_num_trial'] = str(ntrial)
-            ret = subprocess.call(cmd, shell=True, env = env)
-            if ret == 254:
-                ntrial += 1
-                continue
-        else:
-            if args.verbose != 0: 
-                bash = keepalive % (echo % cmd, cmd)
-            else:
-                bash = keepalive % ('', cmd)
-            ret = subprocess.call(bash, shell=True, executable='bash', env = env)
-        if ret == 0:
-            if args.verbose != 0:        
-                print 'Thread %d exit with 0' % taskid
-            return
-        else:
-            if os.name == 'nt':
-                os.exit(-1)
-            else:
-                raise Exception('Get nonzero return code=%d' % ret)
-#
-#  Note: this submit script is only used for demo purpose
-#  submission script using pyhton multi-threading
-#
-def mthread_submit(nslave, worker_args, worker_envs):
-    """
-      customized submit script, that submit nslave jobs, each must contain args as parameter
-      note this can be a lambda function containing additional parameters in input
-      Parameters
-         nslave number of slave process to start up
-         args arguments to launch each job
-              this usually includes the parameters of master_uri and parameters passed into submit
-    """       
-    procs = {}
-    for i in range(nslave):
-        procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i, worker_envs))
-        procs[i].daemon = True
-        procs[i].start()
-    for i in range(nslave):
-        procs[i].join()
-
-# call submit, with nslave, the commands to run each job and submit function
-tracker.submit(args.nworker, [], fun_submit = mthread_submit, verbose = args.verbose)
diff --git a/subtree/rabit/tracker/rabit_hadoop_streaming.py b/subtree/rabit/tracker/rabit_hadoop_streaming.py
deleted file mode 100755
index 22b534d79..000000000
--- a/subtree/rabit/tracker/rabit_hadoop_streaming.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/usr/bin/env python
-"""
-Deprecated
-
-This is a script to submit rabit job using hadoop streaming.
-It will submit the rabit process as mappers of MapReduce.
-
-This script is deprecated, it is highly recommended to use rabit_yarn.py instead
-"""
-import argparse
-import sys
-import os
-import time
-import subprocess
-import warnings
-import rabit_tracker as tracker
-
-WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
-
-#!!! Set path to hadoop and hadoop streaming jar here
-hadoop_binary = 'hadoop'
-hadoop_streaming_jar = None
-
-# code 
-hadoop_home = os.getenv('HADOOP_HOME')
-if hadoop_home != None:
-    if hadoop_binary == None:
-        hadoop_binary = hadoop_home + '/bin/hadoop'
-        assert os.path.exists(hadoop_binary), "HADOOP_HOME does not contain the hadoop binary"
-    if hadoop_streaming_jar == None:
-        hadoop_streaming_jar = hadoop_home + '/lib/hadoop-streaming.jar'
-        assert os.path.exists(hadoop_streaming_jar), "HADOOP_HOME does not contain the hadoop streaming jar"
-
-if hadoop_binary == None or hadoop_streaming_jar == None:
-    warnings.warn('Warning: Cannot auto-detect path to hadoop or hadoop-streaming jar\n'\
-                      '\tneed to set them via arguments -hs and -hb\n'\
-                      '\tTo enable auto-detection, you can set enviroment variable HADOOP_HOME'\
-                      ', or modify rabit_hadoop.py line 16', stacklevel = 2)
-
-parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
-                                     'It is Highly recommended to use rabit_yarn.py instead')
-parser.add_argument('-n', '--nworker', required=True, type=int,
-                    help = 'number of worker proccess to be launched')
-parser.add_argument('-hip', '--host_ip', default='auto', type=str,
-                    help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
-parser.add_argument('-i', '--input', required=True,
-                    help = 'input path in HDFS')
-parser.add_argument('-o', '--output', required=True,
-                    help = 'output path in HDFS')
-parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
-                    help = 'print more messages into the console')
-parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
-                    help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
-parser.add_argument('-f', '--files', default = [], action='append',
-                    help = 'the cached file list in mapreduce,'\
-                        ' the submission script will automatically cache all the files which appears in command'\
-                        ' This will also cause rewritten of all the file names in the command to current path,'\
-                        ' for example `../../kmeans ../kmeans.conf` will be rewritten to `./kmeans kmeans.conf`'\
-                        ' because the two files are cached to running folder.'\
-                        ' You may need this option to cache additional files.'\
-                        ' You can also use it to manually cache files when auto_file_cache is off')
-parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
-parser.add_argument('--timeout', default=600000000, type=int,
-                    help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
-                        'normally you do not need to set this ')
-parser.add_argument('--vcores', default = -1, type=int,
-                    help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
-parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
-                    help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
-                        'if you are running multi-threading rabit,'\
-                        'so that each node can occupy all the mapper slots in a machine for maximum performance')
-if hadoop_binary == None:
-    parser.add_argument('-hb', '--hadoop_binary', required = True,
-                        help="path to hadoop binary file")  
-else:
-    parser.add_argument('-hb', '--hadoop_binary', default = hadoop_binary, 
-                        help="path to hadoop binary file")  
-
-if hadoop_streaming_jar == None:
-    parser.add_argument('-hs', '--hadoop_streaming_jar', required = True,
-                        help='path to hadoop streamimg jar file')
-else:
-    parser.add_argument('-hs', '--hadoop_streaming_jar', default = hadoop_streaming_jar,
-                        help='path to hadoop streamimg jar file')
-parser.add_argument('command', nargs='+',
-                    help = 'command for rabit program')
-args = parser.parse_args()
-
-if args.jobname == 'auto':
-    args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
-
-# detech hadoop version
-(out, err) = subprocess.Popen('%s version' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
-out = out.split('\n')[0].split()
-assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
-hadoop_version = out[1].split('.')
-use_yarn = int(hadoop_version[0]) >= 2
-if use_yarn:
-    warnings.warn('It is highly recommended to use rabit_yarn.py to submit jobs to yarn instead', stacklevel = 2)
-
-print 'Current Hadoop Version is %s' % out[1]
-
-def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
-    worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
-    worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
-    fset = set()
-    if args.auto_file_cache:
-        for i in range(len(args.command)):
-            f = args.command[i]
-            if os.path.exists(f):
-                fset.add(f)
-                if i == 0:
-                    args.command[i] = './' + args.command[i].split('/')[-1]                    
-                else:
-                    args.command[i] = args.command[i].split('/')[-1]    
-    if args.command[0].endswith('.py'):
-        flst = [WRAPPER_PATH + '/rabit.py',
-                WRAPPER_PATH + '/librabit_wrapper.so',
-                WRAPPER_PATH + '/librabit_wrapper_mock.so']
-        for f in flst:
-            if os.path.exists(f):
-                fset.add(f)            
-    kmap = {}
-    kmap['env'] = 'mapred.child.env'
-    # setup keymaps
-    if use_yarn:
-        kmap['nworker'] = 'mapreduce.job.maps'
-        kmap['jobname'] = 'mapreduce.job.name'
-        kmap['nthread'] = 'mapreduce.map.cpu.vcores'
-        kmap['timeout'] = 'mapreduce.task.timeout'
-        kmap['memory_mb'] = 'mapreduce.map.memory.mb'
-    else:
-        kmap['nworker'] = 'mapred.map.tasks'
-        kmap['jobname'] = 'mapred.job.name'
-        kmap['nthread'] = None
-        kmap['timeout'] = 'mapred.task.timeout'
-        kmap['memory_mb'] = 'mapred.job.map.memory.mb'
-    cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
-    cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
-    cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
-    envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
-    cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
-    if args.vcores != -1:
-        if kmap['nthread'] is None:
-            warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
-                              'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
-        else:
-            cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
-    cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
-    if args.memory_mb != -1:
-        cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
-
-    cmd += ' -input %s -output %s' % (args.input, args.output)
-    cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + worker_args))
-    if args.files != None:
-        for flst in args.files:
-            for f in flst.split('#'):
-                fset.add(f)
-    for f in fset:
-        cmd += ' -file %s' % f
-    print cmd
-    subprocess.check_call(cmd, shell = True)
-
-fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
-tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
diff --git a/subtree/rabit/tracker/rabit_mpi.py b/subtree/rabit/tracker/rabit_mpi.py
deleted file mode 100755
index f62696050..000000000
--- a/subtree/rabit/tracker/rabit_mpi.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-"""
-Submission script to submit rabit jobs using MPI
-"""
-import argparse
-import sys
-import os
-import subprocess
-import rabit_tracker as tracker
-
-parser = argparse.ArgumentParser(description='Rabit script to submit rabit job using MPI')
-parser.add_argument('-n', '--nworker', required=True, type=int,
-                    help = 'number of worker proccess to be launched')
-parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
-                    help = 'print more messages into the console')
-parser.add_argument('-H', '--hostfile', type=str,
-                    help = 'the hostfile of mpi server')
-parser.add_argument('command', nargs='+',
-                    help = 'command for rabit program')
-args = parser.parse_args()
-#
-# submission script using MPI
-#
-def mpi_submit(nslave, worker_args, worker_envs):
-    """
-      customized submit script, that submit nslave jobs, each must contain args as parameter
-      note this can be a lambda function containing additional parameters in input
-      Parameters
-         nslave number of slave process to start up
-         args arguments to launch each job
-              this usually includes the parameters of master_uri and parameters passed into submit
-    """
-    worker_args += ['%s=%s' % (k, str(v)) for k, v in worker_envs.items()]
-    sargs = ' '.join(args.command + worker_args)
-    if args.hostfile is None:
-        cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args) 
-    else:
-        cmd = ' '.join(['mpirun -n %d --hostfile %s' % (nslave, args.hostfile)] + args.command + worker_args)
-    print cmd
-    subprocess.check_call(cmd, shell = True)
-
-# call submit, with nslave, the commands to run each job and submit function
-tracker.submit(args.nworker, [], fun_submit = mpi_submit, verbose = args.verbose)
diff --git a/subtree/rabit/tracker/rabit_sge.py b/subtree/rabit/tracker/rabit_sge.py
deleted file mode 100755
index 3026a4fcb..000000000
--- a/subtree/rabit/tracker/rabit_sge.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python
-"""
-Submit rabit jobs to Sun Grid Engine
-"""
-import argparse
-import sys
-import os
-import subprocess
-import rabit_tracker as tracker
-
-parser = argparse.ArgumentParser(description='Rabit script to submit rabit job using MPI')
-parser.add_argument('-n', '--nworker', required=True, type=int,
-                    help = 'number of worker proccess to be launched')
-parser.add_argument('-q', '--queue', default='default', type=str,
-                    help = 'the queue we want to submit the job to')
-parser.add_argument('-hip', '--host_ip', default='auto', type=str,
-                    help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
-parser.add_argument('--vcores', default = 1, type=int,
-                    help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
-parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
-parser.add_argument('--logdir', default='auto', help = 'customize the directory to place the logs')
-parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
-                    help = 'print more messages into the console')
-parser.add_argument('command', nargs='+',
-                    help = 'command for rabit program')
-args = parser.parse_args()
-
-if args.jobname == 'auto':
-    args.jobname = ('rabit%d.' % args.nworker) + args.command[0].split('/')[-1];
-if args.logdir == 'auto':
-    args.logdir = args.jobname + '.log'
-
-if os.path.exists(args.logdir):
-    if not os.path.isdir(args.logdir):
-        raise RuntimeError('specified logdir %s is a file instead of directory' % args.logdir)
-else:
-    os.mkdir(args.logdir)
-    
-runscript = '%s/runrabit.sh' % args.logdir
-fo = open(runscript, 'w')
-fo.write('source ~/.bashrc\n')
-fo.write('\"$@\"\n')
-fo.close()
-#
-# submission script using MPI
-#
-def sge_submit(nslave, worker_args, worker_envs):
-    """
-      customized submit script, that submit nslave jobs, each must contain args as parameter
-      note this can be a lambda function containing additional parameters in input
-      Parameters
-         nslave number of slave process to start up
-         args arguments to launch each job
-              this usually includes the parameters of master_uri and parameters passed into submit
-    """
-    env_arg = ','.join(['%s=\"%s\"' % (k, str(v)) for k, v in worker_envs.items()])
-    cmd = 'qsub -cwd -t 1-%d -S /bin/bash' % nslave
-    if args.queue != 'default':
-        cmd += '-q %s' % args.queue
-    cmd += ' -N %s ' % args.jobname
-    cmd += ' -e %s -o %s' % (args.logdir, args.logdir)
-    cmd += ' -pe orte %d' % (args.vcores)
-    cmd += ' -v %s,PATH=${PATH}:.' % env_arg
-    cmd += ' %s %s' % (runscript, ' '.join(args.command + worker_args))
-    print cmd
-    subprocess.check_call(cmd, shell = True)
-    print 'Waiting for the jobs to get up...'
-
-# call submit, with nslave, the commands to run each job and submit function
-tracker.submit(args.nworker, [], fun_submit = sge_submit, verbose = args.verbose)
diff --git a/subtree/rabit/tracker/rabit_tracker.py b/subtree/rabit/tracker/rabit_tracker.py
deleted file mode 100644
index d8e6ae84d..000000000
--- a/subtree/rabit/tracker/rabit_tracker.py
+++ /dev/null
@@ -1,317 +0,0 @@
-"""
-Tracker script for rabit
-Implements the tracker control protocol
- - start rabit jobs
- - help nodes to establish links with each other
-
-Tianqi Chen
-"""
-
-import sys
-import os
-import socket
-import struct
-import subprocess
-import random
-import time
-from threading import Thread
-
-"""
-Extension of socket to handle recv and send of special data
-"""
-class ExSocket:
-    def __init__(self, sock):
-        self.sock = sock
-    def recvall(self, nbytes):
-        res = []
-        sock = self.sock
-        nread = 0
-        while nread < nbytes:
-            chunk = self.sock.recv(min(nbytes - nread, 1024))
-            nread += len(chunk)
-            res.append(chunk)
-        return ''.join(res)
-    def recvint(self):
-        return struct.unpack('@i', self.recvall(4))[0]
-    def sendint(self, n):
-        self.sock.sendall(struct.pack('@i', n))
-    def sendstr(self, s):
-        self.sendint(len(s))
-        self.sock.sendall(s)
-    def recvstr(self):
-        slen = self.recvint()
-        return self.recvall(slen)
-
-# magic number used to verify existence of data
-kMagic = 0xff99
-
-class SlaveEntry:
-    def __init__(self, sock, s_addr):
-        slave = ExSocket(sock)
-        self.sock = slave
-        self.host = socket.gethostbyname(s_addr[0])
-        magic = slave.recvint()
-        assert magic == kMagic, 'invalid magic number=%d from %s' % (magic, self.host)
-        slave.sendint(kMagic)
-        self.rank = slave.recvint()
-        self.world_size = slave.recvint()
-        self.jobid = slave.recvstr()
-        self.cmd = slave.recvstr()
-
-    def decide_rank(self, job_map):
-        if self.rank >= 0:
-            return self.rank
-        if self.jobid != 'NULL' and self.jobid in job_map:
-            return job_map[self.jobid]
-        return -1
-
-    def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map):
-        self.rank = rank
-        nnset = set(tree_map[rank])
-        rprev, rnext = ring_map[rank]
-        self.sock.sendint(rank)
-        # send parent rank
-        self.sock.sendint(parent_map[rank])
-        # send world size
-        self.sock.sendint(len(tree_map))
-        self.sock.sendint(len(nnset))
-        # send the rprev and next link
-        for r in nnset:
-            self.sock.sendint(r)
-        # send prev link
-        if rprev != -1 and rprev != rank:
-            nnset.add(rprev)
-            self.sock.sendint(rprev)
-        else:
-            self.sock.sendint(-1)
-        # send next link
-        if rnext != -1 and rnext != rank:
-            nnset.add(rnext)
-            self.sock.sendint(rnext)
-        else:
-            self.sock.sendint(-1)
-        while True:
-            ngood = self.sock.recvint()
-            goodset = set([])
-            for i in xrange(ngood):
-                goodset.add(self.sock.recvint())
-            assert goodset.issubset(nnset)
-            badset = nnset - goodset
-            conset = []
-            for r in badset:
-                if r in wait_conn:
-                    conset.append(r)
-            self.sock.sendint(len(conset))
-            self.sock.sendint(len(badset) - len(conset))
-            for r in conset:
-                self.sock.sendstr(wait_conn[r].host)
-                self.sock.sendint(wait_conn[r].port)
-                self.sock.sendint(r)
-            nerr = self.sock.recvint()
-            if nerr != 0:
-                continue
-            self.port = self.sock.recvint()
-            rmset = []
-            # all connection was successuly setup
-            for r in conset:
-                wait_conn[r].wait_accept -= 1
-                if wait_conn[r].wait_accept == 0:
-                    rmset.append(r)
-            for r in rmset:
-                wait_conn.pop(r, None)
-            self.wait_accept = len(badset) - len(conset)
-            return rmset
-
-class Tracker:
-    def __init__(self, port = 9091, port_end = 9999, verbose = True, hostIP = 'auto'):
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        for port in range(port, port_end):
-            try:
-                sock.bind(('', port))
-                self.port = port
-                break
-            except socket.error:
-                continue
-        sock.listen(128)
-        self.sock = sock
-        self.verbose = verbose
-        if hostIP == 'auto':
-            hostIP = 'ip'
-        self.hostIP = hostIP
-        self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
-    def __del__(self):
-        self.sock.close()
-    def slave_envs(self):
-        """
-        get enviroment variables for slaves
-        can be passed in as args or envs
-        """
-        if self.hostIP == 'dns':
-            host = socket.gethostname()
-        elif self.hostIP == 'ip':
-            host = socket.gethostbyname(socket.getfqdn())
-        else:
-            host = self.hostIP
-        return {'rabit_tracker_uri': host,
-                'rabit_tracker_port': self.port}
-    def get_neighbor(self, rank, nslave):
-        rank = rank + 1
-        ret = []
-        if rank > 1:
-            ret.append(rank / 2 - 1)
-        if rank * 2 - 1  < nslave:
-            ret.append(rank * 2 - 1)
-        if rank * 2 < nslave:
-            ret.append(rank * 2)
-        return ret
-    def get_tree(self, nslave):
-        tree_map = {}
-        parent_map = {}
-        for r in range(nslave):
-            tree_map[r] = self.get_neighbor(r, nslave)
-            parent_map[r] = (r + 1) / 2 - 1
-        return tree_map, parent_map
-    def find_share_ring(self, tree_map, parent_map, r):
-        """
-        get a ring structure that tends to share nodes with the tree
-        return a list starting from r
-        """
-        nset = set(tree_map[r])
-        cset = nset - set([parent_map[r]])
-        if len(cset) == 0:
-            return [r]
-        rlst = [r]
-        cnt = 0
-        for v in cset:
-            vlst = self.find_share_ring(tree_map, parent_map, v)
-            cnt += 1
-            if cnt == len(cset):
-                vlst.reverse()
-            rlst += vlst
-        return rlst
-
-    def get_ring(self, tree_map, parent_map):
-        """
-        get a ring connection used to recover local data
-        """
-        assert parent_map[0] == -1
-        rlst = self.find_share_ring(tree_map, parent_map, 0)
-        assert len(rlst) == len(tree_map)
-        ring_map = {}
-        nslave = len(tree_map)
-        for r in range(nslave):
-            rprev = (r + nslave - 1) % nslave
-            rnext = (r + 1) % nslave
-            ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
-        return ring_map
-
-    def get_link_map(self, nslave):
-        """
-        get the link map, this is a bit hacky, call for better algorithm
-        to place similar nodes together
-        """
-        tree_map, parent_map = self.get_tree(nslave)
-        ring_map = self.get_ring(tree_map, parent_map)
-        rmap = {0 : 0}
-        k = 0
-        for i in range(nslave - 1):
-            k = ring_map[k][1]
-            rmap[k] = i + 1
-
-        ring_map_ = {}
-        tree_map_ = {}
-        parent_map_ ={}
-        for k, v in ring_map.items():
-            ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]])
-        for k, v in tree_map.items():
-            tree_map_[rmap[k]] = [rmap[x] for x in v]
-        for k, v in parent_map.items():
-            if k != 0:
-                parent_map_[rmap[k]] = rmap[v]
-            else:
-                parent_map_[rmap[k]] = -1
-        return tree_map_, parent_map_, ring_map_
-
-    def handle_print(self,slave, msg):
-        sys.stdout.write(msg)
-
-    def log_print(self, msg, level):
-        if level == 1:
-            if self.verbose:
-                sys.stderr.write(msg + '\n')
-        else:
-            sys.stderr.write(msg + '\n')
-
-    def accept_slaves(self, nslave):
-        # set of nodes that finishs the job
-        shutdown = {}
-        # set of nodes that is waiting for connections
-        wait_conn = {}
-        # maps job id to rank
-        job_map = {}
-        # list of workers that is pending to be assigned rank
-        pending = []
-        # lazy initialize tree_map
-        tree_map = None
-
-        while len(shutdown) != nslave:
-            fd, s_addr = self.sock.accept()
-            s = SlaveEntry(fd, s_addr)
-            if s.cmd == 'print':
-                msg = s.sock.recvstr()
-                self.handle_print(s, msg)
-                continue
-            if s.cmd == 'shutdown':
-                assert s.rank >= 0 and s.rank not in shutdown
-                assert s.rank not in wait_conn
-                shutdown[s.rank] = s
-                self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
-                continue
-            assert s.cmd == 'start' or s.cmd == 'recover'
-            # lazily initialize the slaves
-            if tree_map == None:
-                assert s.cmd == 'start'
-                if s.world_size > 0:
-                    nslave = s.world_size
-                tree_map, parent_map, ring_map = self.get_link_map(nslave)
-                # set of nodes that is pending for getting up
-                todo_nodes = range(nslave)
-            else:
-                assert s.world_size == -1 or s.world_size == nslave
-            if s.cmd == 'recover':
-                assert s.rank >= 0
-
-            rank = s.decide_rank(job_map)
-            # batch assignment of ranks
-            if rank == -1:
-                assert len(todo_nodes) != 0
-                pending.append(s)
-                if len(pending) == len(todo_nodes):
-                    pending.sort(key = lambda x : x.host)
-                    for s in pending:
-                        rank = todo_nodes.pop(0)
-                        if s.jobid != 'NULL':
-                            job_map[s.jobid] = rank
-                        s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
-                        if s.wait_accept > 0:
-                            wait_conn[rank] = s
-                        self.log_print('Recieve %s signal from %s; assign rank %d' % (s.cmd, s.host, s.rank), 1)
-                if len(todo_nodes) == 0:
-                    self.log_print('@tracker All of %d nodes getting started' % nslave, 2)
-                    self.start_time = time.time()
-            else:
-                s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
-                self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
-                if s.wait_accept > 0:
-                    wait_conn[rank] = s
-        self.log_print('@tracker All nodes finishes job', 2)
-        self.end_time = time.time()
-        self.log_print('@tracker %s secs between node start and job finish' % str(self.end_time - self.start_time), 2)
-
-def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
-    master = Tracker(verbose = verbose, hostIP = hostIP)
-    submit_thread = Thread(target = fun_submit, args = (nslave, args, master.slave_envs()))
-    submit_thread.daemon = True
-    submit_thread.start()
-    master.accept_slaves(nslave)
-    submit_thread.join()
diff --git a/subtree/rabit/tracker/rabit_yarn.py b/subtree/rabit/tracker/rabit_yarn.py
deleted file mode 100755
index 56b9d1e71..000000000
--- a/subtree/rabit/tracker/rabit_yarn.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python
-"""
-This is a script to submit rabit job via Yarn
-rabit will run as a Yarn application
-"""
-import argparse
-import sys
-import os
-import time
-import subprocess
-import warnings
-import rabit_tracker as tracker
-
-WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
-YARN_JAR_PATH = os.path.dirname(__file__) + '/../yarn/rabit-yarn.jar'
-YARN_BOOT_PY = os.path.dirname(__file__) + '/../yarn/run_hdfs_prog.py'
-
-if not os.path.exists(YARN_JAR_PATH):
-    warnings.warn("cannot find \"%s\", I will try to run build" % YARN_JAR_PATH)
-    cmd = 'cd %s;./build.sh' % (os.path.dirname(__file__) + '/../yarn/')
-    print cmd
-    subprocess.check_call(cmd, shell = True, env = os.environ) 
-    assert os.path.exists(YARN_JAR_PATH), "failed to build rabit-yarn.jar, try it manually"
-
-hadoop_binary  = None
-# code 
-hadoop_home = os.getenv('HADOOP_HOME')
-
-if hadoop_home != None:
-    if hadoop_binary == None:
-        hadoop_binary = hadoop_home + '/bin/hadoop'
-        assert os.path.exists(hadoop_binary), "HADOOP_HOME does not contain the hadoop binary"
-
-
-parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs to Yarn.')
-parser.add_argument('-n', '--nworker', required=True, type=int,
-                    help = 'number of worker proccess to be launched')
-parser.add_argument('-hip', '--host_ip', default='auto', type=str,
-                    help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
-parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
-                    help = 'print more messages into the console')
-parser.add_argument('-q', '--queue', default='default', type=str,
-                    help = 'the queue we want to submit the job to')
-parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
-                    help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
-parser.add_argument('-f', '--files', default = [], action='append',
-                    help = 'the cached file list in mapreduce,'\
-                        ' the submission script will automatically cache all the files which appears in command'\
-                        ' This will also cause rewritten of all the file names in the command to current path,'\
-                        ' for example `../../kmeans ../kmeans.conf` will be rewritten to `./kmeans kmeans.conf`'\
-                        ' because the two files are cached to running folder.'\
-                        ' You may need this option to cache additional files.'\
-                        ' You can also use it to manually cache files when auto_file_cache is off')
-parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
-parser.add_argument('--tempdir', default='/tmp', help = 'temporary directory in HDFS that can be used to store intermediate results')
-parser.add_argument('--vcores', default = 1, type=int,
-                    help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
-parser.add_argument('-mem', '--memory_mb', default=1024, type=int,
-                    help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
-                        'if you are running multi-threading rabit,'\
-                        'so that each node can occupy all the mapper slots in a machine for maximum performance')
-parser.add_argument('--libhdfs-opts', default='-Xmx128m', type=str,
-                    help = 'setting to be passed to libhdfs')
-parser.add_argument('--name-node', default='default', type=str,
-                    help = 'the namenode address of hdfs, libhdfs should connect to, normally leave it as default')
-
-parser.add_argument('command', nargs='+',
-                    help = 'command for rabit program')
-args = parser.parse_args()
-
-if args.jobname == 'auto':
-    args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
-
-if hadoop_binary == None:
-    parser.add_argument('-hb', '--hadoop_binary', required = True,
-                        help="path to hadoop binary file")  
-else:
-    parser.add_argument('-hb', '--hadoop_binary', default = hadoop_binary, 
-                        help="path to hadoop binary file")  
-
-args = parser.parse_args()
-
-if args.jobname == 'auto':
-    args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
-
-# detech hadoop version
-(out, err) = subprocess.Popen('%s version' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
-out = out.split('\n')[0].split()
-assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
-hadoop_version = out[1].split('.')
-
-(classpath, err) = subprocess.Popen('%s classpath --glob' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
-
-if hadoop_version < 2:    
-    print 'Current Hadoop Version is %s, rabit_yarn will need Yarn(Hadoop 2.0)' % out[1]
-
-def submit_yarn(nworker, worker_args, worker_env):
-    fset = set([YARN_JAR_PATH, YARN_BOOT_PY]) 
-    if args.auto_file_cache != 0:
-        for i in range(len(args.command)):
-            f = args.command[i]
-            if os.path.exists(f):
-                fset.add(f)
-                if i == 0:
-                    args.command[i] = './' + args.command[i].split('/')[-1]
-                else:
-                    args.command[i] = './' + args.command[i].split('/')[-1]
-    if args.command[0].endswith('.py'):
-        flst = [WRAPPER_PATH + '/rabit.py',
-                WRAPPER_PATH + '/librabit_wrapper.so',
-                WRAPPER_PATH + '/librabit_wrapper_mock.so']
-        for f in flst:
-            if os.path.exists(f):
-                fset.add(f)            
-    
-    cmd = 'java -cp `%s classpath`:%s org.apache.hadoop.yarn.rabit.Client ' % (args.hadoop_binary, YARN_JAR_PATH)
-    env = os.environ.copy()
-    for k, v in worker_env.items():
-        env[k] = str(v)
-    env['rabit_cpu_vcores'] = str(args.vcores)
-    env['rabit_memory_mb'] = str(args.memory_mb)
-    env['rabit_world_size'] = str(args.nworker)
-    env['rabit_hdfs_opts'] = str(args.libhdfs_opts)
-    env['rabit_hdfs_namenode'] = str(args.name_node)
-
-    if args.files != None:
-        for flst in args.files:
-            for f in flst.split('#'):
-                fset.add(f)
-    for f in fset:
-        cmd += ' -file %s' % f
-    cmd += ' -jobname %s ' % args.jobname
-    cmd += ' -tempdir %s ' % args.tempdir
-    cmd += ' -queue %s ' % args.queue
-    cmd += (' '.join(['./run_hdfs_prog.py'] + args.command + worker_args))
-    if args.verbose != 0:
-        print cmd
-    subprocess.check_call(cmd, shell = True, env = env)
-
-tracker.submit(args.nworker, [], fun_submit = submit_yarn, verbose = args.verbose, hostIP = args.host_ip)
diff --git a/subtree/rabit/windows/.gitignore b/subtree/rabit/windows/.gitignore
deleted file mode 100644
index 3bc83e45f..000000000
--- a/subtree/rabit/windows/.gitignore
+++ /dev/null
@@ -1,9 +0,0 @@
-*.suo
-*.exp
-*sdf
-*.exe
-ipch
-x64
-*.filters
-Release
-*.user
diff --git a/subtree/rabit/windows/README.md b/subtree/rabit/windows/README.md
deleted file mode 100644
index 9bdeb7988..000000000
--- a/subtree/rabit/windows/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-The solution has been created with Visual Studio Express 2010.
-Make sure to compile the Release version
-
-Build
-====
-* Build the project ```rabit``` , this will give you ```rabit.lib``` in ```x64\Release```
-
-Build Your code with rabit
-====
-* Add include to the dependency path of your project
-* Add ```rabit.lib``` to the linker dependency  
-* The project basic is an example to show you how to build rabit with basic.cc
diff --git a/subtree/rabit/windows/basic/basic.vcxproj b/subtree/rabit/windows/basic/basic.vcxproj
deleted file mode 100644
index 109c405ef..000000000
--- a/subtree/rabit/windows/basic/basic.vcxproj
+++ /dev/null
@@ -1,118 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{A6A95246-EB0A-46BA-9471-5939CB6B0006}</ProjectGuid>
-    <RootNamespace>basic</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup />
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\guide\basic.cc" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/subtree/rabit/windows/rabit.sln b/subtree/rabit/windows/rabit.sln
deleted file mode 100644
index bf61256d6..000000000
--- a/subtree/rabit/windows/rabit.sln
+++ /dev/null
@@ -1,50 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "basic", "basic\basic.vcxproj", "{A6A95246-EB0A-46BA-9471-5939CB6B0006}"
-	ProjectSection(ProjectDependencies) = postProject
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit_wrapper", "rabit_wrapper\rabit_wrapper.vcxproj", "{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}"
-	ProjectSection(ProjectDependencies) = postProject
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.ActiveCfg = Debug|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.Build.0 = Debug|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.ActiveCfg = Debug|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.Build.0 = Debug|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.ActiveCfg = Release|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.Build.0 = Release|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.ActiveCfg = Release|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.Build.0 = Release|x64
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Debug|Win32.ActiveCfg = Debug|Win32
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Debug|Win32.Build.0 = Debug|Win32
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Debug|x64.ActiveCfg = Debug|Win32
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|Win32.ActiveCfg = Release|Win32
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|Win32.Build.0 = Release|Win32
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|x64.ActiveCfg = Release|x64
-		{A6A95246-EB0A-46BA-9471-5939CB6B0006}.Release|x64.Build.0 = Release|x64
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Debug|Win32.ActiveCfg = Debug|Win32
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Debug|Win32.Build.0 = Debug|Win32
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Debug|x64.ActiveCfg = Debug|Win32
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|Win32.ActiveCfg = Release|Win32
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|Win32.Build.0 = Release|Win32
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|x64.ActiveCfg = Release|x64
-		{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
diff --git a/subtree/rabit/windows/rabit/rabit.vcxproj b/subtree/rabit/windows/rabit/rabit.vcxproj
deleted file mode 100644
index c670484d2..000000000
--- a/subtree/rabit/windows/rabit/rabit.vcxproj
+++ /dev/null
@@ -1,133 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}</ProjectGuid>
-    <RootNamespace>rabit</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup />
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\allreduce_base.cc" />
-    <ClCompile Include="..\..\src\allreduce_robust.cc" />
-    <ClCompile Include="..\..\src\engine.cc" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\include\rabit.h" />
-    <ClInclude Include="..\..\include\rabit\engine.h" />
-    <ClInclude Include="..\..\include\rabit\io.h" />
-    <ClInclude Include="..\..\include\rabit\rabit-inl.h" />
-    <ClInclude Include="..\..\include\rabit\timer.h" />
-    <ClInclude Include="..\..\include\rabit\utils.h" />
-    <ClInclude Include="..\..\include\rabit_serializable.h" />
-    <ClInclude Include="..\..\src\allreduce_base.h" />
-    <ClInclude Include="..\..\src\allreduce_mock.h" />
-    <ClInclude Include="..\..\src\allreduce_robust.h" />
-    <ClInclude Include="..\..\src\socket.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/subtree/rabit/windows/rabit_wrapper/rabit_wrapper.vcxproj b/subtree/rabit/windows/rabit_wrapper/rabit_wrapper.vcxproj
deleted file mode 100644
index 73eb5abb4..000000000
--- a/subtree/rabit/windows/rabit_wrapper/rabit_wrapper.vcxproj
+++ /dev/null
@@ -1,121 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}</ProjectGuid>
-    <RootNamespace>rabit_wrapper</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup />
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>..\..\x64\Release\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\wrapper\rabit_wrapper.cc" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\wrapper\rabit_wrapper.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/subtree/rabit/wrapper/rabit.py b/subtree/rabit/wrapper/rabit.py
deleted file mode 100644
index 91ce3e6ae..000000000
--- a/subtree/rabit/wrapper/rabit.py
+++ /dev/null
@@ -1,327 +0,0 @@
-"""
-Reliable Allreduce and Broadcast Library.
-
-Author: Tianqi Chen
-"""
-# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
-import cPickle as pickle
-import ctypes
-import os
-import sys
-import warnings
-import numpy as np
-
-# version information about the doc
-__version__ = '1.0'
-
-if os.name == 'nt':
-    WRAPPER_PATH = os.path.dirname(__file__) + '\\..\\windows\\x64\\Release\\rabit_wrapper%s.dll'
-else:
-    WRAPPER_PATH = os.path.dirname(__file__) + '/librabit_wrapper%s.so'
-
-_LIB = None
-
-# load in xgboost library
-def _loadlib(lib='standard'):
-    """Load rabit library."""
-    global _LIB
-    if _LIB != None:
-        warnings.warn('rabit.int call was ignored because it has'\
-                          ' already been initialized', level=2)
-        return
-    if lib == 'standard':
-        _LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '')
-    elif lib == 'mock':
-        _LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mock')
-    elif lib == 'mpi':
-        _LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mpi')
-    else:
-        raise Exception('unknown rabit lib %s, can be standard, mock, mpi' % lib)
-    _LIB.RabitGetRank.restype = ctypes.c_int
-    _LIB.RabitGetWorldSize.restype = ctypes.c_int
-    _LIB.RabitVersionNumber.restype = ctypes.c_int
-
-def _unloadlib():
-    """Unload rabit library."""
-    global _LIB
-    del _LIB
-    _LIB = None
-
-# reduction operators
-MAX = 0
-MIN = 1
-SUM = 2
-BITOR = 3
-
-def init(args=None, lib='standard'):
-    """Intialize the rabit module, call this once before using anything.
-
-    Parameters
-    ----------
-    args: list of str, optional
-        The list of arguments used to initialized the rabit
-        usually you need to pass in sys.argv.
-        Defaults to sys.argv when it is None.
-    lib: {'standard', 'mock', 'mpi'}
-        Type of library we want to load
-    """
-    if args is None:
-        args = sys.argv
-    _loadlib(lib)
-    arr = (ctypes.c_char_p * len(args))()
-    arr[:] = args
-    _LIB.RabitInit(len(args), arr)
-
-def finalize():
-    """Finalize the rabit engine.
-
-    Call this function after you finished all jobs.
-    """
-    _LIB.RabitFinalize()
-    _unloadlib()
-
-def get_rank():
-    """Get rank of current process.
-
-    Returns
-    -------
-    rank : int
-        Rank of current process.
-    """
-    ret = _LIB.RabitGetRank()
-    return ret
-
-def get_world_size():
-    """Get total number workers.
-
-    Returns
-    -------
-    n : int
-        Total number of process.
-    """
-    ret = _LIB.RabitGetWorldSize()
-    return ret
-
-def tracker_print(msg):
-    """Print message to the tracker.
-
-    This function can be used to communicate the information of
-    the progress to the tracker
-
-    Parameters
-    ----------
-    msg : str
-        The message to be printed to tracker.
-    """
-    if not isinstance(msg, str):
-        msg = str(msg)
-    _LIB.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
-
-def get_processor_name():
-    """Get the processor name.
-
-    Returns
-    -------
-    name : str
-        the name of processor(host)
-    """
-    mxlen = 256
-    length = ctypes.c_ulong()
-    buf = ctypes.create_string_buffer(mxlen)
-    _LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
-    return buf.value
-
-def broadcast(data, root):
-    """Broadcast object from one node to all other nodes.
-
-    Parameters
-    ----------
-    data : any type that can be pickled
-        Input data, if current rank does not equal root, this can be None
-    root : int
-        Rank of the node to broadcast data from.
-
-    Returns
-    -------
-    object : int
-        the result of broadcast.
-    """
-    rank = get_rank()
-    length = ctypes.c_ulong()
-    if root == rank:
-        assert data is not None, 'need to pass in data when broadcasting'
-        s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
-        length.value = len(s)
-    # run first broadcast
-    _LIB.RabitBroadcast(ctypes.byref(length),
-                        ctypes.sizeof(ctypes.c_ulong), root)
-    if root != rank:
-        dptr = (ctypes.c_char * length.value)()
-        # run second
-        _LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
-                            length.value, root)
-        data = pickle.loads(dptr.raw)
-        del dptr
-    else:
-        _LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
-                            length.value, root)
-        del s
-    return data
-
-# enumeration of dtypes
-DTYPE_ENUM__ = {
-    np.dtype('int8') : 0,
-    np.dtype('uint8') : 1,
-    np.dtype('int32') : 2,
-    np.dtype('uint32') : 3,
-    np.dtype('int64') : 4,
-    np.dtype('uint64') : 5,
-    np.dtype('float32') : 6,
-    np.dtype('float64') : 7
-}
-
-def allreduce(data, op, prepare_fun=None):
-    """Perform allreduce, return the result.
-
-    Parameters
-    ----------
-    data: numpy array
-        Input data.
-    op: int
-        Reduction operators, can be MIN, MAX, SUM, BITOR
-    prepare_fun: function
-        Lazy preprocessing function, if it is not None, prepare_fun(data)
-        will be called by the function before performing allreduce, to intialize the data
-        If the result of Allreduce can be recovered directly,
-        then prepare_fun will NOT be called
-
-    Returns
-    -------
-    result : array_like
-        The result of allreduce, have same shape as data
-
-    Notes
-    -----
-    This function is not thread-safe.
-    """
-    if not isinstance(data, np.ndarray):
-        raise Exception('allreduce only takes in numpy.ndarray')
-    buf = data.ravel()
-    if buf.base is data.base:
-        buf = buf.copy()
-    if buf.dtype not in DTYPE_ENUM__:
-        raise Exception('data type %s not supported' % str(buf.dtype))
-    if prepare_fun is None:
-        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                            buf.size, DTYPE_ENUM__[buf.dtype],
-                            op, None, None)
-    else:
-        func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
-        def pfunc(args):
-            """prepare function."""
-            prepare_fun(data)
-        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                            buf.size, DTYPE_ENUM__[buf.dtype],
-                            op, func_ptr(pfunc), None)
-    return buf
-
-
-def _load_model(ptr, length):
-    """
-    Internal function used by the module,
-    unpickle a model from a buffer specified by ptr, length
-    Arguments:
-        ptr: ctypes.POINTER(ctypes._char)
-            pointer to the memory region of buffer
-        length: int
-            the length of buffer
-    """
-    data = (ctypes.c_char * length).from_address(ctypes.addressof(ptr.contents))
-    return pickle.loads(data.raw)
-
-def load_checkpoint(with_local=False):
-    """Load latest check point.
-
-    Parameters
-    ----------
-    with_local: bool, optional
-        whether the checkpoint contains local model
-
-    Returns
-    -------
-    tuple : tuple
-        if with_local: return (version, gobal_model, local_model)
-        else return (version, gobal_model)
-        if returned version == 0, this means no model has been CheckPointed
-        and global_model, local_model returned will be None
-    """
-    gptr = ctypes.POINTER(ctypes.c_char)()
-    global_len = ctypes.c_ulong()
-    if with_local:
-        lptr = ctypes.POINTER(ctypes.c_char)()
-        local_len = ctypes.c_ulong()
-        version = _LIB.RabitLoadCheckPoint(
-            ctypes.byref(gptr),
-            ctypes.byref(global_len),
-            ctypes.byref(lptr),
-            ctypes.byref(local_len))
-        if version == 0:
-            return (version, None, None)
-        return (version,
-                _load_model(gptr, global_len.value),
-                _load_model(lptr, local_len.value))
-    else:
-        version = _LIB.RabitLoadCheckPoint(
-            ctypes.byref(gptr),
-            ctypes.byref(global_len),
-            None, None)
-        if version == 0:
-            return (version, None)
-        return (version,
-                _load_model(gptr, global_len.value))
-
-def checkpoint(global_model, local_model=None):
-    """Checkpoint the model.
-
-    This means we finished a stage of execution.
-    Every time we call check point, there is a version number which will increase by one.
-
-    Parameters
-    ----------
-    global_model: anytype that can be pickled
-        globally shared model/state when calling this function,
-        the caller need to gauranttees that global_model is the same in all nodes
-
-    local_model: anytype that can be pickled
-       Local model, that is specific to current node/rank.
-       This can be None when no local state is needed.
-
-    Notes
-    -----
-    local_model requires explicit replication of the model for fault-tolerance.
-    This will bring replication cost in checkpoint function.
-    while global_model do not need explicit replication.
-    It is recommended to use global_model if possible.
-    """
-    sglobal = pickle.dumps(global_model)
-    if local_model is None:
-        _LIB.RabitCheckPoint(sglobal, len(sglobal), None, 0)
-        del sglobal
-    else:
-        slocal = pickle.dumps(local_model)
-        _LIB.RabitCheckPoint(sglobal, len(sglobal), slocal, len(slocal))
-        del slocal
-        del sglobal
-
-def version_number():
-    """Returns version number of current stored model.
-
-    This means how many calls to CheckPoint we made so far.
-
-    Returns
-    -------
-    version : int
-        Version number of currently stored model
-    """
-    ret = _LIB.RabitVersionNumber()
-    return ret
diff --git a/subtree/rabit/wrapper/rabit_wrapper.cc b/subtree/rabit/wrapper/rabit_wrapper.cc
deleted file mode 100644
index 7025b3ffe..000000000
--- a/subtree/rabit/wrapper/rabit_wrapper.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright by Contributors
-// implementations in ctypes
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-
-#include <cstring>
-#include <string>
-#include "../include/rabit.h"
-#include "./rabit_wrapper.h"
-namespace rabit {
-namespace wrapper {
-// helper use to avoid BitOR operator
-template<typename OP, typename DType>
-struct FHelper {
-  inline static void
-  Allreduce(DType *senrecvbuf_,
-            size_t count,
-            void (*prepare_fun)(void *arg),
-            void *prepare_arg) {
-    rabit::Allreduce<OP>(senrecvbuf_, count,
-                         prepare_fun, prepare_arg);
-  }
-};
-template<typename DType>
-struct FHelper<op::BitOR, DType> {
-  inline static void
-  Allreduce(DType *senrecvbuf_,
-            size_t count,
-            void (*prepare_fun)(void *arg),
-            void *prepare_arg) {
-    utils::Error("DataType does not support bitwise or operation");
-  }
-};
-template<typename OP>
-inline void Allreduce_(void *sendrecvbuf_,
-                       size_t count,
-                       engine::mpi::DataType enum_dtype,
-                       void (*prepare_fun)(void *arg),
-                       void *prepare_arg) {
-  using namespace engine::mpi;
-  switch (enum_dtype) {
-    case kChar:
-      rabit::Allreduce<OP>
-          (static_cast<char*>(sendrecvbuf_),
-           count, prepare_fun, prepare_arg);
-      return;
-    case kUChar:
-      rabit::Allreduce<OP>
-          (static_cast<unsigned char*>(sendrecvbuf_),
-           count, prepare_fun, prepare_arg);
-      return;
-    case kInt:
-      rabit::Allreduce<OP>
-          (static_cast<int*>(sendrecvbuf_),
-           count, prepare_fun, prepare_arg);
-      return;
-    case kUInt:
-      rabit::Allreduce<OP>
-          (static_cast<unsigned*>(sendrecvbuf_),
-           count, prepare_fun, prepare_arg);
-      return;
-    case kLong:
-      rabit::Allreduce<OP>
-          (static_cast<long*>(sendrecvbuf_),  // NOLINT(*)
-           count, prepare_fun, prepare_arg);
-      return;
-    case kULong:
-      rabit::Allreduce<OP>
-          (static_cast<unsigned long*>(sendrecvbuf_),  // NOLINT(*)
-           count, prepare_fun, prepare_arg);
-      return;
-    case kFloat:
-      FHelper<OP, float>::Allreduce
-          (static_cast<float*>(sendrecvbuf_),
-           count, prepare_fun, prepare_arg);
-      return;
-    case kDouble:
-      FHelper<OP, double>::Allreduce
-          (static_cast<double*>(sendrecvbuf_),
-           count, prepare_fun, prepare_arg);
-      return;
-    default: utils::Error("unknown data_type");
-  }
-}
-inline void Allreduce(void *sendrecvbuf,
-                      size_t count,
-                      engine::mpi::DataType enum_dtype,
-                      engine::mpi::OpType enum_op,
-                      void (*prepare_fun)(void *arg),
-                      void *prepare_arg) {
-  using namespace engine::mpi;
-  switch (enum_op) {
-    case kMax:
-      Allreduce_<op::Max>
-          (sendrecvbuf,
-           count, enum_dtype,
-           prepare_fun, prepare_arg);
-      return;
-    case kMin:
-      Allreduce_<op::Min>
-          (sendrecvbuf,
-           count, enum_dtype,
-           prepare_fun, prepare_arg);
-      return;
-    case kSum:
-      Allreduce_<op::Sum>
-          (sendrecvbuf,
-           count, enum_dtype,
-           prepare_fun, prepare_arg);
-      return;
-    case kBitwiseOR:
-      Allreduce_<op::BitOR>
-          (sendrecvbuf,
-           count, enum_dtype,
-           prepare_fun, prepare_arg);
-      return;
-    default: utils::Error("unknown enum_op");
-  }
-}
-// temporal memory for global and local model
-std::string global_buffer, local_buffer;
-// wrapper for serialization
-struct ReadWrapper : public Serializable {
-  std::string *p_str;
-  explicit ReadWrapper(std::string *p_str)
-      : p_str(p_str) {}
-  virtual void Load(Stream *fi) {
-    uint64_t sz;
-    utils::Assert(fi->Read(&sz, sizeof(sz)) != 0,
-                 "Read pickle string");
-    p_str->resize(sz);
-    if (sz != 0) {
-      utils::Assert(fi->Read(&(*p_str)[0], sizeof(char) * sz) != 0,
-                    "Read pickle string");
-    }
-  }
-  virtual void Save(Stream *fo) const {
-    utils::Error("not implemented");
-  }
-};
-struct WriteWrapper : public Serializable {
-  const char *data;
-  size_t length;
-  explicit WriteWrapper(const char *data,
-                        size_t length)
-      : data(data), length(length) {
-  }
-  virtual void Load(Stream *fi) {
-    utils::Error("not implemented");
-  }
-  virtual void Save(Stream *fo) const {
-    uint64_t sz = static_cast<uint16_t>(length);
-    fo->Write(&sz, sizeof(sz));
-    fo->Write(data, length * sizeof(char));
-  }
-};
-}  // namespace wrapper
-}  // namespace rabit
-extern "C" {
-  void RabitInit(int argc, char *argv[]) {
-    rabit::Init(argc, argv);
-  }
-  void RabitFinalize(void) {
-    rabit::Finalize();
-  }
-  int RabitGetRank(void) {
-    return rabit::GetRank();
-  }
-  int RabitGetWorldSize(void) {
-    return rabit::GetWorldSize();
-  }
-  void RabitTrackerPrint(const char *msg) {
-    std::string m(msg);
-    rabit::TrackerPrint(m);
-  }
-  void RabitGetProcessorName(char *out_name,
-                             rbt_ulong *out_len,
-                             rbt_ulong max_len) {
-    std::string s = rabit::GetProcessorName();
-    if (s.length() > max_len) {
-      s.resize(max_len - 1);
-    }
-    strcpy(out_name, s.c_str()); // NOLINT(*)
-    *out_len = static_cast<rbt_ulong>(s.length());
-  }
-  void RabitBroadcast(void *sendrecv_data,
-                      rbt_ulong size, int root) {
-    rabit::Broadcast(sendrecv_data, size, root);
-  }
-  void RabitAllreduce(void *sendrecvbuf,
-                      size_t count,
-                      int enum_dtype,
-                      int enum_op,
-                      void (*prepare_fun)(void *arg),
-                      void *prepare_arg) {
-    rabit::wrapper::Allreduce
-        (sendrecvbuf, count,
-         static_cast<rabit::engine::mpi::DataType>(enum_dtype),
-         static_cast<rabit::engine::mpi::OpType>(enum_op),
-         prepare_fun, prepare_arg);
-  }
-  int RabitLoadCheckPoint(char **out_global_model,
-                          rbt_ulong *out_global_len,
-                          char **out_local_model,
-                          rbt_ulong *out_local_len) {
-    using rabit::BeginPtr;
-    using namespace rabit::wrapper;
-    ReadWrapper sg(&global_buffer);
-    ReadWrapper sl(&local_buffer);
-    int version;
-    if (out_local_model == NULL) {
-      version = rabit::LoadCheckPoint(&sg, NULL);
-      *out_global_model = BeginPtr(global_buffer);
-      *out_global_len = static_cast<rbt_ulong>(global_buffer.length());
-    } else {
-      version = rabit::LoadCheckPoint(&sg, &sl);
-      *out_global_model = BeginPtr(global_buffer);
-      *out_global_len = static_cast<rbt_ulong>(global_buffer.length());
-      *out_local_model = BeginPtr(local_buffer);
-      *out_local_len = static_cast<rbt_ulong>(local_buffer.length());
-    }
-    return version;
-  }
-  void RabitCheckPoint(const char *global_model,
-                       rbt_ulong global_len,
-                       const char *local_model,
-                       rbt_ulong local_len) {
-    using namespace rabit::wrapper;
-    WriteWrapper sg(global_model, global_len);
-    WriteWrapper sl(local_model, local_len);
-    if (local_model == NULL) {
-      rabit::CheckPoint(&sg, NULL);
-    } else {
-      rabit::CheckPoint(&sg, &sl);
-    }
-  }
-  int RabitVersionNumber(void) {
-    return rabit::VersionNumber();
-  }
-}
diff --git a/subtree/rabit/wrapper/rabit_wrapper.h b/subtree/rabit/wrapper/rabit_wrapper.h
deleted file mode 100644
index d00a31fda..000000000
--- a/subtree/rabit/wrapper/rabit_wrapper.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file rabit_wrapper.h
- * \author Tianqi Chen
- * \brief a C style wrapper of rabit
- *  can be used to create wrapper of other languages
- */
-#ifndef RABIT_WRAPPER_H_
-#define RABIT_WRAPPER_H_
-#ifdef _MSC_VER
-#define RABIT_DLL __declspec(dllexport)
-#else
-#define RABIT_DLL
-#endif
-// manually define unsign long
-typedef unsigned long rbt_ulong;  // NOLINT(*)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*!
- * \brief intialize the rabit module, call this once before using anything
- * \param argc number of arguments in argv
- * \param argv the array of input arguments
- */
-  RABIT_DLL void RabitInit(int argc, char *argv[]);
-  /*!
-   * \brief finalize the rabit engine, call this function after you finished all jobs
-   */
-  RABIT_DLL void RabitFinalize(void);
-  /*! \brief get rank of current process */
-  RABIT_DLL int RabitGetRank(void);
-  /*! \brief get total number of process */
-  RABIT_DLL int RabitGetWorldSize(void);
-  /*!
-   * \brief print the msg to the tracker,
-   *    this function can be used to communicate the information of the progress to
-   *    the user who monitors the tracker
-   * \param msg the message to be printed
-   */
-  RABIT_DLL void RabitTrackerPrint(const char *msg);
-  /*!
-   * \brief get name of processor
-   * \param out_name hold output string
-   * \param out_len hold length of output string
-   * \param max_len maximum buffer length of input
-   */
-  RABIT_DLL void RabitGetProcessorName(char *out_name,
-                                       rbt_ulong *out_len,
-                                       rbt_ulong max_len);
-  /*!
-   * \brief broadcast an memory region to all others from root
-   *
-   *     Example: int a = 1; Broadcast(&a, sizeof(a), root);
-   * \param sendrecv_data the pointer to send or recive buffer,
-   * \param size the size of the data
-   * \param root the root of process
-   */
-  RABIT_DLL void RabitBroadcast(void *sendrecv_data,
-                                rbt_ulong size, int root);
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf
-   *        this function is NOT thread-safe
-   *
-   * Example Usage: the following code gives sum of the result
-   *     vector<int> data(10);
-   *     ...
-   *     Allreduce<op::Sum>(&data[0], data.size());
-   *     ...
-   * \param sendrecvbuf buffer for both sending and recving data
-   * \param count number of elements to be reduced
-   * \param enum_dtype the enumeration of data type, see rabit::engine::mpi::DataType in engine.h of rabit include
-   * \param enum_op the enumeration of operation type, see rabit::engine::mpi::OpType in engine.h of rabit
-   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
-   *                    will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to passed into the lazy preprocessing function
-   */
-  RABIT_DLL void RabitAllreduce(void *sendrecvbuf,
-                                size_t count,
-                                int enum_dtype,
-                                int enum_op,
-                                void (*prepare_fun)(void *arg),
-                                void *prepare_arg);
-
-  /*!
-   * \brief load latest check point
-   * \param out_global_model hold output of serialized global_model
-   * \param out_global_len the output length of serialized global model
-   * \param out_local_model hold output of serialized local_model, can be NULL
-   * \param out_local_len the output length of serialized local model, can be NULL
-   *
-   * \return the version number of check point loaded
-   *     if returned version == 0, this means no model has been CheckPointed
-   *     nothing will be touched
-   */
-  RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
-                                    rbt_ulong *out_global_len,
-                                    char **out_local_model,
-                                    rbt_ulong *out_local_len);
-  /*!
-   * \brief checkpoint the model, meaning we finished a stage of execution
-   *  every time we call check point, there is a version number which will increase by one
-   *
-   * \param global_model hold content of serialized global_model
-   * \param global_len the content length of serialized global model
-   * \param local_model hold content of serialized local_model, can be NULL
-   * \param local_len the content length of serialized local model, can be NULL
-   *
-   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
-   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
-   *       So only CheckPoint with global_model if possible
-   */
-  RABIT_DLL void RabitCheckPoint(const char *global_model,
-                                 rbt_ulong global_len,
-                                 const char *local_model,
-                                 rbt_ulong local_len);
-  /*!
-   * \return version number of current stored model,
-   * which means how many calls to CheckPoint we made so far
-   */
-  RABIT_DLL int RabitVersionNumber(void);
-#ifdef __cplusplus
-}  // C
-#endif
-#endif  // RABIT_WRAPPER_H_
diff --git a/subtree/rabit/yarn/.gitignore b/subtree/rabit/yarn/.gitignore
deleted file mode 100644
index 1162c62ea..000000000
--- a/subtree/rabit/yarn/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-bin
-.classpath
-.project
-*.jar
diff --git a/subtree/rabit/yarn/README.md b/subtree/rabit/yarn/README.md
deleted file mode 100644
index a1f924fd9..000000000
--- a/subtree/rabit/yarn/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-rabit-yarn
-=====
-* This folder contains Application code to allow rabit run on Yarn.
-* You can use [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) to submit the job
-  - run ```./build.sh``` to build the jar, before using the script
diff --git a/subtree/rabit/yarn/build.sh b/subtree/rabit/yarn/build.sh
deleted file mode 100755
index 8908cafdd..000000000
--- a/subtree/rabit/yarn/build.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-if [ ! -d bin ]; then
-    mkdir bin
-fi
-
-CPATH=`${HADOOP_HOME}/bin/hadoop classpath`
-javac -cp $CPATH -d bin src/org/apache/hadoop/yarn/rabit/*
-jar cf rabit-yarn.jar -C bin . 
diff --git a/subtree/rabit/yarn/run_hdfs_prog.py b/subtree/rabit/yarn/run_hdfs_prog.py
deleted file mode 100755
index d3962bfa6..000000000
--- a/subtree/rabit/yarn/run_hdfs_prog.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-"""
-this script helps setup classpath env for HDFS, before running program
-that links with libhdfs
-"""
-import glob
-import sys
-import os
-import subprocess
-
-if len(sys.argv) < 2:
-    print 'Usage: the command you want to run'
-
-hadoop_home = os.getenv('HADOOP_HOME')
-hdfs_home = os.getenv('HADOOP_HDFS_HOME')
-java_home = os.getenv('JAVA_HOME')
-if hadoop_home is None:
-    hadoop_home = os.getenv('HADOOP_PREFIX')
-assert hadoop_home is not None, 'need to set HADOOP_HOME'
-assert hdfs_home is not None, 'need to set HADOOP_HDFS_HOME'
-assert java_home is not None, 'need to set JAVA_HOME'
-
-(classpath, err) = subprocess.Popen('%s/bin/hadoop classpath' % hadoop_home,
-                                    stdout=subprocess.PIPE, shell = True,
-                                    env = os.environ).communicate()
-cpath = []
-for f in classpath.split(':'):
-    cpath += glob.glob(f)
-
-lpath = []
-lpath.append('%s/lib/native' % hdfs_home)
-lpath.append('%s/jre/lib/amd64/server' % java_home) 
-
-env = os.environ.copy()
-env['CLASSPATH'] = '${CLASSPATH}:' + (':'.join(cpath))
-
-# setup hdfs options
-if 'rabit_hdfs_opts' in env:
-    env['LIBHDFS_OPTS'] = env['rabit_hdfs_opts']
-elif 'LIBHDFS_OPTS' not in env:
-    env['LIBHDFS_OPTS'] = '--Xmx128m'
-
-env['LD_LIBRARY_PATH'] = '${LD_LIBRARY_PATH}:' + (':'.join(lpath)) 
-ret = subprocess.call(args = sys.argv[1:], env = env)
-sys.exit(ret)
diff --git a/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/ApplicationMaster.java b/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/ApplicationMaster.java
deleted file mode 100644
index 47432aa26..000000000
--- a/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/ApplicationMaster.java
+++ /dev/null
@@ -1,570 +0,0 @@
-package org.apache.hadoop.yarn.rabit;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.Map;
-import java.util.Queue;
-import java.util.Collection;
-import java.util.Collections;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.yarn.util.ConverterUtils;
-import org.apache.hadoop.yarn.util.Records;
-import org.apache.hadoop.yarn.conf.YarnConfiguration;
-import org.apache.hadoop.yarn.api.ApplicationConstants;
-import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
-import org.apache.hadoop.yarn.api.records.Container;
-import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
-import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
-import org.apache.hadoop.yarn.api.records.ContainerState;
-import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
-import org.apache.hadoop.yarn.api.records.LocalResource;
-import org.apache.hadoop.yarn.api.records.LocalResourceType;
-import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
-import org.apache.hadoop.yarn.api.records.Priority;
-import org.apache.hadoop.yarn.api.records.Resource;
-import org.apache.hadoop.yarn.api.records.ContainerId;
-import org.apache.hadoop.yarn.api.records.ContainerStatus;
-import org.apache.hadoop.yarn.api.records.NodeReport;
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
-import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
-import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
-import org.apache.hadoop.security.Credentials;
-import org.apache.hadoop.security.UserGroupInformation;
-
-/**
- * application master for allocating resources of rabit client
- * 
- * @author Tianqi Chen
- */
-public class ApplicationMaster {
-    // logger
-    private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
-    // configuration
-    private Configuration conf = new YarnConfiguration();
-    // hdfs handler
-    private FileSystem dfs;
-
-    // number of cores allocated for each task
-    private int numVCores = 1;
-    // memory needed requested for the task
-    private int numMemoryMB = 10;
-    // priority of the app master
-    private int appPriority = 0;
-    // total number of tasks
-    private int numTasks = 1;
-    // maximum number of attempts to try in each task
-    private int maxNumAttempt = 3;
-    // command to launch
-    private String command = "";
-
-    // username
-    private String userName = "";
-    // user credentials
-    private Credentials credentials = null;
-    // security tokens
-    private ByteBuffer securityTokens = null;
-    // application tracker hostname
-    private String appHostName = "";
-    // tracker URL to do
-    private String appTrackerUrl = "";
-    // tracker port
-    private int appTrackerPort = 0;
-
-    // whether we start to abort the application, due to whatever fatal reasons
-    private boolean startAbort = false;
-    // worker resources
-    private Map<String, LocalResource> workerResources = new java.util.HashMap<String, LocalResource>();
-    // record the aborting reason
-    private String abortDiagnosis = "";
-    // resource manager
-    private AMRMClientAsync<ContainerRequest> rmClient = null;
-    // node manager
-    private NMClientAsync nmClient = null;
-
-    // list of tasks that pending for resources to be allocated
-    private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
-    // map containerId->task record of tasks that was running
-    private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
-    // collection of tasks
-    private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
-    // collection of killed tasks
-    private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
-
-    public static void main(String[] args) throws Exception {
-        new ApplicationMaster().run(args);
-    }
-
-    private ApplicationMaster() throws IOException {
-        dfs = FileSystem.get(conf);
-        userName = UserGroupInformation.getCurrentUser().getShortUserName();
-        credentials = UserGroupInformation.getCurrentUser().getCredentials();
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        this.credentials.writeTokenStorageToStream(buffer);
-        this.securityTokens = ByteBuffer.wrap(buffer.getData());
-    }
-    /**
-     * get integer argument from environment variable
-     * 
-     * @param name
-     *            name of key
-     * @param required
-     *            whether this is required
-     * @param defv
-     *            default value
-     * @return the requested result
-     */
-    private int getEnvInteger(String name, boolean required, int defv)
-            throws IOException {
-        String value = System.getenv(name);
-        if (value == null) {
-            if (required) {
-                throw new IOException("environment variable " + name
-                        + " not set");
-            } else {
-                return defv;
-            }
-        }
-        return Integer.valueOf(value);
-    }
-
-    /**
-     * initialize from arguments and command lines
-     * 
-     * @param args
-     */
-    private void initArgs(String args[]) throws IOException {
-        LOG.info("Start AM as user=" + this.userName);
-        // get user name
-        userName = UserGroupInformation.getCurrentUser().getShortUserName();
-        // cached maps
-        Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
-        for (int i = 0; i < args.length; ++i) {
-            if (args[i].equals("-file")) {
-                String[] arr = args[++i].split("#");
-                Path path = new Path(arr[0]);
-                if (arr.length == 1) {
-                    cacheFiles.put(path.getName(), path);
-                } else {
-                    cacheFiles.put(arr[1], path);
-                }
-            } else {
-                this.command += args[i] + " ";
-            }
-        }
-        for (Map.Entry<String, Path> e : cacheFiles.entrySet()) {
-            LocalResource r = Records.newRecord(LocalResource.class);
-            FileStatus status = dfs.getFileStatus(e.getValue());
-            r.setResource(ConverterUtils.getYarnUrlFromPath(e.getValue()));
-            r.setSize(status.getLen());
-            r.setTimestamp(status.getModificationTime());
-            r.setType(LocalResourceType.FILE);
-            r.setVisibility(LocalResourceVisibility.APPLICATION);
-            workerResources.put(e.getKey(), r);
-        }
-        numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
-        numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
-        numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
-        maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
-                maxNumAttempt);
-    }
-
-    /**
-     * called to start the application
-     */
-    private void run(String args[]) throws Exception {
-        this.initArgs(args);
-        this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000,
-                new RMCallbackHandler());
-        this.nmClient = NMClientAsync
-                .createNMClientAsync(new NMCallbackHandler());
-        this.rmClient.init(conf);
-        this.rmClient.start();
-        this.nmClient.init(conf);
-        this.nmClient.start();
-        RegisterApplicationMasterResponse response = this.rmClient
-                .registerApplicationMaster(this.appHostName,
-                        this.appTrackerPort, this.appTrackerUrl);
-      
-        boolean success = false;
-        String diagnostics = "";
-        try {
-            // list of tasks that waits to be submit
-            java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
-            // add waiting tasks
-            for (int i = 0; i < this.numTasks; ++i) {
-                tasks.add(new TaskRecord(i));
-            }
-            Resource maxResource = response.getMaximumResourceCapability();
-
-            if (maxResource.getMemory() < this.numMemoryMB) {
-                LOG.warn("[Rabit] memory requested exceed bound "
-                        + maxResource.getMemory());
-                this.numMemoryMB = maxResource.getMemory();
-            }
-            if (maxResource.getVirtualCores() < this.numVCores) {
-                LOG.warn("[Rabit] memory requested exceed bound "
-                        + maxResource.getVirtualCores());
-                this.numVCores = maxResource.getVirtualCores();
-            }
-            this.submitTasks(tasks);
-            LOG.info("[Rabit] ApplicationMaster started");
-            while (!this.doneAllJobs()) {
-                try {
-                    Thread.sleep(100);
-                } catch (InterruptedException e) {
-                }
-            }
-            assert (killedTasks.size() + finishedTasks.size() == numTasks);
-            success = finishedTasks.size() == numTasks;
-            LOG.info("Application completed. Stopping running containers");
-            diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
-                + ", finished=" + this.finishedTasks.size() + ", failed="
-                + this.killedTasks.size() + "\n" + this.abortDiagnosis;
-            nmClient.stop();
-            LOG.info(diagnostics);
-        } catch (Exception e) {
-            diagnostics = e.toString();
-        } 
-        rmClient.unregisterApplicationMaster(
-                success ? FinalApplicationStatus.SUCCEEDED
-                        : FinalApplicationStatus.FAILED, diagnostics,
-                appTrackerUrl);
-        if (!success)
-            throw new Exception("Application not successful");
-    }
-
-    /**
-     * check if the job finishes
-     * 
-     * @return whether we finished all the jobs
-     */
-    private synchronized boolean doneAllJobs() {
-        return pendingTasks.size() == 0 && runningTasks.size() == 0;
-    }
-
-    /**
-     * submit tasks to request containers for the tasks
-     * 
-     * @param tasks
-     *            a collection of tasks we want to ask container for
-     */
-    private synchronized void submitTasks(Collection<TaskRecord> tasks) {
-        for (TaskRecord r : tasks) {
-            Resource resource = Records.newRecord(Resource.class);
-            resource.setMemory(numMemoryMB);
-            resource.setVirtualCores(numVCores);
-            Priority priority = Records.newRecord(Priority.class);
-            priority.setPriority(this.appPriority);
-            r.containerRequest = new ContainerRequest(resource, null, null,
-                    priority);
-            rmClient.addContainerRequest(r.containerRequest);
-            pendingTasks.add(r);
-        }
-    }
-
-    /**
-     * launch the task on container
-     * 
-     * @param container
-     *            container to run the task
-     * @param task
-     *            the task
-     */
-    private void launchTask(Container container, TaskRecord task) {
-        task.container = container;
-        task.containerRequest = null;
-        ContainerLaunchContext ctx = Records
-                .newRecord(ContainerLaunchContext.class);
-        String cmd =
-        // use this to setup CLASSPATH correctly for libhdfs
-             this.command + " 1>"
-            + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
-            + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
-            + "/stderr";
-        ctx.setCommands(Collections.singletonList(cmd));
-        ctx.setTokens(this.securityTokens);
-        LOG.info(workerResources);
-        ctx.setLocalResources(this.workerResources);
-        // setup environment variables
-        Map<String, String> env = new java.util.HashMap<String, String>();
-
-        // setup class path, this is kind of duplicated, ignoring
-        StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
-        for (String c : conf.getStrings(
-                YarnConfiguration.YARN_APPLICATION_CLASSPATH,
-                YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
-            String[] arrPath = c.split(":");
-            for (String ps : arrPath) {
-                if (ps.endsWith("*.jar") || ps.endsWith("*")) {
-                    ps = ps.substring(0, ps.lastIndexOf('*'));
-                    String prefix = ps.substring(0, ps.lastIndexOf('/'));
-                    if (ps.startsWith("$")) {
-                        String[] arr =ps.split("/", 2);
-                        if (arr.length != 2) continue;
-                        try {
-                            ps = System.getenv(arr[0].substring(1)) + '/' + arr[1];
-                        } catch (Exception e){
-                            continue;
-                        }
-                    }
-                    File dir = new File(ps);
-                    if (dir.isDirectory()) {
-                        for (File f: dir.listFiles()) {
-                            if (f.isFile() && f.getPath().endsWith(".jar")) {
-                                cpath.append(":");
-                                cpath.append(prefix + '/' + f.getName());
-                            }
-                        }
-                    }
-                } else {
-                    cpath.append(':');
-                    cpath.append(ps.trim());
-                }
-            }
-        }
-        // already use hadoop command to get class path in worker, maybe a
-        // better solution in future
-        env.put("CLASSPATH", cpath.toString());
-        //LOG.info("CLASSPATH =" + cpath.toString());
-        // setup LD_LIBARY_PATH path for libhdfs
-        env.put("LD_LIBRARY_PATH",
-                "${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
-        env.put("PYTHONPATH", "${PYTHONPATH}:.");
-        // inherit all rabit variables
-        for (Map.Entry<String, String> e : System.getenv().entrySet()) {
-            if (e.getKey().startsWith("rabit_")) {
-                env.put(e.getKey(), e.getValue());
-            }
-            if (e.getKey() == "LIBHDFS_OPTS") {
-                env.put(e.getKey(), e.getValue());
-            }
-        }
-        env.put("rabit_task_id", String.valueOf(task.taskId));
-        env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
-        // ctx.setUser(userName);
-        ctx.setEnvironment(env);
-        synchronized (this) {
-            assert (!this.runningTasks.containsKey(container.getId()));
-            this.runningTasks.put(container.getId(), task);
-            this.nmClient.startContainerAsync(container, ctx);
-        }
-    }
-
-    /**
-     * free the containers that have not yet been launched
-     * 
-     * @param containers
-     */
-    private synchronized void freeUnusedContainers(
-            Collection<Container> containers) {
-    }
-
-    /**
-     * handle method for AMRMClientAsync.CallbackHandler container allocation
-     * 
-     * @param containers
-     */
-    private synchronized void onContainersAllocated(List<Container> containers) {
-        if (this.startAbort) {
-            this.freeUnusedContainers(containers);
-            return;
-        }
-        Collection<Container> freelist = new java.util.LinkedList<Container>();
-        for (Container c : containers) {
-            TaskRecord task;
-            task = pendingTasks.poll();
-            if (task == null) {
-                freelist.add(c);
-                continue;
-            }
-            this.launchTask(c, task);
-        }
-        this.freeUnusedContainers(freelist);
-    }
-
-    /**
-     * start aborting the job
-     * 
-     * @param msg
-     *            the fatal message
-     */
-    private synchronized void abortJob(String msg) {
-        if (!this.startAbort)
-            this.abortDiagnosis = msg;
-        this.startAbort = true;
-        for (TaskRecord r : this.runningTasks.values()) {
-            if (!r.abortRequested) {
-                nmClient.stopContainerAsync(r.container.getId(),
-                        r.container.getNodeId());
-                r.abortRequested = true;
-            }
-        }
-        this.killedTasks.addAll(this.pendingTasks);
-        for (TaskRecord r : this.pendingTasks) {
-            rmClient.removeContainerRequest(r.containerRequest);
-        }
-        this.pendingTasks.clear();
-        LOG.info(msg);
-    }
-
-    /**
-     * handle non fatal failures
-     * 
-     * @param cid
-     */
-    private synchronized void handleFailure(Collection<ContainerId> failed) {
-        Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
-        for (ContainerId cid : failed) {
-            TaskRecord r = runningTasks.remove(cid);
-            if (r == null) {
-                continue;
-            }
-            LOG.info("Task "
-                    + r.taskId
-                    + "failed on "
-                    + r.container.getId()
-                    + ". See LOG at : "
-                    + String.format("http://%s/node/containerlogs/%s/"
-                            + userName, r.container.getNodeHttpAddress(),
-                            r.container.getId()));
-            r.attemptCounter += 1;
-            r.container = null;
-            tasks.add(r);
-            if (r.attemptCounter >= this.maxNumAttempt) {
-                this.abortJob("[Rabit] Task " + r.taskId + " failed more than "
-                        + r.attemptCounter + "times");
-            }
-        }
-        if (this.startAbort) {
-            this.killedTasks.addAll(tasks);
-        } else {
-            this.submitTasks(tasks);
-        }
-    }
-
-    /**
-     * handle method for AMRMClientAsync.CallbackHandler container allocation
-     * 
-     * @param status
-     *            list of status
-     */
-    private synchronized void onContainersCompleted(List<ContainerStatus> status) {
-        Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>();
-        for (ContainerStatus s : status) {
-            assert (s.getState().equals(ContainerState.COMPLETE));
-            int exstatus = s.getExitStatus();
-            TaskRecord r = runningTasks.get(s.getContainerId());
-            if (r == null)
-                continue;
-            if (exstatus == ContainerExitStatus.SUCCESS) {
-                finishedTasks.add(r);
-                runningTasks.remove(s.getContainerId());
-            } else {
-                try {
-                    if (exstatus == ContainerExitStatus.class.getField(
-                            "KILLED_EXCEEDED_PMEM").getInt(null)) {
-                        this.abortJob("[Rabit] Task "
-                                + r.taskId
-                                + " killed because of exceeding allocated physical memory");
-                        continue;
-                    }
-                    if (exstatus == ContainerExitStatus.class.getField(
-                            "KILLED_EXCEEDED_VMEM").getInt(null)) {
-                        this.abortJob("[Rabit] Task "
-                                + r.taskId
-                                + " killed because of exceeding allocated virtual memory");
-                        continue;
-                    }
-                } catch (Exception e) {
-                }
-                LOG.info("[Rabit] Task " + r.taskId + " exited with status "
-                         + exstatus + " Diagnostics:"+ s.getDiagnostics());
-                failed.add(s.getContainerId());
-            }
-        }
-        this.handleFailure(failed);
-    }
-
-    /**
-     * callback handler for resource manager
-     */
-    private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
-        @Override
-        public float getProgress() {
-            return 1.0f - (float) (pendingTasks.size()) / numTasks;
-        }
-
-        @Override
-        public void onContainersAllocated(List<Container> containers) {
-            ApplicationMaster.this.onContainersAllocated(containers);
-        }
-
-        @Override
-        public void onContainersCompleted(List<ContainerStatus> status) {
-            ApplicationMaster.this.onContainersCompleted(status);
-        }
-
-        @Override
-        public void onError(Throwable ex) {
-            ApplicationMaster.this.abortJob("[Rabit] Resource manager Error "
-                    + ex.toString());
-        }
-
-        @Override
-        public void onNodesUpdated(List<NodeReport> nodereport) {
-        }
-
-        @Override
-        public void onShutdownRequest() {
-            ApplicationMaster.this
-                    .abortJob("[Rabit] Get shutdown request, start to shutdown...");
-        }
-    }
-
-    private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
-        @Override
-        public void onContainerStarted(ContainerId cid,
-                Map<String, ByteBuffer> services) {
-            LOG.debug("onContainerStarted Invoked");
-        }
-
-        @Override
-        public void onContainerStatusReceived(ContainerId cid,
-                ContainerStatus status) {
-            LOG.debug("onContainerStatusReceived Invoked");
-        }
-
-        @Override
-        public void onContainerStopped(ContainerId cid) {
-            LOG.debug("onContainerStopped Invoked");
-        }
-
-        @Override
-        public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
-            LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
-            ApplicationMaster.this
-                    .handleFailure(Collections.singletonList(cid));
-        }
-
-        @Override
-        public void onStartContainerError(ContainerId cid, Throwable ex) {
-            LOG.debug("onStartContainerError Invoked: " + ex.toString());
-            ApplicationMaster.this
-                    .handleFailure(Collections.singletonList(cid));
-        }
-
-        @Override
-        public void onStopContainerError(ContainerId cid, Throwable ex) {
-            LOG.info("onStopContainerError Invoked: " + ex.toString());
-        }
-    }
-}
diff --git a/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/Client.java b/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/Client.java
deleted file mode 100644
index 9dbdc2619..000000000
--- a/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/Client.java
+++ /dev/null
@@ -1,269 +0,0 @@
-package org.apache.hadoop.yarn.rabit;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.permission.FsPermission;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.security.UserGroupInformation;
-import org.apache.hadoop.security.Credentials;
-import org.apache.hadoop.yarn.api.ApplicationConstants;
-import org.apache.hadoop.yarn.api.records.ApplicationId;
-import org.apache.hadoop.yarn.api.records.ApplicationReport;
-import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
-import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
-import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
-import org.apache.hadoop.yarn.api.records.LocalResource;
-import org.apache.hadoop.yarn.api.records.LocalResourceType;
-import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
-import org.apache.hadoop.yarn.api.records.Resource;
-import org.apache.hadoop.yarn.api.records.QueueInfo;
-import org.apache.hadoop.yarn.api.records.YarnApplicationState;
-import org.apache.hadoop.yarn.client.api.YarnClient;
-import org.apache.hadoop.yarn.client.api.YarnClientApplication;
-import org.apache.hadoop.yarn.conf.YarnConfiguration;
-import org.apache.hadoop.yarn.util.ConverterUtils;
-import org.apache.hadoop.yarn.util.Records;
-
-public class Client {
-    // logger
-    private static final Log LOG = LogFactory.getLog(Client.class);
-    // permission for temp file
-    private static final FsPermission permTemp = new FsPermission("777");
-    // configuration
-    private YarnConfiguration conf = new YarnConfiguration();
-    // hdfs handler
-    private FileSystem dfs;
-    // cached maps
-    private Map<String, String> cacheFiles = new java.util.HashMap<String, String>();
-    // enviroment variable to setup cachefiles
-    private String cacheFileArg = "";
-    // args to pass to application master
-    private String appArgs = "";
-    // HDFS Path to store temporal result
-    private String tempdir = "/tmp";
-    // user name
-    private String userName = "";
-    // user credentials
-    private Credentials credentials = null;
-    // job name
-    private String jobName = "";
-    // queue
-    private String queue = "default";
-    /**
-     * constructor
-     * @throws IOException
-     */
-    private Client() throws IOException {
-        conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/core-site.xml"));
-        conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/hdfs-site.xml"));
-        dfs = FileSystem.get(conf);
-        userName = UserGroupInformation.getCurrentUser().getShortUserName();
-        credentials = UserGroupInformation.getCurrentUser().getCredentials();
-    }
-    
-    /**
-     * setup security token given current user
-     * @return the ByeBuffer containing the security tokens
-     * @throws IOException
-     */
-    private ByteBuffer setupTokens() throws IOException {
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        this.credentials.writeTokenStorageToStream(buffer);
-        return ByteBuffer.wrap(buffer.getData());
-    }
-    
-    /**
-     * setup all the cached files
-     * 
-     * @param fmaps
-     *            the file maps
-     * @return the resource map
-     * @throws IOException
-     */
-    private Map<String, LocalResource> setupCacheFiles(ApplicationId appId) throws IOException {
-        // create temporary rabit directory
-        Path tmpPath = new Path(this.tempdir);
-        if (!dfs.exists(tmpPath)) {
-            dfs.mkdirs(tmpPath, permTemp);
-            LOG.info("HDFS temp directory do not exist, creating.. " + tmpPath);
-        }
-        tmpPath = new Path(tmpPath + "/temp-rabit-yarn-" + appId);
-        if (dfs.exists(tmpPath)) {
-            dfs.delete(tmpPath, true);
-        }
-        // create temporary directory
-        FileSystem.mkdirs(dfs, tmpPath, permTemp);
-        
-        StringBuilder cstr = new StringBuilder();
-        Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>();
-        for (Map.Entry<String, String> e : cacheFiles.entrySet()) {
-            LocalResource r = Records.newRecord(LocalResource.class);
-            Path path = new Path(e.getValue());
-            // copy local data to temporary folder in HDFS
-            if (!e.getValue().startsWith("hdfs://")) {
-                Path dst = new Path("hdfs://" + tmpPath + "/"+  path.getName());
-                dfs.copyFromLocalFile(false, true, path, dst);
-                dfs.setPermission(dst, permTemp);
-                dfs.deleteOnExit(dst);
-                path = dst;
-            }
-            FileStatus status = dfs.getFileStatus(path);
-            r.setResource(ConverterUtils.getYarnUrlFromPath(path));
-            r.setSize(status.getLen());
-            r.setTimestamp(status.getModificationTime());
-            r.setType(LocalResourceType.FILE);
-            r.setVisibility(LocalResourceVisibility.APPLICATION);
-            rmap.put(e.getKey(), r);
-            cstr.append(" -file \"");
-            cstr.append(path.toString());
-            cstr.append('#');
-            cstr.append(e.getKey());
-            cstr.append("\"");
-        }
-        
-        dfs.deleteOnExit(tmpPath);
-        this.cacheFileArg = cstr.toString();
-        return rmap;
-    }
-
-    /**
-     * get the environment variables for container
-     * 
-     * @return the env variable for child class
-     */
-    private Map<String, String> getEnvironment() {
-        // Setup environment variables
-        Map<String, String> env = new java.util.HashMap<String, String>();
-        String cpath = "${CLASSPATH}:./*";
-        for (String c : conf.getStrings(
-                YarnConfiguration.YARN_APPLICATION_CLASSPATH,
-                YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
-            cpath += ':';
-            cpath += c.trim();
-        }
-        env.put("CLASSPATH", cpath);
-        for (Map.Entry<String, String> e : System.getenv().entrySet()) {
-            if (e.getKey().startsWith("rabit_")) {
-                env.put(e.getKey(), e.getValue());
-            }
-            if (e.getKey() == "LIBHDFS_OPTS") {
-                env.put(e.getKey(), e.getValue());
-            }
-        }
-        LOG.debug(env);
-        return env;
-    }
-
-    /**
-     * initialize the settings
-     * 
-     * @param args
-     */
-    private void initArgs(String[] args) {
-        // directly pass all arguments except args0
-        StringBuilder sargs = new StringBuilder("");
-        for (int i = 0; i < args.length; ++i) {
-            if (args[i].equals("-file")) {
-                String[] arr = args[++i].split("#");
-                if (arr.length == 1) {
-                    cacheFiles.put(new Path(arr[0]).getName(), arr[0]);
-                } else {
-                    cacheFiles.put(arr[1], arr[0]);
-                }
-            } else if(args[i].equals("-jobname")) {
-                this.jobName = args[++i];
-            } else if(args[i].equals("-tempdir")) {
-                this.tempdir = args[++i];
-            } else if(args[i].equals("-queue")) {
-                this.queue = args[++i];
-            } else {
-                sargs.append(" ");
-                sargs.append(args[i]);
-            }
-        }
-        this.appArgs = sargs.toString();
-    }
-
-    private void run(String[] args) throws Exception {
-        if (args.length == 0) {
-            System.out.println("Usage: [options] [commands..]");
-            System.out.println("options: [-file filename]");
-            return;
-        }
-        this.initArgs(args);
-        // Create yarnClient
-        YarnClient yarnClient = YarnClient.createYarnClient();
-        yarnClient.init(conf);
-        yarnClient.start();
-
-        // Create application via yarnClient
-        YarnClientApplication app = yarnClient.createApplication();
-
-        // Set up the container launch context for the application master
-        ContainerLaunchContext amContainer = Records
-                .newRecord(ContainerLaunchContext.class);
-        ApplicationSubmissionContext appContext = app
-                .getApplicationSubmissionContext();
-        // Submit application
-        ApplicationId appId = appContext.getApplicationId();
-        // setup security token
-        amContainer.setTokens(this.setupTokens());
-        // setup cache-files and environment variables
-        amContainer.setLocalResources(this.setupCacheFiles(appId));
-        amContainer.setEnvironment(this.getEnvironment());
-        String cmd = "$JAVA_HOME/bin/java"
-                + " -Xmx900M"
-                + " org.apache.hadoop.yarn.rabit.ApplicationMaster"
-                + this.cacheFileArg + ' ' + this.appArgs + " 1>"
-                + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
-                + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr";
-        LOG.debug(cmd);
-        amContainer.setCommands(Collections.singletonList(cmd));
-
-        // Set up resource type requirements for ApplicationMaster
-        Resource capability = Records.newRecord(Resource.class);
-        capability.setMemory(1024);
-        capability.setVirtualCores(1);
-        LOG.info("jobname=" + this.jobName + ",username=" + this.userName);
-        
-        appContext.setApplicationName(jobName + ":RABIT-YARN");
-        appContext.setAMContainerSpec(amContainer);
-        appContext.setResource(capability);
-        appContext.setQueue(queue);
-        //appContext.setUser(userName);
-        LOG.info("Submitting application " + appId);      
-        yarnClient.submitApplication(appContext);
-
-        ApplicationReport appReport = yarnClient.getApplicationReport(appId);
-        YarnApplicationState appState = appReport.getYarnApplicationState();
-        while (appState != YarnApplicationState.FINISHED
-                && appState != YarnApplicationState.KILLED
-                && appState != YarnApplicationState.FAILED) {
-            Thread.sleep(100);
-            appReport = yarnClient.getApplicationReport(appId);
-            appState = appReport.getYarnApplicationState();
-        }
-        
-        System.out.println("Application " + appId + " finished with"
-                + " state " + appState + " at " + appReport.getFinishTime());
-        if (!appReport.getFinalApplicationStatus().equals(
-                FinalApplicationStatus.SUCCEEDED)) {
-            System.err.println(appReport.getDiagnostics());
-            System.out.println("Available queues:");
-            for (QueueInfo q : yarnClient.getAllQueues()) {
-              System.out.println(q.getQueueName());
-            }
-        }
-    }
-
-    public static void main(String[] args) throws Exception {
-        new Client().run(args);
-    }
-}
diff --git a/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/TaskRecord.java b/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/TaskRecord.java
deleted file mode 100644
index c1b70d320..000000000
--- a/subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/TaskRecord.java
+++ /dev/null
@@ -1,24 +0,0 @@
-package org.apache.hadoop.yarn.rabit;
-
-import org.apache.hadoop.yarn.api.records.Container;
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
-
-/**
- * data structure to hold the task information
- */
-public class TaskRecord {
-    // task id of the task
-    public int taskId = 0;
-    // number of failed attempts to run the task
-    public int attemptCounter = 0;
-    // container request, can be null if task is already running
-    public ContainerRequest containerRequest = null;
-    // running container, can be null if the task is not launched
-    public Container container = null;
-    // whether we have requested abortion of this task
-    public boolean abortRequested = false;
-
-    public TaskRecord(int taskId) {
-        this.taskId = taskId;
-    }
-}
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
new file mode 100755
index 000000000..0e8b25e8a
--- /dev/null
+++ b/tests/travis/run_test.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+if [ ${TASK} == "lint" ]; then
+    make lint || exit -1
+    echo "Check documentations..."
+    make doxygen 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
+    echo "---------Error Log----------"
+    cat logclean.txt
+    echo "----------------------------"
+    (cat logclean.txt|grep warning) && exit -1
+    (cat logclean.txt|grep error) && exit -1
+    exit 0
+fi
+
+cp make/travis.mk config.mk
+make -f dmlc-core/scripts/packages.mk lz4
+
+
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    echo "USE_OPENMP=0" >> config.mk
+fi
+
+if [ ${TASK} == "python_test" ]; then
+    make all || exit -1
+    echo "-------------------------------"
+    source activate python3
+    python --version
+    python -m nose tests/python || exit -1
+    source activate python2
+    echo "-------------------------------"
+    python --version
+    python -m nose tests/python || exit -1
+    exit 0
+fi
+
+if [ ${TASK} == "r_test" ]; then
+    set -e
+    export _R_CHECK_TIMINGS_=0
+    export R_BUILD_ARGS="--no-build-vignettes --no-manual"
+    export R_CHECK_ARGS="--no-vignettes --no-manual"
+
+    curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
+    chmod 755 ./travis-tool.sh
+    ./travis-tool.sh bootstrap
+    make Rpack
+    cd ./xgboost
+    ../travis-tool.sh install_deps
+    ../travis-tool.sh run_tests
+    exit 0
+fi
+
+if [ ${TASK} == "java_test" ]; then
+    set -e
+    make java
+    cd java
+    ./create_wrap.sh
+    cd xgboost4j
+    mvn clean install -DskipTests=true
+    mvn test
+fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
new file mode 100755
index 000000000..03d71c90e
--- /dev/null
+++ b/tests/travis/setup.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    brew update
+    brew install graphviz
+fi
+
+if [ ${TASK} == "lint" ]; then
+    pip install cpplint 'pylint==1.4.4' 'astroid==1.3.6' --user `whoami`
+fi
+
+
+if [ ${TASK} == "python_test" ]; then
+    # python2
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+    else
+        wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    fi
+    bash conda.sh -b -p $HOME/miniconda
+    export PATH="$HOME/miniconda/bin:$PATH"
+    hash -r
+    conda config --set always_yes yes --set changeps1 no
+    conda update -q conda
+    # Useful for debugging any issues with conda
+    conda info -a
+    conda create -n python3 python=3.5
+    conda create -n python2 python=2.7
+    source activate python3
+    conda install numpy scipy pandas matplotlib nose scikit-learn
+    python -m pip install graphviz
+    source activate python2
+    conda install numpy scipy pandas matplotlib nose scikit-learn
+    python -m pip install graphviz
+fi
diff --git a/tests/travis/travis_after_failure.sh b/tests/travis/travis_after_failure.sh
new file mode 100755
index 000000000..553cc979e
--- /dev/null
+++ b/tests/travis/travis_after_failure.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ ${TASK} == "r_test" ]; then
+    cat xgboost/xgboost.Rcheck/*.log
+    echo "--------------------------"
+    cat xgboost/xgboost.Rcheck/*.out
+fi
diff --git a/windows/README.md b/windows/README.md
deleted file mode 100644
index 564c97d25..000000000
--- a/windows/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-The solution has been created with Visual Studio Express 2010.
-
-How to Build Windows Version
-=====
-* Open the solution file with Visual Studio
-* Select x64 and Release in build
-	- For 32bit windows or python, try win32 and Release (not fully tested)
-* Rebuild all
-
-This should give you xgboost.exe for CLI version and xgboost_wrapper.dll for python
-
-Use Python Module
-=====
-* After you build the dll, you can install the Python package from the [../python-package](../python-package) folder
-
-```
-python setup.py install
-```
-
-And import it as usual
-
-```
-import xgboost as xgb
-```
-
-R Package
-====
-* see [R-package](../R-package) instead
diff --git a/windows/xgboost.sln b/windows/xgboost.sln
deleted file mode 100644
index d94c14932..000000000
--- a/windows/xgboost.sln
+++ /dev/null
@@ -1,60 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{19766C3F-7508-49D0-BAAC-0988FCC9970C}"
-	ProjectSection(ProjectDependencies) = postProject
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
-	ProjectSection(ProjectDependencies) = postProject
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "..\subtree\rabit\windows\rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboostjavawrapper", "xgboostjavawrapper\xgboostjavawrapper.vcxproj", "{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Debug|x64
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|x64
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.Build.0 = Debug|x64
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.Build.0 = Release|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.ActiveCfg = Debug|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|Win32.Build.0 = Debug|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.ActiveCfg = Debug|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Debug|x64.Build.0 = Debug|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.ActiveCfg = Release|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|Win32.Build.0 = Release|Win32
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.ActiveCfg = Release|x64
-		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}.Release|x64.Build.0 = Release|x64
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.ActiveCfg = Debug|Win32
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.Build.0 = Debug|Win32
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.ActiveCfg = Debug|x64
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.ActiveCfg = Release|Win32
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.Build.0 = Release|Win32
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|x64.ActiveCfg = Release|x64
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj
deleted file mode 100644
index 8a15eaf61..000000000
--- a/windows/xgboost/xgboost.vcxproj
+++ /dev/null
@@ -1,131 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\gbm\gbm.cpp" />
-    <ClCompile Include="..\..\src\io\dmlc_simple.cpp" />
-    <ClCompile Include="..\..\src\io\io.cpp" />
-    <ClCompile Include="..\..\src\tree\updater.cpp" />
-    <ClCompile Include="..\..\src\xgboost_main.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="..\..\subtree\rabit\windows\rabit\rabit.vcxproj">
-      <Project>{d7b77d06-4f5f-4bd7-b81e-7cc8ebbe684f}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{19766C3F-7508-49D0-BAAC-0988FCC9970C}</ProjectGuid>
-    <RootNamespace>xgboost</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup />
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <OpenMPSupport>true</OpenMPSupport>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <OpenMPSupport>true</OpenMPSupport>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/windows/xgboost4j/xgboost4j.vcxproj b/windows/xgboost4j/xgboost4j.vcxproj
deleted file mode 100644
index 060ab399a..000000000
--- a/windows/xgboost4j/xgboost4j.vcxproj
+++ /dev/null
@@ -1,130 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\java\xgboost4j_wrapper.cpp" />
-    <ClCompile Include="..\..\src\gbm\gbm.cpp" />
-    <ClCompile Include="..\..\src\io\dmlc_simple.cpp" />
-    <ClCompile Include="..\..\src\io\io.cpp" />
-    <ClCompile Include="..\..\src\tree\updater.cpp" />
-    <ClCompile Include="..\..\subtree\rabit\src\engine_empty.cc" />
-    <ClCompile Include="..\..\wrapper\xgboost_wrapper.cpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}</ProjectGuid>
-    <RootNamespace>xgboost_wrapper</RootNamespace>
-    <ProjectName>xgboost4j</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <OpenMPSupport>true</OpenMPSupport>
-      <AdditionalIncludeDirectories>$(JAVA_HOME)\include;$(JAVA_HOME)\include\win32;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <OpenMPSupport>true</OpenMPSupport>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <AdditionalIncludeDirectories>$(JAVA_HOME)\include\win32;$(JAVA_HOME)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
deleted file mode 100644
index cff3cde65..000000000
--- a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
+++ /dev/null
@@ -1,127 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\gbm\gbm.cpp" />
-    <ClCompile Include="..\..\src\io\dmlc_simple.cpp" />
-    <ClCompile Include="..\..\src\io\io.cpp" />
-    <ClCompile Include="..\..\src\tree\updater.cpp" />
-    <ClCompile Include="..\..\subtree\rabit\src\engine_empty.cc" />
-    <ClCompile Include="..\..\wrapper\xgboost_wrapper.cpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}</ProjectGuid>
-    <RootNamespace>xgboost_wrapper</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup />
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <OpenMPSupport>true</OpenMPSupport>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <OpenMPSupport>true</OpenMPSupport>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/wrapper/.gitignore b/wrapper/.gitignore
deleted file mode 100644
index 2ebc5b00b..000000000
--- a/wrapper/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-build
-dist
-*.egg*
diff --git a/wrapper/README.md b/wrapper/README.md
deleted file mode 100644
index 77316e15c..000000000
--- a/wrapper/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-XGBoost Wrappers
-================
-This folder provides wrapper to create xgboost packages to other languages.
-
-***Supported Language Packages***
-* [Python package](../python-package)
-* [R-package](../R-package)
-* [Java Package](../java)
-* [Julia Package](https://github.com/antinucleon/XGBoost.jl)
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
deleted file mode 100644
index 6d547fe18..000000000
--- a/wrapper/xgboost_wrapper.cpp
+++ /dev/null
@@ -1,599 +0,0 @@
-// Copyright (c) 2014 by Contributors
-// implementations in ctypes
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#include <cstdio>
-#include <vector>
-#include <string>
-#include <cstring>
-#include <cmath>
-#include <algorithm>
-#include <exception>
-// include all std functions
-using namespace std;
-#include "./xgboost_wrapper.h"
-#include "../src/data.h"
-#include "../src/learner/learner-inl.hpp"
-#include "../src/io/io.h"
-#include "../src/utils/utils.h"
-#include "../src/utils/math.h"
-#include "../src/utils/group_data.h"
-#include "../src/io/simple_dmatrix-inl.hpp"
-
-using namespace xgboost;
-using namespace xgboost::io;
-
-namespace xgboost {
-namespace wrapper {
-// booster wrapper class
-class Booster: public learner::BoostLearner {
- public:
-  explicit Booster(const std::vector<DataMatrix*>& mats) {
-    this->silent = 1;
-    this->init_model = false;
-    this->SetCacheData(mats);
-  }
-  inline const float *Pred(const DataMatrix &dmat, int option_mask,
-                           unsigned ntree_limit, bst_ulong *len) {
-    this->CheckInitModel();
-    this->Predict(dmat, (option_mask&1) != 0, &this->preds_,
-                  ntree_limit, (option_mask&2) != 0);
-    *len = static_cast<bst_ulong>(this->preds_.size());
-    return BeginPtr(this->preds_);
-  }
-  inline void BoostOneIter(const DataMatrix &train,
-                           float *grad, float *hess, bst_ulong len) {
-    this->gpair_.resize(len);
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(len);
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint j = 0; j < ndata; ++j) {
-      gpair_[j] = bst_gpair(grad[j], hess[j]);
-    }
-    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
-  }
-  inline void CheckInitModel(void) {
-    if (!init_model) {
-      this->InitModel(); init_model = true;
-    }
-  }
-  inline void LoadModel(const char *fname) {
-    learner::BoostLearner::LoadModel(fname);
-    this->init_model = true;
-  }
-  inline void LoadModelFromBuffer(const void *buf, size_t size) {
-    utils::MemoryFixSizeBuffer fs((void*)buf, size);  // NOLINT(*)
-    learner::BoostLearner::LoadModel(fs, true);
-    this->init_model = true;
-  }
-  inline const char *GetModelRaw(bst_ulong *out_len) {
-    this->CheckInitModel();
-    model_str.resize(0);
-    utils::MemoryBufferStream fs(&model_str);
-    learner::BoostLearner::SaveModel(fs, false);
-    *out_len = static_cast<bst_ulong>(model_str.length());
-    if (*out_len == 0) {
-      return NULL;
-    } else {
-      return &model_str[0];
-    }
-  }
-  inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
-    model_dump = this->DumpModel(fmap, with_stats);
-    model_dump_cptr.resize(model_dump.size());
-    for (size_t i = 0; i < model_dump.size(); ++i) {
-      model_dump_cptr[i] = model_dump[i].c_str();
-    }
-    *len = static_cast<bst_ulong>(model_dump.size());
-    return BeginPtr(model_dump_cptr);
-  }
-  // temporal fields
-  // temporal data to save evaluation dump
-  std::string eval_str;
-  // temporal data to save model dump
-  std::string model_str;
-  // temporal space to save model dump
-  std::vector<std::string> model_dump;
-  std::vector<const char*> model_dump_cptr;
-
- private:
-  bool init_model;
-};
-}  // namespace wrapper
-}  // namespace xgboost
-
-using namespace xgboost::wrapper;
-
-#ifndef XGBOOST_STRICT_CXX98_
-namespace xgboost {
-namespace wrapper {
-// helper to support threadlocal
-struct ThreadLocalStore {
-  std::vector<std::string*> data;
-  // allocate a string
-  inline std::string *Alloc() {
-    mutex.Lock();
-    data.push_back(new std::string());
-    std::string *ret = data.back();
-    mutex.Unlock();
-    return ret;
-  }
-  ThreadLocalStore() {
-    mutex.Init();
-  }
-  ~ThreadLocalStore() {
-    for (size_t i = 0; i < data.size(); ++i) {
-      delete data[i];
-    }
-    mutex.Destroy();
-  }
-  utils::Mutex mutex;
-};
-
-static ThreadLocalStore thread_local_store;
-}  // namespace wrapper
-}  // namespace xgboost
-
-/*! \brief  macro to guard beginning and end section of all functions */
-#define API_BEGIN() try {
-/*!
- * \brief every function starts with API_BEGIN(); and finishes with API_END();
- * \param Finalize optionally put in a finalizer
- */
-#define API_END_FINALIZE(Finalize) } catch(std::exception &e) {  \
-    Finalize; return XGBHandleException(e);             \
-  } return 0;
-/*! \brief API End with no finalization */
-#define API_END() API_END_FINALIZE(;)
-
-// do not use threadlocal on OSX since it is not always available
-#ifndef DISABLE_THREAD_LOCAL
-#ifdef __GNUC__
-  #define XGB_TREAD_LOCAL __thread
-#elif __STDC_VERSION__ >= 201112L
-  #define XGB_TREAD_LOCAL _Thread_local
-#elif defined(_MSC_VER)
-  #define XGB_TREAD_LOCAL __declspec(thread)
-#endif
-#endif
-
-#ifndef XGB_TREAD_LOCAL
-#pragma message("Warning: Threadlocal not enabled, used single thread error handling")
-#define XGB_TREAD_LOCAL
-#endif
-
-/*!
- * \brief a helper function for error handling
- *  will set the last error to be str_set when it is not NULL
- * \param str_set the error to set
- * \return a pointer message to last error
- */
-const char *XGBSetGetLastError_(const char *str_set) {
-  // use last_error to record last error
-  static XGB_TREAD_LOCAL std::string *last_error = NULL;
-  if (last_error == NULL) {
-    last_error = thread_local_store.Alloc();
-  }
-  if (str_set != NULL) {
-    *last_error = str_set;
-  }
-  return last_error->c_str();
-}
-#else
-// crippled implementation for solaris case
-// exception handling is not needed for R, so it is OK.
-#define API_BEGIN()
-#define API_END_FINALIZE(Finalize) return 0
-#define API_END() return 0
-
-const char *XGBSetGetLastError_(const char *str_set) {
-  return NULL;
-}
-#endif  // XGBOOST_STRICT_CXX98_
-
-/*! \brief return str message of the last error */
-const char *XGBGetLastError() {
-  return XGBSetGetLastError_(NULL);
-}
-
-/*!
- * \brief handle exception throwed out
- * \param e the exception
- * \return the return value of API after exception is handled
- */
-int XGBHandleException(const std::exception &e) {
-  XGBSetGetLastError_(e.what());
-  return -1;
-}
-
-int XGDMatrixCreateFromFile(const char *fname,
-                            int silent,
-                            DMatrixHandle *out) {
-  API_BEGIN();
-  *out = LoadDataMatrix(fname, silent != 0, false, false);
-  API_END();
-}
-
-int XGDMatrixCreateFromCSR(const bst_ulong *indptr,
-                           const unsigned *indices,
-                           const float *data,
-                           bst_ulong nindptr,
-                           bst_ulong nelem,
-                           DMatrixHandle *out) {
-  DMatrixSimple *p_mat = NULL;
-  API_BEGIN();
-  p_mat = new DMatrixSimple();
-  DMatrixSimple &mat = *p_mat;
-  mat.row_ptr_.resize(nindptr);
-  for (bst_ulong i = 0; i < nindptr; ++i) {
-    mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
-  }
-  mat.row_data_.resize(nelem);
-  for (bst_ulong i = 0; i < nelem; ++i) {
-    mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
-    mat.info.info.num_col = std::max(mat.info.info.num_col,
-                                     static_cast<size_t>(indices[i]+1));
-  }
-  mat.info.info.num_row = nindptr - 1;
-  *out = p_mat;
-  API_END_FINALIZE(delete p_mat);
-}
-
-int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
-                           const unsigned *indices,
-                           const float *data,
-                           bst_ulong nindptr,
-                           bst_ulong nelem,
-                           DMatrixHandle *out) {
-  DMatrixSimple *p_mat = NULL;
-  API_BEGIN();
-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
-  p_mat = new DMatrixSimple();
-  DMatrixSimple &mat = *p_mat;
-  utils::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
-  builder.InitBudget(0, nthread);
-  long ncol = static_cast<long>(nindptr - 1);  // NOLINT(*)
-  #pragma omp parallel for schedule(static)
-  for (long i = 0; i < ncol; ++i) {  // NOLINT(*)
-    int tid = omp_get_thread_num();
-    for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
-      builder.AddBudget(indices[j], tid);
-    }
-  }
-  builder.InitStorage();
-  #pragma omp parallel for schedule(static)
-  for (long i = 0; i < ncol; ++i) {  // NOLINT(*)
-    int tid = omp_get_thread_num();
-    for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
-      builder.Push(indices[j],
-                   RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
-                   tid);
-    }
-  }
-  mat.info.info.num_row = mat.row_ptr_.size() - 1;
-  mat.info.info.num_col = static_cast<size_t>(ncol);
-  *out = p_mat;
-  API_END_FINALIZE(delete p_mat);
-}
-
-int XGDMatrixCreateFromMat(const float *data,
-                           bst_ulong nrow,
-                           bst_ulong ncol,
-                           float  missing,
-                           DMatrixHandle *out) {
-  DMatrixSimple *p_mat = NULL;
-  API_BEGIN();
-  p_mat = new DMatrixSimple();
-  bool nan_missing = utils::CheckNAN(missing);
-  DMatrixSimple &mat = *p_mat;
-  mat.info.info.num_row = nrow;
-  mat.info.info.num_col = ncol;
-  for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
-    bst_ulong nelem = 0;
-    for (bst_ulong j = 0; j < ncol; ++j) {
-      if (utils::CheckNAN(data[j])) {
-        utils::Check(nan_missing,
-                     "There are NAN in the matrix, however, you did not set missing=NAN");
-      } else {
-        if (nan_missing || data[j] != missing) {
-          mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
-          ++nelem;
-        }
-      }
-    }
-    mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
-  }
-  *out = p_mat;
-  API_END_FINALIZE(delete p_mat);
-}
-
-int XGDMatrixSliceDMatrix(DMatrixHandle handle,
-                          const int *idxset,
-                          bst_ulong len,
-                          DMatrixHandle *out) {
-  DMatrixSimple *p_ret = NULL;
-  API_BEGIN();
-  DMatrixSimple tmp;
-  DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
-  if (dsrc.magic != DMatrixSimple::kMagic) {
-    tmp.CopyFrom(dsrc);
-  }
-  DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ?
-                     *static_cast<DMatrixSimple*>(handle): tmp);
-  p_ret = new DMatrixSimple();
-  DMatrixSimple &ret = *p_ret;
-
-  utils::Check(src.info.group_ptr.size() == 0,
-               "slice does not support group structure");
-  ret.Clear();
-  ret.info.info.num_row = len;
-  ret.info.info.num_col = src.info.num_col();
-
-  utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
-  iter->BeforeFirst();
-  utils::Assert(iter->Next(), "slice");
-  const RowBatch &batch = iter->Value();
-  for (bst_ulong i = 0; i < len; ++i) {
-    const int ridx = idxset[i];
-    RowBatch::Inst inst = batch[ridx];
-    utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
-    ret.row_data_.resize(ret.row_data_.size() + inst.length);
-    memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
-           sizeof(RowBatch::Entry) * inst.length);
-    ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
-    if (src.info.labels.size() != 0) {
-      ret.info.labels.push_back(src.info.labels[ridx]);
-    }
-    if (src.info.weights.size() != 0) {
-      ret.info.weights.push_back(src.info.weights[ridx]);
-    }
-    if (src.info.info.root_index.size() != 0) {
-      ret.info.info.root_index.push_back(src.info.info.root_index[ridx]);
-    }
-    if (src.info.info.fold_index.size() != 0) {
-      ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]);
-    }
-  }
-  *out = p_ret;
-  API_END_FINALIZE(delete p_ret);
-}
-
-int XGDMatrixFree(DMatrixHandle handle) {
-  API_BEGIN();
-  delete static_cast<DataMatrix*>(handle);
-  API_END();
-}
-
-int XGDMatrixSaveBinary(DMatrixHandle handle,
-                        const char *fname,
-                        int silent) {
-  API_BEGIN();
-  SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent != 0);
-  API_END();
-}
-
-int XGDMatrixSetFloatInfo(DMatrixHandle handle,
-                          const char *field,
-                          const float *info,
-                          bst_ulong len) {
-  API_BEGIN();
-  std::vector<float> &vec =
-      static_cast<DataMatrix*>(handle)->info.GetFloatInfo(field);
-  vec.resize(len);
-  memcpy(BeginPtr(vec), info, sizeof(float) * len);
-  API_END();
-}
-
-int XGDMatrixSetUIntInfo(DMatrixHandle handle,
-                         const char *field,
-                         const unsigned *info,
-                         bst_ulong len) {
-  API_BEGIN();
-  std::vector<unsigned> &vec =
-      static_cast<DataMatrix*>(handle)->info.GetUIntInfo(field);
-  vec.resize(len);
-  memcpy(BeginPtr(vec), info, sizeof(unsigned) * len);
-  API_END();
-}
-
-int XGDMatrixSetGroup(DMatrixHandle handle,
-                      const unsigned *group,
-                      bst_ulong len) {
-  API_BEGIN();
-  DataMatrix *pmat = static_cast<DataMatrix*>(handle);
-  pmat->info.group_ptr.resize(len + 1);
-  pmat->info.group_ptr[0] = 0;
-  for (uint64_t i = 0; i < len; ++i) {
-    pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i] + group[i];
-  }
-  API_END();
-}
-
-int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
-                          const char *field,
-                          bst_ulong *out_len,
-                          const float **out_dptr) {
-  API_BEGIN();
-  const std::vector<float> &vec =
-      static_cast<const DataMatrix*>(handle)->info.GetFloatInfo(field);
-  *out_len = static_cast<bst_ulong>(vec.size());
-  *out_dptr = BeginPtr(vec);
-  API_END();
-}
-
-int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
-                         const char *field,
-                         bst_ulong *out_len,
-                         const unsigned **out_dptr) {
-  API_BEGIN();
-  const std::vector<unsigned> &vec =
-      static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
-  *out_len = static_cast<bst_ulong>(vec.size());
-  *out_dptr = BeginPtr(vec);
-  API_END();
-}
-
-int XGDMatrixNumRow(const DMatrixHandle handle,
-                    bst_ulong *out) {
-  API_BEGIN();
-  *out = static_cast<bst_ulong>(static_cast<const DataMatrix*>(handle)->info.num_row());
-  API_END();
-}
-
-int XGDMatrixNumCol(const DMatrixHandle handle,
-                    bst_ulong *out) {
-  API_BEGIN();
-  *out = static_cast<size_t>(static_cast<const DataMatrix*>(handle)->info.num_col());
-  API_END();
-}
-
-// xgboost implementation
-int XGBoosterCreate(DMatrixHandle dmats[],
-                    bst_ulong len,
-                    BoosterHandle *out) {
-  API_BEGIN();
-  std::vector<DataMatrix*> mats;
-  for (bst_ulong i = 0; i < len; ++i) {
-    DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
-    mats.push_back(dtr);
-  }
-  *out = new Booster(mats);
-  API_END();
-}
-
-int XGBoosterFree(BoosterHandle handle) {
-  API_BEGIN();
-  delete static_cast<Booster*>(handle);
-  API_END();
-}
-
-int XGBoosterSetParam(BoosterHandle handle,
-                      const char *name, const char *value) {
-  API_BEGIN();
-  static_cast<Booster*>(handle)->SetParam(name, value);
-  API_END();
-}
-
-int XGBoosterUpdateOneIter(BoosterHandle handle,
-                           int iter,
-                           DMatrixHandle dtrain) {
-  API_BEGIN();
-  Booster *bst = static_cast<Booster*>(handle);
-  DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
-  bst->CheckInitModel();
-  bst->CheckInit(dtr);
-  bst->UpdateOneIter(iter, *dtr);
-  API_END();
-}
-
-int XGBoosterBoostOneIter(BoosterHandle handle,
-                          DMatrixHandle dtrain,
-                          float *grad,
-                          float *hess,
-                          bst_ulong len) {
-  API_BEGIN();
-  Booster *bst = static_cast<Booster*>(handle);
-  DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
-  bst->CheckInitModel();
-  bst->CheckInit(dtr);
-  bst->BoostOneIter(*dtr, grad, hess, len);
-  API_END();
-}
-
-int XGBoosterEvalOneIter(BoosterHandle handle,
-                         int iter,
-                         DMatrixHandle dmats[],
-                         const char *evnames[],
-                         bst_ulong len,
-                         const char **out_str) {
-  API_BEGIN();
-  Booster *bst = static_cast<Booster*>(handle);
-  std::vector<std::string> names;
-  std::vector<const DataMatrix*> mats;
-  for (bst_ulong i = 0; i < len; ++i) {
-    mats.push_back(static_cast<DataMatrix*>(dmats[i]));
-    names.push_back(std::string(evnames[i]));
-  }
-  bst->CheckInitModel();
-  bst->eval_str = bst->EvalOneIter(iter, mats, names);
-  *out_str = bst->eval_str.c_str();
-  API_END();
-}
-
-int XGBoosterPredict(BoosterHandle handle,
-                     DMatrixHandle dmat,
-                     int option_mask,
-                     unsigned ntree_limit,
-                     bst_ulong *len,
-                     const float **out_result) {
-  API_BEGIN();
-  *out_result = static_cast<Booster*>(handle)->
-      Pred(*static_cast<DataMatrix*>(dmat),
-           option_mask, ntree_limit, len);
-  API_END();
-}
-
-int XGBoosterLoadModel(BoosterHandle handle, const char *fname) {
-  API_BEGIN();
-  static_cast<Booster*>(handle)->LoadModel(fname);
-  API_END();
-}
-
-int XGBoosterSaveModel(BoosterHandle handle, const char *fname) {
-  API_BEGIN();
-  Booster *bst = static_cast<Booster*>(handle);
-  bst->CheckInitModel();
-  bst->SaveModel(fname, false);
-  API_END();
-}
-
-int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
-                                 const void *buf,
-                                 bst_ulong len) {
-  API_BEGIN();
-  static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
-  API_END();
-}
-
-int XGBoosterGetModelRaw(BoosterHandle handle,
-                         bst_ulong *out_len,
-                         const char **out_dptr) {
-  API_BEGIN();
-  *out_dptr = static_cast<Booster*>(handle)->GetModelRaw(out_len);
-  API_END();
-}
-
-int XGBoosterDumpModel(BoosterHandle handle,
-                       const char *fmap,
-                       int with_stats,
-                       bst_ulong *len,
-                       const char ***out_models) {
-  API_BEGIN();
-  utils::FeatMap featmap;
-  if (strlen(fmap) != 0) {
-    featmap.LoadText(fmap);
-  }
-  *out_models = static_cast<Booster*>(handle)->GetModelDump(
-      featmap, with_stats != 0, len);
-  API_END();
-}
-
-int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
-                                   int fnum,
-                                   const char **fname,
-                                   const char **ftype,
-                                   int with_stats,
-                                   bst_ulong *len,
-                                   const char ***out_models) {
-  API_BEGIN();
-  utils::FeatMap featmap;
-  for (int i = 0; i < fnum; ++i) {
-      featmap.PushBack(i, fname[i], ftype[i]);
-  }
-  *out_models = static_cast<Booster*>(handle)->GetModelDump(
-      featmap, with_stats != 0, len);
-  API_END();
-}