Release version 0.71 (#3200 )

[jvm-packages] add back libsvm notes (#3232 )
* add back train method but mark as deprecated * add back train method but mark as deprecated * fix scalastyle error * fix scalastyle error * add back libsvm notes
2018-04-11 21:43:32 +09:00 · 2018-04-10 09:00:58 -07:00 · 2018-04-04 15:08:22 +12:00 · 2018-04-04 14:21:48 +12:00 · 2018-03-28 10:32:52 -07:00 · 2018-03-28 10:05:47 -07:00
423 changed files with 33085 additions and 4914 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,7 @@
 *.Rcheck
 *.rds
 *.tar.gz
-*txt*
+#*txt*
 *conf
 *buffer
 *model
@@ -79,3 +79,18 @@ tags
 *.class
 target
 *.swp
+
+# cpp tests and gcov generated files
+*.gcov
+*.gcda
+*.gcno
+build_tests
+/tests/cpp/xgboost_test
+
+.DS_Store
+lib/
+
+# spark
+metastore_db
+
+plugin/updater_gpu/test/cpp/data
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,9 @@
 [submodule "rabit"]
 	path = rabit
 	url = https://github.com/dmlc/rabit
+[submodule "nccl"]
+	path = nccl
+	url = https://github.com/dmlc/nccl
+[submodule "cub"]
+	path = cub
+	url = https://github.com/NVlabs/cub
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,11 +1,15 @@
 # disable sudo for container build.
-sudo: false
+sudo: required

 # Enabling test on Linux and OS X
 os:
  - linux
  - osx

+osx_image: xcode8
+
+group: deprecated-2017Q4
+
 # Use Build Matrix to do lint and build seperately
 env:
  matrix:
@@ -20,36 +24,43 @@ env:
    - TASK=java_test
    # cmake test
    - TASK=cmake_test
-
-os:
-  - linux
-  - osx
+    # c++ test
+    - TASK=cpp_test

 matrix:
  exclude:
    - os: osx
      env: TASK=lint
+    - os: osx
+      env: TASK=cmake_test
    - os: linux
      env: TASK=r_test
-    - os: osx
-      env: TASK=java_test
    - os: osx
      env: TASK=python_lightweight_test
+    - os: osx
+      env: TASK=cpp_test

 # dependent apt packages
 addons:
  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+      - george-edison55-precise-backports
    packages:
+      - cmake
+      - cmake-data
      - doxygen
      - wget
      - libcurl4-openssl-dev
      - unzip
      - graphviz
+      - gcc-4.8
+      - g++-4.8

 before_install:
  - source dmlc-core/scripts/travis/travis_setup_env.sh
  - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package
-  - echo "MAVEN_OPTS='-Xmx2048m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m'" > ~/.mavenrc 
+  - echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc

 install:
  - source tests/travis/setup.sh
@@ -68,6 +79,10 @@ before_cache:
 after_failure:
  - tests/travis/travis_after_failure.sh

+after_success:
+  - tree build
+  - bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
+
 notifications:
  email:
    on_success: change
--- a/18
+++ b/18
@@ -0,0 +1,18 @@
+@inproceedings{Chen:2016:XST:2939672.2939785,
+ author = {Chen, Tianqi and Guestrin, Carlos},
+ title = {{XGBoost}: A Scalable Tree Boosting System},
+ booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+ series = {KDD '16},
+ year = {2016},
+ isbn = {978-1-4503-4232-2},
+ location = {San Francisco, California, USA},
+ pages = {785--794},
+ numpages = {10},
+ url = {http://doi.acm.org/10.1145/2939672.2939785},
+ doi = {10.1145/2939672.2939785},
+ acmid = {2939785},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {large-scale machine learning},
+}
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,79 +1,250 @@
-cmake_minimum_required (VERSION 2.6)
-project (xgboost)
+cmake_minimum_required (VERSION 3.2)
+project(xgboost)
+include(cmake/Utils.cmake)
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
 find_package(OpenMP)

-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC")
+set_default_configuration_release()
+msvc_use_static_runtime()

-# Make sure we are using C++11
-# Visual Studio 12.0 and newer supports enough c++11 to make this work
-if(MSVC AND MSVC_VERSION LESS 1800)
-    message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.")
+# Options
+option(USE_CUDA  "Build with GPU acceleration") 
+option(USE_AVX  "Build with AVX instructions. May not produce identical results due to approximate math." OFF) 
+option(USE_NCCL "Build using NCCL for multi-GPU. Also requires USE_CUDA") 
+option(JVM_BINDINGS "Build JVM bindings" OFF)
+option(GOOGLE_TEST "Build google tests" OFF)
+option(R_LIB "Build shared library for R package" OFF)
+set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
+  "Space separated list of compute versions to be built against")
+
+# Deprecation warning
+if(PLUGIN_UPDATER_GPU)
+  set(USE_CUDA ON)
+  message(WARNING "The option 'PLUGIN_UPDATER_GPU' is deprecated. Set 'USE_CUDA' instead.")
+endif()
+
+# Compiler flags
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+if(MSVC)
+  # Multithreaded compilation
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 else()
-  # GCC 4.6 with c++0x supports enough to make this work
-  include(CheckCXXCompilerFlag)
-  CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11)
-  CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X)
+  # Correct error for GCC 5 and cuda
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
+  # Performance
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops")
+endif()

-  if(COMPILER_SUPPORTS_CXX11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-  elseif(COMPILER_SUPPORTS_CXX0X)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
+# AVX
+if(USE_AVX)
+  if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
  else()
-    message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
  endif()
+  add_definitions(-DXGBOOST_USE_AVX)
 endif()


-#Make sure we are using the static runtime
-if(MSVC)
-	set(variables
-		CMAKE_C_FLAGS_DEBUG
-		CMAKE_C_FLAGS_MINSIZEREL
-		CMAKE_C_FLAGS_RELEASE
-		CMAKE_C_FLAGS_RELWITHDEBINFO
-		CMAKE_CXX_FLAGS_DEBUG
-		CMAKE_CXX_FLAGS_MINSIZEREL
-		CMAKE_CXX_FLAGS_RELEASE
-		CMAKE_CXX_FLAGS_RELWITHDEBINFO
-	)
-	foreach(variable ${variables})
-		if(${variable} MATCHES "/MD")
-			string(REGEX REPLACE "/MD" "/MT" ${variable} "${${variable}}")
-		endif()
-	endforeach()
+# compiled code customizations for R package
+if(R_LIB)
+  add_definitions(
+    -DXGBOOST_STRICT_R_MODE=1
+    -DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1
+    -DDMLC_LOG_BEFORE_THROW=0
+    -DDMLC_DISABLE_STDIN=1
+    -DDMLC_LOG_CUSTOMIZE=1
+    -DRABIT_CUSTOMIZE_MSG_
+    -DRABIT_STRICT_CXX98_
+  )
 endif()

 include_directories (
-	${PROJECT_SOURCE_DIR}/include 
-	${PROJECT_SOURCE_DIR}/dmlc-core/include
-	${PROJECT_SOURCE_DIR}/rabit/include 
-	)
-
-file(GLOB SOURCES 
-	src/c_api/*.cc
-	src/common/*.cc
-	src/data/*.cc
-	src/gbm/*.cc
-	src/metric/*.cc
-	src/objective/*.cc
-	src/tree/*.cc
-	src/*.cc
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/dmlc-core/include
+    ${PROJECT_SOURCE_DIR}/rabit/include
 )

+file(GLOB_RECURSE SOURCES 
+    src/*.cc
+    src/*.h
+    include/*.h
+)
+
+# Only add main function for executable target
+list(REMOVE_ITEM SOURCES ${PROJECT_SOURCE_DIR}/src/cli_main.cc)
+
+file(GLOB_RECURSE CUDA_SOURCES
+    src/*.cu
+    src/*.cuh
+)
+
+# rabit
+# TODO: Create rabit cmakelists.txt
 set(RABIT_SOURCES
-	rabit/src/allreduce_base.cc
-	rabit/src/allreduce_robust.cc
-	rabit/src/engine.cc
-	rabit/src/c_api.cc
+    rabit/src/allreduce_base.cc
+    rabit/src/allreduce_robust.cc
+    rabit/src/engine.cc
+    rabit/src/c_api.cc
 )
+set(RABIT_EMPTY_SOURCES
+    rabit/src/engine_empty.cc
+    rabit/src/c_api.cc
+)
+if(MINGW OR R_LIB)
+  # build a dummy rabit library
+  add_library(rabit STATIC ${RABIT_EMPTY_SOURCES})
+else()
+  add_library(rabit STATIC ${RABIT_SOURCES})
+endif()


-add_subdirectory(dmlc-core) 
+# dmlc-core
+add_subdirectory(dmlc-core)
+set(LINK_LIBRARIES dmlccore rabit)

-add_library(rabit STATIC ${RABIT_SOURCES})

-add_executable(xgboost ${SOURCES})
-add_library(libxgboost SHARED ${SOURCES})
+if(USE_CUDA)
+  find_package(CUDA 8.0 REQUIRED)
+  cmake_minimum_required(VERSION 3.5)

-target_link_libraries(xgboost dmlccore rabit)
-target_link_libraries(libxgboost dmlccore rabit)
+  add_definitions(-DXGBOOST_USE_CUDA)
+  
+  include_directories(cub)
+
+  if(USE_NCCL)
+    include_directories(nccl/src)
+    add_definitions(-DXGBOOST_USE_NCCL)
+  endif()
+
+  if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
+    message("CUDA 9.0 detected, adding Volta compute capability (7.0).")
+    set(GPU_COMPUTE_VER "${GPU_COMPUTE_VER};70")
+  endif()
+  
+  set(GENCODE_FLAGS "")
+  format_gencode_flags("${GPU_COMPUTE_VER}" GENCODE_FLAGS)
+  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo;")
+  if(NOT MSVC)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11")
+  endif()
+
+  if(USE_NCCL)
+    add_subdirectory(nccl)
+  endif()
+
+  cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
+  
+  if(USE_NCCL)
+    target_link_libraries(gpuxgboost nccl)
+  endif()
+  list(APPEND LINK_LIBRARIES gpuxgboost) 
+endif()
+
+
+# flags and sources for R-package
+if(R_LIB)
+  file(GLOB_RECURSE R_SOURCES
+    R-package/src/*.h
+    R-package/src/*.c
+    R-package/src/*.cc
+  )
+  list(APPEND SOURCES ${R_SOURCES})
+endif()
+
+add_library(objxgboost OBJECT ${SOURCES})
+
+
+# building shared library for R package
+if(R_LIB)
+  find_package(LibR REQUIRED)
+
+  list(APPEND LINK_LIBRARIES "${LIBR_CORE_LIBRARY}")
+  MESSAGE(STATUS "LIBR_CORE_LIBRARY " ${LIBR_CORE_LIBRARY})
+
+  include_directories(
+    "${LIBR_INCLUDE_DIRS}"
+    "${PROJECT_SOURCE_DIR}"
+  )
+
+  # Shared library target for the R package
+  add_library(xgboost SHARED $<TARGET_OBJECTS:objxgboost>)
+  target_link_libraries(xgboost ${LINK_LIBRARIES})
+  # R uses no lib prefix in shared library names of its packages
+  set_target_properties(xgboost PROPERTIES PREFIX "")
+
+  setup_rpackage_install_target(xgboost ${CMAKE_CURRENT_BINARY_DIR})
+  # use a dummy location for any other remaining installs
+  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
+
+# main targets: shared library & exe
+else()
+  # Executable
+  add_executable(runxgboost $<TARGET_OBJECTS:objxgboost> src/cli_main.cc)
+  set_target_properties(runxgboost PROPERTIES
+    OUTPUT_NAME xgboost
+  )
+  set_output_directory(runxgboost ${PROJECT_SOURCE_DIR})
+  target_link_libraries(runxgboost ${LINK_LIBRARIES})
+
+  # Shared library
+  add_library(xgboost SHARED $<TARGET_OBJECTS:objxgboost>)
+  target_link_libraries(xgboost ${LINK_LIBRARIES})
+  set_output_directory(xgboost ${PROJECT_SOURCE_DIR}/lib)
+  if(MINGW)
+    # remove the 'lib' prefix to conform to windows convention for shared library names
+    set_target_properties(xgboost PROPERTIES PREFIX "")
+  endif()
+
+  #Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
+  add_dependencies(xgboost runxgboost)
+endif()
+
+
+# JVM
+if(JVM_BINDINGS)
+    find_package(JNI QUIET REQUIRED)
+
+    include_directories(${JNI_INCLUDE_DIRS} jvm-packages/xgboost4j/src/native)
+
+    add_library(xgboost4j SHARED
+        $<TARGET_OBJECTS:objxgboost>
+        jvm-packages/xgboost4j/src/native/xgboost4j.cpp)
+    set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
+    target_link_libraries(xgboost4j
+        ${LINK_LIBRARIES}
+        ${JAVA_JVM_LIBRARY})
+endif()
+
+
+# Test
+if(GOOGLE_TEST)
+  find_package(GTest REQUIRED)
+  enable_testing()
+
+  file(GLOB_RECURSE TEST_SOURCES "tests/cpp/*.cc")
+  auto_source_group("${TEST_SOURCES}")
+  include_directories(${GTEST_INCLUDE_DIR})
+
+  if(USE_CUDA)
+    file(GLOB_RECURSE CUDA_TEST_SOURCES "tests/cpp/*.cu")
+    cuda_compile(CUDA_TEST_OBJS ${CUDA_TEST_SOURCES})
+  else()
+    set(CUDA_TEST_OBJS "")
+  endif()
+
+  add_executable(testxgboost ${TEST_SOURCES} ${CUDA_TEST_OBJS} $<TARGET_OBJECTS:objxgboost>)
+  set_output_directory(testxgboost ${PROJECT_SOURCE_DIR})
+  target_link_libraries(testxgboost ${GTEST_LIBRARIES} ${LINK_LIBRARIES})
+
+  add_test(TestXGBoost testxgboost)
+endif()
+
+
+# Group sources
+auto_source_group("${SOURCES}")
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -2,28 +2,34 @@ Contributors of DMLC/XGBoost
 ============================
 XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.

-Comitters
---------
+Committers
+----------
 Committers are people who have made substantial contribution to the project and granted write access to the project.
 * [Tianqi Chen](https://github.com/tqchen), University of Washington
  - Tianqi is a PhD working on large-scale machine learning, he is the creator of the project.
 * [Tong He](https://github.com/hetong007), Simon Fraser University
  - Tong is a master student working on data mining, he is the maintainer of xgboost R package.
+* [Vadim Khotilovich](https://github.com/khotilov)
+  - Vadim contributes many improvements in R and core packages.
 * [Bing Xu](https://github.com/antinucleon)
  - Bing is the original creator of xgboost python package and currently the maintainer of [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl).
 * [Michael Benesty](https://github.com/pommedeterresautee)
  - Micheal is a lawyer, data scientist in France, he is the creator of xgboost interactive analysis module in R.
 * [Yuan Tang](https://github.com/terrytangyuan)
-  - Yuan is a data scientist in Chicago, US. He contributed mostly in R and Python packages. 
+  - Yuan is a data scientist in Chicago, US. He contributed mostly in R and Python packages.
+* [Nan Zhu](https://github.com/CodingCat)
+  - Nan is a software engineer in Microsoft. He contributed mostly in JVM packages.
+* [Sergei Lebedev](https://github.com/superbobry)
+  - Serget is a software engineer in Criteo. He contributed mostly in JVM packages.

-Become a Comitter
-----------------
-XGBoost is a opensource project and we are actively looking for new comitters who are willing to help maintaining and lead the project.
+Become a Committer
+------------------
+XGBoost is a opensource project and we are actively looking for new committers who are willing to help maintaining and lead the project.
 Committers comes from contributors who:
 * Made substantial contribution to the project.
 * Willing to spent time on maintaining and lead the project.

-New committers will be proposed by current comitter memembers, with support from more than two of current comitters.
+New committers will be proposed by current committer members, with support from more than two of current committers.

 List of Contributors
 --------------------
@@ -37,14 +43,13 @@ List of Contributors
  - Zygmunt is the master behind the early stopping feature frequently used by kagglers.
 * [Ajinkya Kale](https://github.com/ajkl)
 * [Boliang Chen](https://github.com/cblsjtu)
-* [Vadim Khotilovich](https://github.com/khotilov)
 * [Yangqing Men](https://github.com/yanqingmen)
  - Yangqing is the creator of xgboost java package.
 * [Engpeng Yao](https://github.com/yepyao)
 * [Giulio](https://github.com/giuliohome)
  - Giulio is the creator of windows project of xgboost
 * [Jamie Hall](https://github.com/nerdcha)
-  - Jamie is the initial creator of xgboost sklearn modue.
+  - Jamie is the initial creator of xgboost sklearn module.
 * [Yen-Ying Lee](https://github.com/white1033)
 * [Masaaki Horikoshi](https://github.com/sinhrks)
  - Masaaki is the initial creator of xgboost python plotting module.
@@ -60,3 +65,10 @@ List of Contributors
 * [ganesh-krishnan](https://github.com/ganesh-krishnan)
 * [Damien Carol](https://github.com/damiencarol)
 * [Alex Bain](https://github.com/convexquad)
+* [Baltazar Bieniek](https://github.com/bbieniek)
+* [Adam Pocock](https://github.com/Craigacp)
+* [Rory Mitchell](https://github.com/RAMitchell)
+  - Rory is the author of the GPU plugin and also contributed the cmake build system and windows continuous integration
+* [Gideon Whitehead](https://github.com/gaw89)
+* [Yi-Lin Juang](https://github.com/frankyjuang)
+* [Andrew Hannigan](https://github.com/andrewhannigan)
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -0,0 +1,44 @@
+For bugs or installation issues, please provide the following information.
+The more information you provide, the more easily we will be able to offer
+help and advice.
+
+## Environment info
+Operating System:
+
+Compiler:
+
+Package used (python/R/jvm/C++):
+
+`xgboost` version used:
+
+If installing from source, please provide 
+
+1. The commit hash (`git rev-parse HEAD`)
+2. Logs will be helpful (If logs are large, please upload as attachment).
+
+If you are using jvm package, please 
+
+1. add [jvm-packages] in the title to make it quickly be identified
+2. the gcc version and distribution
+
+If you are using python package, please provide
+
+1. The python version and distribution
+2. The command to install `xgboost` if you are not installing from source
+
+If you are using R package, please provide
+
+1. The R `sessionInfo()`
+2. The command to install `xgboost` if you are not installing from source
+
+## Steps to reproduce
+
+1.
+2.
+3.
+
+## What have you tried?
+
+1.
+2.
+3.
--- a/151
+++ b/151
@@ -0,0 +1,151 @@
+#!/usr/bin/groovy
+// -*- mode: groovy -*-
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Command to run command inside a docker container
+dockerRun = 'tests/ci_build/ci_build.sh'
+
+def buildMatrix = [
+    [ "enabled": true,  "os" : "linux", "withGpu": true,  "withOmp": true, "pythonVersion": "2.7" ],
+    [ "enabled": true,  "os" : "linux", "withGpu": false, "withOmp": true, "pythonVersion": "2.7" ],
+    [ "enabled": false, "os" : "osx",   "withGpu": false, "withOmp": false, "pythonVersion": "2.7" ],
+]
+
+pipeline {
+    // Each stage specify its own agent
+    agent none
+
+    // Setup common job properties
+    options {
+        ansiColor('xterm')
+        timestamps()
+        timeout(time: 120, unit: 'MINUTES')
+        buildDiscarder(logRotator(numToKeepStr: '10'))
+    }
+
+    // Build stages
+    stages {
+        stage('Get sources') {
+            agent any
+            steps {
+                checkoutSrcs()
+                stash name: 'srcs', excludes: '.git/'
+                milestone label: 'Sources ready', ordinal: 1
+            }
+        }
+        stage('Build & Test') {
+            steps {
+                script {
+                    parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
+                        def buildName = getBuildName(c)
+                        buildFactory(buildName, c)
+                    })
+                }
+            }
+        }
+    }
+}
+
+// initialize source codes
+def checkoutSrcs() {
+  retry(5) {
+    try {
+      timeout(time: 2, unit: 'MINUTES') {
+        checkout scm
+        sh 'git submodule update --init'
+      }
+    } catch (exc) {
+      deleteDir()
+      error "Failed to fetch source codes"
+    }
+  }
+}
+
+/**
+ * Creates cmake and make builds
+ */
+def buildFactory(buildName, conf) {
+    def os = conf["os"]
+    def nodeReq = conf["withGpu"] ? "${os} && gpu" : "${os}"
+    def dockerTarget = conf["withGpu"] ? "gpu" : "cpu"
+    [ ("cmake_${buildName}") : { buildPlatformCmake("cmake_${buildName}", conf, nodeReq, dockerTarget) },
+      ("make_${buildName}") : { buildPlatformMake("make_${buildName}", conf, nodeReq, dockerTarget) }
+    ]
+}
+
+/**
+ * Build platform and test it via cmake.
+ */
+def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
+    def opts = cmakeOptions(conf)
+    // Destination dir for artifacts
+    def distDir = "dist/${buildName}"
+    // Build node - this is returned result
+    node(nodeReq) {
+        unstash name: 'srcs'
+        echo """
+        |===== XGBoost CMake build =====
+        |  dockerTarget: ${dockerTarget}
+        |  cmakeOpts   : ${opts}
+        |=========================
+        """.stripMargin('|')
+        // Invoke command inside docker
+        sh """
+        ${dockerRun} ${dockerTarget} tests/ci_build/build_via_cmake.sh ${opts}
+        ${dockerRun} ${dockerTarget} tests/ci_build/test_${dockerTarget}.sh
+        ${dockerRun} ${dockerTarget} bash -c "cd python-package; python setup.py bdist_wheel"
+        rm -rf "${distDir}"; mkdir -p "${distDir}/py"
+        cp xgboost "${distDir}"
+        cp -r lib "${distDir}"
+        cp -r python-package/dist "${distDir}/py"
+        """
+        archiveArtifacts artifacts: "${distDir}/**/*.*", allowEmptyArchive: true
+    }
+}
+
+/**
+ * Build platform via make
+ */
+def buildPlatformMake(buildName, conf, nodeReq, dockerTarget) {
+    def opts = makeOptions(conf)
+    // Destination dir for artifacts
+    def distDir = "dist/${buildName}"
+    // Build node
+    node(nodeReq) {
+        unstash name: 'srcs'
+        echo """
+        |===== XGBoost Make build =====
+        |  dockerTarget: ${dockerTarget}
+        |  makeOpts    : ${opts}
+        |=========================
+        """.stripMargin('|')
+        // Invoke command inside docker
+        sh """
+        ${dockerRun} ${dockerTarget} tests/ci_build/build_via_make.sh ${opts}
+        """
+    }
+}
+
+def makeOptions(conf) {
+    return ([
+        conf["withGpu"] ? 'PLUGIN_UPDATER_GPU=ON' : 'PLUGIN_UPDATER_GPU=OFF',
+        conf["withOmp"] ? 'USE_OPENMP=1' : 'USE_OPENMP=0']
+        ).join(" ")
+}
+
+
+def cmakeOptions(conf) {
+    return ([
+        conf["withGpu"] ? '-DPLUGIN_UPDATER_GPU:BOOL=ON' : '',
+        conf["withOmp"] ? '-DOPEN_MP:BOOL=ON' : '']
+        ).join(" ")
+}
+
+def getBuildName(conf) {
+    def gpuLabel = conf['withGpu'] ? "_gpu" : "_cpu"
+    def ompLabel = conf['withOmp'] ? "_omp" : ""
+    def pyLabel = "_py${conf['pythonVersion']}"
+    return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}"
+}
+
--- a/146
+++ b/146
@@ -16,6 +16,20 @@ endif

 ROOTDIR = $(CURDIR)

+# workarounds for some buggy old make & msys2 versions seen in windows
+ifeq (NA, $(shell test ! -d "$(ROOTDIR)" && echo NA ))
+        $(warning Attempting to fix non-existing ROOTDIR [$(ROOTDIR)])
+        ROOTDIR := $(shell pwd)
+        $(warning New ROOTDIR [$(ROOTDIR)] $(shell test -d "$(ROOTDIR)" && echo " is OK" ))
+endif
+MAKE_OK := $(shell "$(MAKE)" -v 2> /dev/null)
+ifndef MAKE_OK
+        $(warning Attempting to recover non-functional MAKE [$(MAKE)])
+        MAKE := $(shell which make 2> /dev/null)
+        MAKE_OK := $(shell "$(MAKE)" -v 2> /dev/null)
+endif
+$(warning MAKE [$(MAKE)] - $(if $(MAKE_OK),checked OK,PROBLEM))
+
 ifeq ($(OS), Windows_NT)
 	UNAME="Windows"
 else
@@ -29,31 +43,60 @@ endif
 include $(DMLC_CORE)/make/dmlc.mk

 # include the plugins
+ifdef XGB_PLUGINS
 include $(XGB_PLUGINS)
+endif

-# use customized config file
+# set compiler defaults for OSX versus *nix
+# let people override either
+OS := $(shell uname)
+ifeq ($(OS), Darwin)
 ifndef CC
-export CC  = $(if $(shell which gcc-5),gcc-5,gcc)
+export CC = $(if $(shell which clang), clang, gcc)
 endif
 ifndef CXX
-export CXX = $(if $(shell which g++-5),g++-5,g++)
+export CXX = $(if $(shell which clang++), clang++, g++)
+endif
+else
+# linux defaults
+ifndef CC
+export CC = gcc
+endif
+ifndef CXX
+export CXX = g++
+endif
 endif

 export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS)
-export CFLAGS=  -std=c++0x -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
-CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include
+export CFLAGS=  -std=c++11 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
+CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include -I$(GTEST_PATH)/include
 #java include path
 export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java

+ifeq ($(TEST_COVER), 1)
+	CFLAGS += -g -O0 -fprofile-arcs -ftest-coverage
+else
+	CFLAGS += -O3 -funroll-loops
+ifeq ($(USE_SSE), 1)
+	CFLAGS += -msse2
+endif
+endif
+
 ifndef LINT_LANG
 	LINT_LANG= "all"
 endif

-ifneq ($(UNAME), Windows)
-	CFLAGS += -fPIC
-	XGBOOST_DYLIB = lib/libxgboost.so
+ifeq ($(UNAME), Windows)
+	XGBOOST_DYLIB = lib/xgboost.dll
+	JAVAINCFLAGS += -I${JAVA_HOME}/include/win32
 else
-	XGBOOST_DYLIB = lib/libxgboost.dll
+ifeq ($(UNAME), Darwin)
+	XGBOOST_DYLIB = lib/libxgboost.dylib
+	CFLAGS += -fPIC
+else
+	XGBOOST_DYLIB = lib/libxgboost.so
+	CFLAGS += -fPIC
+endif
 endif

 ifeq ($(UNAME), Linux)
@@ -65,24 +108,24 @@ ifeq ($(UNAME), Darwin)
 	JAVAINCFLAGS += -I${JAVA_HOME}/include/darwin
 endif

+OPENMP_FLAGS =
 ifeq ($(USE_OPENMP), 1)
-	CFLAGS += -fopenmp
+	OPENMP_FLAGS = -fopenmp
 else
-	CFLAGS += -DDISABLE_OPENMP
+	OPENMP_FLAGS = -DDISABLE_OPENMP
 endif
-
+CFLAGS += $(OPENMP_FLAGS)

 # specify tensor path
 .PHONY: clean all lint clean_all doxygen rcpplint pypack Rpack Rbuild Rcheck java pylint

-
 all: lib/libxgboost.a $(XGBOOST_DYLIB) xgboost

 $(DMLC_CORE)/libdmlc.a: $(wildcard $(DMLC_CORE)/src/*.cc $(DMLC_CORE)/src/*/*.cc)
-	+ cd $(DMLC_CORE); $(MAKE) libdmlc.a config=$(ROOTDIR)/$(config); cd $(ROOTDIR)
+	+ cd $(DMLC_CORE); "$(MAKE)" libdmlc.a config=$(ROOTDIR)/$(config); cd $(ROOTDIR)

 $(RABIT)/lib/$(LIB_RABIT): $(wildcard $(RABIT)/src/*.cc)
-	+ cd $(RABIT); $(MAKE) lib/$(LIB_RABIT); cd $(ROOTDIR)
+	+ cd $(RABIT); "$(MAKE)" lib/$(LIB_RABIT) USE_SSE=$(USE_SSE); cd $(ROOTDIR)

 jvm: jvm-packages/lib/libxgboost4j.so

@@ -92,20 +135,21 @@ AMALGA_OBJ = amalgamation/xgboost-all0.o
 LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)
 ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP)
 CLI_OBJ = build/cli_main.o
+include tests/cpp/xgboost_test.mk

 build/%.o: src/%.cc
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
-	$(CXX) -c $(CFLAGS) -c $< -o $@
+	$(CXX) -c $(CFLAGS) $< -o $@

 build_plugin/%.o: plugin/%.cc
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
-	$(CXX) -c $(CFLAGS) -c $< -o $@
+	$(CXX) -c $(CFLAGS) $< -o $@

 # The should be equivalent to $(ALL_OBJ)  except for build/cli_main.o
 amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
-	$(CXX) -c $(CFLAGS) -c $< -o $@
+	$(CXX) -c $(CFLAGS) $< -o $@

 # Equivalent to lib/libxgboost_all.so
 lib/libxgboost_all.so: $(AMALGA_OBJ) $(LIB_DEP)
@@ -116,7 +160,7 @@ lib/libxgboost.a: $(ALL_DEP)
 	@mkdir -p $(@D)
 	ar crv $@ $(filter %.o, $?)

-lib/libxgboost.dll lib/libxgboost.so: $(ALL_DEP)
+lib/xgboost.dll lib/libxgboost.so lib/libxgboost.dylib: $(ALL_DEP)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %a,  $^) $(LDFLAGS)

@@ -124,6 +168,7 @@ jvm-packages/lib/libxgboost4j.so: jvm-packages/xgboost4j/src/native/xgboost4j.cp
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS)

+
 xgboost: $(CLI_OBJ) $(ALL_DEP)
 	$(CXX) $(CFLAGS) -o $@  $(filter %.o %.a, $^)  $(LDFLAGS)

@@ -136,12 +181,32 @@ lint: rcpplint
 pylint:
 	flake8 --ignore E501 python-package
 	flake8 --ignore E501 tests/python
+
+test: $(ALL_TEST)
+	$(ALL_TEST)
+
+check: test
+	./tests/cpp/xgboost_test
+
+ifeq ($(TEST_COVER), 1)
+cover: check
+	@- $(foreach COV_OBJ, $(COVER_OBJ), \
+		gcov -pbcul -o $(shell dirname $(COV_OBJ)) $(COV_OBJ) > gcov.log || cat gcov.log; \
+	)
+endif
+
 clean:
-	$(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o xgboost
+	$(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o #xgboost
+	$(RM) -rf build_tests *.gcov tests/cpp/xgboost_test
+	if [ -d "R-package/src" ]; then \
+		cd R-package/src; \
+		$(RM) -rf rabit src include dmlc-core amalgamation *.so *.dll; \
+		cd $(ROOTDIR); \
+	fi

 clean_all: clean
-	cd $(DMLC_CORE); $(MAKE) clean; cd $(ROODIR)
-	cd $(RABIT); $(MAKE) clean; cd $(ROODIR)
+	cd $(DMLC_CORE); "$(MAKE)" clean; cd $(ROOTDIR)
+	cd $(RABIT); "$(MAKE)" clean; cd $(ROOTDIR)

 doxygen:
 	doxygen doc/Doxyfile
@@ -151,9 +216,31 @@ pypack: ${XGBOOST_DYLIB}
 	cp ${XGBOOST_DYLIB} python-package/xgboost
 	cd python-package; tar cf xgboost.tar xgboost; cd ..

+# create pip source dist (sdist) pack for PyPI
+pippack: clean_all
+	rm -rf xgboost-python
+# remove symlinked directories in python-package/xgboost
+	rm -rf python-package/xgboost/lib
+	rm -rf python-package/xgboost/dmlc-core
+	rm -rf python-package/xgboost/include
+	rm -rf python-package/xgboost/make
+	rm -rf python-package/xgboost/rabit
+	rm -rf python-package/xgboost/src
+	cp -r python-package xgboost-python
+	cp -r Makefile xgboost-python/xgboost/
+	cp -r make xgboost-python/xgboost/
+	cp -r src xgboost-python/xgboost/
+	cp -r tests xgboost-python/xgboost/
+	cp -r include xgboost-python/xgboost/
+	cp -r dmlc-core xgboost-python/xgboost/
+	cp -r rabit xgboost-python/xgboost/
+# Use setup_pip.py instead of setup.py
+	mv xgboost-python/setup_pip.py xgboost-python/setup.py
+# Build sdist tarball
+	cd xgboost-python; python setup.py sdist; mv dist/*.tar.gz ..; cd ..
+
 # Script to make a clean installable R package.
-Rpack:
-	$(MAKE) clean_all
+Rpack: clean_all
 	rm -rf xgboost xgboost*.tar.gz
 	cp -r R-package xgboost
 	rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
@@ -171,16 +258,15 @@ Rpack:
 	cp -r dmlc-core/include xgboost/src/dmlc-core/include
 	cp -r dmlc-core/src xgboost/src/dmlc-core/src
 	cp ./LICENSE xgboost
-	cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' | sed '3s/.*/ENABLE_STD_THREAD=0/' > xgboost/src/Makevars
-	cp xgboost/src/Makevars xgboost/src/Makevars.win
+	cat R-package/src/Makevars.in|sed '2s/.*/PKGROOT=./' | sed '3s/.*/ENABLE_STD_THREAD=0/' > xgboost/src/Makevars.in
+	cp xgboost/src/Makevars.in xgboost/src/Makevars.win
+	sed -i -e 's/@OPENMP_CXXFLAGS@/$$\(SHLIB_OPENMP_CFLAGS\)/g' xgboost/src/Makevars.win

-Rbuild:
-	$(MAKE) Rpack
+Rbuild: Rpack
 	R CMD build --no-build-vignettes xgboost
 	rm -rf xgboost

-Rcheck:
-	$(MAKE) Rbuild
+Rcheck: Rbuild
 	R CMD check  xgboost*.tar.gz

 -include build/*.d
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,151 @@ XGBoost Change Log

 This file records the changes in xgboost library in reverse chronological order.

+## v0.71 (2018.04.11)
+* This is a minor release, mainly motivated by issues concerning `pip install`, e.g. #2426, #3189, #3118, and #3194.
+  With this release, users of Linux and MacOS will be able to run `pip install` for the most part.
+* Refactored linear booster class (`gblinear`), so as to support multiple coordinate descent updaters (#3103, #3134). See BREAKING CHANGES below.
+* Fix slow training for multiclass classification with high number of classes (#3109)
+* Fix a corner case in approximate quantile sketch (#3167). Applicable for 'hist' and 'gpu_hist' algorithms
+* Fix memory leak in DMatrix (#3182)
+* New functionality
+  - Better linear booster class (#3103, #3134)
+  - Pairwise SHAP interaction effects (#3043)
+  - Cox loss (#3043)
+  - AUC-PR metric for ranking task (#3172)
+  - Monotonic constraints for 'hist' algorithm (#3085)
+* GPU support
+  - Create an abtract 1D vector class that moves data seamlessly between the main and GPU memory (#2935, #3116, #3068). This eliminates unnecessary PCIe data transfer during training time.
+  - Fix minor bugs (#3051, #3217)
+  - Fix compatibility error for CUDA 9.1 (#3218)
+* Python package:
+  - Correctly handle parameter `verbose_eval=0` (#3115)
+* R package:
+  - Eliminate segmentation fault on 32-bit Windows platform (#2994)
+* JVM packages
+  - Fix a memory bug involving double-freeing Booster objects (#3005, #3011)
+  - Handle empty partition in predict (#3014)
+  - Update docs and unify terminology (#3024)
+  - Delete cache files after job finishes (#3022)
+  - Compatibility fixes for latest Spark versions (#3062, #3093)
+* BREAKING CHANGES: Updated linear modelling algorithms. In particular L1/L2 regularisation penalties are now normalised to number of training examples. This makes the implementation consistent with sklearn/glmnet. L2 regularisation has also been removed from the intercept. To produce linear models with the old regularisation behaviour, the alpha/lambda regularisation parameters can be manually scaled by dividing them by the number of training examples.
+
+## v0.7 (2017.12.30)
+* **This version represents a major change from the last release (v0.6), which was released one year and half ago.**
+* Updated Sklearn API
+  - Add compatibility layer for scikit-learn v0.18: `sklearn.cross_validation` now deprecated
+  - Updated to allow use of all XGBoost parameters via `**kwargs`.
+  - Updated `nthread` to `n_jobs` and `seed` to `random_state` (as per Sklearn convention); `nthread` and `seed` are now marked as deprecated
+  - Updated to allow choice of Booster (`gbtree`, `gblinear`, or `dart`)
+  - `XGBRegressor` now supports instance weights (specify `sample_weight` parameter)
+  - Pass `n_jobs` parameter to the `DMatrix` constructor
+  - Add `xgb_model` parameter to `fit` method, to allow continuation of training
+* Refactored gbm to allow more friendly cache strategy
+  - Specialized some prediction routine
+* Robust `DMatrix` construction from a sparse matrix
+* Faster consturction of `DMatrix` from 2D NumPy matrices: elide copies, use of multiple threads
+* Automatically remove nan from input data when it is sparse.
+  - This can solve some of user reported problem of istart != hist.size
+* Fix the single-instance prediction function to obtain correct predictions
+* Minor fixes
+  - Thread local variable is upgraded so it is automatically freed at thread exit.
+  - Fix saving and loading `count::poisson` models
+  - Fix CalcDCG to use base-2 logarithm
+  - Messages are now written to stderr instead of stdout
+  - Keep built-in evaluations while using customized evaluation functions
+  - Use `bst_float` consistently to minimize type conversion
+  - Copy the base margin when slicing `DMatrix`
+  - Evaluation metrics are now saved to the model file
+  - Use `int32_t` explicitly when serializing version
+  - In distributed training, synchronize the number of features after loading a data matrix.
+* Migrate to C++11
+  - The current master version now requires C++11 enabled compiled(g++4.8 or higher)
+* Predictor interface was factored out (in a manner similar to the updater interface).
+* Makefile support for Solaris and ARM
+* Test code coverage using Codecov
+* Add CPP tests
+* Add `Dockerfile` and `Jenkinsfile` to support continuous integration for GPU code
+* New functionality
+  - Ability to adjust tree model's statistics to a new dataset without changing tree structures.
+  - Ability to extract feature contributions from individual predictions, as described in [here](http://blog.datadive.net/interpreting-random-forests/) and [here](https://arxiv.org/abs/1706.06060).
+  - Faster, histogram-based tree algorithm (`tree_method='hist'`) .
+  - GPU/CUDA accelerated tree algorithms (`tree_method='gpu_hist'` or `'gpu_exact'`), including the GPU-based predictor.
+  - Monotonic constraints: when other features are fixed, force the prediction to be monotonic increasing with respect to a certain specified feature.
+  - Faster gradient caculation using AVX SIMD
+  - Ability to export models in JSON format
+  - Support for Tweedie regression
+  - Additional dropout options for DART: binomial+1, epsilon
+  - Ability to update an existing model in-place: this is useful for many applications, such as determining feature importance
+* Python package:
+  - New parameters:
+    - `learning_rates` in `cv()`
+    - `shuffle` in `mknfold()`
+    - `max_features` and `show_values` in `plot_importance()`
+    - `sample_weight` in `XGBRegressor.fit()`
+  - Support binary wheel builds
+  - Fix `MultiIndex` detection to support Pandas 0.21.0 and higher
+  - Support metrics and evaluation sets whose names contain `-`
+  - Support feature maps when plotting trees
+  - Compatibility fix for Python 2.6
+  - Call `print_evaluation` callback at last iteration
+  - Use appropriate integer types when calling native code, to prevent truncation and memory error
+  - Fix shared library loading on Mac OS X 
+* R package:
+  - New parameters:
+    - `silent` in `xgb.DMatrix()`
+    - `use_int_id` in `xgb.model.dt.tree()`
+    - `predcontrib` in `predict()`
+    - `monotone_constraints` in `xgb.train()`
+  - Default value of the `save_period` parameter in `xgboost()` changed to NULL (consistent with `xgb.train()`).
+  - It's possible to custom-build the R package with GPU acceleration support.
+  - Enable JVM build for Mac OS X and Windows
+  - Integration with AppVeyor CI
+  - Improved safety for garbage collection
+  - Store numeric attributes with higher precision
+  - Easier installation for devel version
+  - Improved `xgb.plot.tree()`
+  - Various minor fixes to improve user experience and robustness
+  - Register native code to pass CRAN check
+  - Updated CRAN submission
+* JVM packages
+  - Add Spark pipeline persistence API
+  - Fix data persistence: loss evaluation on test data had wrongly used caches for training data.
+  - Clean external cache after training
+  - Implement early stopping
+  - Enable training of multiple models by distinguishing stage IDs
+  - Better Spark integration: support RDD / dataframe / dataset, integrate with Spark ML package
+  - XGBoost4j now supports ranking task
+  - Support training with missing data
+  - Refactor JVM package to separate regression and classification models to be consistent with other machine learning libraries
+  - Support XGBoost4j compilation on Windows
+  - Parameter tuning tool
+  - Publish source code for XGBoost4j to maven local repo
+  - Scala implementation of the Rabit tracker (drop-in replacement for the Java implementation)
+  - Better exception handling for the Rabit tracker
+  - Persist `num_class`, number of classes (for classification task)
+  - `XGBoostModel` now holds `BoosterParams`
+  - libxgboost4j is now part of CMake build
+  - Release `DMatrix` when no longer needed, to conserve memory
+  - Expose `baseMargin`, to allow initialization of boosting with predictions from an external model
+  - Support instance weights
+  - Use `SparkParallelismTracker` to prevent jobs from hanging forever
+  - Expose train-time evaluation metrics via `XGBoostModel.summary`
+  - Option to specify `host-ip` explicitly in the Rabit tracker 
+* Documentation
+  - Better math notation for gradient boosting
+  - Updated build instructions for Mac OS X
+  - Template for GitHub issues
+  - Add `CITATION` file for citing XGBoost in scientific writing
+  - Fix dropdown menu in xgboost.readthedocs.io
+  - Document `updater_seq` parameter
+  - Style fixes for Python documentation
+  - Links to additional examples and tutorials
+  - Clarify installation requirements
+* Changes that break backward compatibility
+  - [#1519](https://github.com/dmlc/xgboost/pull/1519) XGBoost-spark no longer contains APIs for DMatrix; use the public booster interface instead.
+  - [#2476](https://github.com/dmlc/xgboost/pull/2476) `XGBoostModel.predict()` now has a different signature
+
+
 ## v0.6 (2016.07.29)
 * Version 0.5 is skipped due to major improvements in the core
 * Major refactor of core library.
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,17 +1,27 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.6-0
-Date: 2015-08-01
-Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
-    Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,
-    Yuan Tang <terrytangyuan@gmail.com>
-Maintainer: Tong He <hetong007@gmail.com>
+Version: 0.71.1
+Date: 2018-04-11
+Authors@R: c(
+  person("Tianqi", "Chen", role = c("aut"),
+         email = "tianqi.tchen@gmail.com"),
+  person("Tong", "He", role = c("aut", "cre"),
+         email = "hetong007@gmail.com"),
+  person("Michael", "Benesty", role = c("aut"),
+         email = "michael@benesty.fr"),
+  person("Vadim", "Khotilovich", role = c("aut"),
+         email = "khotilovich@gmail.com"),
+  person("Yuan", "Tang", role = c("aut"),
+         email = "terrytangyuan@gmail.com",
+         comment = c(ORCID = "0000-0001-5243-233X"))
+  )
 Description: Extreme Gradient Boosting, which is an efficient implementation
-    of gradient boosting framework. This package is its R interface. The package
-    includes efficient linear model solver and tree learning algorithms. The package
-    can automatically do parallel computation on a single machine which could be
-    more than 10 times faster than existing gradient boosting packages. It supports
+    of the gradient boosting framework from Chen & Guestrin (2016) <doi:10.1145/2939672.2939785>.
+    This package is its R interface. The package includes efficient linear 
+    model solver and tree learning algorithms. The package can automatically 
+    do parallel computation on a single machine which could be more than 10 
+    times faster than existing gradient boosting packages. It supports
    various objective functions, including regression, classification and ranking.
    The package is made to be extensible, so that users are also allowed to define
    their own objectives easily.
@@ -23,17 +33,18 @@ Suggests:
    knitr,
    rmarkdown,
    ggplot2 (>= 1.0.1),
-    DiagrammeR (>= 0.8.1),
+    DiagrammeR (>= 0.9.0),
    Ckmeans.1d.dp (>= 3.3.1),
    vcd (>= 1.3),
    testthat,
    igraph (>= 1.0.1)
 Depends:
-    R (>= 2.10)
+    R (>= 3.3.0)
 Imports:
    Matrix (>= 1.1-0),
    methods,
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
    stringi (>= 0.5.2)
-RoxygenNote: 5.0.1
+RoxygenNote: 6.0.1
+SystemRequirements: GNU make, C++11
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -18,12 +18,14 @@ export("xgb.parameters<-")
 export(cb.cv.predict)
 export(cb.early.stop)
 export(cb.evaluation.log)
+export(cb.gblinear.history)
 export(cb.print.evaluation)
 export(cb.reset.parameters)
 export(cb.save.model)
 export(getinfo)
 export(setinfo)
 export(slice)
+export(xgb.Booster.complete)
 export(xgb.DMatrix)
 export(xgb.DMatrix.save)
 export(xgb.attr)
@@ -31,6 +33,7 @@ export(xgb.attributes)
 export(xgb.create.features)
 export(xgb.cv)
 export(xgb.dump)
+export(xgb.gblinear.history)
 export(xgb.ggplot.deepness)
 export(xgb.ggplot.importance)
 export(xgb.importance)
@@ -39,6 +42,7 @@ export(xgb.model.dt.tree)
 export(xgb.plot.deepness)
 export(xgb.plot.importance)
 export(xgb.plot.multi.trees)
+export(xgb.plot.shap)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
@@ -47,25 +51,36 @@ export(xgboost)
 import(methods)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
-importFrom(Matrix,cBind)
 importFrom(Matrix,colSums)
 importFrom(Matrix,sparse.model.matrix)
+importFrom(Matrix,sparseMatrix)
 importFrom(Matrix,sparseVector)
+importFrom(Matrix,t)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
+importFrom(data.table,is.data.table)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setkey)
 importFrom(data.table,setkeyv)
 importFrom(data.table,setnames)
+importFrom(grDevices,rgb)
+importFrom(graphics,barplot)
+importFrom(graphics,grid)
+importFrom(graphics,lines)
+importFrom(graphics,par)
+importFrom(graphics,points)
+importFrom(graphics,title)
 importFrom(magrittr,"%>%")
+importFrom(stats,median)
 importFrom(stats,predict)
 importFrom(stringi,stri_detect_regex)
 importFrom(stringi,stri_match_first_regex)
 importFrom(stringi,stri_replace_all_regex)
 importFrom(stringi,stri_replace_first_regex)
 importFrom(stringi,stri_split_regex)
+importFrom(utils,head)
 importFrom(utils,object.size)
 importFrom(utils,str)
 importFrom(utils,tail)
-useDynLib(xgboost)
+useDynLib(xgboost, .registration = TRUE)
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -41,6 +41,7 @@ NULL
 #' Callback closure for printing the result of evaluation
 #' 
 #' @param period  results would be printed every number of periods
+#' @param showsd  whether standard deviations should be printed (when available)
 #' 
 #' @details
 #' The callback function prints the result of evaluation at every \code{period} iterations.
@@ -56,7 +57,7 @@ NULL
 #' \code{\link{callbacks}}
 #' 
 #' @export
-cb.print.evaluation <- function(period=1) {
+cb.print.evaluation <- function(period = 1, showsd = TRUE) {
  
  callback <- function(env = parent.frame()) {
    if (length(env$bst_evaluation) == 0 ||
@@ -68,7 +69,8 @@ cb.print.evaluation <- function(period=1) {
    if ((i-1) %% period == 0 ||
        i == env$begin_iteration ||
        i == env$end_iteration) {
-      msg <- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
+      stdev <- if (showsd) env$bst_evaluation_err else NULL
+      msg <- format.eval.string(i, env$bst_evaluation, stdev)
      cat(msg, '\n')
    }
  }
@@ -125,12 +127,12 @@ cb.evaluation.log <- function() {
      # rearrange col order from _mean,_mean,...,_std,_std,...
      # to be _mean,_std,_mean,_std,...
      len <- length(mnames)
-      means <- mnames[1:(len/2)]
+      means <- mnames[seq_len(len/2)]
      stds <- mnames[(len/2 + 1):len]
      cnames <- numeric(len)
      cnames[c(TRUE, FALSE)] <- means
      cnames[c(FALSE, TRUE)] <- stds
-      env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with=FALSE]
+      env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with = FALSE]
    }
  }
  
@@ -229,7 +231,7 @@ cb.reset.parameters <- function(new_params) {
      xgb.parameters(env$bst$handle) <- pars
    } else {
      for (fd in env$bst_folds)
-        xgb.parameters(fd$bst$handle) <- pars
+        xgb.parameters(fd$bst) <- pars
    }
  }
  attr(callback, 'is_pre_iteration') <- TRUE
@@ -288,8 +290,8 @@ cb.reset.parameters <- function(new_params) {
 #' \code{\link{xgb.attr}}
 #' 
 #' @export
-cb.early.stop <- function(stopping_rounds, maximize=FALSE, 
-                          metric_name=NULL, verbose=TRUE) {
+cb.early.stop <- function(stopping_rounds, maximize = FALSE, 
+                          metric_name = NULL, verbose = TRUE) {
  # state variables
  best_iteration <- -1
  best_ntreelimit <- -1
@@ -306,7 +308,7 @@ cb.early.stop <- function(stopping_rounds, maximize=FALSE,
      metric_idx <<- which(gsub('-', '_', metric_name) == eval_names)
      if (length(metric_idx) == 0)
        stop("'metric_name' for early stopping is not one of the following:\n",
-             paste(eval_names, collapse=' '), '\n')
+             paste(eval_names, collapse = ' '), '\n')
    }
    if (is.null(metric_name) &&
        length(env$bst_evaluation) > 1) {
@@ -318,9 +320,9 @@ cb.early.stop <- function(stopping_rounds, maximize=FALSE,
    
    metric_name <<- eval_names[metric_idx]
    
-    # maximixe is usually NULL when not set in xgb.train and built-in metrics
+    # maximize is usually NULL when not set in xgb.train and built-in metrics
    if (is.null(maximize))
-      maximize <<- ifelse(grepl('(_auc|_map|_ndcg)', metric_name), TRUE, FALSE)
+      maximize <<- grepl('(_auc|_map|_ndcg)', metric_name)

    if (verbose && NVL(env$rank, 0) == 0)
      cat("Will train until ", metric_name, " hasn't improved in ", 
@@ -332,7 +334,7 @@ cb.early.stop <- function(stopping_rounds, maximize=FALSE,
    env$stop_condition <- FALSE
    
    if (!is.null(env$bst)) {
-      if (class(env$bst) != 'xgb.Booster')
+      if (!inherits(env$bst, 'xgb.Booster'))
        stop("'bst' in the parent frame must be an 'xgb.Booster'")
      if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
        best_score <<- as.numeric(best_score)
@@ -458,6 +460,7 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
 #' \code{basket},
 #' \code{data},
 #' \code{end_iteration},
+#' \code{params},
 #' \code{num_parallel_tree},
 #' \code{num_class}.
 #' 
@@ -491,6 +494,9 @@ cb.cv.predict <- function(save_models = FALSE) {

    ntreelimit <- NVL(env$basket$best_ntreelimit, 
                      env$end_iteration * env$num_parallel_tree)
+    if (NVL(env$params[['booster']], '') == 'gblinear') {
+      ntreelimit <- 0 # must be 0 for gblinear
+    }
    for (fd in env$bst_folds) {
      pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
      if (is.matrix(pred)) {
@@ -503,7 +509,7 @@ cb.cv.predict <- function(save_models = FALSE) {
    if (save_models) {
      env$basket$models <- lapply(env$bst_folds, function(fd) {
        xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
-        xgb.Booster.check(xgb.handleToBooster(fd$bst), saveraw = TRUE)
+        xgb.Booster.complete(xgb.handleToBooster(fd$bst), saveraw = TRUE)
      })
    }
  }
@@ -518,12 +524,234 @@ cb.cv.predict <- function(save_models = FALSE) {
 }


+#' Callback closure for collecting the model coefficients history of a gblinear booster
+#' during its training.
+#'
+#' @param sparse when set to FALSE/TURE, a dense/sparse matrix is used to store the result.
+#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
+#'       when using the "thrifty" feature selector with fairly small number of top features
+#'       selected per iteration.
+#'
+#' @details
+#' To keep things fast and simple, gblinear booster does not internally store the history of linear
+#' model coefficients at each boosting iteration. This callback provides a workaround for storing
+#' the coefficients' path, by extracting them after each training iteration.
+#'
+#' Callback function expects the following values to be set in its calling frame:
+#' \code{bst} (or \code{bst_folds}).
+#'
+#' @return
+#' Results are stored in the \code{coefs} element of the closure.
+#' The \code{\link{xgb.gblinear.history}} convenience function provides an easy way to access it.
+#' With \code{xgb.train}, it is either a dense of a sparse matrix.
+#' While with \code{xgb.cv}, it is a list (an element per each fold) of such matrices.
+#'
+#' @seealso
+#' \code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+#'
+#' @examples
+#' #### Binary classification:
+#' #
+#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
+#' # without considering the 2nd order interactions:
+#' require(magrittr)
+#' x <- model.matrix(Species ~ .^2, iris)[,-1]
+#' colnames(x)
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"))
+#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
+#'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
+#' # For 'shotgun', which is a default linear updater, using high eta values may result in
+#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
+#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
+#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
+#'                  callbacks = list(cb.gblinear.history()))
+#' # Extract the coefficients' path and plot them vs boosting iteration number:
+#' coef_path <- xgb.gblinear.history(bst)
+#' matplot(coef_path, type = 'l')
+#' 
+#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
+#' # Will try the classical componentwise boosting which selects a single best feature per round:
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
+#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
+#'                  callbacks = list(cb.gblinear.history()))
+#' xgb.gblinear.history(bst) %>% matplot(type = 'l')
+#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
+#' # Try experimenting with various values of top_k, eta, nrounds,
+#' # as well as different feature_selectors.
+#'
+#' # For xgb.cv:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
+#'              callbacks = list(cb.gblinear.history()))
+#' # coefficients in the CV fold #3
+#' xgb.gblinear.history(bst)[[3]] %>% matplot(type = 'l')
+#'
+#' 
+#' #### Multiclass classification:
+#' #
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1)
+#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
+#'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
+#' # For the default linear updater 'shotgun' it sometimes is helpful
+#' # to use smaller eta to reduce instability
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 70, eta = 0.5,
+#'                  callbacks = list(cb.gblinear.history()))
+#' # Will plot the coefficient paths separately for each class:
+#' xgb.gblinear.history(bst, class_index = 0) %>% matplot(type = 'l')
+#' xgb.gblinear.history(bst, class_index = 1) %>% matplot(type = 'l')
+#' xgb.gblinear.history(bst, class_index = 2) %>% matplot(type = 'l')
+#'
+#' # CV:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
+#'               callbacks = list(cb.gblinear.history(FALSE)))
+#' # 1st forld of 1st class
+#' xgb.gblinear.history(bst, class_index = 0)[[1]] %>% matplot(type = 'l')
+#'
+#' @export
+cb.gblinear.history <- function(sparse=FALSE) {
+  coefs <- NULL
+
+  init <- function(env) {
+    if (!is.null(env$bst)) { # xgb.train:
+      coef_path <- list()
+    } else if (!is.null(env$bst_folds)) { # xgb.cv:
+      coef_path <- rep(list(), length(env$bst_folds))
+    } else stop("Parent frame has neither 'bst' nor 'bst_folds'")
+  }
+
+  # convert from list to (sparse) matrix
+  list2mat <- function(coef_list) {
+    if (sparse) {
+      coef_mat <- sparseMatrix(x = unlist(lapply(coef_list, slot, "x")),
+                               i = unlist(lapply(coef_list, slot, "i")),
+                               p = c(0, cumsum(sapply(coef_list, function(x) length(x@x)))),
+                               dims = c(length(coef_list[[1]]), length(coef_list)))
+      return(t(coef_mat))
+    } else {
+      return(do.call(rbind, coef_list))
+    }
+  }
+
+  finalizer <- function(env) {
+    if (length(coefs) == 0)
+      return()
+    if (!is.null(env$bst)) { # # xgb.train:
+      coefs <<- list2mat(coefs)
+    } else { # xgb.cv:
+      # first lapply transposes the list
+      coefs <<- lapply(seq_along(coefs[[1]]), function(i) lapply(coefs, "[[", i)) %>%
+                lapply(function(x) list2mat(x))
+    }
+  }
+
+  extract.coef <- function(env) {
+    if (!is.null(env$bst)) { # # xgb.train:
+      cf <- as.numeric(grep('(booster|bias|weigh)', xgb.dump(env$bst), invert = TRUE, value = TRUE))
+      if (sparse) cf <- as(cf, "sparseVector")
+    } else { # xgb.cv:
+      cf <- vector("list", length(env$bst_folds))
+      for (i in seq_along(env$bst_folds)) {
+        dmp <- xgb.dump(xgb.handleToBooster(env$bst_folds[[i]]$bst))
+        cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
+        if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
+      }
+    }
+    cf
+  }
+
+  callback <- function(env = parent.frame(), finalize = FALSE) {
+    if (is.null(coefs)) init(env)
+    if (finalize) return(finalizer(env))
+    cf <- extract.coef(env)
+    coefs <<- c(coefs, list(cf))
+  }
+
+  attr(callback, 'call') <- match.call()
+  attr(callback, 'name') <- 'cb.gblinear.history'
+  callback
+}
+
+#' Extract gblinear coefficients history.
+#'
+#' A helper function to extract the matrix of linear coefficients' history
+#' from a gblinear model created while using the \code{cb.gblinear.history()}
+#' callback.
+#'
+#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
+#'        using the \code{cb.gblinear.history()} callback.
+#' @param class_index zero-based class index to extract the coefficients for only that
+#'        specific class in a multinomial multiclass model. When it is NULL, all the
+#'        coeffients are returned. Has no effect in non-multiclass models.
+#'
+#' @return 
+#' For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
+#' corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
+#' return) and the rows corresponding to boosting iterations.
+#'
+#' For an \code{xgb.cv} result, a list of such matrices is returned with the elements
+#' corresponding to CV folds.
+#'
+#' @examples
+#' \dontrun{
+#' See \code{\link{cv.gblinear.history}}
+#' }
+#'
+#' @export
+xgb.gblinear.history <- function(model, class_index = NULL) {
+
+  if (!(inherits(model, "xgb.Booster") ||
+        inherits(model, "xgb.cv.synchronous")))
+    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
+  is_cv <- inherits(model, "xgb.cv.synchronous")
+
+  if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]]))
+    stop("model must be trained while using the cb.gblinear.history() callback")
+
+  if (!is_cv) {
+    # extract num_class & num_feat from the internal model
+    dmp <- xgb.dump(model)
+    if(length(dmp) < 2 || dmp[2] != "bias:")
+      stop("It does not appear to be a gblinear model")
+    dmp <- dmp[-c(1,2)]
+    n <- which(dmp == 'weight:')
+    if(length(n) != 1)
+      stop("It does not appear to be a gblinear model")
+    num_class <- n - 1
+    num_feat <- (length(dmp) - 4) / num_class
+  } else {
+    # in case of CV, the object is expected to have this info
+    if (model$params$booster != "gblinear")
+      stop("It does not appear to be a gblinear model")
+    num_class <- NVL(model$params$num_class, 1)
+    num_feat <- model$nfeatures
+    if (is.null(num_feat))
+      stop("This xgb.cv result does not have nfeatures info")
+  }
+
+  if (!is.null(class_index) &&
+      num_class > 1 &&
+      (class_index[1] < 0 || class_index[1] >= num_class))
+    stop("class_index has to be within [0,", num_class - 1, "]")
+
+  coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]]
+  if (!is.null(class_index) && num_class > 1) {
+    coef_path <- if (is.list(coef_path)) {
+      lapply(coef_path, 
+             function(x) x[, seq(1 + class_index, by=num_class, length.out=num_feat)])
+    } else {
+      coef_path <- coef_path[, seq(1 + class_index, by=num_class, length.out=num_feat)]
+    }
+  }
+  coef_path
+}
+
+
 #
 # Internal utility functions for callbacks ------------------------------------
 # 

 # Format the evaluation metric string
-format.eval.string <- function(iter, eval_res, eval_err=NULL) {
+format.eval.string <- function(iter, eval_res, eval_err = NULL) {
  if (length(eval_res) == 0)
    stop('no evaluation results')
  enames <- names(eval_res)
@@ -533,9 +761,9 @@ format.eval.string <- function(iter, eval_res, eval_err=NULL) {
  if (!is.null(eval_err)) {
    if (length(eval_res) != length(eval_err))
      stop('eval_res & eval_err lengths mismatch')
-    res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse='\t')
+    res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse = '\t')
  } else {
-    res <- paste0(sprintf("%s:%f", enames, eval_res), collapse='\t')
+    res <- paste0(sprintf("%s:%f", enames, eval_res), collapse = '\t')
  }
  return(paste0(iter, res))
 }
@@ -591,7 +819,7 @@ has.callbacks <- function(cb_list, query_names) {
    return(FALSE)
  if (!is.list(cb_list) ||
      any(sapply(cb_list, class) != 'function')) {
-    stop('`cb_list`` must be a list of callback functions')
+    stop('`cb_list` must be a list of callback functions')
  }
  cb_names <- callback.names(cb_list)
  if (!is.character(cb_names) ||
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -17,7 +17,7 @@ NVL <- function(x, val) {
  }
  if (typeof(x) == 'closure')
    return(x)
-  stop('x of unsupported for NVL type')
+  stop("typeof(x) == ", typeof(x), " is not supported by NVL")
 }


@@ -42,15 +42,15 @@ check.booster.params <- function(params, ...) {
    stop("Same parameters in 'params' and in the call are not allowed. Please check your 'params' list.")
  params <- c(params, dot_params)
  
-  # providing a parameter multiple times only makes sense for 'eval_metric'
+  # providing a parameter multiple times makes sense only for 'eval_metric'
  name_freqs <- table(names(params))
  multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric')
  if (length(multi_names) > 0) {
    warning("The following parameters were provided multiple times:\n\t",
-            paste(multi_names, collapse=', '), "\n  Only the last value for each of them will be used.\n")
-    # While xgboost itself would choose the last value for a multi-parameter, 
-    # will do some clean-up here b/c multi-parameters could be used further in R code, and R would 
-    # pick the 1st (not the last) value when multiple elements with the same name are present in a list.
+            paste(multi_names, collapse = ', '), "\n  Only the last value for each of them will be used.\n")
+    # While xgboost internals would choose the last value for a multiple-times parameter, 
+    # enforce it here in R as well (b/c multi-parameters might be used further in R code, 
+    # and R takes the 1st value when multiple elements with the same name are present in a list).
    for (n in multi_names) {
      del_idx <- which(n == names(params))
      del_idx <- del_idx[-length(del_idx)]
@@ -60,9 +60,18 @@ check.booster.params <- function(params, ...) {
  
  # for multiclass, expect num_class to be set
  if (typeof(params[['objective']]) == "character" &&
-    substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:') {
-    if (as.numeric(NVL(params[['num_class']], 0)) < 2)
-      stop("'num_class' > 1 parameter must be set for multiclass classification")
+      substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:' &&
+      as.numeric(NVL(params[['num_class']], 0)) < 2) {
+        stop("'num_class' > 1 parameter must be set for multiclass classification")
+  }
+  
+  # monotone_constraints parser
+  
+  if (!is.null(params[['monotone_constraints']]) &&
+      typeof(params[['monotone_constraints']]) != "character") {
+        vec2str = paste(params[['monotone_constraints']], collapse = ',')
+        vec2str = paste0('(', vec2str, ')')
+        params[['monotone_constraints']] = vec2str
  }
  
  return(params)
@@ -82,9 +91,7 @@ check.custom.obj <- function(env = parent.frame()) {
  if (!is.null(env$params[['objective']]) &&
      typeof(env$params$objective) == 'closure') {
    env$obj <- env$params$objective
-    p <- env$params
-    p$objective <- NULL
-    env$params <- p
+    env$params$objective <- NULL
  }
 }

@@ -97,36 +104,37 @@ check.custom.eval <- function(env = parent.frame()) {
  if (!is.null(env$feval) && typeof(env$feval) != 'closure')
    stop("'feval' must be a function")
  
-  if (!is.null(env$feval) && is.null(env$maximize))
-    stop("Please set 'maximize' to indicate whether the metric needs to be maximized or not")
-  
  # handle a situation when custom eval function was provided through params
  if (!is.null(env$params[['eval_metric']]) &&
      typeof(env$params$eval_metric) == 'closure') {
    env$feval <- env$params$eval_metric
-    p <- env$params
-    p[ which(names(p) == 'eval_metric') ] <- NULL
-    env$params <- p
+    env$params$eval_metric <- NULL
  }
+  
+  # require maximize to be set when custom feval and early stopping are used together
+  if (!is.null(env$feval) &&
+      is.null(env$maximize) && (
+        !is.null(env$early_stopping_rounds) || 
+        has.callbacks(env$callbacks, 'cb.early.stop')))
+    stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not")
 }


-# Update booster with dtrain for an iteration
-xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
-  if (class(booster) != "xgb.Booster.handle") {
-    stop("first argument type must be xgb.Booster.handle")
+# Update a booster handle for an iteration with dtrain data
+xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
+  if (!identical(class(booster_handle), "xgb.Booster.handle")) {
+    stop("booster_handle must be of xgb.Booster.handle class")
  }
-  if (class(dtrain) != "xgb.DMatrix") {
-    stop("second argument type must be xgb.DMatrix")
+  if (!inherits(dtrain, "xgb.DMatrix")) {
+    stop("dtrain must be of xgb.DMatrix class")
  }

  if (is.null(obj)) {
-    .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
-          PACKAGE = "xgboost")
+    .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
  } else {
-    pred <- predict(booster, dtrain)
+    pred <- predict(booster_handle, dtrain)
    gpair <- obj(pred, dtrain)
-    .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
+    .Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess)
  }
  return(TRUE)
 }
@@ -135,24 +143,23 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
 # Evaluate one iteration.
 # Returns a named vector of evaluation metrics 
 # with the names in a 'datasetname-metricname' format.
-xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
-  if (class(booster) != "xgb.Booster.handle")
-    stop("first argument type must be xgb.Booster.handle")
-  
+xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
+  if (!identical(class(booster_handle), "xgb.Booster.handle"))
+    stop("class of booster_handle must be xgb.Booster.handle")
+
  if (length(watchlist) == 0) 
    return(NULL)
  
  evnames <- names(watchlist)
  if (is.null(feval)) {
-    msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
-                 as.list(evnames), PACKAGE = "xgboost")
+    msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
    msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
    res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
    names(res) <- msg[c(TRUE,FALSE)]      # odds are the names
  } else {
    res <- sapply(seq_along(watchlist), function(j) {
      w <- watchlist[[j]]
-      preds <- predict(booster, w) # predict using all trees
+      preds <- predict(booster_handle, w) # predict using all trees
      eval_res <- feval(preds, w)
      out <- eval_res$value
      names(out) <- paste0(evnames[j], "-", eval_res$metric)
@@ -171,14 +178,14 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
 generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
  
  # cannot do it for rank
-  if (exists('objective', where=params) &&
+  if (exists('objective', where = params) &&
      is.character(params$objective) &&
      strtrim(params$objective, 5) == 'rank:') {
    stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
         "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
  }
  # shuffle
-  rnd_idx <- sample(1:nrows)
+  rnd_idx <- sample.int(nrows)
  if (stratified &&
      length(label) == length(rnd_idx)) {
    y <- label[rnd_idx]
@@ -186,7 +193,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
    #  - For classification, need to convert y labels to factor before making the folds,
    #    and then do stratification by factor levels.
    #  - For regression, leave y numeric and do stratification by quantiles.
-    if (exists('objective', where=params) &&
+    if (exists('objective', where = params) &&
        is.character(params$objective)) {
      # If 'objective' provided in params, assume that y is a classification label
      # unless objective is reg:linear
@@ -204,9 +211,9 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
    # make simple non-stratified folds
    kstep <- length(rnd_idx) %/% nfold
    folds <- list()
-    for (i in 1:(nfold - 1)) {
-      folds[[i]] <- rnd_idx[1:kstep]
-      rnd_idx <- rnd_idx[-(1:kstep)]
+    for (i in seq_len(nfold - 1)) {
+      folds[[i]] <- rnd_idx[seq_len(kstep)]
+      rnd_idx <- rnd_idx[-seq_len(kstep)]
    }
    folds[[nfold]] <- rnd_idx
  }
@@ -247,15 +254,15 @@ xgb.createFolds <- function(y, k = 10)
    ## For each class, balance the fold allocation as far
    ## as possible, then resample the remainder.
    ## The final assignment of folds is also randomized.
-    for (i in 1:length(numInClass)) {
+    for (i in seq_along(numInClass)) {
      ## create a vector of integers from 1:k as many times as possible without
      ## going over the number of samples in the class. Note that if the number
      ## of samples in a class is less than k, nothing is producd here.
-      seqVector <- rep(1:k, numInClass[i] %/% k)
+      seqVector <- rep(seq_len(k), numInClass[i] %/% k)
      ## add enough random integers to get  length(seqVector) == numInClass[i]
-      if (numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
+      if (numInClass[i] %% k > 0) seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
      ## shuffle the integers for fold assignment and assign to this classes's data
-      foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
+      foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
    }
  } else {
    foldVector <- seq(along = y)
@@ -295,8 +302,9 @@ depr_par_lut <- matrix(c(
  'features.keep', 'features_keep',
  'plot.height','plot_height',
  'plot.width','plot_width',
+  'n_first_tree', 'trees',
  'dummy', 'DUMMY'
-), ncol=2, byrow = TRUE)
+), ncol = 2, byrow = TRUE)
 colnames(depr_par_lut) <- c('old', 'new')

 # Checks the dot-parameters for deprecated names
@@ -321,7 +329,7 @@ check.deprecation <- function(..., env = parent.frame()) {
    if (!ex_match[i]) {
      warning("'", pars_par, "' was partially matched to '", old_par,"'")
    }
-    .Deprecated(new_par, old=old_par, package = 'xgboost')
+    .Deprecated(new_par, old = old_par, package = 'xgboost')
    if (new_par != 'NULL') {
      eval(parse(text = paste(new_par, '<-', pars[[pars_par]])), envir = env)
    }
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -1,20 +1,20 @@
-# Construct a Booster from cachelist
+# Construct an internal xgboost Booster and return a handle to it.
 # internal utility function
-xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
+xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) {
  if (typeof(cachelist) != "list" ||
-      any(sapply(cachelist, class) != 'xgb.DMatrix')) {
-    stop("xgb.Booster only accepts list of DMatrix as cachelist")
+      !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) {
+    stop("cachelist must be a list of xgb.DMatrix objects")
  }

-  handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
+  handle <- .Call(XGBoosterCreate_R, cachelist)
  if (!is.null(modelfile)) {
    if (typeof(modelfile) == "character") {
-      .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
+      .Call(XGBoosterLoadModel_R, handle, modelfile[1])
    } else if (typeof(modelfile) == "raw") {
-      .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
-    } else if (class(modelfile) == "xgb.Booster") {
-      modelfile <- xgb.Booster.check(modelfile, saveraw=TRUE)
-      .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile$raw, PACKAGE = "xgboost")
+      .Call(XGBoosterLoadModelFromRaw_R, handle, modelfile)
+    } else if (inherits(modelfile, "xgb.Booster")) {
+      bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
+      .Call(XGBoosterLoadModelFromRaw_R, handle, bst$raw)
    } else {
      stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
    }
@@ -34,6 +34,17 @@ xgb.handleToBooster <- function(handle, raw = NULL) {
  return(bst)
 }

+# Check whether xgb.Booster.handle is null
+# internal utility function
+is.null.handle <- function(handle) {
+  if (!identical(class(handle), "xgb.Booster.handle"))
+    stop("argument type must be xgb.Booster.handle")
+
+  if (is.null(handle) || .Call(XGCheckNullPtr_R, handle))
+    return(TRUE)
+  return(FALSE)
+}
+
 # Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster
 # internal utility function
 xgb.get.handle <- function(object) {
@@ -42,94 +53,171 @@ xgb.get.handle <- function(object) {
    xgb.Booster.handle = object,
    stop("argument must be of either xgb.Booster or xgb.Booster.handle class")
  )
-  if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE="xgboost")) {
+  if (is.null.handle(handle)) {
    stop("invalid xgb.Booster.handle")
  }
  handle
 }

-# Check whether an xgb.Booster object is complete
-# internal utility function
-xgb.Booster.check <- function(bst, saveraw = TRUE) {
-  if (class(bst) != "xgb.Booster")
+#' Restore missing parts of an incomplete xgb.Booster object.
+#'
+#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing
+#' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
+#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
+#' but it has a raw Booster memory dump).
+#'
+#' @param object object of class \code{xgb.Booster}
+#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data
+#'                when it doesn't already exist.
+#'
+#' @details
+#'
+#' While this method is primarily for internal use, it might be useful in some practical situations.
+#'
+#' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
+#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
+#' should still work for such a model object since those methods would be using
+#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
+#' \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
+#' That would prevent further repeated implicit reconstruction of an internal booster model.
+#'
+#' @return
+#' An object of \code{xgb.Booster} class.
+#'
+#' @examples
+#'
+#' data(agaricus.train, package='xgboost')
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
+#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' saveRDS(bst, "xgb.model.rds")
+#'
+#' bst1 <- readRDS("xgb.model.rds")
+#' # the handle is invalid:
+#' print(bst1$handle)
+#'
+#' bst1 <- xgb.Booster.complete(bst1)
+#' # now the handle points to a valid internal booster model:
+#' print(bst1$handle)
+#'
+#' @export
+xgb.Booster.complete <- function(object, saveraw = TRUE) {
+  if (!inherits(object, "xgb.Booster"))
    stop("argument type must be xgb.Booster")
-  
-  isnull <- is.null(bst$handle)
-  if (!isnull) {
-    isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
-  }
-  if (isnull) {
-    bst$handle <- xgb.Booster(modelfile = bst$raw)
+
+  if (is.null.handle(object$handle)) {
+    object$handle <- xgb.Booster.handle(modelfile = object$raw)
  } else {
-    if (is.null(bst$raw) && saveraw)
-      bst$raw <- xgb.save.raw(bst$handle)
+    if (is.null(object$raw) && saveraw)
+      object$raw <- xgb.save.raw(object$handle)
  }
-  return(bst)
+  return(object)
 }

-
 #' Predict method for eXtreme Gradient Boosting model
-#' 
+#'
 #' Predicted values based on either xgboost model or model handle object.
-#' 
+#'
 #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}
 #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.
 #' @param missing Missing is only used when input is dense matrix. Pick a float value that represents
 #'        missing values in data (e.g., sometimes 0 or some other extreme value is used).
-#' @param outputmargin whether the prediction should be returned in the for of original untransformed 
-#'        sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
+#' @param outputmargin whether the prediction should be returned in the for of original untransformed
+#'        sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 #'        logistic regression would result in predictions for log-odds instead of probabilities.
 #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
 #'        It will use all the trees by default (\code{NULL} value).
-#' @param predleaf whether predict leaf index instead. 
-#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several 
+#' @param predleaf whether predict leaf index instead.
+#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
+#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
+#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
 #'        prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
 #' @param ... Parameters passed to \code{predict.xgb.Booster}
-#' 
-#' @details  
-#' Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
-#' and it is not necesserily equal to the number of trees in a model.
+#'
+#' @details
+#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
+#' and it is not necessarily equal to the number of trees in a model.
 #' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
-#' But for multiclass classification, there are multiple trees per iteration, 
-#' but \code{ntreelimit} limits the number of boosting iterations.
-#' 
-#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
-#' since gblinear doesn't keep its boosting history. 
-#' 
-#' One possible practical applications of the \code{predleaf} option is to use the model 
-#' as a generator of new features which capture non-linearity and interactions, 
-#' e.g., as implemented in \code{\link{xgb.create.features}}. 
-#' 
-#' @return 
+#' But for multiclass classification, while there are multiple trees per iteration,
+#' \code{ntreelimit} limits the number of boosting iterations.
+#'
+#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
+#' since gblinear doesn't keep its boosting history.
+#'
+#' One possible practical applications of the \code{predleaf} option is to use the model
+#' as a generator of new features which capture non-linearity and interactions,
+#' e.g., as implemented in \code{\link{xgb.create.features}}.
+#'
+#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
+#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
+#' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
+#' values (Lundberg 2017) that sum to the difference between the expected output
+#' of the model and the current prediction (where the hessian weights are used to compute the expectations).
+#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
+#' in \url{http://blog.datadive.net/interpreting-random-forests/}.
+#'
+#' @return
 #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
-#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or 
-#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on 
+#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
+#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
 #' the \code{reshape} value.
-#' 
-#' When \code{predleaf = TRUE}, the output is a matrix object with the 
+#'
+#' When \code{predleaf = TRUE}, the output is a matrix object with the
 #' number of columns corresponding to the number of trees.
-#' 
+#'
+#' When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
+#' \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
+#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
+#' such a matrix. The contribution values are on the scale of untransformed margin
+#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
+#'
 #' @seealso
 #' \code{\link{xgb.train}}.
 #' 
+#' @references
+#'
+#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+#'
+#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
+#'
 #' @examples
 #' ## binary classification:
-#' 
+#'
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' 
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#'
+#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
+#'                eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
 #' # use all trees by default
 #' pred <- predict(bst, test$data)
 #' # use only the 1st tree
-#' pred <- predict(bst, test$data, ntreelimit = 1)
-#' 
-#' 
+#' pred1 <- predict(bst, test$data, ntreelimit = 1)
+#'
+#' # Predicting tree leafs:
+#' # the result is an nsamples X ntrees matrix
+#' pred_leaf <- predict(bst, test$data, predleaf = TRUE)
+#' str(pred_leaf)
+#'
+#' # Predicting feature contributions to predictions:
+#' # the result is an nsamples X (nfeatures + 1) matrix
+#' pred_contr <- predict(bst, test$data, predcontrib = TRUE)
+#' str(pred_contr)
+#' # verify that contributions' sums are equal to log-odds of predictions (up to float precision):
+#' summary(rowSums(pred_contr) - qlogis(pred))
+#' # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
+#' contr1 <- pred_contr[1,]
+#' contr1 <- contr1[-length(contr1)]    # drop BIAS
+#' contr1 <- contr1[contr1 != 0]        # drop non-contributing features
+#' contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
+#' old_mar <- par("mar")
+#' par(mar = old_mar + c(0,7,0,0))
+#' barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds")
+#' par(mar = old_mar)
+#'
+#'
 #' ## multiclass classification in iris dataset:
-#' 
+#'
 #' lb <- as.numeric(iris$Species) - 1
 #' num_class <- 3
 #' set.seed(11)
@@ -145,7 +233,7 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) {
 #' pred_labels <- max.col(pred) - 1
 #' # the following should result in the same error as seen in the last iteration
 #' sum(pred_labels != lb)/length(lb)
-#' 
+#'
 #' # compare that to the predictions from softmax:
 #' set.seed(11)
 #' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
@@ -154,14 +242,14 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) {
 #' pred <- predict(bst, as.matrix(iris[, -5]))
 #' str(pred)
 #' all.equal(pred, pred_labels)
-#' # prediction from using only 5 iterations should result 
+#' # prediction from using only 5 iterations should result
 #' # in the same error as seen in iteration 5:
 #' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
 #' sum(pred5 != lb)/length(lb)
-#' 
-#' 
+#'
+#'
 #' ## random forest-like model of 25 trees for binary classification:
-#' 
+#'
 #' set.seed(11)
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
 #'                nthread = 2, nrounds = 1, objective = "binary:logistic",
@@ -177,35 +265,57 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) {
 #'
 #' @rdname predict.xgb.Booster
 #' @export
-predict.xgb.Booster <- function(object, newdata, missing = NA,
-    outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) {
+predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
+                                predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) {

-  object <- xgb.Booster.check(object, saveraw = FALSE)
-  if (class(newdata) != "xgb.DMatrix")
+  object <- xgb.Booster.complete(object, saveraw = FALSE)
+  if (!inherits(newdata, "xgb.DMatrix"))
    newdata <- xgb.DMatrix(newdata, missing = missing)
+  if (!is.null(object[["feature_names"]]) &&
+      !is.null(colnames(newdata)) &&
+      !identical(object[["feature_names"]], colnames(newdata)))
+    stop("Feature names stored in `object` and `newdata` are different!")
  if (is.null(ntreelimit))
    ntreelimit <- NVL(object$best_ntreelimit, 0)
+  if (NVL(object$params[['booster']], '') == 'gblinear')
+    ntreelimit <- 0
  if (ntreelimit < 0)
    stop("ntreelimit cannot be negative")
-  
-  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf)
-  
-  ret <- .Call("XGBoosterPredict_R", object$handle, newdata, option[1],
-               as.integer(ntreelimit), PACKAGE = "xgboost")
-  
-  if (length(ret) %% nrow(newdata) != 0)
-    stop("prediction length ", length(ret)," is not multiple of nrows(newdata) ", nrow(newdata))
-  npred_per_case <- length(ret) / nrow(newdata)

-  if (predleaf){
-    len <- nrow(newdata)
-    ret <- if (length(ret) == len) {
+  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib)
+
+  ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
+
+  n_ret <- length(ret)
+  n_row <- nrow(newdata)
+  npred_per_case <- n_ret / n_row
+
+  if (n_ret %% n_row != 0)
+    stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row)
+
+  if (predleaf) {
+    ret <- if (n_ret == n_row) {
      matrix(ret, ncol = 1)
    } else {
-      t(matrix(ret, ncol = len))
+      matrix(ret, nrow = n_row, byrow = TRUE)
+    }
+  } else if (predcontrib) {
+    n_col1 <- ncol(newdata) + 1
+    n_group <- npred_per_case / n_col1
+    dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
+    ret <- if (n_ret == n_row) {
+      matrix(ret, ncol = 1, dimnames = dnames)
+    } else if (n_group == 1) {
+      matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames)
+    } else {
+      grp_mask <- rep(seq_len(n_col1), n_row) +
+        rep((seq_len(n_row) - 1) * n_col1 * n_group, each = n_col1)
+      lapply(seq_len(n_group), function(g) {
+        matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames)
+      })
    }
  } else if (reshape && npred_per_case > 1) {
-    ret <- matrix(ret, ncol = length(ret) / nrow(newdata), byrow = TRUE)
+    ret <- matrix(ret, nrow = n_row, byrow = TRUE)
  }
  return(ret)
 }
@@ -227,9 +337,9 @@ predict.xgb.Booster.handle <- function(object, ...) {
 #'
 #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.
 #' @param name a non-empty character string specifying which attribute is to be accessed.
-#' @param value a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} 
-#'        it's a list (or an object coercible to a list) with the names of attributes to set 
-#'        and the elements corresponding to attribute values. 
+#' @param value a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-}
+#'        it's a list (or an object coercible to a list) with the names of attributes to set
+#'        and the elements corresponding to attribute values.
 #'        Non-character values are converted to character.
 #'        When attribute value is not a scalar, only the first index is used.
 #'        Use \code{NULL} to remove an attribute.
@@ -238,32 +348,32 @@ predict.xgb.Booster.handle <- function(object, ...) {
 #' The primary purpose of xgboost model attributes is to store some meta-data about the model.
 #' Note that they are a separate concept from the object attributes in R.
 #' Specifically, they refer to key-value strings that can be attached to an xgboost model,
-#' stored together with the model's binary representation, and accessed later 
+#' stored together with the model's binary representation, and accessed later
 #' (from R or any other interface).
 #' In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
 #' would not be saved by \code{xgb.save} because an xgboost model is an external memory object
-#' and its serialization is handled extrnally.
-#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't 
-#' change the value of that parameter for a model. 
+#' and its serialization is handled externally.
+#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
+#' change the value of that parameter for a model.
 #' Use \code{\link{xgb.parameters<-}} to set or change model parameters.
-#' 
+#'
 #' The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
 #' than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
 #' That would only matter if attributes need to be set many times.
 #' Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
-#' the raw model cache of an \code{xgb.Booster} object would not be automatically updated, 
+#' the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
 #' and it would be user's responsibility to call \code{xgb.save.raw} to update it.
-#' 
-#' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, 
+#'
+#' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
 #' but it doesn't delete the other existing attributes.
-#' 
+#'
 #' @return
-#' \code{xgb.attr} returns either a string value of an attribute 
+#' \code{xgb.attr} returns either a string value of an attribute
 #' or \code{NULL} if an attribute wasn't stored in a model.
-#' 
-#' \code{xgb.attributes} returns a list of all attribute stored in a model 
+#'
+#' \code{xgb.attributes} returns a list of all attribute stored in a model
 #' or \code{NULL} if a model has no stored attributes.
-#' 
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
@@ -279,19 +389,19 @@ predict.xgb.Booster.handle <- function(object, ...) {
 #' bst1 <- xgb.load('xgb.model')
 #' print(xgb.attr(bst1, "my_attribute"))
 #' print(xgb.attributes(bst1))
-#' 
+#'
 #' # deletion:
 #' xgb.attr(bst1, "my_attribute") <- NULL
 #' print(xgb.attributes(bst1))
 #' xgb.attributes(bst1) <- list(a = NULL, b = NULL)
 #' print(xgb.attributes(bst1))
-#' 
+#'
 #' @rdname xgb.attr
 #' @export
 xgb.attr <- function(object, name) {
  if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name")
  handle <- xgb.get.handle(object)
-  .Call("XGBoosterGetAttr_R", handle, as.character(name[1]), PACKAGE="xgboost")
+  .Call(XGBoosterGetAttr_R, handle, as.character(name[1]))
 }

 #' @rdname xgb.attr
@@ -302,9 +412,13 @@ xgb.attr <- function(object, name) {
  if (!is.null(value)) {
    # Coerce the elements to be scalar strings.
    # Q: should we warn user about non-scalar elements?
-    value <- as.character(value[1])
+    if (is.numeric(value[1])) {
+      value <- format(value[1], digits = 17)
+    } else {
+      value <- as.character(value[1])
+    }
  }
-  .Call("XGBoosterSetAttr_R", handle, as.character(name[1]), value, PACKAGE="xgboost")
+  .Call(XGBoosterSetAttr_R, handle, as.character(name[1]), value)
  if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
    object$raw <- xgb.save.raw(object$handle)
  }
@@ -315,10 +429,10 @@ xgb.attr <- function(object, name) {
 #' @export
 xgb.attributes <- function(object) {
  handle <- xgb.get.handle(object)
-  attr_names <- .Call("XGBoosterGetAttrNames_R", handle, PACKAGE="xgboost")
+  attr_names <- .Call(XGBoosterGetAttrNames_R, handle)
  if (is.null(attr_names)) return(NULL)
  res <- lapply(attr_names, function(x) {
-    .Call("XGBoosterGetAttr_R", handle, x, PACKAGE="xgboost")
+    .Call(XGBoosterGetAttr_R, handle, x)
  })
  names(res) <- attr_names
  res
@@ -335,11 +449,15 @@ xgb.attributes <- function(object) {
  # Q: should we warn a user about non-scalar elements?
  a <- lapply(a, function(x) {
    if (is.null(x)) return(NULL)
-    as.character(x[1])
+    if (is.numeric(x[1])) {
+      format(x[1], digits = 17)
+    } else {
+      as.character(x[1])
+    }
  })
  handle <- xgb.get.handle(object)
  for (i in seq_along(a)) {
-    .Call("XGBoosterSetAttr_R", handle, names(a[i]), a[[i]], PACKAGE="xgboost")
+    .Call(XGBoosterSetAttr_R, handle, names(a[i]), a[[i]])
  }
  if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
    object$raw <- xgb.save.raw(object$handle)
@@ -358,7 +476,7 @@ xgb.attributes <- function(object) {
 #' @details
 #' Note that the setter would usually work more efficiently for \code{xgb.Booster.handle}
 #' than for \code{xgb.Booster}, since only just a handle would need to be copied.
-#' 
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
@@ -367,7 +485,7 @@ xgb.attributes <- function(object) {
 #'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #'
 #' xgb.parameters(bst) <- list(eta = 0.1)
-#' 
+#'
 #' @rdname xgb.parameters
 #' @export
 `xgb.parameters<-` <- function(object, value) {
@@ -380,7 +498,7 @@ xgb.attributes <- function(object) {
  p <- lapply(p, function(x) as.character(x)[1])
  handle <- xgb.get.handle(object)
  for (i in seq_along(p)) {
-    .Call("XGBoosterSetParam_R", handle, names(p[i]), p[[i]], PACKAGE = "xgboost")
+    .Call(XGBoosterSetParam_R, handle, names(p[i]), p[[i]])
  }
  if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
    object$raw <- xgb.save.raw(object$handle)
@@ -388,8 +506,8 @@ xgb.attributes <- function(object) {
  object
 }

-# Extract # of trees in a model
-# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration
+# Extract the number of trees in a model.
+# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration.
 # internal utility function
 xgb.ntree <- function(bst) {
  length(grep('^booster', xgb.dump(bst)))
@@ -397,36 +515,35 @@ xgb.ntree <- function(bst) {


 #' Print xgb.Booster
-#' 
+#'
 #' Print information about xgb.Booster.
-#' 
+#'
 #' @param x an xgb.Booster object
 #' @param verbose whether to print detailed data (e.g., attribute values)
 #' @param ... not currently used
-#' 
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 #'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #' attr(bst, 'myattr') <- 'memo'
-#' 
+#'
 #' print(bst)
 #' print(bst, verbose=TRUE)
 #'
-#' @method print xgb.Booster 
+#' @method print xgb.Booster
 #' @export
-print.xgb.Booster <- function(x, verbose=FALSE, ...) {
+print.xgb.Booster <- function(x, verbose = FALSE, ...) {
  cat('##### xgb.Booster\n')
-  
-  if (is.null(x$handle) || .Call("XGCheckNullPtr_R", x$handle, PACKAGE="xgboost")) {
-    cat("handle is invalid\n")
-    return(x)
-  }
-  
+
+  valid_handle <- is.null.handle(x$handle)
+  if (!valid_handle)
+    cat("Handle is invalid! Suggest using xgb.Booster.complete\n")
+
  cat('raw: ')
  if (!is.null(x$raw)) {
-    cat(format(object.size(x$raw), units="auto"), '\n')
+    cat(format(object.size(x$raw), units = "auto"), '\n')
  } else {
    cat('NULL\n')
  }
@@ -434,28 +551,30 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
    cat('call:\n  ')
    print(x$call)
  }
-  
+
  if (!is.null(x$params)) {
    cat('params (as set within xgb.train):\n')
-    cat( '  ', 
+    cat( '  ',
         paste(names(x$params),
               paste0('"', unlist(x$params), '"'),
-               sep=' = ', collapse=', '), '\n', sep='')
+               sep = ' = ', collapse = ', '), '\n', sep = '')
  }
  # TODO: need an interface to access all the xgboosts parameters

-  attrs <- xgb.attributes(x)
+  attrs <- character(0)
+  if (valid_handle)
+    attrs <- xgb.attributes(x)
  if (length(attrs) > 0) {
    cat('xgb.attributes:\n')
    if (verbose) {
      cat( paste(paste0('  ',names(attrs)),
                 paste0('"', unlist(attrs), '"'),
-                 sep=' = ', collapse='\n'), '\n', sep='')
+                 sep = ' = ', collapse = '\n'), '\n', sep = '')
    } else {
-      cat('  ', paste(names(attrs), collapse=', '), '\n', sep='')
+      cat('  ', paste(names(attrs), collapse = ', '), '\n', sep = '')
    }
  }
-  
+
  if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
    cat('callbacks:\n')
    lapply(callback.calls(x$callbacks), function(x) {
@@ -463,24 +582,28 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
      print(x)
    })
  }
-  
-  cat('niter: ', x$niter, '\n', sep='')
+
+  if (!is.null(x$feature_names))
+    cat('# of features:', length(x$feature_names), '\n')
+
+  cat('niter: ', x$niter, '\n', sep = '')
  # TODO: uncomment when faster xgb.ntree is implemented
  #cat('ntree: ', xgb.ntree(x), '\n', sep='')
-  
-  for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks','evaluation_log','niter'))) {
+
+  for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
+                                'evaluation_log','niter','feature_names'))) {
    if (is.atomic(x[[n]])) {
-      cat(n, ': ', x[[n]], '\n', sep='')
+      cat(n, ':', x[[n]], '\n', sep = ' ')
    } else {
-      cat(n, ':\n\t', sep='')
+      cat(n, ':\n\t', sep = ' ')
      print(x[[n]])
    }
  }
-  
+
  if (!is.null(x$evaluation_log)) {
    cat('evaluation_log:\n')
    print(x$evaluation_log, row.names = FALSE, topn = 2)
  }
-  
+
  invisible(x)
 }
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -1,14 +1,17 @@
-#' Contruct xgb.DMatrix object
+#' Construct xgb.DMatrix object
 #' 
-#' Contruct xgb.DMatrix object from dense matrix, sparse matrix 
-#' or local file (that was created previously by saving an \code{xgb.DMatrix}).
+#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
+#' Supported input file formats are either a libsvm text file or a binary file that was created previously by
+#' \code{\link{xgb.DMatrix.save}}).
 #' 
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
-#' @param info a list of information of the xgb.DMatrix object
-#' @param missing Missing is only used when input is dense matrix, pick a float
-#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
-#
-#' @param ... other information to pass to \code{info}.
+#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character 
+#'        string representing a filename.
+#' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
+#'        See \code{\link{setinfo}} for the specific allowed kinds of 
+#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
+#'        It is useful when a 0 or some other extreme value represents missing values in data.
+#' @param silent whether to suppress printing an informational message after loading from a file.
+#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -17,32 +20,27 @@
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' @export
-xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
+xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, ...) {
  cnames <- NULL
  if (typeof(data) == "character") {
-    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
-                    PACKAGE = "xgboost")
+    if (length(data) > 1)
+      stop("'data' has class 'character' and length ", length(data),
+           ".\n  'data' accepts either a numeric matrix or a single filename.")
+    handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
  } else if (is.matrix(data)) {
-    handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
-                    PACKAGE = "xgboost")
+    handle <- .Call(XGDMatrixCreateFromMat_R, data, missing)
    cnames <- colnames(data)
-  } else if (class(data) == "dgCMatrix") {
-    handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
-                    PACKAGE = "xgboost")
+  } else if (inherits(data, "dgCMatrix")) {
+    handle <- .Call(XGDMatrixCreateFromCSC_R, data@p, data@i, data@x, nrow(data))
    cnames <- colnames(data)
  } else {
-    stop(paste("xgb.DMatrix: does not support to construct from ",
-               typeof(data)))
+    stop("xgb.DMatrix does not support construction from ", typeof(data))
  }
  dmat <- handle
  attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix")
-  #dmat <- list(handle = handle, colnames = cnames)
-  #attr(dmat, 'class') <- "xgb.DMatrix"

  info <- append(info, list(...))
-  if (length(info) == 0)
-    return(dmat)
-  for (i in 1:length(info)) {
+  for (i in seq_along(info)) {
    p <- info[i]
    setinfo(dmat, names(p), p[[1]])
  }
@@ -53,10 +51,9 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
 # get dmatrix from data, label
 # internal helper method
 xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
-  inClass <- class(data)
-  if ("dgCMatrix" %in% inClass || "matrix" %in% inClass ) {
+  if (inherits(data, "dgCMatrix") || is.matrix(data)) {
    if (is.null(label)) {
-      stop("xgboost: need label when data is a matrix")
+      stop("label must be provided when data is a matrix")
    }
    dtrain <- xgb.DMatrix(data, label = label, missing = missing)
    if (!is.null(weight)){
@@ -66,15 +63,14 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
    if (!is.null(label)) {
      warning("xgboost: label will be ignored.")
    }
-    if (inClass == "character") {
-      dtrain <- xgb.DMatrix(data)
-    } else if (inClass == "xgb.DMatrix") {
+    if (is.character(data)) {
+      dtrain <- xgb.DMatrix(data[1])
+    } else if (inherits(data, "xgb.DMatrix")) {
      dtrain <- data
-    } else if (inClass == "data.frame") {
-      stop("xgboost only support numerical matrix input,
-           use 'data.matrix' to transform the data.")
+    } else if (inherits(data, "data.frame")) {
+      stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
    } else {
-      stop("xgboost: Invalid input of data")
+      stop("xgboost: invalid input data")
    }
  }
  return (dtrain)
@@ -101,8 +97,7 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
 #' 
 #' @export
 dim.xgb.DMatrix <- function(x) {
-  c(.Call("XGDMatrixNumRow_R", x, PACKAGE="xgboost"),
-    .Call("XGDMatrixNumCol_R", x, PACKAGE="xgboost"))
+  c(.Call(XGDMatrixNumRow_R, x), .Call(XGDMatrixNumCol_R, x))
 }


@@ -168,8 +163,11 @@ dimnames.xgb.DMatrix <- function(x) {
 #'     \item \code{weight}: to do a weight rescale ;
 #'     \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
 #'     \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
+#'     
 #' }
 #' 
+#' \code{group} can be setup by \code{setinfo} but can't be retrieved by \code{getinfo}.
+#' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
@@ -190,11 +188,11 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
  if (typeof(name) != "character" ||
      length(name) != 1 ||
      !name %in% c('label', 'weight', 'base_margin', 'nrow')) {
-    stop("getinfo: name must one of the following\n",
+    stop("getinfo: name must be one of the following\n",
         "    'label', 'weight', 'base_margin', 'nrow'")
  }
  if (name != "nrow"){
-    ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
+    ret <- .Call(XGDMatrixGetInfo_R, object, name)
  } else {
    ret <- nrow(object)
  }
@@ -219,7 +217,7 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
 #'     \item \code{label}: label Xgboost learn from ;
 #'     \item \code{weight}: to do a weight rescale ;
 #'     \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
-#'     \item \code{group}.
+#'     \item \code{group}: number of rows in each group (to use with \code{rank:pairwise} objective).
 #' }
 #' 
 #' @examples
@@ -241,32 +239,28 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
  if (name == "label") {
    if (length(info) != nrow(object))
      stop("The length of labels must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info),
-          PACKAGE = "xgboost")
+    .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
    return(TRUE)
  }
  if (name == "weight") {
    if (length(info) != nrow(object))
      stop("The length of weights must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info),
-          PACKAGE = "xgboost")
+    .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
    return(TRUE)
  }
  if (name == "base_margin") {
    # if (length(info)!=nrow(object))
    #   stop("The length of base margin must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info),
-          PACKAGE = "xgboost")
+    .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
    return(TRUE)
  }
  if (name == "group") {
    if (sum(info) != nrow(object))
      stop("The sum of groups must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", object, name, as.integer(info),
-          PACKAGE = "xgboost")
+    .Call(XGDMatrixSetInfo_R, object, name, as.integer(info))
    return(TRUE)
  }
-  stop(paste("setinfo: unknown info name", name))
+  stop("setinfo: unknown info name ", name)
  return(FALSE)
 }

@@ -300,10 +294,10 @@ slice <- function(object, ...) UseMethod("slice")
 #' @rdname slice.xgb.DMatrix
 #' @export
 slice.xgb.DMatrix <- function(object, idxset, ...) {
-  if (class(object) != "xgb.DMatrix") {
-    stop("slice: first argument dtrain must be xgb.DMatrix")
+  if (!inherits(object, "xgb.DMatrix")) {
+    stop("object must be xgb.DMatrix")
  }
-  ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
+  ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)

  attr_list <- attributes(object)
  nr <- nrow(object)
@@ -311,7 +305,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
  ind <- which(len == nr)
  if (length(ind) > 0) {
    nms <- names(attr_list)[ind]
-    for (i in 1:length(ind)) {
+    for (i in seq_along(ind)) {
      attr(ret, nms[i]) <- attr(object, nms[i])[idxset]
    }
  }
@@ -320,7 +314,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {

 #' @rdname slice.xgb.DMatrix
 #' @export
-`[.xgb.DMatrix` <- function(object, idxset, colset=NULL) {
+`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
  slice(object, idxset)
 }

@@ -344,7 +338,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
 #' 
 #' @method print xgb.DMatrix
 #' @export
-print.xgb.DMatrix <- function(x, verbose=FALSE, ...) {
+print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
  cat('xgb.DMatrix  dim:', nrow(x), 'x', ncol(x), ' info: ')
  infos <- c()
  if(length(getinfo(x, 'label')) > 0) infos <- 'label'
@@ -356,7 +350,7 @@ print.xgb.DMatrix <- function(x, verbose=FALSE, ...) {
  cat('  colnames:')
  if (verbose & !is.null(cnames)) {
    cat("\n'")
-    cat(cnames, sep="','")
+    cat(cnames, sep = "','")
    cat("'")
  } else {
    if (is.null(cnames)) cat(' no')
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -15,9 +15,9 @@
 xgb.DMatrix.save <- function(dmatrix, fname) {
  if (typeof(fname) != "character")
    stop("fname must be character")
-  if (class(dmatrix) != "xgb.DMatrix")
-    stop("the input data must be xgb.DMatrix")
+  if (!inherits(dmatrix, "xgb.DMatrix"))
+    stop("dmatrix must be xgb.DMatrix")
  
-  .Call("XGDMatrixSaveBinary_R", dmatrix, fname, 0L, PACKAGE = "xgboost")
+  .Call(XGDMatrixSaveBinary_R, dmatrix, fname[1], 0L)
  return(TRUE)
 }
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -18,7 +18,7 @@
 #'  
 #' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
 #' 
-#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
+#' \url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
 #' 
 #' Extract explaining the method:
 #' 
@@ -57,7 +57,8 @@
 #' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
 #' 
 #' # Model accuracy without new features
-#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
+#'                    length(agaricus.test$label)
 #' 
 #' # Convert previous features to one hot encoding
 #' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
@@ -70,15 +71,17 @@
 #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
 #' 
 #' # Model accuracy with new features
-#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
+#'                   length(agaricus.test$label)
 #' 
 #' # Here the accuracy was already good and is now perfect.
-#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
+#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
+#'           accuracy.after, "!\n"))
 #' 
 #' @export
 xgb.create.features <- function(model, data, ...){
  check.deprecation(...)
  pred_with_leaf <- predict(model, data, predleaf = TRUE)
  cols <- lapply(as.data.frame(pred_with_leaf), factor)
-  cBind(data, sparse.model.matrix( ~ . -1, cols))
+  cbind(data, sparse.model.matrix( ~ . -1, cols))
 }
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -1,6 +1,6 @@
 #' Cross Validation
 #' 
-#' The cross valudation function of xgboost
+#' The cross validation function of xgboost
 #' 
 #' @param params the list of parameters. Commonly used ones are:
 #' \itemize{
@@ -16,10 +16,10 @@
 #'
 #'   See \code{\link{xgb.train}} for further details.
 #'   See also demo/ for walkthrough example in R.
-#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
+#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
 #' @param nrounds the max number of iterations
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. 
-#' @param label vector of response values. Should be provided only when data is \code{DMatrix}.
+#' @param label vector of response values. Should be provided only when data is an R-matrix.
 #' @param missing is only used when input is a dense matrix. By default is set to NA, which means 
 #'        that NA values should be considered as 'missing' by the algorithm. 
 #'        Sometimes, 0 or other extreme value might be used to represent missing values.
@@ -34,6 +34,7 @@
 #'   \item \code{rmse} Rooted mean square error
 #'   \item \code{logloss} negative log-likelihood function
 #'   \item \code{auc} Area under curve
+#'   \item \code{aucpr} Area under PR curve
 #'   \item \code{merror} Exact matching error, used to evaluate multi-class classification
 #' }
 #' @param obj customized objective function. Returns gradient and second order 
@@ -88,6 +89,7 @@
 #'         CV-based evaluation means and standard deviations for the training and test CV-sets.
 #'         It is created by the \code{\link{cb.evaluation.log}} callback.
 #'   \item \code{niter} number of boosting iterations.
+#'   \item \code{nfeatures} number of features in training data.
 #'   \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds} 
 #'         parameter or randomly generated.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
@@ -129,15 +131,14 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  #if (is.null(params[['eval_metric']]) && is.null(feval))
  #  stop("Either 'eval_metric' or 'feval' must be provided for CV")
  
-  # Labels
-  if (class(data) == 'xgb.DMatrix')
-    labels <- getinfo(data, 'label')
-  if (is.null(labels))
+  # Check the labels
+  if ( (inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
+       (!inherits(data, 'xgb.DMatrix') && is.null(label)))
    stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
  
  # CV folds
  if(!is.null(folds)) {
-    if(class(folds) != "list" || length(folds) < 2)
+    if(!is.list(folds) || length(folds) < 2)
      stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
    nfold <- length(folds)
  } else {
@@ -154,7 +155,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  params <- c(params, list(silent = 1))
  print_every_n <- max( as.integer(print_every_n), 1L)
  if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
+    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
  }
  # evaluation log callback: always is on in CV
  evaluation_log <- list()
@@ -166,12 +167,12 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  if (!is.null(early_stopping_rounds) &&
      !has.callbacks(callbacks, 'cb.early.stop')) {
    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, 
-                                                 maximize=maximize, verbose=verbose))
+                                                 maximize = maximize, verbose = verbose))
  }
  # CV-predictions callback
  if (prediction &&
      !has.callbacks(callbacks, 'cb.cv.predict')) {
-    callbacks <- add.cb(callbacks, cb.cv.predict(save_models=FALSE))
+    callbacks <- add.cb(callbacks, cb.cv.predict(save_models = FALSE))
  }
  # Sort the callbacks into categories
  cb <- categorize.callbacks(callbacks)
@@ -179,12 +180,13 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  
  # create the booster-folds
  dall <- xgb.get.DMatrix(data, label, missing)
-  bst_folds <- lapply(1:length(folds), function(k) {
+  bst_folds <- lapply(seq_along(folds), function(k) {
    dtest  <- slice(dall, folds[[k]])
    dtrain <- slice(dall, unlist(folds[-k]))
-    bst <- xgb.Booster(params, list(dtrain, dtest))
-    list(dtrain=dtrain, bst=bst, watchlist=list(train=dtrain, test=dtest), index=folds[[k]])
+    handle <- xgb.Booster.handle(params, list(dtrain, dtest))
+    list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]])
  })
+  rm(dall)
  # a "basket" to collect some results from callbacks
  basket <- list()

@@ -213,7 +215,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    
    if (stop_condition) break
  }
-  for (f in cb$finalize) f(finalize=TRUE)
+  for (f in cb$finalize) f(finalize = TRUE)

  # the CV result
  ret <- list(
@@ -222,6 +224,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    callbacks = callbacks,
    evaluation_log = evaluation_log,
    niter = end_iteration,
+    nfeatures = ncol(data),
    folds = folds
  )
  ret <- c(ret, basket)
@@ -255,8 +258,8 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
 #' @rdname print.xgb.cv
 #' @method print xgb.cv.synchronous
 #' @export
-print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) {
-  cat('##### xgb.cv ', length(x$folds), '-folds\n', sep='')
+print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
+  cat('##### xgb.cv ', length(x$folds), '-folds\n', sep = '')
  
  if (verbose) {
    if (!is.null(x$call)) {
@@ -268,7 +271,7 @@ print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) {
      cat( '  ', 
           paste(names(x$params), 
                 paste0('"', unlist(x$params), '"'),
-                 sep=' = ', collapse=', '), '\n', sep='')
+                 sep = ' = ', collapse = ', '), '\n', sep = '')
    }
    if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
      cat('callbacks:\n')
@@ -281,7 +284,7 @@ print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) {
    for (n in c('niter', 'best_iteration', 'best_ntreelimit')) {
      if (is.null(x[[n]])) 
        next
-      cat(n, ': ', x[[n]], '\n', sep='')
+      cat(n, ': ', x[[n]], '\n', sep = '')
    }

    if (!is.null(x$pred)) {
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -1,23 +1,26 @@
-#' Save xgboost model to text file
+#' Dump an xgboost model in text format.
 #' 
-#' Save a xgboost model to text file. Could be parsed later.
+#' Dump an xgboost model in text format.
 #' 
 #' @param model the model object.
-#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
-#' @param fmap feature map file representing the type of feature. 
+#' @param fname the name of the text file where to save the model text dump. 
+#'        If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
+#' @param fmap feature map file representing feature types.
 #'        Detailed description could be found at 
 #'        \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
 #'        See demo/ for walkthrough example in R, and
 #'        \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} 
 #'        for example Format.
-#' @param with_stats whether dump statistics of splits 
-#'        When this option is on, the model dump comes with two additional statistics:
+#' @param with_stats whether to dump some additional statistics about the splits.
+#'        When this option is on, the model dump contains two additional values:
 #'        gain is the approximate loss function gain we get in each split;
 #'        cover is the sum of second order gradient in each node.
+#' @param dump_format either 'text' or 'json' format could be specified.
 #' @param ... currently not used
 #'
 #' @return
-#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
+#' If fname is not provided or set to \code{NULL} the function will return the model
+#' as a \code{character} vector. Otherwise it will return \code{TRUE}.
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -30,30 +33,39 @@
 #' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
 #' 
 #' # print the model without saving it to a file
-#' print(xgb.dump(bst))
+#' print(xgb.dump(bst, with_stats = TRUE))
+#' 
+#' # print in JSON format:
+#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+#' 
 #' @export
-xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ...) {
+xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
+                     dump_format = c("text", "json"), ...) {
  check.deprecation(...)
-  if (class(model) != "xgb.Booster")
+  dump_format <- match.arg(dump_format)
+  if (!inherits(model, "xgb.Booster"))
    stop("model: argument must be of type xgb.Booster")
-  if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
-    stop("fname: argument must be of type character (when provided)")
-  if (!(class(fmap) %in% c("character", "NULL") && length(fmap) <= 1))
-    stop("fmap: argument must be of type character (when provided)")
+  if (!(is.null(fname) || is.character(fname)))
+    stop("fname: argument must be a character string (when provided)")
+  if (!(is.null(fmap) || is.character(fmap)))
+    stop("fmap: argument must be a character string (when provided)")
  
-  model <- xgb.Booster.check(model)
-  model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
+  model <- xgb.Booster.complete(model)
+  model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats),
+                      as.character(dump_format))

  if (is.null(fname)) 
    model_dump <- stri_replace_all_regex(model_dump, '\t', '')
  
-  model_dump <- unlist(stri_split_regex(model_dump, '\n'))
+  if (dump_format == "text")
+    model_dump <- unlist(stri_split_regex(model_dump, '\n'))
+  
  model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
  
  if (is.null(fname)) {
    return(model_dump)
  } else {
-    writeLines(model_dump, fname)
+    writeLines(model_dump, fname[1])
    return(TRUE)
  }
 }
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -131,5 +131,5 @@ multiplot <- function(..., cols = 1) {

 globalVariables(c(
  "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
-  "element_blank", "element_text"
+  "element_blank", "element_text", "V1", "Weight"
 ))
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -1,101 +1,134 @@
-#' Show importance of features in a model
+#' Importance of features in a model.
 #' 
-#' Create a \code{data.table} of the most important features of a model. 
+#' Creates a \code{data.table} of feature importances in a model.
 #' 
-#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param model generated by the \code{xgb.train} function.
-#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
-#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
-#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
-#'
-#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
+#' @param feature_names character vector of feature names. If the model already
+#'       contains feature names, those would be used when \code{feature_names=NULL} (default value).
+#'       Non-null \code{feature_names} could be provided to override those in the model.
+#' @param model object of class \code{xgb.Booster}.
+#' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included
+#'          into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
+#'          It could be useful, e.g., in multiclass classification to get feature importances 
+#'          for each class separately. IMPORTANT: the tree index in xgboost models
+#'          is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
+#' @param data deprecated.
+#' @param label deprecated.
+#' @param target deprecated.
 #'
 #' @details 
-#' This function is for both linear and tree models.
 #' 
-#' \code{data.table} is returned by the function. 
-#' The columns are :
+#' This function works for both linear and tree models.
+#' 
+#' For linear models, the importance is the absolute magnitude of linear coefficients. 
+#' For that reason, in order to obtain a meaningful ranking by importance for a linear model, 
+#' the features need to be on the same scale (which you also would want to do when using either 
+#' L1 or L2 regularization).
+#' 
+#' @return
+#' 
+#' For a tree model, a \code{data.table} with the following columns:
 #' \itemize{
-#'   \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
-#'   \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
-#'   \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
-#'   \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
+#'   \item \code{Features} names of the features used in the model;
+#'   \item \code{Gain} represents fractional contribution of each feature to the model based on
+#'        the total gain of this feature's splits. Higher percentage means a more important 
+#'        predictive feature.
+#'   \item \code{Cover} metric of the number of observation related to this feature;
+#'   \item \code{Frequency} percentage representing the relative number of times
+#'        a feature have been used in trees.
 #' }
 #' 
-#' If you don't provide \code{feature_names}, index of the features will be used instead.
+#' A linear model's importance \code{data.table} has the following columns:
+#' \itemize{
+#'   \item \code{Features} names of the features used in the model;
+#'   \item \code{Weight} the linear coefficient of this feature;
+#'   \item \code{Class} (only for multiclass models) class label.
+#' }
 #' 
-#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
-#' 
-#' Co-occurence count
-#' ------------------
-#' 
-#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
-#' 
-#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
-#' 
-#' If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
+#' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, 
+#' index of the features will be used instead. Because the index is extracted from the model dump
+#' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
 #' 
 #' @examples
+#' 
+#' # binomial classification using gbtree:
 #' data(agaricus.train, package='xgboost')
-#' 
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, 
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' xgb.importance(model = bst)
 #' 
-#' xgb.importance(colnames(agaricus.train$data), model = bst)
+#' # binomial classification using gblinear:
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", 
+#'                eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
+#' xgb.importance(model = bst)
 #' 
-#' # Same thing with co-occurence computation this time
-#' xgb.importance(colnames(agaricus.train$data), model = bst, data = agaricus.train$data, label = agaricus.train$label)
+#' # multiclass classification using gbtree:
+#' nclass <- 3
+#' nrounds <- 10
+#' mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
+#'                max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds,
+#'                objective = "multi:softprob", num_class = nclass)
+#' # all classes clumped together:
+#' xgb.importance(model = mbst)
+#' # inspect importances separately for each class:
+#' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
+#' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
+#' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
 #' 
+#' # multiclass classification using gblinear:
+#' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
+#'                booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
+#'                objective = "multi:softprob", num_class = nclass)
+#' xgb.importance(model = mbst)
+#'
 #' @export
-xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
-  if (!class(feature_names) %in% c("character", "NULL")) {
-    stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
-  }
-
-  if (class(model) != "xgb.Booster") {
-    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
-  }
-
-  if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) {
-    stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
-  }
-
-  if(class(label) == "numeric"){
-    if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
-  }
+xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
+                           data = NULL, label = NULL, target = NULL){
  
-  treeDump <- function(feature_names, text, keepDetail){
-    if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
-    xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
-  }
+  if (!(is.null(data) && is.null(label) && is.null(target)))
+    warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
  
-  linearDump <- function(feature_names, text){
-    weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric
-    if(is.null(feature_names)) feature_names <- seq(to = length(weights))
-    data.table(Feature = feature_names, Weight = weights)
-  }
-
-  model.text.dump <- xgb.dump(model = model, with_stats = T)
+  if (!inherits(model, "xgb.Booster"))
+    stop("model: must be an object of class xgb.Booster")
  
-  if(model.text.dump[2] == "bias:"){
-    result <- model.text.dump %>% linearDump(feature_names, .)
-    if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
-  }  else {
-    result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data))
+  if (is.null(feature_names) && !is.null(model$feature_names))
+    feature_names <- model$feature_names
+  
+  if (!(is.null(feature_names) || is.character(feature_names)))
+    stop("feature_names: Has to be a character vector")

-    # Co-occurence computation
-    if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
-      # Take care of missing column
-      a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0
-      # Bind the two Matrix and reorder columns
-      c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]]
-      rm(a)
-      # Apply split
-      d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
-      apply(c & d, 2, . %>% target %>% sum) -> vec
-
-      result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL]
+  model_text_dump <- xgb.dump(model = model, with_stats = TRUE)
+  
+  # linear model
+  if(model_text_dump[2] == "bias:"){
+    weights <- which(model_text_dump == "weight:") %>%
+               {model_text_dump[(. + 1):length(model_text_dump)]} %>%
+               as.numeric
+    
+    num_class <- NVL(model$params$num_class, 1)
+    if(is.null(feature_names)) 
+      feature_names <- seq(to = length(weights) / num_class) - 1
+    if (length(feature_names) * num_class != length(weights))
+      stop("feature_names length does not match the number of features used in the model")
+    
+    result <- if (num_class == 1) {
+      data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
+    } else {
+      data.table(Feature = rep(feature_names, each = num_class),
+                 Weight = weights,
+                 Class = seq_len(num_class) - 1)[order(Class, -abs(Weight))]
    }
+  } else { 
+  # tree model
+    result <- xgb.model.dt.tree(feature_names = feature_names,
+                                text = model_text_dump,
+                                trees = trees)[
+      Feature != "Leaf", .(Gain = sum(Quality), 
+                           Cover = sum(Cover), 
+                           Frequency = .N), by = Feature][
+      ,`:=`(Gain = Gain / sum(Gain), 
+            Cover = Cover / sum(Cover),
+            Frequency = Frequency / sum(Frequency))][
+      order(Gain, decreasing = TRUE)]
  }
  result
 }
@@ -103,4 +136,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c(".", ".N", "Gain", "Frequency", "Feature", "Split", "No", "Missing", "MissingNo", "RealCover"))
+globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature", "Class"))
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -1,8 +1,23 @@
 #' Load xgboost model from binary file
 #' 
-#' Load xgboost model from the binary model file
+#' Load xgboost model from the binary model file. 
 #' 
-#' @param modelfile the name of the binary file.
+#' @param modelfile the name of the binary input file.
+#' 
+#' @details 
+#' The input file is expected to contain a model saved in an xgboost-internal binary format
+#' using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some 
+#' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and 
+#' saved from there in xgboost format, could be loaded from R.
+#' 
+#' Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
+#' not \code{xgb.load}.
+#' 
+#' @return 
+#' An object of \code{xgb.Booster} class.
+#' 
+#' @seealso 
+#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. 
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -19,13 +34,13 @@ xgb.load <- function(modelfile) {
  if (is.null(modelfile))
    stop("xgb.load: modelfile cannot be NULL")

-  handle <- xgb.Booster(modelfile = modelfile)
-  # re-use modelfile if it is raw so we donot need to serialize
+  handle <- xgb.Booster.handle(modelfile = modelfile)
+  # re-use modelfile if it is raw so we do not need to serialize
  if (typeof(modelfile) == "raw") {
    bst <- xgb.handleToBooster(handle, modelfile)
  } else {
    bst <- xgb.handleToBooster(handle, NULL)
  }
-  bst <- xgb.Booster.check(bst, saveraw = TRUE)
+  bst <- xgb.Booster.complete(bst, saveraw = TRUE)
  return(bst)
 }
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -3,12 +3,20 @@
 #' Parse a boosted tree model text dump into a \code{data.table} structure.
 #' 
 #' @param feature_names character vector of feature names. If the model already
-#'          contains feature names, this argument should be \code{NULL} (default value)
+#'          contains feature names, those would be used when \code{feature_names=NULL} (default value).
+#'          Non-null \code{feature_names} could be provided to override those in the model.
 #' @param model object of class \code{xgb.Booster}
 #' @param text \code{character} vector previously generated by the \code{xgb.dump} 
 #'          function  (where parameter \code{with_stats = TRUE} should have been set).
-#' @param n_first_tree limit the parsing to the \code{n} first trees. 
+#'          \code{text} takes precedence over \code{model}.
+#' @param trees an integer vector of tree indices that should be parsed.
 #'          If set to \code{NULL}, all trees of the model are parsed.
+#'          It could be useful, e.g., in multiclass classification to get only
+#'          the trees of one certain class. IMPORTANT: the tree index in xgboost models
+#'          is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
+#' @param use_int_id a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
+#'          represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
+#' @param ... currently not used.
 #'
 #' @return 
 #' A \code{data.table} with detailed information about model trees' nodes.
@@ -16,9 +24,9 @@
 #' The columns of the \code{data.table} are:
 #' 
 #' \itemize{
-#'  \item \code{Tree}: ID of a tree in a model
-#'  \item \code{Node}: ID of a node in a tree
-#'  \item \code{ID}: unique identifier of a node in a model
+#'  \item \code{Tree}: integer ID of a tree in a model (zero-based index)
+#'  \item \code{Node}: integer ID of a node in a tree (zero-based index)
+#'  \item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
 #'  \item \code{Feature}: for a branch node, it's a feature id or name (when available);
 #'              for a leaf note, it simply labels it as \code{'Leaf'}
 #'  \item \code{Split}: location of the split for a branch node (split condition is always "less than")
@@ -30,6 +38,10 @@
 #'                      or collected by a leaf during training.
 #' } 
 #' 
+#' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
+#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from 
+#' the corresponding trees in the "Node" column.
+#' 
 #' @examples
 #' # Basic use:
 #' 
@@ -40,6 +52,9 @@
 #' 
 #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
 #' 
+#' # This bst model already has feature_names stored with it, so those would be used when 
+#' # feature_names is not set:
+#' (dt <- xgb.model.dt.tree(model = bst))
 #' 
 #' # How to match feature names of splits that are following a current 'Yes' branch:
 #' 
@@ -47,68 +62,90 @@
 #'  
 #' @export
 xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
-                              n_first_tree = NULL){
+                              trees = NULL, use_int_id = FALSE, ...){
+  check.deprecation(...)
  
-  if (!class(feature_names) %in% c("character", "NULL")) {
-    stop("feature_names: Has to be a vector of character\n",
-         "  or NULL if the model dump already contains feature names.\n",
-         "  Look at this function documentation to see where to get feature names.")
+  if (!inherits(model, "xgb.Booster") && !is.character(text)) {
+    stop("Either 'model' must be an object of class xgb.Booster\n",
+         "  or 'text' must be a character vector with the result of xgb.dump\n",
+         "  (or NULL if 'model' was provided).")
  }
  
-  if (class(model) != "xgb.Booster" & class(text) != "character") {
-    stop("Either 'model' has to be an object of class xgb.Booster\n",
-         "  or 'text' has to be a character vector with the result of xgb.dump\n",
-         "  (or NULL if the model was provided).")
+  if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
+    feature_names <- model$feature_names
+  
+  if (!(is.null(feature_names) || is.character(feature_names))) {
+    stop("feature_names: must be a character vector")
  }
  
-  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
-    stop("n_first_tree: Has to be a numeric vector of size 1.")
+  if (!(is.null(trees) || is.numeric(trees))) {
+    stop("trees: must be a vector of integers.")
  }
  
-  if(is.null(text)){
-    text <- xgb.dump(model = model, with_stats = T)
+  if (is.null(text)){
+    text <- xgb.dump(model = model, with_stats = TRUE)
+  }
+  
+  if (length(text) < 2 ||
+      sum(stri_detect_regex(text, 'yes=(\\d+),no=(\\d+)')) < 1) {
+    stop("Non-tree model detected! This function can only be used with tree models.")
  }
  
  position <- which(!is.na(stri_match_first_regex(text, "booster")))
  
-  add.tree.id <- function(x, i) paste(i, x, sep = "-")
+  add.tree.id <- function(node, tree) if (use_int_id) node else paste(tree, node, sep = "-")
  
  anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
  
-  td <- data.table(t=text)
+  td <- data.table(t = text)
  td[position, Tree := 1L]
  td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L]
  
-  n_first_tree <- min(max(td$Tree), n_first_tree)
-  td <- td[Tree <= n_first_tree & !grepl('^booster', t)]
+  if (is.null(trees)) {
+    trees <- 0:max(td$Tree)
+  } else {
+    trees <- trees[trees >= 0 & trees <= max(td$Tree)]
+  }
+  td <- td[Tree %in% trees & !grepl('^booster', t)]
  
-  td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.numeric ]
-  td[, ID := add.tree.id(Node, Tree)]
+  td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ]
+  if (!use_int_id) td[, ID := add.tree.id(Node, Tree)]
  td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]

  # parse branch lines
-  td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := {
-    rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
-                 "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
-    # skip some indices with spurious capture groups from anynumber_regex
-    xtr <- stri_match_first_regex(t, rx)[, c(2,3,5,6,7,8,10)]
-    xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
-    lapply(1:ncol(xtr), function(i) xtr[,i])
-  }]
+  branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
+                      "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
+  branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
+  td[isLeaf == FALSE, 
+     (branch_cols) := {
+      # skip some indices with spurious capture groups from anynumber_regex
+      xtr <- stri_match_first_regex(t, branch_rx)[, c(2,3,5,6,7,8,10), drop = FALSE]
+      xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
+      lapply(seq_len(ncol(xtr)), function(i) xtr[,i])
+    }]
  # assign feature_names when available
-  td[isLeaf==FALSE & !is.null(feature_names), 
-     Feature := feature_names[as.numeric(Feature) + 1] ]
+  if (!is.null(feature_names)) {
+    if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
+      stop("feature_names has less elements than there are features used in the model")
+    td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1] ]
+  }
  
  # parse leaf lines
-  td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := {
-    rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
-    xtr <- stri_match_first_regex(t, rx)[, c(2,4)]
-    c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
-  }]
+  leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
+  leaf_cols <- c("Feature", "Quality", "Cover")
+  td[isLeaf == TRUE,
+     (leaf_cols) := {
+      xtr <- stri_match_first_regex(t, leaf_rx)[, c(2,4)]
+      c("Leaf", lapply(seq_len(ncol(xtr)), function(i) xtr[,i]))
+    }]
  
  # convert some columns to numeric
-  numeric_cols <- c("Quality", "Cover")
-  td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols=numeric_cols]
+  numeric_cols <- c("Split", "Quality", "Cover")
+  td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
+  if (use_int_id) {
+    int_cols <- c("Yes", "No", "Missing")
+    td[, (int_cols) := lapply(.SD, as.integer), .SDcols = int_cols]
+  }
  
  td[, t := NULL]
  td[, isLeaf := NULL]
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -46,7 +46,8 @@
 #' 
 #' data(agaricus.train, package='xgboost')
 #'
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
+#' # Change max_depth to a higher number to get a more significant result
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
 #'                eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
 #'                subsample = 0.5, min_child_weight = 2)
 #'
@@ -62,7 +63,7 @@
 xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
                              plot = TRUE, ...) {
  
-  if (!(class(model) == "xgb.Booster" || is.data.table(model)))
+  if (!(inherits(model, "xgb.Booster") || is.data.table(model)))
    stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
         "or a data.table result of the xgb.importance function")

@@ -72,14 +73,14 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
  which <- match.arg(which)
  
  dt_tree <- model
-  if (class(model) == "xgb.Booster")
+  if (inherits(model, "xgb.Booster"))
    dt_tree <- xgb.model.dt.tree(model = model)
  
  if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))
    stop("Model tree columns are not as expected!\n",
         "  Note that this function works only for tree models.")
  
-  dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight=Quality)], by = "ID")
+  dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID")
  setkeyv(dt_depths, c("Tree", "ID"))
  # count by depth levels, and also calculate average cover at a depth
  dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
@@ -88,13 +89,13 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
  if (plot) {
    if (which == "2x1") {
      op <- par(no.readonly = TRUE)
-      par(mfrow=c(2,1),
+      par(mfrow = c(2,1),
          oma = c(3,1,3,1) + 0.1,
          mar = c(1,4,1,0) + 0.1)

-      dt_summaries[, barplot(N, border=NA, ylab = 'Number of leafs', ...)]
+      dt_summaries[, barplot(N, border = NA, ylab = 'Number of leafs', ...)]

-      dt_summaries[, barplot(Cover, border=NA, ylab = "Weighted cover", names.arg=Depth, ...)]
+      dt_summaries[, barplot(Cover, border = NA, ylab = "Weighted cover", names.arg = Depth, ...)]
    
      title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1)
      par(op)
@@ -118,8 +119,8 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
 get.leaf.depth <- function(dt_tree) {
  # extract tree graph's edges
  dt_edges <- rbindlist(list(
-      dt_tree[Feature != "Leaf", .(ID, To=Yes, Tree)],
-      dt_tree[Feature != "Leaf", .(ID, To=No, Tree)]
+      dt_tree[Feature != "Leaf", .(ID, To = Yes, Tree)],
+      dt_tree[Feature != "Leaf", .(ID, To = No, Tree)]
    ))
  # whether "To" is a leaf:
  dt_edges <- 
@@ -144,6 +145,6 @@ get.leaf.depth <- function(dt_tree) {
 # They are mainly column names inferred by Data.table...
 globalVariables(
  c(
-    ".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature"
+    ".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
  )
 )
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -61,8 +61,8 @@
 xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, 
                                rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) {
  check.deprecation(...)
-  if (!"data.table" %in% class(importance_matrix))  {
-    stop("importance_matrix: Should be a data.table.")
+  if (!is.data.table(importance_matrix))  {
+    stop("importance_matrix: must be a data.table")
  }

  imp_names <- colnames(importance_matrix)
@@ -107,12 +107,12 @@ xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure
    
    # reverse the order of rows to have the highest ranked at the top
    importance_matrix[nrow(importance_matrix):1,
-                      barplot(Importance, horiz=TRUE, border=NA, cex.names=cex,
-                              names.arg=Feature, las=1, ...)]
+                      barplot(Importance, horiz = TRUE, border = NA, cex.names = cex,
+                              names.arg = Feature, las = 1, ...)]
    grid(NULL, NA)
    # redraw over the grid
    importance_matrix[nrow(importance_matrix):1,
-                      barplot(Importance, horiz=TRUE, border=NA, add=TRUE)]
+                      barplot(Importance, horiz = TRUE, border = NA, add = TRUE)]
    par(op)
  }
  
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -1,59 +1,77 @@
 #' Project all trees on one tree and plot it
-#' 
+#'
 #' Visualization of the ensemble of trees as a single collective unit.
 #'
-#' @param model dump generated by the \code{xgb.train} function.
-#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
+#' @param model produced by the \code{xgb.train} function.
+#' @param feature_names names of each feature as a \code{character} vector.
 #' @param features_keep number of features to keep in each position of the multi trees.
 #' @param plot_width width in pixels of the graph to produce
 #' @param plot_height height in pixels of the graph to produce
+#' @param render a logical flag for whether the graph should be rendered (see Value).
 #' @param ... currently not used
 #' 
-#' @return Two graphs showing the distribution of the model deepness.
-#' 
 #' @details
-#' 
-#' This function tries to capture the complexity of gradient boosted tree ensemble 
-#' in a cohesive way. 
-#' 
-#' The goal is to improve the interpretability of the model generally seen as black box.
-#' The function is dedicated to boosting applied to decision trees only.
-#' 
-#' The purpose is to move from an ensemble of trees to a single tree only.
-#' 
-#' It takes advantage of the fact that the shape of a binary tree is only defined by 
-#' its deepness (therefore in a boosting model, all trees have the same shape). 
-#' 
+#'
+#' This function tries to capture the complexity of a gradient boosted tree model
+#' in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
+#' The goal is to improve the interpretability of a model generally seen as black box.
+#'
+#' Note: this function is applicable to tree booster-based models only.
+#'
+#' It takes advantage of the fact that the shape of a binary tree is only defined by
+#' its depth (therefore, in a boosting model, all trees have similar shape).
+#'
 #' Moreover, the trees tend to reuse the same features.
-#' 
-#' The function will project each tree on one, and keep for each position the 
-#' \code{features_keep} first features (based on Gain per feature measure).
-#' 
+#'
+#' The function projects each tree onto one, and keeps for each position the
+#' \code{features_keep} first features (based on the Gain per feature measure).
+#'
 #' This function is inspired by this blog post:
 #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 #'
+#' @return
+#'
+#' When \code{render = TRUE}:
+#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
+#' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
+#'
+#' When \code{render = FALSE}:
+#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
+#' This could be useful if one wants to modify some of the graph attributes
+#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
+#'
 #' @examples
+#'
 #' data(agaricus.train, package='xgboost')
 #'
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-#'                  eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-#'                  min_child_weight = 50)
+#'                eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
+#'                min_child_weight = 50, verbose = 0)
 #'
-#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), features_keep = 3)
+#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 #' print(p)
 #'
+#' \dontrun{
+#' # Below is an example of how to save this plot to a file.
+#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
+#' library(DiagrammeR)
+#' gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
+#' export_graph(gr, 'tree.pdf', width=1500, height=600)
+#' }
+#'
 #' @export
-xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL, ...){
+xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL,
+                                 render = TRUE, ...){
  check.deprecation(...)
  tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
-  
+
  # first number of the path represents the tree, then the following numbers are related to the path to follow
  # root init
  root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID]
-  tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
-  
+  tree.matrix[ID %in% root.nodes, abs.node.position := root.nodes]
+
  precedent.nodes <- root.nodes
-  
+
  while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
    yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
    no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
@@ -65,44 +83,66 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
    precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
  }
  
-  tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")]
-  tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
-  
-  
+  tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")]
+  tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")]
  
  remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
  
-  tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
+  tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position),
+                    Yes = remove.tree(Yes),
+                    No = remove.tree(No))]
  
-  nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), features_keep)], " (", Quality[1:min(length(Quality), features_keep)], ")") %>% paste0(collapse = "\n")), by=abs.node.position]
-  edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL]
+  nodes.dt <- tree.matrix[
+        , .(Quality = sum(Quality))
+        , by = .(abs.node.position, Feature)
+      ][, .(Text = paste0(Feature[1:min(length(Feature), features_keep)],
+                          " (",
+                          format(Quality[1:min(length(Quality), features_keep)], digits=5),
+                          ")") %>%
+                   paste0(collapse = "\n"))
+        , by = abs.node.position]
  
-  nodes <- DiagrammeR::create_nodes(nodes = nodes.dt[,abs.node.position],
-                                    label = nodes.dt[,Text],
-                                    style = "filled",
-                                    color = "DimGray",
-                                    fillcolor= "Beige",
-                                    shape = "oval",
-                                    fontname = "Helvetica"
+  edges.dt <- tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)] %>%
+    list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>%
+    rbindlist() %>%
+    setnames(c("From", "To")) %>%
+    .[, .N, .(From, To)] %>%
+    .[, N:=NULL]
+  
+  nodes <- DiagrammeR::create_node_df(
+    n = nrow(nodes.dt),
+    label = nodes.dt[,Text]
  )
  
-  edges <- DiagrammeR::create_edges(from = edges.dt[,From],
-                                    to = edges.dt[,To],
-                                    color = "DimGray", 
-                                    arrowsize = "1.5", 
-                                    arrowhead = "vee",
-                                    fontname = "Helvetica",
-                                    rel = "leading_to")
+  edges <- DiagrammeR::create_edge_df(
+    from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
+    to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
+    rel = "leading_to")
  
-  graph <- DiagrammeR::create_graph(nodes_df = nodes,
-                                    edges_df = edges,
-                                    graph_attrs = "rankdir = LR")
-  
-  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)  
+  graph <- DiagrammeR::create_graph(
+      nodes_df = nodes,
+      edges_df = edges,
+      attr_theme = NULL
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "graph",
+      attr  = c("layout", "rankdir"),
+      value = c("dot", "LR")
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "node",
+      attr  = c("color", "fillcolor", "style", "shape", "fontname"),
+      value = c("DimGray", "beige", "filled", "rectangle", "Helvetica")
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "edge",
+      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
+      value = c("DimGray", "1.5", "vee", "Helvetica"))
+
+  if (!render) return(invisible(graph))
+
+  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
 }

-globalVariables(
-  c(
-    ".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos", "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"
-  )
-)
+globalVariables(c(".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos",
+                  "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"))
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -0,0 +1,217 @@
+#' SHAP contribution dependency plots
+#'
+#' Visualizing the SHAP feature contribution to prediction dependencies on feature value.
+#' 
+#' @param data data as a \code{matrix} or \code{dgCMatrix}.
+#' @param shap_contrib a matrix of SHAP contributions that was computed earlier for the above 
+#'          \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.
+#' @param features a vector of either column indices or of feature names to plot. When it is NULL,
+#'          feature importance is calculated, and \code{top_n} high ranked features are taken.
+#' @param top_n when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.
+#' @param model an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
+#'          or \code{features} is missing.
+#' @param trees passed to \code{\link{xgb.importance}} when \code{features = NULL}.
+#' @param target_class is only relevant for multiclass models. When it is set to a 0-based class index,
+#'          only SHAP contributions for that specific class are used.
+#'          If it is not set, SHAP importances are averaged over all classes.
+#' @param approxcontrib passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.
+#' @param subsample a random fraction of data points to use for plotting. When it is NULL,
+#'          it is set so that up to 100K data points are used.
+#' @param n_col a number of columns in a grid of plots.
+#' @param col color of the scatterplot markers.
+#' @param pch scatterplot marker.
+#' @param discrete_n_uniq a maximal number of unique values in a feature to consider it as discrete.
+#' @param discrete_jitter an \code{amount} parameter of jitter added to discrete features' positions.
+#' @param ylab a y-axis label in 1D plots.
+#' @param plot_NA whether the contributions of cases with missing values should also be plotted.
+#' @param col_NA a color of marker for missing value contributions.
+#' @param pch_NA a marker type for NA values.
+#' @param pos_NA a relative position of the x-location where NA values are shown:
+#'          \code{min(x) + (max(x) - min(x)) * pos_NA}.
+#' @param plot_loess whether to plot loess-smoothed curves. The smoothing is only done for features with
+#'          more than 5 distinct values.
+#' @param col_loess a color to use for the loess curves.
+#' @param span_loess the \code{span} paramerer in \code{\link[stats]{loess}}'s call.
+#' @param which whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.
+#' @param plot whether a plot should be drawn. If FALSE, only a lits of matrices is returned.
+#' @param ... other parameters passed to \code{plot}.
+#' 
+#' @details
+#' 
+#' These scatterplots represent how SHAP feature contributions depend of feature values.
+#' The similarity to partial dependency plots is that they also give an idea for how feature values
+#' affect predictions. However, in partial dependency plots, we usually see marginal dependencies
+#' of model prediction on feature value, while SHAP contribution dependency plots display the estimated
+#' contributions of a feature to model prediction for each individual case.
+#' 
+#' When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
+#' weighted LOESS is computed and plotted, where weights are the numbers of data points
+#' at each rounded value.
+#' 
+#' Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
+#' the margin is prediction before a sigmoidal transform into probability-like values.
+#' Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
+#' contributions for all features + bias), depending on the objective used, transforming SHAP
+#' contributions for a feature from the marginal to the prediction space is not necessarily
+#' a meaningful thing to do.
+#' 
+#' @return
+#' 
+#' In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
+#' \itemize{
+#'  \item \code{data} the values of selected features;
+#'  \item \code{shap_contrib} the contributions of selected features.
+#' }
+#'
+#' @references
+#'
+#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+#'
+#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
+#'
+#' @examples
+#' 
+#' data(agaricus.train, package='xgboost')
+#' data(agaricus.test, package='xgboost')
+#'
+#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, 
+#'                eta = 0.1, max_depth = 3, subsample = .5,
+#'                method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+#'
+#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
+#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
+#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
+#'
+#' # multiclass example - plots for each class separately:
+#' nclass <- 3
+#' nrounds <- 20
+#' x <- as.matrix(iris[, -5])
+#' set.seed(123)
+#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
+#' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
+#'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+#'                 objective = "multi:softprob", num_class = nclass, verbose = 0)
+#' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
+#' col <- rgb(0, 0, 1, 0.5)
+#' xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4,
+#'               n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
+#'               n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
+#'               n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' 
+#' @rdname xgb.plot.shap
+#' @export
+xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
+                          trees = NULL, target_class = NULL, approxcontrib = FALSE,
+                          subsample = NULL, n_col = 1, col = rgb(0, 0, 1, 0.2), pch = '.',
+                          discrete_n_uniq = 5, discrete_jitter = 0.01, ylab = "SHAP",
+                          plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
+                          plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
+                          which = c("1d", "2d"), plot = TRUE, ...) {
+  
+  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
+    stop("data: must be either matrix or dgCMatrix")
+
+  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
+
+  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
+
+  if (!is.null(shap_contrib) &&
+      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
+    stop("shap_contrib is not compatible with the provided data")
+  
+  nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
+  idx <- sample(1:nrow(data), nsample)
+  data <- data[idx,]
+
+  if (is.null(shap_contrib)) {
+    shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
+  } else {
+    shap_contrib <- shap_contrib[idx,]
+  }
+
+  which <- match.arg(which)
+  if (which == "2d")
+    stop("2D plots are not implemented yet")
+
+  if (is.null(features)) {
+    imp <- xgb.importance(model = model, trees = trees)
+    top_n <- as.integer(top_n[1])
+    if (top_n < 1 && top_n > 100)
+      stop("top_n: must be an integer within [1, 100]")
+    features <- imp$Feature[1:min(top_n, NROW(imp))]
+  }
+  
+  if (is.character(features)) {
+    if (is.null(colnames(data)))
+      stop("Either provide `data` with column names or provide `features` as column indices")
+    features <- match(features, colnames(data))
+  }
+  
+  if (n_col > length(features)) n_col <- length(features)
+
+  if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
+    shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
+                    else Reduce("+", lapply(shap_contrib, abs))
+  }
+
+  shap_contrib <- shap_contrib[, features, drop = FALSE]
+  data <- data[, features, drop = FALSE]
+  cols <- colnames(data)
+  if (is.null(cols)) cols <- colnames(shap_contrib)
+  if (is.null(cols)) cols <- paste0('X', 1:ncol(data))
+  colnames(data) <- cols
+  colnames(shap_contrib) <- cols
+  
+  if (plot && which == "1d") {
+    op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
+              oma = c(0,0,0,0) + 0.2,
+              mar = c(3.5,3.5,0,0) + 0.1,
+              mgp = c(1.7, 0.6, 0))
+    for (f in cols) {
+      ord <- order(data[, f])
+      x <- data[, f][ord]
+      y <- shap_contrib[, f][ord]
+      x_lim <- range(x, na.rm = TRUE)
+      y_lim <- range(y, na.rm = TRUE)
+      do_na <- plot_NA && any(is.na(x))
+      if (do_na) {
+        x_range <- diff(x_lim)
+        loc_na <- min(x, na.rm = TRUE) + x_range * pos_NA
+        x_lim <- range(c(x_lim, loc_na))
+      }
+      x_uniq <- unique(x)
+      x2plot <- x
+      # add small jitter for discrete features with <= 5 distinct values
+      if (length(x_uniq) <= discrete_n_uniq)
+        x2plot <- jitter(x, amount = discrete_jitter * min(diff(x_uniq), na.rm = TRUE))
+      plot(x2plot, y, pch = pch, xlab = f, col = col, xlim = x_lim, ylim = y_lim, ylab = ylab, ...)
+      grid()
+      if (plot_loess) {
+        # compress x to 3 digits, and mean-aggredate y
+        zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x]
+        if (nrow(zz) <= 5) {
+          lines(zz$x, zz$y, col = col_loess)
+        } else {
+          lo <- stats::loess(y ~ x, data = zz, weights = zz$N, span = span_loess)
+          zz$y_lo <- predict(lo, zz, type = "link")
+          lines(zz$x, zz$y_lo, col = col_loess)
+        }
+      }
+      if (do_na) {
+        i_na <- which(is.na(x))
+        x_na <- rep(loc_na, length(i_na))
+        x_na <- jitter(x_na, amount = x_range * 0.01)
+        points(x_na, y[i_na], pch = pch_NA, col = col_NA)
+      }
+    }
+    par(op)
+  }
+  if (plot && which == "2d") {
+    # TODO
+  }
+  invisible(list(data = data, shap_contrib = shap_contrib))
+}
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -2,75 +2,132 @@
 #' 
 #' Read a tree model text dump and plot the model. 
 #' 
-#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
-#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
+#' @param feature_names names of each feature as a \code{character} vector.
+#' @param model produced by the \code{xgb.train} function.
+#' @param trees an integer vector of tree indices that should be visualized.
+#'          If set to \code{NULL}, all trees of the model are included.
+#'          IMPORTANT: the tree index in xgboost model is zero-based
+#'          (e.g., use \code{trees = 0:2} for the first 3 trees in a model).
 #' @param plot_width  the width of the diagram in pixels.
 #' @param plot_height	the height of the diagram in pixels.
+#' @param render a logical flag for whether the graph should be rendered (see Value).
+#' @param show_node_id a logical flag for whether to show node id's in the graph.
 #' @param ... currently not used.
 #'
-#' @return A \code{DiagrammeR} of the model.
-#'
 #' @details 
 #' 
 #' The content of each node is organised that way:
 #' 
 #' \itemize{
-#'  \item \code{feature} value;
-#'  \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
-#'  \item \code{gain}: metric the importance of the node in the model.
+#'  \item Feature name.
+#'  \item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
+#'        If it is square loss, this simply corresponds to the number of instances seen by a split
+#'        or collected by a leaf during training.
+#'        The deeper in the tree a node is, the lower this metric will be.
+#'  \item \code{Gain} (for split nodes): the information gain metric of a split
+#'        (corresponds to the importance of the node in the model).
+#'  \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction.
 #' } 
+#' The tree root nodes also indicate the Tree index (0-based).
 #' 
-#' The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
+#' The "Yes" branches are marked by the "< split_value" label.
+#' The branches that also used for missing values are marked as bold
+#' (as in "carrying extra capacity").
+#' 
+#' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
 #'  
+#' @return
+#' 
+#' When \code{render = TRUE}:
+#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
+#' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
+#' 
+#' When \code{render = FALSE}:
+#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
+#' This could be useful if one wants to modify some of the graph attributes
+#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, 
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
 #'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#' # plot all the trees
+#' xgb.plot.tree(model = bst)
+#' # plot only the first tree and display the node ID:
+#' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
 #' 
-#' xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst)
+#' \dontrun{
+#' # Below is an example of how to save this plot to a file. 
+#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
+#' library(DiagrammeR)
+#' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
+#' export_graph(gr, 'tree.pdf', width=1500, height=1900)
+#' export_graph(gr, 'tree.png', width=1500, height=1900)
+#' }
 #' 
 #' @export
-xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NULL, plot_width = NULL, plot_height = NULL, ...){
+xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
+                          render = TRUE, show_node_id = FALSE, ...){
  check.deprecation(...)
-  if (class(model) != "xgb.Booster") {
-    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  if (!inherits(model, "xgb.Booster")) {
+    stop("model: Has to be an object of class xgb.Booster")
  }

  if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
    stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
  }
  
-  allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
-  
-  allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)]
-  allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
-  allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
-  
-  # rev is used to put the first tree on top.
-  nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID] %>% rev,
-                 label = allTrees[,label] %>% rev,
-                 style = "filled",
-                 color = "DimGray",
-                 fillcolor= allTrees[,filledcolor] %>% rev,
-                 shape = allTrees[,shape] %>% rev,
-                 data = allTrees[,Feature] %>% rev,
-                 fontname = "Helvetica"
-                 )
-  
-  edges <- DiagrammeR::create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2),
-                        to = allTrees[Feature != "Leaf", c(Yes, No)],
-                        label = allTrees[Feature != "Leaf", paste("<",Split)] %>% c(rep("",nrow(allTrees[Feature != "Leaf"]))),
-                        color = "DimGray", 
-                        arrowsize = "1.5", 
-                        arrowhead = "vee",
-                        fontname = "Helvetica",
-                        rel = "leading_to")
+  dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)

-  graph <- DiagrammeR::create_graph(nodes_df = nodes,
-                        edges_df = edges,
-                        graph_attrs = "rankdir = LR")
+  dt[, label:= paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)]
+  if (show_node_id)
+    dt[, label := paste0(ID, ": ", label)]
+  dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)]
+  dt[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
+  dt[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
+  # in order to draw the first tree on top:
+  dt <- dt[order(-Tree)]
+  
+  nodes <- DiagrammeR::create_node_df(
+    n         = nrow(dt),
+    ID        = dt$ID,
+    label     = dt$label,
+    fillcolor = dt$filledcolor,
+    shape     = dt$shape,
+    data      = dt$Feature,
+    fontcolor = "black")
+  
+  edges <- DiagrammeR::create_edge_df(
+    from  = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID),
+    to    = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID),
+    label = dt[Feature != "Leaf", paste("<", Split)] %>%
+            c(rep("", nrow(dt[Feature != "Leaf"]))),
+    style = dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")] %>%
+            c(dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]),
+    rel   = "leading_to")
+
+  graph <- DiagrammeR::create_graph(
+      nodes_df = nodes,
+      edges_df = edges,
+      attr_theme = NULL
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "graph",
+      attr  = c("layout", "rankdir"),
+      value = c("dot", "LR")
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "node",
+      attr  = c("color", "style", "fontname"),
+      value = c("DimGray", "filled", "Helvetica")
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "edge",
+      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
+      value = c("DimGray", "1.5", "vee", "Helvetica"))
+  
+  if (!render) return(invisible(graph))
  
  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
 }
@@ -78,4 +135,4 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NUL
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label"))
+globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -1,9 +1,22 @@
 #' Save xgboost model to binary file
 #' 
-#' Save xgboost model from xgboost or xgb.train
+#' Save xgboost model to a file in binary format.
 #' 
-#' @param model the model object.
-#' @param fname the name of the file to write.
+#' @param model model object of \code{xgb.Booster} class.
+#' @param fname name of the file to write.
+#' 
+#' @details 
+#' This methods allows to save a model in an xgboost-internal binary format which is universal 
+#' among the various xgboost interfaces. In R, the saved model file could be read-in later
+#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter 
+#' of \code{\link{xgb.train}}.
+#' 
+#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} 
+#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and 
+#' corresponding R-methods would need to be used to load it.
+#' 
+#' @seealso 
+#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. 
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -19,9 +32,11 @@
 xgb.save <- function(model, fname) {
  if (typeof(fname) != "character")
    stop("fname must be character")
-  if (class(model) != "xgb.Booster")
-    stop("the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.")
-  
-  .Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
+  if (!inherits(model, "xgb.Booster")) {
+    stop("model must be xgb.Booster.",
+         if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
+  }
+  model <- xgb.Booster.complete(model, saveraw = FALSE)
+  .Call(XGBoosterSaveModel_R, model$handle, fname[1])
  return(TRUE)
 }
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -19,5 +19,5 @@
 #' @export
 xgb.save.raw <- function(model) {
  model <- xgb.get.handle(model)
-  .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost")
+  .Call(XGBoosterModelToRaw_R, model)
 }
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -1,6 +1,7 @@
 #' eXtreme Gradient Boosting Training
 #' 
-#' \code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
+#' \code{xgb.train} is an advanced interface for training an xgboost model.
+#' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
 #'
 #' @param params the list of parameters. 
 #'        The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
@@ -9,8 +10,7 @@
 #' 1. General Parameters
 #' 
 #' \itemize{
-#'   \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
-#'   \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
+#'   \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
 #' }
 #'  
 #' 2. Booster Parameters
@@ -25,6 +25,7 @@
 #'   \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
 #'   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
 #'   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
+#'   \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.
 #' }
 #' 
 #' 2.2. Parameter for Linear Booster
@@ -53,24 +54,26 @@
 #'   \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 #' }
 #' 
-#' @param data input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
-#'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.
-#' @param nrounds the max number of iterations
-#' @param watchlist what information should be printed when \code{verbose=1} or
-#'        \code{verbose=2}. Watchlist is used to specify validation set monitoring
-#'        during training. For example user can specify
-#'        watchlist=list(validation1=mat1, validation2=mat2) to watch
-#'        the performance of each round's model on mat1 and mat2
-#'
+#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
+#'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
+#' @param nrounds max number of boosting iterations.
+#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
+#'        Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
+#'        of these datasets during each boosting iteration, and stored in the end as a field named 
+#'        \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or 
+#'        \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+#'        printed out during the training. 
+#'        E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
+#'        the performance of each round's model on mat1 and mat2.
 #' @param obj customized objective function. Returns gradient and second order 
 #'        gradient with given prediction and dtrain.
 #' @param feval custimized evaluation function. Returns 
 #'        \code{list(metric='metric-name', value='metric-value')} with given 
 #'        prediction and dtrain.
-#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
-#'        information of performance. If 2, xgboost will print some additional information.
-#'        Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and 
-#'        \code{\link{cb.print.evaluation}} callback functions.
+#' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
+#'        If 2, some additional information will be printed out.
+#'        Note that setting \code{verbose > 0} automatically engages the 
+#'        \code{cb.print.evaluation(period=1)} callback function.
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the 
 #'        \code{\link{cb.print.evaluation}} callback.
@@ -85,7 +88,7 @@
 #' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
 #'        0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.
 #' @param save_name the name or path for periodically saved model file.
-#' @param xgb_model a previously built model to continue the trainig from.
+#' @param xgb_model a previously built model to continue the training from.
 #'        Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a 
 #'        file with a previously saved model.
 #' @param callbacks a list of callback functions to perform various task during boosting.
@@ -105,7 +108,7 @@
 #' 
 #' The \code{xgb.train} interface supports advanced features such as \code{watchlist}, 
 #' customized objective and evaluation metric functions, therefore it is more flexible 
-#' than the \code{\link{xgboost}} interface.
+#' than the \code{xgboost} interface.
 #'
 #' Parallelization is automatically enabled if \code{OpenMP} is present. 
 #' Number of threads can also be manually specified via \code{nthread} parameter.
@@ -118,12 +121,13 @@
 #'   \itemize{
 #'      \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
 #'      \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
-#'      \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
+#'      \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
 #'      \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
 #'            By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
 #'            Different threshold (e.g., 0.) could be specified as "error@0."
 #'      \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
 #'      \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
+#'      \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
 #'      \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
 #'   }
 #' 
@@ -131,7 +135,7 @@
 #' \itemize{
 #'   \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
 #'         and the \code{print_every_n} parameter is passed to it.
-#'   \item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present.
+#'   \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
 #'   \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
 #'   \item \code{cb.save.model}: when \code{save_period > 0} is set.
 #' }
@@ -157,6 +161,9 @@
 #'         (only available with early stopping).
 #'   \item \code{best_score} the best evaluation metric value during early stopping.
 #'         (only available with early stopping).
+#'   \item \code{feature_names} names of the training dataset features
+#'         (only when comun names were defined in training data).
+#'   \item \code{nfeatures} number of features in training data.
 #' }
 #' 
 #' @seealso
@@ -164,18 +171,24 @@
 #' \code{\link{predict.xgb.Booster}},
 #' \code{\link{xgb.cv}}
 #' 
+#' @references
+#'
+#' Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
+#' 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' 
 #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 #' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
-#' watchlist <- list(eval = dtest, train = dtrain)
+#' watchlist <- list(train = dtrain, eval = dtest)
 #' 
 #' ## A simple xgb.train example:
-#' param <- list(max_depth = 2, eta = 1, silent = 1, 
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
 #'               objective = "binary:logistic", eval_metric = "auc")
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+#' 
 #' 
 #' ## An xgb.train example where custom objective and evaluation metric are used:
 #' logregobj <- function(preds, dtrain) {
@@ -190,18 +203,33 @@
 #'   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
 #'   return(list(metric = "error", value = err))
 #' }
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+#' 
+#' # These functions could be used by passing them either:
+#' #  as 'objective' and 'eval_metric' parameters in the params list:
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
+#'               objective = logregobj, eval_metric = evalerror)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+#' 
+#' #  or through the ... arguments:
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+#'                  objective = logregobj, eval_metric = evalerror)
+#' 
+#' #  or as dedicated 'obj' and 'feval' parameters of xgb.train:
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+#'                  obj = logregobj, feval = evalerror)
+#' 
 #' 
 #' ## An xgb.train example of using variable learning rates at each iteration:
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
+#'               objective = "binary:logistic", eval_metric = "auc")
 #' my_etas <- list(eta = c(0.5, 0.1))
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 #'                  callbacks = list(cb.reset.parameters(my_etas)))
 #' 
-#' ## Explicit use of the cb.evaluation.log callback allows to run 
-#' ## xgb.train silently but still store the evaluation results:
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
-#'                  verbose = 0, callbacks = list(cb.evaluation.log()))
-#' print(bst$evaluation_log)
+#' ## Early stopping:
+#' bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
+#'                  early_stopping_rounds = 3)
 #' 
 #' ## An 'xgboost' interface example:
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, 
@@ -212,7 +240,7 @@
 #' @rdname xgb.train
 #' @export
 xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
-                      obj = NULL, feval = NULL, verbose = 1, print_every_n=1L,
+                      obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
                      early_stopping_rounds = NULL, maximize = NULL,
                      save_period = NULL, save_name = "xgboost.model", 
                      xgb_model = NULL, callbacks = list(), ...) {
@@ -226,11 +254,11 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  
  # data & watchlist checks
  dtrain <- data
-  if (class(dtrain) != "xgb.DMatrix") 
+  if (!inherits(dtrain, "xgb.DMatrix")) 
    stop("second argument dtrain must be xgb.DMatrix")
  if (length(watchlist) > 0) {
    if (typeof(watchlist) != "list" ||
-        !all(sapply(watchlist, class) == "xgb.DMatrix"))
+        !all(vapply(watchlist, inherits, logical(1), what = 'xgb.DMatrix')))
      stop("watchlist must be a list of xgb.DMatrix elements")
    evnames <- names(watchlist)
    if (is.null(evnames) || any(evnames == ""))
@@ -240,13 +268,13 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  # evaluation printing callback
  params <- c(params, list(silent = ifelse(verbose > 1, 0, 1)))
  print_every_n <- max( as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
+  if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
+      verbose) {
    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
  }
-  # evaluation log callback:  it is automatically enabled only when verbose > 0
+  # evaluation log callback:  it is automatically enabled when watchlist is provided
  evaluation_log <- list()
-  if (verbose > 0 &&
-      !has.callbacks(callbacks, 'cb.evaluation.log') &&
+  if (!has.callbacks(callbacks, 'cb.evaluation.log') &&
      length(watchlist) > 0) {
    callbacks <- add.cb(callbacks, cb.evaluation.log())
  }
@@ -260,14 +288,16 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  if (!is.null(early_stopping_rounds) &&
      !has.callbacks(callbacks, 'cb.early.stop')) {
    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, 
-                                                 maximize=maximize, verbose=verbose))
+                                                 maximize = maximize, verbose = verbose))
  }
  # Sort the callbacks into categories
  cb <- categorize.callbacks(callbacks)

-  
+  # The tree updating process would need slightly different handling
+  is_update <- NVL(params[['process_type']], '.') == 'update'
+
  # Construct a booster (either a new one or load from xgb_model)
-  handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model)
+  handle <- xgb.Booster.handle(params, append(watchlist, dtrain), xgb_model)
  bst <- xgb.handleToBooster(handle)

  # extract parameters that can affect the relationship b/w #trees and #iterations
@@ -275,17 +305,20 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)

  # When the 'xgb_model' was set, find out how many boosting iterations it has
-  niter_skip <- 0
+  niter_init <- 0
  if (!is.null(xgb_model)) {
-    niter_skip <- as.numeric(xgb.attr(bst, 'niter')) + 1
-    if (length(niter_skip) == 0) {
-      niter_skip <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
+    niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
+    if (length(niter_init) == 0) {
+      niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
    }
  }
+  if(is_update && nrounds > niter_init)
+    stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")

  # TODO: distributed code
  rank <- 0
  
+  niter_skip <- ifelse(is_update, 0, niter_init)
  begin_iteration <- niter_skip + 1
  end_iteration <- niter_skip + nrounds
  
@@ -306,9 +339,9 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),

    if (stop_condition) break
  }
-  for (f in cb$finalize) f(finalize=TRUE)
+  for (f in cb$finalize) f(finalize = TRUE)
  
-  bst <- xgb.Booster.check(bst, saveraw = TRUE)
+  bst <- xgb.Booster.complete(bst, saveraw = TRUE)
  
  # store the total number of boosting iterations
  bst$niter = end_iteration
@@ -317,10 +350,11 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  if (length(evaluation_log) > 0 &&
      nrow(evaluation_log) > 0) {
    # include the previous compatible history when available
-    if (class(xgb_model) == 'xgb.Booster' &&
+    if (inherits(xgb_model, 'xgb.Booster') &&
+        !is_update &&
        !is.null(xgb_model$evaluation_log) &&
-        all.equal(colnames(evaluation_log),
-                  colnames(xgb_model$evaluation_log))) {
+        isTRUE(all.equal(colnames(evaluation_log),
+                         colnames(xgb_model$evaluation_log)))) {
      evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log))
    }
    bst$evaluation_log <- evaluation_log
@@ -329,6 +363,9 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  bst$call <- match.call()
  bst$params <- params
  bst$callbacks <- callbacks
-  
+  if (!is.null(colnames(dtrain)))
+    bst$feature_names <- colnames(dtrain)
+  bst$nfeatures <- ncol(dtrain)
+
  return(bst)
 }
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -1,4 +1,4 @@
-# Simple interface for training an xgboost model.
+# Simple interface for training an xgboost model that wraps \code{xgb.train}.
 # Its documentation is combined with xgb.train.
 #
 #' @rdname xgb.train
@@ -7,16 +7,14 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
                    params = list(), nrounds,
                    verbose = 1, print_every_n = 1L, 
                    early_stopping_rounds = NULL, maximize = NULL, 
-                    save_period = 0, save_name = "xgboost.model",
+                    save_period = NULL, save_name = "xgboost.model",
                    xgb_model = NULL, callbacks = list(), ...) {

  dtrain <- xgb.get.DMatrix(data, label, missing, weight)

-  watchlist <- list()
-  if (verbose > 0)
-    watchlist$train = dtrain
+  watchlist <- list(train = dtrain)

-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n=print_every_n,
+  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n = print_every_n,
                   early_stopping_rounds = early_stopping_rounds, maximize = maximize,
                   save_period = save_period, save_name = save_name,
                   xgb_model = xgb_model, callbacks = callbacks, ...)
@@ -79,11 +77,13 @@ NULL

 # Various imports
 #' @importClassesFrom Matrix dgCMatrix dgeMatrix
-#' @importFrom Matrix cBind
 #' @importFrom Matrix colSums
 #' @importFrom Matrix sparse.model.matrix
 #' @importFrom Matrix sparseVector
+#' @importFrom Matrix sparseMatrix
+#' @importFrom Matrix t
 #' @importFrom data.table data.table
+#' @importFrom data.table is.data.table
 #' @importFrom data.table as.data.table
 #' @importFrom data.table :=
 #' @importFrom data.table rbindlist
@@ -98,7 +98,16 @@ NULL
 #' @importFrom stringi stri_split_regex
 #' @importFrom utils object.size str tail
 #' @importFrom stats predict
+#' @importFrom stats median
+#' @importFrom utils head
+#' @importFrom graphics barplot
+#' @importFrom graphics lines
+#' @importFrom graphics points
+#' @importFrom graphics grid
+#' @importFrom graphics par
+#' @importFrom graphics title
+#' @importFrom grDevices rgb
 #' 
 #' @import methods
-#' @useDynLib xgboost
+#' @useDynLib xgboost, .registration = TRUE
 NULL
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -1,8 +1,8 @@
 XGBoost R Package for Scalable GBM
 ==================================

-[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
-[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
+[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](https://cran.r-project.org/web/packages/xgboost)
+[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](https://cran.rstudio.com/web/packages/xgboost/index.html)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](http://xgboost.readthedocs.org/en/latest/R-package/index.html)

 Resources
@@ -19,51 +19,6 @@ We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now
 install.packages('xgboost')
 ```

-You can also install from our weekly updated drat repo:
-```r
-install.packages("drat", repos="https://cran.rstudio.com")
-drat:::addRepo("dmlc")
-install.packages("xgboost", repos="http://dmlc.ml/drat/", type="source")
-```
-
-***Important*** Due to the usage of submodule, `install_github` is no longer support to install the
-latest version of R package. 
-For up-to-date version, please install from github.
-
-Windows users will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first. They also need to download [MinGW-W64](http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe) using x86_64 architecture during installation.
-
-Run the following command to add MinGW to PATH in Windows if not already added.
-
-```cmd
-PATH %PATH%;C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin
-```
-
-To compile xgboost at the root of your storage, run the following bash script.
-
-```bash
-git clone --recursive https://github.com/dmlc/xgboost
-cd xgboost
-git submodule init
-git submodule update
-alias make='mingw32-make'
-cd dmlc-core
-make -j4
-cd ../rabit
-make lib/librabit_empty.a -j4
-cd ..
-cp make/mingw64.mk config.mk
-make -j4
-```
-
-Run the following R script to install xgboost package from the root directory.
-
-```r
-install.package('devtools') # if not installed
-setwd('C:/xgboost/')
-library(devtools)
-install('R-package')
-```
-
 For more detailed installation instructions, please see [here](http://xgboost.readthedocs.org/en/latest/build.html#r-package-installation).

 Examples
@@ -71,3 +26,8 @@ Examples

 * Please visit [walk through example](demo).
 * See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
+
+Development
+-----------
+
+* See the [R Package section](https://xgboost.readthedocs.io/en/latest/how_to/contribute.html#r-package) of the contributiors guide.
--- a/R-package/cleanup
+++ b/R-package/cleanup
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+rm -f src/Makevars
--- a/R-package/configure
+++ b/R-package/configure
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -0,0 +1,31 @@
+### configure.ac					-*- Autoconf -*-
+
+AC_PREREQ(2.62)
+
+AC_INIT([xgboost],[0.6-3],[],[xgboost],[])
+
+OPENMP_CXXFLAGS=""
+
+if test `uname -s` = "Linux"
+then
+  OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CFLAGS)"
+fi
+
+if test `uname -s` = "Darwin"
+then
+  OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CFLAGS)"
+  ac_pkg_openmp=no
+  AC_MSG_CHECKING([whether OpenMP will work in a package])
+  AC_LANG_CONFTEST(
+  [AC_LANG_PROGRAM([[#include <omp.h>]], [[ return omp_get_num_threads (); ]])])
+  PKG_CFLAGS="${OPENMP_CFLAGS}" PKG_LIBS="${OPENMP_CFLAGS}" "$RBIN" CMD SHLIB conftest.c 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD && "$RBIN" --vanilla -q -e "dyn.load(paste('conftest',.Platform\$dynlib.ext,sep=''))" 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD && ac_pkg_openmp=yes
+  AC_MSG_RESULT([${ac_pkg_openmp}])
+  if test "${ac_pkg_openmp}" = no; then
+    OPENMP_CXXFLAGS=''
+  fi
+fi
+
+AC_SUBST(OPENMP_CXXFLAGS)
+AC_CONFIG_FILES([src/Makevars])
+AC_OUTPUT
+
--- a/R-package/configure.win
+++ b/R-package/configure.win
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -9,3 +9,6 @@ create_sparse_matrix            Create Sparse Matrix
 predict_leaf_indices            Predicting the corresponding leaves
 early_stopping                  Early Stop in training
 poisson_regression              Poisson Regression on count data
+tweedie_regression              Tweddie Regression
+gpu_accelerated                 GPU-accelerated tree building algorithms
+
--- a/R-package/demo/README.md
+++ b/R-package/demo/README.md
@@ -8,6 +8,7 @@ XGBoost R Feature Walkthrough
 * [Generalized Linear Model](generalized_linear_model.R)
 * [Cross validation](cross_validation.R)
 * [Create a sparse matrix from a dense one](create_sparse_matrix.R)
+* [Use GPU-accelerated tree building algorithms](gpu_accelerated.R)

 Benchmarks
 ====
--- a/R-package/demo/caret_wrapper.R
+++ b/R-package/demo/caret_wrapper.R
@@ -24,7 +24,7 @@ df[,ID:=NULL]
 #-------------Basic Training using XGBoost in caret Library-----------------
 # Set up control parameters for caret::train
 # Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
-fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
+fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
 # train a xgbTree model using caret::train
 model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)

--- a/R-package/demo/gpu_accelerated.R
+++ b/R-package/demo/gpu_accelerated.R
@@ -0,0 +1,45 @@
+# An example of using GPU-accelerated tree building algorithms
+# 
+# NOTE: it can only run if you have a CUDA-enable GPU and the package was 
+#       specially compiled with GPU support.
+#
+# For the current functionality, see 
+# https://xgboost.readthedocs.io/en/latest/gpu/index.html
+#
+
+library('xgboost')
+
+# Simulate N x p random matrix with some binomial response dependent on pp columns
+set.seed(111)
+N <- 1000000
+p <- 50
+pp <- 25
+X <- matrix(runif(N * p), ncol = p)
+betas <- 2 * runif(pp) - 1
+sel <- sort(sample(p, pp))
+m <- X[, sel] %*% betas - 1 + rnorm(N)
+y <- rbinom(N, 1, plogis(m))
+
+tr <- sample.int(N, N * 0.75)
+dtrain <- xgb.DMatrix(X[tr,], label = y[tr])
+dtest <- xgb.DMatrix(X[-tr,], label = y[-tr])
+wl <- list(train = dtrain, test = dtest)
+
+# An example of running 'gpu_hist' algorithm
+# which is
+# - similar to the 'hist'
+# - the fastest option for moderately large datasets
+# - current limitations: max_depth < 16, does not implement guided loss
+# You can use tree_method = 'gpu_exact' for another GPU accelerated algorithm,
+# which is slower, more memory-hungry, but does not use binning.
+param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4,
+              max_bin = 64, tree_method = 'gpu_hist')
+pt <- proc.time()
+bst_gpu <- xgb.train(param, dtrain, watchlist = wl, nrounds = 50)
+proc.time() - pt
+
+# Compare to the 'hist' algorithm:
+param$tree_method <- 'hist'
+pt <- proc.time()
+bst_hist <- xgb.train(param, dtrain, watchlist = wl, nrounds = 50)
+proc.time() - pt
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -27,12 +27,12 @@ head(pred_with_leaf)
 create.new.tree.features <- function(model, original.features){
  pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
  cols <- list()
-  for(i in 1:length(trees)){
+  for(i in 1:model$niter){
    # max is not the real max but it s not important for the purpose of adding features
    leaf.id <- sort(unique(pred_with_leaf[,i]))
    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
  }
-  cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
+  cbind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
 }

 # Convert previous features to one hot encoding
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@@ -10,3 +10,5 @@ demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
 demo(caret_wrapper)
+demo(tweedie_regression)
+#demo(gpu_accelerated) # can only run when built with GPU support
--- a/R-package/demo/tweedie_regression.R
+++ b/R-package/demo/tweedie_regression.R
@@ -0,0 +1,49 @@
+library(xgboost)
+library(data.table)
+library(cplm)
+
+data(AutoClaim)
+
+# auto insurance dataset analyzed by Yip and Yau (2005)
+dt <- data.table(AutoClaim)
+
+# exclude these columns from the model matrix
+exclude <-  c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
+
+# retains the missing values
+# NOTE: this dataset is comes ready out of the box
+options(na.action = 'na.pass')
+x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
+options(na.action = 'na.omit')
+
+# response
+y <- dt[, CLM_AMT5]
+
+d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
+
+# the tweedie_variance_power parameter determines the shape of 
+# distribution
+# - closer to 1 is more poisson like and the mass
+#   is more concentrated near zero 
+# - closer to 2 is more gamma like and the mass spreads to the 
+#   the right with less concentration near zero
+
+params <- list(
+  objective = 'reg:tweedie',
+  eval_metric = 'rmse', 
+  tweedie_variance_power = 1.4,
+  max_depth = 6,
+  eta = 1)
+
+bst <- xgb.train(
+  data = d_train, 
+  params = params, 
+  maximize = FALSE,
+  watchlist = list(train = d_train), 
+  nrounds = 20)
+
+var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
+
+preds <- predict(bst, d_train)
+
+rmse <- sqrt(sum(mean((y - preds)^2)))
--- a/R-package/man/agaricus.test.Rd
+++ b/R-package/man/agaricus.test.Rd
@@ -29,4 +29,3 @@ Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
 School of Information and Computer Science.
 }
 \keyword{datasets}
-
--- a/R-package/man/agaricus.train.Rd
+++ b/R-package/man/agaricus.train.Rd
@@ -29,4 +29,3 @@ Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
 School of Information and Computer Science.
 }
 \keyword{datasets}
-
--- a/R-package/man/callbacks.Rd
+++ b/R-package/man/callbacks.Rd
@@ -35,4 +35,3 @@ with the objects available inside of the \code{xgb.train} and \code{xgb.cv} inte
 \code{\link{xgb.train}},
 \code{\link{xgb.cv}}
 }
-
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/cb.cv.predict.Rd
@@ -34,10 +34,10 @@ Callback function expects the following values to be set in its calling frame:
 \code{basket},
 \code{data},
 \code{end_iteration},
+\code{params},
 \code{num_parallel_tree},
 \code{num_class}.
 }
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.early.stop.Rd
+++ b/R-package/man/cb.early.stop.Rd
@@ -60,4 +60,3 @@ Callback function expects the following values to be set in its calling frame:
 \code{\link{callbacks}},
 \code{\link{xgb.attr}}
 }
-
--- a/R-package/man/cb.evaluation.log.Rd
+++ b/R-package/man/cb.evaluation.log.Rd
@@ -29,4 +29,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/cb.gblinear.history.Rd
@@ -0,0 +1,95 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{cb.gblinear.history}
+\alias{cb.gblinear.history}
+\title{Callback closure for collecting the model coefficients history of a gblinear booster
+during its training.}
+\usage{
+cb.gblinear.history(sparse = FALSE)
+}
+\arguments{
+\item{sparse}{when set to FALSE/TURE, a dense/sparse matrix is used to store the result.
+Sparse format is useful when one expects only a subset of coefficients to be non-zero,
+when using the "thrifty" feature selector with fairly small number of top features
+selected per iteration.}
+}
+\value{
+Results are stored in the \code{coefs} element of the closure.
+The \code{\link{xgb.gblinear.history}} convenience function provides an easy way to access it.
+With \code{xgb.train}, it is either a dense of a sparse matrix.
+While with \code{xgb.cv}, it is a list (an element per each fold) of such matrices.
+}
+\description{
+Callback closure for collecting the model coefficients history of a gblinear booster
+during its training.
+}
+\details{
+To keep things fast and simple, gblinear booster does not internally store the history of linear
+model coefficients at each boosting iteration. This callback provides a workaround for storing
+the coefficients' path, by extracting them after each training iteration.
+
+Callback function expects the following values to be set in its calling frame:
+\code{bst} (or \code{bst_folds}).
+}
+\examples{
+#### Binary classification:
+#
+# In the iris dataset, it is hard to linearly separate Versicolor class from the rest
+# without considering the 2nd order interactions:
+require(magrittr)
+x <- model.matrix(Species ~ .^2, iris)[,-1]
+colnames(x)
+dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"))
+param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
+              lambda = 0.0003, alpha = 0.0003, nthread = 2)
+# For 'shotgun', which is a default linear updater, using high eta values may result in
+# unstable behaviour in some datasets. With this simple dataset, however, the high learning
+# rate does not break the convergence, but allows us to illustrate the typical pattern of
+# "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
+bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
+                 callbacks = list(cb.gblinear.history()))
+# Extract the coefficients' path and plot them vs boosting iteration number:
+coef_path <- xgb.gblinear.history(bst)
+matplot(coef_path, type = 'l')
+
+# With the deterministic coordinate descent updater, it is safer to use higher learning rates.
+# Will try the classical componentwise boosting which selects a single best feature per round:
+bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
+                 updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
+                 callbacks = list(cb.gblinear.history()))
+xgb.gblinear.history(bst) \%>\% matplot(type = 'l')
+#  Componentwise boosting is known to have similar effect to Lasso regularization.
+# Try experimenting with various values of top_k, eta, nrounds,
+# as well as different feature_selectors.
+
+# For xgb.cv:
+bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
+             callbacks = list(cb.gblinear.history()))
+# coefficients in the CV fold #3
+xgb.gblinear.history(bst)[[3]] \%>\% matplot(type = 'l')
+
+
+#### Multiclass classification:
+#
+dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1)
+param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
+              lambda = 0.0003, alpha = 0.0003, nthread = 2)
+# For the default linear updater 'shotgun' it sometimes is helpful
+# to use smaller eta to reduce instability
+bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 70, eta = 0.5,
+                 callbacks = list(cb.gblinear.history()))
+# Will plot the coefficient paths separately for each class:
+xgb.gblinear.history(bst, class_index = 0) \%>\% matplot(type = 'l')
+xgb.gblinear.history(bst, class_index = 1) \%>\% matplot(type = 'l')
+xgb.gblinear.history(bst, class_index = 2) \%>\% matplot(type = 'l')
+
+# CV:
+bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
+              callbacks = list(cb.gblinear.history(FALSE)))
+# 1st forld of 1st class
+xgb.gblinear.history(bst, class_index = 0)[[1]] \%>\% matplot(type = 'l')
+
+}
+\seealso{
+\code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+}
--- a/R-package/man/cb.print.evaluation.Rd
+++ b/R-package/man/cb.print.evaluation.Rd
@@ -4,10 +4,12 @@
 \alias{cb.print.evaluation}
 \title{Callback closure for printing the result of evaluation}
 \usage{
-cb.print.evaluation(period = 1)
+cb.print.evaluation(period = 1, showsd = TRUE)
 }
 \arguments{
 \item{period}{results would be printed every number of periods}
+
+\item{showsd}{whether standard deviations should be printed (when available)}
 }
 \description{
 Callback closure for printing the result of evaluation
@@ -25,4 +27,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.reset.parameters.Rd
+++ b/R-package/man/cb.reset.parameters.Rd
@@ -34,4 +34,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.save.model.Rd
+++ b/R-package/man/cb.save.model.Rd
@@ -31,4 +31,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/dim.xgb.DMatrix.Rd
+++ b/R-package/man/dim.xgb.DMatrix.Rd
@@ -26,4 +26,3 @@ stopifnot(ncol(dtrain) == ncol(train$data))
 stopifnot(all(dim(dtrain) == dim(train$data)))

 }
-
--- a/R-package/man/dimnames.xgb.DMatrix.Rd
+++ b/R-package/man/dimnames.xgb.DMatrix.Rd
@@ -33,4 +33,3 @@ colnames(dtrain) <- make.names(1:ncol(train$data))
 print(dtrain, verbose=TRUE)

 }
-
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -27,7 +27,10 @@ The \code{name} field can be one of the following:
    \item \code{weight}: to do a weight rescale ;
    \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
    \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
+    
 }
+
+\code{group} can be setup by \code{setinfo} but can't be retrieved by \code{getinfo}.
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -40,4 +43,3 @@ setinfo(dtrain, 'label', 1-labels)
 labels2 <- getinfo(dtrain, 'label')
 stopifnot(all(labels2 == 1-labels))
 }
-
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -7,7 +7,7 @@
 \usage{
 \method{predict}{xgb.Booster}(object, newdata, missing = NA,
  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
-  reshape = FALSE, ...)
+  predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)

 \method{predict}{xgb.Booster.handle}(object, ...)
 }
@@ -19,8 +19,8 @@
 \item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
 missing values in data (e.g., sometimes 0 or some other extreme value is used).}

-\item{outputmargin}{whether the prediction should be returned in the for of original untransformed 
-sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
+\item{outputmargin}{whether the prediction should be returned in the for of original untransformed
+sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would result in predictions for log-odds instead of probabilities.}

 \item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
@@ -28,36 +28,54 @@ It will use all the trees by default (\code{NULL} value).}

 \item{predleaf}{whether predict leaf index instead.}

-\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 
+\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
+
+\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
+
+\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
 prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}

 \item{...}{Parameters passed to \code{predict.xgb.Booster}}
 }
 \value{
 For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
-For multiclass classification, either a \code{num_class * nrows(newdata)} vector or 
-a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on 
+For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
+a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
 the \code{reshape} value.

-When \code{predleaf = TRUE}, the output is a matrix object with the 
+When \code{predleaf = TRUE}, the output is a matrix object with the
 number of columns corresponding to the number of trees.
+
+When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
+\code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
+For a multiclass case, a list of \code{num_class} elements is returned, where each element is
+such a matrix. The contribution values are on the scale of untransformed margin
+(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
 }
 \description{
 Predicted values based on either xgboost model or model handle object.
 }
 \details{
-Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
-and it is not necesserily equal to the number of trees in a model.
+Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
+and it is not necessarily equal to the number of trees in a model.
 E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
-But for multiclass classification, there are multiple trees per iteration, 
-but \code{ntreelimit} limits the number of boosting iterations.
+But for multiclass classification, while there are multiple trees per iteration,
+\code{ntreelimit} limits the number of boosting iterations.

-Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
-since gblinear doesn't keep its boosting history. 
+Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
+since gblinear doesn't keep its boosting history.

-One possible practical applications of the \code{predleaf} option is to use the model 
-as a generator of new features which capture non-linearity and interactions, 
+One possible practical applications of the \code{predleaf} option is to use the model
+as a generator of new features which capture non-linearity and interactions,
 e.g., as implemented in \code{\link{xgb.create.features}}.
+
+Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
+individual predictions. For "gblinear" booster, feature contributions are simply linear terms
+(feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
+values (Lundberg 2017) that sum to the difference between the expected output
+of the model and the current prediction (where the hessian weights are used to compute the expectations).
+Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
+in \url{http://blog.datadive.net/interpreting-random-forests/}.
 }
 \examples{
 ## binary classification:
@@ -67,12 +85,33 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test

-bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
+               eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
 # use all trees by default
 pred <- predict(bst, test$data)
 # use only the 1st tree
-pred <- predict(bst, test$data, ntreelimit = 1)
+pred1 <- predict(bst, test$data, ntreelimit = 1)
+
+# Predicting tree leafs:
+# the result is an nsamples X ntrees matrix
+pred_leaf <- predict(bst, test$data, predleaf = TRUE)
+str(pred_leaf)
+
+# Predicting feature contributions to predictions:
+# the result is an nsamples X (nfeatures + 1) matrix
+pred_contr <- predict(bst, test$data, predcontrib = TRUE)
+str(pred_contr)
+# verify that contributions' sums are equal to log-odds of predictions (up to float precision):
+summary(rowSums(pred_contr) - qlogis(pred))
+# for the 1st record, let's inspect its features that had non-zero contribution to prediction:
+contr1 <- pred_contr[1,]
+contr1 <- contr1[-length(contr1)]    # drop BIAS
+contr1 <- contr1[contr1 != 0]        # drop non-contributing features
+contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
+old_mar <- par("mar")
+par(mar = old_mar + c(0,7,0,0))
+barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds")
+par(mar = old_mar)


 ## multiclass classification in iris dataset:
@@ -101,7 +140,7 @@ bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
 pred <- predict(bst, as.matrix(iris[, -5]))
 str(pred)
 all.equal(pred, pred_labels)
-# prediction from using only 5 iterations should result 
+# prediction from using only 5 iterations should result
 # in the same error as seen in iteration 5:
 pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
 sum(pred5 != lb)/length(lb)
@@ -122,8 +161,12 @@ err <- sapply(1:25, function(n) {
 })
 plot(err, type='l', ylim=c(0,0.1), xlab='#trees')

+}
+\references{
+Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+
+Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 }
 \seealso{
 \code{\link{xgb.train}}.
 }
-
--- a/R-package/man/print.xgb.Booster.Rd
+++ b/R-package/man/print.xgb.Booster.Rd
@@ -27,4 +27,3 @@ print(bst)
 print(bst, verbose=TRUE)

 }
-
--- a/R-package/man/print.xgb.DMatrix.Rd
+++ b/R-package/man/print.xgb.DMatrix.Rd
@@ -26,4 +26,3 @@ dtrain
 print(dtrain, verbose=TRUE)

 }
-
--- a/R-package/man/print.xgb.cv.Rd
+++ b/R-package/man/print.xgb.cv.Rd
@@ -29,4 +29,3 @@ print(cv)
 print(cv, verbose=TRUE)

 }
-
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -28,7 +28,7 @@ The \code{name} field can be one of the following:
    \item \code{label}: label Xgboost learn from ;
    \item \code{weight}: to do a weight rescale ;
    \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
-    \item \code{group}.
+    \item \code{group}: number of rows in each group (to use with \code{rank:pairwise} objective).
 }
 }
 \examples{
@@ -41,4 +41,3 @@ setinfo(dtrain, 'label', 1-labels)
 labels2 <- getinfo(dtrain, 'label')
 stopifnot(all.equal(labels2, 1-labels))
 }
-
--- a/R-package/man/slice.xgb.DMatrix.Rd
+++ b/R-package/man/slice.xgb.DMatrix.Rd
@@ -1,9 +1,9 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.DMatrix.R
 \name{slice}
-\alias{[.xgb.DMatrix}
 \alias{slice}
 \alias{slice.xgb.DMatrix}
+\alias{[.xgb.DMatrix}
 \title{Get a new DMatrix containing the specified rows of
 orginal xgb.DMatrix object}
 \usage{
@@ -38,4 +38,3 @@ labels2 <- getinfo(dsub, 'label')
 all.equal(labels1, labels2)

 }
-
--- a/R-package/man/xgb.Booster.complete.Rd
+++ b/R-package/man/xgb.Booster.complete.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.Booster.R
+\name{xgb.Booster.complete}
+\alias{xgb.Booster.complete}
+\title{Restore missing parts of an incomplete xgb.Booster object.}
+\usage{
+xgb.Booster.complete(object, saveraw = TRUE)
+}
+\arguments{
+\item{object}{object of class \code{xgb.Booster}}
+
+\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data
+when it doesn't already exist.}
+}
+\value{
+An object of \code{xgb.Booster} class.
+}
+\description{
+It attempts to complete an \code{xgb.Booster} object by restoring either its missing
+raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
+or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
+but it has a raw Booster memory dump).
+}
+\details{
+While this method is primarily for internal use, it might be useful in some practical situations.
+
+E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
+its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
+should still work for such a model object since those methods would be using
+\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
+\code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
+That would prevent further repeated implicit reconstruction of an internal booster model.
+}
+\examples{
+
+data(agaricus.train, package='xgboost')
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
+               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+saveRDS(bst, "xgb.model.rds")
+
+bst1 <- readRDS("xgb.model.rds")
+# the handle is invalid:
+print(bst1$handle)
+
+bst1 <- xgb.Booster.complete(bst1)
+# now the handle points to a valid internal booster model:
+print(bst1$handle)
+
+}
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -2,23 +2,28 @@
 % Please edit documentation in R/xgb.DMatrix.R
 \name{xgb.DMatrix}
 \alias{xgb.DMatrix}
-\title{Contruct xgb.DMatrix object}
+\title{Construct xgb.DMatrix object}
 \usage{
-xgb.DMatrix(data, info = list(), missing = NA, ...)
+xgb.DMatrix(data, info = list(), missing = NA, silent = FALSE, ...)
 }
 \arguments{
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
+\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character 
+string representing a filename.}

-\item{info}{a list of information of the xgb.DMatrix object}
+\item{info}{a named list of additional information to store in the \code{xgb.DMatrix} object.
+See \code{\link{setinfo}} for the specific allowed kinds of}

-\item{missing}{Missing is only used when input is dense matrix, pick a float
-value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
+It is useful when a 0 or some other extreme value represents missing values in data.}

-\item{...}{other information to pass to \code{info}.}
+\item{silent}{whether to suppress printing an informational message after loading from a file.}
+
+\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
 }
 \description{
-Contruct xgb.DMatrix object from dense matrix, sparse matrix 
-or local file (that was created previously by saving an \code{xgb.DMatrix}).
+Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
+Supported input file formats are either a libsvm text file or a binary file that was created previously by
+\code{\link{xgb.DMatrix.save}}).
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -27,4 +32,3 @@ dtrain <- xgb.DMatrix(train$data, label=train$label)
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 }
-
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -21,4 +21,3 @@ dtrain <- xgb.DMatrix(train$data, label=train$label)
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 }
-
--- a/R-package/man/xgb.attr.Rd
+++ b/R-package/man/xgb.attr.Rd
@@ -20,18 +20,18 @@ xgb.attributes(object) <- value

 \item{name}{a non-empty character string specifying which attribute is to be accessed.}

-\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} 
-it's a list (or an object coercible to a list) with the names of attributes to set 
-and the elements corresponding to attribute values. 
+\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-}
+it's a list (or an object coercible to a list) with the names of attributes to set
+and the elements corresponding to attribute values.
 Non-character values are converted to character.
 When attribute value is not a scalar, only the first index is used.
 Use \code{NULL} to remove an attribute.}
 }
 \value{
-\code{xgb.attr} returns either a string value of an attribute 
+\code{xgb.attr} returns either a string value of an attribute
 or \code{NULL} if an attribute wasn't stored in a model.

-\code{xgb.attributes} returns a list of all attribute stored in a model 
+\code{xgb.attributes} returns a list of all attribute stored in a model
 or \code{NULL} if a model has no stored attributes.
 }
 \description{
@@ -41,23 +41,23 @@ These methods allow to manipulate the key-value attribute strings of an xgboost
 The primary purpose of xgboost model attributes is to store some meta-data about the model.
 Note that they are a separate concept from the object attributes in R.
 Specifically, they refer to key-value strings that can be attached to an xgboost model,
-stored together with the model's binary representation, and accessed later 
+stored together with the model's binary representation, and accessed later
 (from R or any other interface).
 In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
 would not be saved by \code{xgb.save} because an xgboost model is an external memory object
-and its serialization is handled extrnally.
-Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't 
-change the value of that parameter for a model. 
+and its serialization is handled externally.
+Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
+change the value of that parameter for a model.
 Use \code{\link{xgb.parameters<-}} to set or change model parameters.

 The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
 than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
 That would only matter if attributes need to be set many times.
 Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
-the raw model cache of an \code{xgb.Booster} object would not be automatically updated, 
+the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
 and it would be user's responsibility to call \code{xgb.save.raw} to update it.

-The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, 
+The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
 but it doesn't delete the other existing attributes.
 }
 \examples{
@@ -83,4 +83,3 @@ xgb.attributes(bst1) <- list(a = NULL, b = NULL)
 print(xgb.attributes(bst1))

 }
-
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@@ -29,7 +29,7 @@ Joaquin Quinonero Candela)}
 
 International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014

-\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
+\url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.

 Extract explaining the method:

@@ -68,7 +68,8 @@ nround = 4
 bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)

 # Model accuracy without new features
-accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
+                   length(agaricus.test$label)

 # Convert previous features to one hot encoding
 new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
@@ -81,10 +82,11 @@ watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)

 # Model accuracy with new features
-accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
+                  length(agaricus.test$label)

 # Here the accuracy was already good and is now perfect.
-cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
+cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
+          accuracy.after, "!\\n"))

 }
-
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -26,13 +26,13 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
  See \code{\link{xgb.train}} for further details.
  See also demo/ for walkthrough example in R.}

-\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
+\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}

 \item{nrounds}{the max number of iterations}

 \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}

-\item{label}{vector of response values. Should be provided only when data is \code{DMatrix}.}
+\item{label}{vector of response values. Should be provided only when data is an R-matrix.}

 \item{missing}{is only used when input is a dense matrix. By default is set to NA, which means 
 that NA values should be considered as 'missing' by the algorithm. 
@@ -51,6 +51,7 @@ from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callb
  \item \code{rmse} Rooted mean square error
  \item \code{logloss} negative log-likelihood function
  \item \code{auc} Area under curve
+  \item \code{aucpr} Area under PR curve
  \item \code{merror} Exact matching error, used to evaluate multi-class classification
 }}

@@ -104,6 +105,7 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
        CV-based evaluation means and standard deviations for the training and test CV-sets.
        It is created by the \code{\link{cb.evaluation.log}} callback.
  \item \code{niter} number of boosting iterations.
+  \item \code{nfeatures} number of features in training data.
  \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds} 
        parameter or randomly generated.
  \item \code{best_iteration} iteration number with the best evaluation metric value
@@ -118,7 +120,7 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
 }
 }
 \description{
-The cross valudation function of xgboost
+The cross validation function of xgboost
 }
 \details{
 The original sample is randomly partitioned into \code{nfold} equal size subsamples. 
@@ -140,4 +142,3 @@ print(cv)
 print(cv, verbose=TRUE)

 }
-
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -2,34 +2,39 @@
 % Please edit documentation in R/xgb.dump.R
 \name{xgb.dump}
 \alias{xgb.dump}
-\title{Save xgboost model to text file}
+\title{Dump an xgboost model in text format.}
 \usage{
-xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, ...)
+xgb.dump(model, fname = NULL, fmap = "", with_stats = FALSE,
+  dump_format = c("text", "json"), ...)
 }
 \arguments{
 \item{model}{the model object.}

-\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
+\item{fname}{the name of the text file where to save the model text dump. 
+If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}

-\item{fmap}{feature map file representing the type of feature. 
+\item{fmap}{feature map file representing feature types.
 Detailed description could be found at 
 \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
 See demo/ for walkthrough example in R, and
 \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} 
 for example Format.}

-\item{with_stats}{whether dump statistics of splits 
-When this option is on, the model dump comes with two additional statistics:
+\item{with_stats}{whether to dump some additional statistics about the splits.
+When this option is on, the model dump contains two additional values:
 gain is the approximate loss function gain we get in each split;
 cover is the sum of second order gradient in each node.}

+\item{dump_format}{either 'text' or 'json' format could be specified.}
+
 \item{...}{currently not used}
 }
 \value{
-if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
+If fname is not provided or set to \code{NULL} the function will return the model
+as a \code{character} vector. Otherwise it will return \code{TRUE}.
 }
 \description{
-Save a xgboost model to text file. Could be parsed later.
+Dump an xgboost model in text format.
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -42,6 +47,9 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)

 # print the model without saving it to a file
-print(xgb.dump(bst))
-}
+print(xgb.dump(bst, with_stats = TRUE))

+# print in JSON format:
+cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+
+}
--- a/R-package/man/xgb.gblinear.history.Rd
+++ b/R-package/man/xgb.gblinear.history.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.gblinear.history}
+\alias{xgb.gblinear.history}
+\title{Extract gblinear coefficients history.}
+\usage{
+xgb.gblinear.history(model, class_index = NULL)
+}
+\arguments{
+\item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
+using the \code{cb.gblinear.history()} callback.}
+
+\item{class_index}{zero-based class index to extract the coefficients for only that
+specific class in a multinomial multiclass model. When it is NULL, all the
+coeffients are returned. Has no effect in non-multiclass models.}
+}
+\value{
+For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
+corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
+return) and the rows corresponding to boosting iterations.
+
+For an \code{xgb.cv} result, a list of such matrices is returned with the elements
+corresponding to CV folds.
+}
+\description{
+A helper function to extract the matrix of linear coefficients' history
+from a gblinear model created while using the \code{cb.gblinear.history()}
+callback.
+}
+\examples{
+\dontrun{
+See \\code{\\link{cv.gblinear.history}}
+}
+
+}
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -2,63 +2,94 @@
 % Please edit documentation in R/xgb.importance.R
 \name{xgb.importance}
 \alias{xgb.importance}
-\title{Show importance of features in a model}
+\title{Importance of features in a model.}
 \usage{
-xgb.importance(feature_names = NULL, model = NULL, data = NULL,
-  label = NULL, target = function(x) ((x + label) == 2))
+xgb.importance(feature_names = NULL, model = NULL, trees = NULL,
+  data = NULL, label = NULL, target = NULL)
 }
 \arguments{
-\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+\item{feature_names}{character vector of feature names. If the model already
+contains feature names, those would be used when \code{feature_names=NULL} (default value).
+Non-null \code{feature_names} could be provided to override those in the model.}

-\item{model}{generated by the \code{xgb.train} function.}
+\item{model}{object of class \code{xgb.Booster}.}

-\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
+\item{trees}{(only for the gbtree booster) an integer vector of tree indices that should be included
+into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
+It could be useful, e.g., in multiclass classification to get feature importances 
+for each class separately. IMPORTANT: the tree index in xgboost models
+is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}

-\item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
+\item{data}{deprecated.}

-\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
+\item{label}{deprecated.}
+
+\item{target}{deprecated.}
 }
 \value{
-A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
+For a tree model, a \code{data.table} with the following columns:
+\itemize{
+  \item \code{Features} names of the features used in the model;
+  \item \code{Gain} represents fractional contribution of each feature to the model based on
+       the total gain of this feature's splits. Higher percentage means a more important 
+       predictive feature.
+  \item \code{Cover} metric of the number of observation related to this feature;
+  \item \code{Frequency} percentage representing the relative number of times
+       a feature have been used in trees.
+}
+
+A linear model's importance \code{data.table} has the following columns:
+\itemize{
+  \item \code{Features} names of the features used in the model;
+  \item \code{Weight} the linear coefficient of this feature;
+  \item \code{Class} (only for multiclass models) class label.
+}
+
+If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, 
+index of the features will be used instead. Because the index is extracted from the model dump
+(based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
 }
 \description{
-Create a \code{data.table} of the most important features of a model.
+Creates a \code{data.table} of feature importances in a model.
 }
 \details{
-This function is for both linear and tree models.
+This function works for both linear and tree models.

-\code{data.table} is returned by the function. 
-The columns are :
-\itemize{
-  \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
-  \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
-  \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
-  \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
-}
-
-If you don't provide \code{feature_names}, index of the features will be used instead.
-
-Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
-
-Co-occurence count
------------------
-
-The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
-
-Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
-
-If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
+For linear models, the importance is the absolute magnitude of linear coefficients. 
+For that reason, in order to obtain a meaningful ranking by importance for a linear model, 
+the features need to be on the same scale (which you also would want to do when using either 
+L1 or L2 regularization).
 }
 \examples{
+
+# binomial classification using gbtree:
 data(agaricus.train, package='xgboost')
-
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, 
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+xgb.importance(model = bst)

-xgb.importance(colnames(agaricus.train$data), model = bst)
+# binomial classification using gblinear:
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", 
+               eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
+xgb.importance(model = bst)

-# Same thing with co-occurence computation this time
-xgb.importance(colnames(agaricus.train$data), model = bst, data = agaricus.train$data, label = agaricus.train$label)
+# multiclass classification using gbtree:
+nclass <- 3
+nrounds <- 10
+mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
+               max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds,
+               objective = "multi:softprob", num_class = nclass)
+# all classes clumped together:
+xgb.importance(model = mbst)
+# inspect importances separately for each class:
+xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
+xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
+xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
+
+# multiclass classification using gblinear:
+mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
+               booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
+               objective = "multi:softprob", num_class = nclass)
+xgb.importance(model = mbst)

 }
-
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -7,10 +7,22 @@
 xgb.load(modelfile)
 }
 \arguments{
-\item{modelfile}{the name of the binary file.}
+\item{modelfile}{the name of the binary input file.}
+}
+\value{
+An object of \code{xgb.Booster} class.
 }
 \description{
-Load xgboost model from the binary model file
+Load xgboost model from the binary model file.
+}
+\details{
+The input file is expected to contain a model saved in an xgboost-internal binary format
+using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some 
+appropriate methods from other xgboost interfaces. E.g., a model trained in Python and 
+saved from there in xgboost format, could be loaded from R.
+
+Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
+not \code{xgb.load}.
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -23,4 +35,6 @@ xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 pred <- predict(bst, test$data)
 }
-
+\seealso{
+\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
+}
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -5,19 +5,29 @@
 \title{Parse a boosted tree model text dump}
 \usage{
 xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
-  n_first_tree = NULL)
+  trees = NULL, use_int_id = FALSE, ...)
 }
 \arguments{
 \item{feature_names}{character vector of feature names. If the model already
-contains feature names, this argument should be \code{NULL} (default value)}
+contains feature names, those would be used when \code{feature_names=NULL} (default value).
+Non-null \code{feature_names} could be provided to override those in the model.}

 \item{model}{object of class \code{xgb.Booster}}

 \item{text}{\code{character} vector previously generated by the \code{xgb.dump} 
-function  (where parameter \code{with_stats = TRUE} should have been set).}
+function  (where parameter \code{with_stats = TRUE} should have been set).
+\code{text} takes precedence over \code{model}.}

-\item{n_first_tree}{limit the parsing to the \code{n} first trees. 
-If set to \code{NULL}, all trees of the model are parsed.}
+\item{trees}{an integer vector of tree indices that should be parsed.
+If set to \code{NULL}, all trees of the model are parsed.
+It could be useful, e.g., in multiclass classification to get only
+the trees of one certain class. IMPORTANT: the tree index in xgboost models
+is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
+
+\item{use_int_id}{a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
+represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).}
+
+\item{...}{currently not used.}
 }
 \value{
 A \code{data.table} with detailed information about model trees' nodes.
@@ -25,9 +35,9 @@ A \code{data.table} with detailed information about model trees' nodes.
 The columns of the \code{data.table} are:

 \itemize{
- \item \code{Tree}: ID of a tree in a model
- \item \code{Node}: ID of a node in a tree
- \item \code{ID}: unique identifier of a node in a model
+ \item \code{Tree}: integer ID of a tree in a model (zero-based index)
+ \item \code{Node}: integer ID of a node in a tree (zero-based index)
+ \item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
 \item \code{Feature}: for a branch node, it's a feature id or name (when available);
             for a leaf note, it simply labels it as \code{'Leaf'}
 \item \code{Split}: location of the split for a branch node (split condition is always "less than")
@@ -37,7 +47,11 @@ The columns of the \code{data.table} are:
 \item \code{Quality}: either the split gain (change in loss) or the leaf value
 \item \code{Cover}: metric related to the number of observation either seen by a split
                     or collected by a leaf during training.
-}
+} 
+
+When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
+in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from 
+the corresponding trees in the "Node" column.
 }
 \description{
 Parse a boosted tree model text dump into a \code{data.table} structure.
@@ -52,10 +66,12 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep

 (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))

+# This bst model already has feature_names stored with it, so those would be used when 
+# feature_names is not set:
+(dt <- xgb.model.dt.tree(model = bst))

 # How to match feature names of splits that are following a current 'Yes' branch:

 merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
 
 }
-
--- a/R-package/man/xgb.parameters.Rd
+++ b/R-package/man/xgb.parameters.Rd
@@ -29,4 +29,3 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 xgb.parameters(bst) <- list(eta = 0.1)

 }
-
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -56,7 +56,8 @@ This function was inspired by the blog post

 data(agaricus.train, package='xgboost')

-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
+# Change max_depth to a higher number to get a more significant result
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
               eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
               subsample = 0.5, min_child_weight = 2)

@@ -71,4 +72,3 @@ xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
 \seealso{
 \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
 }
-
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -79,4 +79,3 @@ gg + ggplot2::ylab("Frequency")
 \seealso{
 \code{\link[graphics]{barplot}}.
 }
-
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -5,12 +5,12 @@
 \title{Project all trees on one tree and plot it}
 \usage{
 xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,
-  plot_width = NULL, plot_height = NULL, ...)
+  plot_width = NULL, plot_height = NULL, render = TRUE, ...)
 }
 \arguments{
-\item{model}{dump generated by the \code{xgb.train} function.}
+\item{model}{produced by the \code{xgb.train} function.}

-\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+\item{feature_names}{names of each feature as a \code{character} vector.}

 \item{features_keep}{number of features to keep in each position of the multi trees.}

@@ -18,43 +18,58 @@ xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,

 \item{plot_height}{height in pixels of the graph to produce}

+\item{render}{a logical flag for whether the graph should be rendered (see Value).}
+
 \item{...}{currently not used}
 }
 \value{
-Two graphs showing the distribution of the model deepness.
+When \code{render = TRUE}:
+returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
+Similar to ggplot objects, it needs to be printed to see it when not running from command line.
+
+When \code{render = FALSE}:
+silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
+This could be useful if one wants to modify some of the graph attributes
+before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
 }
 \description{
 Visualization of the ensemble of trees as a single collective unit.
 }
 \details{
-This function tries to capture the complexity of gradient boosted tree ensemble 
-in a cohesive way. 
+This function tries to capture the complexity of a gradient boosted tree model
+in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
+The goal is to improve the interpretability of a model generally seen as black box.

-The goal is to improve the interpretability of the model generally seen as black box.
-The function is dedicated to boosting applied to decision trees only.
+Note: this function is applicable to tree booster-based models only.

-The purpose is to move from an ensemble of trees to a single tree only.
-
-It takes advantage of the fact that the shape of a binary tree is only defined by 
-its deepness (therefore in a boosting model, all trees have the same shape). 
+It takes advantage of the fact that the shape of a binary tree is only defined by
+its depth (therefore, in a boosting model, all trees have similar shape).

 Moreover, the trees tend to reuse the same features.

-The function will project each tree on one, and keep for each position the 
-\code{features_keep} first features (based on Gain per feature measure).
+The function projects each tree onto one, and keeps for each position the
+\code{features_keep} first features (based on the Gain per feature measure).

 This function is inspired by this blog post:
 \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 }
 \examples{
+
 data(agaricus.train, package='xgboost')

 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-                 eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-                 min_child_weight = 50)
+               eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
+               min_child_weight = 50, verbose = 0)

-p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), features_keep = 3)
+p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 print(p)

+\dontrun{
+# Below is an example of how to save this plot to a file.
+# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
+library(DiagrammeR)
+gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
+export_graph(gr, 'tree.pdf', width=1500, height=600)
 }

+}
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -0,0 +1,138 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.shap.R
+\name{xgb.plot.shap}
+\alias{xgb.plot.shap}
+\title{SHAP contribution dependency plots}
+\usage{
+xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
+  model = NULL, trees = NULL, target_class = NULL,
+  approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
+  0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
+  ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
+  pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
+  span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
+}
+\arguments{
+\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
+
+\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above 
+\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
+
+\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
+feature importance is calculated, and \code{top_n} high ranked features are taken.}
+
+\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
+
+\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
+or \code{features} is missing.}
+
+\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
+
+\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
+only SHAP contributions for that specific class are used.
+If it is not set, SHAP importances are averaged over all classes.}
+
+\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
+
+\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
+it is set so that up to 100K data points are used.}
+
+\item{n_col}{a number of columns in a grid of plots.}
+
+\item{col}{color of the scatterplot markers.}
+
+\item{pch}{scatterplot marker.}
+
+\item{discrete_n_uniq}{a maximal number of unique values in a feature to consider it as discrete.}
+
+\item{discrete_jitter}{an \code{amount} parameter of jitter added to discrete features' positions.}
+
+\item{ylab}{a y-axis label in 1D plots.}
+
+\item{plot_NA}{whether the contributions of cases with missing values should also be plotted.}
+
+\item{col_NA}{a color of marker for missing value contributions.}
+
+\item{pch_NA}{a marker type for NA values.}
+
+\item{pos_NA}{a relative position of the x-location where NA values are shown:
+\code{min(x) + (max(x) - min(x)) * pos_NA}.}
+
+\item{plot_loess}{whether to plot loess-smoothed curves. The smoothing is only done for features with
+more than 5 distinct values.}
+
+\item{col_loess}{a color to use for the loess curves.}
+
+\item{span_loess}{the \code{span} paramerer in \code{\link[stats]{loess}}'s call.}
+
+\item{which}{whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.}
+
+\item{plot}{whether a plot should be drawn. If FALSE, only a lits of matrices is returned.}
+
+\item{...}{other parameters passed to \code{plot}.}
+}
+\value{
+In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
+\itemize{
+ \item \code{data} the values of selected features;
+ \item \code{shap_contrib} the contributions of selected features.
+}
+}
+\description{
+Visualizing the SHAP feature contribution to prediction dependencies on feature value.
+}
+\details{
+These scatterplots represent how SHAP feature contributions depend of feature values.
+The similarity to partial dependency plots is that they also give an idea for how feature values
+affect predictions. However, in partial dependency plots, we usually see marginal dependencies
+of model prediction on feature value, while SHAP contribution dependency plots display the estimated
+contributions of a feature to model prediction for each individual case.
+
+When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
+weighted LOESS is computed and plotted, where weights are the numbers of data points
+at each rounded value.
+
+Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
+the margin is prediction before a sigmoidal transform into probability-like values.
+Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
+contributions for all features + bias), depending on the objective used, transforming SHAP
+contributions for a feature from the marginal to the prediction space is not necessarily
+a meaningful thing to do.
+}
+\examples{
+
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+
+bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, 
+               eta = 0.1, max_depth = 3, subsample = .5,
+               method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+
+xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
+contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
+xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
+
+# multiclass example - plots for each class separately:
+nclass <- 3
+nrounds <- 20
+x <- as.matrix(iris[, -5])
+set.seed(123)
+is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
+mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
+                max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+                objective = "multi:softprob", num_class = nclass, verbose = 0)
+trees0 <- seq(from=0, by=nclass, length.out=nrounds)
+col <- rgb(0, 0, 1, 0.5)
+xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4,
+              n_col = 2, col = col, pch = 16, pch_NA = 17)
+xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
+              n_col = 2, col = col, pch = 16, pch_NA = 17)
+xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
+              n_col = 2, col = col, pch = 16, pch_NA = 17)
+
+}
+\references{
+Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+
+Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
+}
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -4,24 +4,39 @@
 \alias{xgb.plot.tree}
 \title{Plot a boosted tree model}
 \usage{
-xgb.plot.tree(feature_names = NULL, model = NULL, n_first_tree = NULL,
-  plot_width = NULL, plot_height = NULL, ...)
+xgb.plot.tree(feature_names = NULL, model = NULL, trees = NULL,
+  plot_width = NULL, plot_height = NULL, render = TRUE,
+  show_node_id = FALSE, ...)
 }
 \arguments{
-\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+\item{feature_names}{names of each feature as a \code{character} vector.}

-\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+\item{model}{produced by the \code{xgb.train} function.}

-\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
+\item{trees}{an integer vector of tree indices that should be visualized.
+If set to \code{NULL}, all trees of the model are included.
+IMPORTANT: the tree index in xgboost model is zero-based
+(e.g., use \code{trees = 0:2} for the first 3 trees in a model).}

 \item{plot_width}{the width of the diagram in pixels.}

 \item{plot_height}{the height of the diagram in pixels.}

+\item{render}{a logical flag for whether the graph should be rendered (see Value).}
+
+\item{show_node_id}{a logical flag for whether to show node id's in the graph.}
+
 \item{...}{currently not used.}
 }
 \value{
-A \code{DiagrammeR} of the model.
+When \code{render = TRUE}:
+returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
+Similar to ggplot objects, it needs to be printed to see it when not running from command line.
+
+When \code{render = FALSE}:
+silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
+This could be useful if one wants to modify some of the graph attributes
+before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
 }
 \description{
 Read a tree model text dump and plot the model.
@@ -30,20 +45,40 @@ Read a tree model text dump and plot the model.
 The content of each node is organised that way:

 \itemize{
- \item \code{feature} value;
- \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
- \item \code{gain}: metric the importance of the node in the model.
+ \item Feature name.
+ \item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
+       If it is square loss, this simply corresponds to the number of instances seen by a split
+       or collected by a leaf during training.
+       The deeper in the tree a node is, the lower this metric will be.
+ \item \code{Gain} (for split nodes): the information gain metric of a split
+       (corresponds to the importance of the node in the model).
+ \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction.
 } 
+The tree root nodes also indicate the Tree index (0-based).

-The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
+The "Yes" branches are marked by the "< split_value" label.
+The branches that also used for missing values are marked as bold
+(as in "carrying extra capacity").
+
+This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
 }
 \examples{
 data(agaricus.train, package='xgboost')

-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, 
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+# plot all the trees
+xgb.plot.tree(model = bst)
+# plot only the first tree and display the node ID:
+xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)

-xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst)
-
+\dontrun{
+# Below is an example of how to save this plot to a file. 
+# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
+library(DiagrammeR)
+gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
+export_graph(gr, 'tree.pdf', width=1500, height=1900)
+export_graph(gr, 'tree.png', width=1500, height=1900)
 }

+}
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -7,12 +7,22 @@
 xgb.save(model, fname)
 }
 \arguments{
-\item{model}{the model object.}
+\item{model}{model object of \code{xgb.Booster} class.}

-\item{fname}{the name of the file to write.}
+\item{fname}{name of the file to write.}
 }
 \description{
-Save xgboost model from xgboost or xgb.train
+Save xgboost model to a file in binary format.
+}
+\details{
+This methods allows to save a model in an xgboost-internal binary format which is universal 
+among the various xgboost interfaces. In R, the saved model file could be read-in later
+using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter 
+of \code{\link{xgb.train}}.
+
+Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} 
+or \code{\link[base]{save}}). However, it would then only be compatible with R, and 
+corresponding R-methods would need to be used to load it.
 }
 \examples{
 data(agaricus.train, package='xgboost')
@@ -25,4 +35,6 @@ xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 pred <- predict(bst, test$data)
 }
-
+\seealso{
+\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
+}
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -25,4 +25,3 @@ bst <- xgb.load(raw)
 pred <- predict(bst, test$data)

 }
-
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -12,7 +12,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,

 xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
  params = list(), nrounds, verbose = 1, print_every_n = 1L,
-  early_stopping_rounds = NULL, maximize = NULL, save_period = 0,
+  early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
 }
 \arguments{
@@ -23,8 +23,7 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
 1. General Parameters

 \itemize{
-  \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
-  \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
+  \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
 }
 
 2. Booster Parameters
@@ -39,6 +38,7 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
  \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
  \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
  \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
+  \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.
 }

 2.2. Parameter for Linear Booster
@@ -67,16 +67,19 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
  \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 }}

-\item{data}{input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
-\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.}
+\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
+\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.}

-\item{nrounds}{the max number of iterations}
+\item{nrounds}{max number of boosting iterations.}

-\item{watchlist}{what information should be printed when \code{verbose=1} or
-\code{verbose=2}. Watchlist is used to specify validation set monitoring
-during training. For example user can specify
-watchlist=list(validation1=mat1, validation2=mat2) to watch
-the performance of each round's model on mat1 and mat2}
+\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
+Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
+of these datasets during each boosting iteration, and stored in the end as a field named 
+\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or 
+\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+printed out during the training. 
+E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
+the performance of each round's model on mat1 and mat2.}

 \item{obj}{customized objective function. Returns gradient and second order 
 gradient with given prediction and dtrain.}
@@ -85,10 +88,10 @@ gradient with given prediction and dtrain.}
 \code{list(metric='metric-name', value='metric-value')} with given 
 prediction and dtrain.}

-\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print 
-information of performance. If 2, xgboost will print some additional information.
-Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and 
-\code{\link{cb.print.evaluation}} callback functions.}
+\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance.
+If 2, some additional information will be printed out.
+Note that setting \code{verbose > 0} automatically engages the 
+\code{cb.print.evaluation(period=1)} callback function.}

 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the 
@@ -109,7 +112,7 @@ This parameter is passed to the \code{\link{cb.early.stop}} callback.}

 \item{save_name}{the name or path for periodically saved model file.}

-\item{xgb_model}{a previously built model to continue the trainig from.
+\item{xgb_model}{a previously built model to continue the training from.
 Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a 
 file with a previously saved model.}

@@ -150,17 +153,21 @@ An object of class \code{xgb.Booster} with the following elements:
        (only available with early stopping).
  \item \code{best_score} the best evaluation metric value during early stopping.
        (only available with early stopping).
+  \item \code{feature_names} names of the training dataset features
+        (only when comun names were defined in training data).
+  \item \code{nfeatures} number of features in training data.
 }
 }
 \description{
-\code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
+\code{xgb.train} is an advanced interface for training an xgboost model.
+The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
 }
 \details{
 These are the training functions for \code{xgboost}. 

 The \code{xgb.train} interface supports advanced features such as \code{watchlist}, 
 customized objective and evaluation metric functions, therefore it is more flexible 
-than the \code{\link{xgboost}} interface.
+than the \code{xgboost} interface.

 Parallelization is automatically enabled if \code{OpenMP} is present. 
 Number of threads can also be manually specified via \code{nthread} parameter.
@@ -173,12 +180,13 @@ The folloiwing is the list of built-in metrics for which Xgboost provides optimi
  \itemize{
     \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
     \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
-     \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
+     \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
     \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
           By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
           Different threshold (e.g., 0.) could be specified as "error@0."
     \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
     \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
+     \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
     \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
  }

@@ -186,7 +194,7 @@ The following callbacks are automatically created when certain parameters are se
 \itemize{
  \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
        and the \code{print_every_n} parameter is passed to it.
-  \item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present.
+  \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
  \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
  \item \code{cb.save.model}: when \code{save_period > 0} is set.
 }
@@ -197,12 +205,13 @@ data(agaricus.test, package='xgboost')

 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
-watchlist <- list(eval = dtest, train = dtrain)
+watchlist <- list(train = dtrain, eval = dtest)

 ## A simple xgb.train example:
-param <- list(max_depth = 2, eta = 1, silent = 1, 
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
              objective = "binary:logistic", eval_metric = "auc")
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+

 ## An xgb.train example where custom objective and evaluation metric are used:
 logregobj <- function(preds, dtrain) {
@@ -217,18 +226,33 @@ evalerror <- function(preds, dtrain) {
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+
+# These functions could be used by passing them either:
+#  as 'objective' and 'eval_metric' parameters in the params list:
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
+              objective = logregobj, eval_metric = evalerror)
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+
+#  or through the ... arguments:
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+                 objective = logregobj, eval_metric = evalerror)
+
+#  or as dedicated 'obj' and 'feval' parameters of xgb.train:
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+                 obj = logregobj, feval = evalerror)
+

 ## An xgb.train example of using variable learning rates at each iteration:
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
+              objective = "binary:logistic", eval_metric = "auc")
 my_etas <- list(eta = c(0.5, 0.1))
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                 callbacks = list(cb.reset.parameters(my_etas)))

-## Explicit use of the cb.evaluation.log callback allows to run 
-## xgb.train silently but still store the evaluation results:
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
-                 verbose = 0, callbacks = list(cb.evaluation.log()))
-print(bst$evaluation_log)
+## Early stopping:
+bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
+                 early_stopping_rounds = 3)

 ## An 'xgboost' interface example:
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, 
@@ -236,10 +260,13 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
               objective = "binary:logistic")
 pred <- predict(bst, agaricus.test$data)

+}
+\references{
+Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
+22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 }
 \seealso{
 \code{\link{callbacks}},
 \code{\link{predict.xgb.Booster}},
 \code{\link{xgb.cv}}
 }
-
--- a/R-package/man/xgboost-deprecated.Rd
+++ b/R-package/man/xgboost-deprecated.Rd
@@ -14,4 +14,3 @@ A deprecation warning is shown when any of the deprecated parameters is used in
 An additional warning is shown when there was a partial match to a deprecated parameter 
 (as R is able to partially match parameter names).
 }
-
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -10,9 +10,15 @@ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
           -DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\
           -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_

+# disable the use of thread_local for 32 bit windows:
+ifeq ($(R_OSTYPE)$(WIN),windows)
+    XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0
+endif
+$(foreach v, $(XGB_RFLAGS), $(warning $(v)))
+
 PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
-PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o\
+PKG_CXXFLAGS= @OPENMP_CXXFLAGS@ $(SHLIB_PTHREAD_FLAGS)
+PKG_LIBS = @OPENMP_CXXFLAGS@ $(SHLIB_PTHREAD_FLAGS)
+OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o\
         $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o\
         $(PKGROOT)/rabit/src/engine_empty.o $(PKGROOT)/rabit/src/c_api.o
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -4,7 +4,7 @@ ENABLE_STD_THREAD=0
 # _*_ mode: Makefile; _*_

 # This file is only used for windows compilation from github
-# It will be replaced by Makevars in CRAN version
+# It will be replaced with Makevars.in for the CRAN version
 .PHONY: all xgblib
 all: $(SHLIB)
 $(SHLIB): xgblib
@@ -22,10 +22,16 @@ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
           -DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\
           -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_

+# disable the use of thread_local for 32 bit windows:
+ifeq ($(R_OSTYPE)$(WIN),windows)
+    XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0
+endif
+$(foreach v, $(XGB_RFLAGS), $(warning $(v)))
+
 PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o\
+OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o\
         $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o\
         $(PKGROOT)/rabit/src/engine_empty.o $(PKGROOT)/rabit/src/c_api.o

--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -0,0 +1,77 @@
+/* Copyright (c) 2015 by Contributors
+ * 
+ * This file was initially generated using the following R command:
+ * tools::package_native_routine_registration_skeleton('.', con = 'src/init.c', character_only = F)
+ * and edited to conform to xgboost C linter requirements. For details, see
+ * https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines
+ */
+#include <R.h>
+#include <Rinternals.h>
+#include <stdlib.h>
+#include <R_ext/Rdynload.h>
+
+/* FIXME: 
+Check these declarations against the C/Fortran source code.
+*/
+
+/* .Call calls */
+extern SEXP XGBoosterBoostOneIter_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterCreate_R(SEXP);
+extern SEXP XGBoosterDumpModel_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterEvalOneIter_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterGetAttrNames_R(SEXP);
+extern SEXP XGBoosterGetAttr_R(SEXP, SEXP);
+extern SEXP XGBoosterLoadModelFromRaw_R(SEXP, SEXP);
+extern SEXP XGBoosterLoadModel_R(SEXP, SEXP);
+extern SEXP XGBoosterModelToRaw_R(SEXP);
+extern SEXP XGBoosterPredict_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
+extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
+extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
+extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP);
+extern SEXP XGCheckNullPtr_R(SEXP);
+extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP);
+extern SEXP XGDMatrixGetInfo_R(SEXP, SEXP);
+extern SEXP XGDMatrixNumCol_R(SEXP);
+extern SEXP XGDMatrixNumRow_R(SEXP);
+extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
+extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
+extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
+
+static const R_CallMethodDef CallEntries[] = {
+  {"XGBoosterBoostOneIter_R",     (DL_FUNC) &XGBoosterBoostOneIter_R,     4},
+  {"XGBoosterCreate_R",           (DL_FUNC) &XGBoosterCreate_R,           1},
+  {"XGBoosterDumpModel_R",        (DL_FUNC) &XGBoosterDumpModel_R,        4},
+  {"XGBoosterEvalOneIter_R",      (DL_FUNC) &XGBoosterEvalOneIter_R,      4},
+  {"XGBoosterGetAttrNames_R",     (DL_FUNC) &XGBoosterGetAttrNames_R,     1},
+  {"XGBoosterGetAttr_R",          (DL_FUNC) &XGBoosterGetAttr_R,          2},
+  {"XGBoosterLoadModelFromRaw_R", (DL_FUNC) &XGBoosterLoadModelFromRaw_R, 2},
+  {"XGBoosterLoadModel_R",        (DL_FUNC) &XGBoosterLoadModel_R,        2},
+  {"XGBoosterModelToRaw_R",       (DL_FUNC) &XGBoosterModelToRaw_R,       1},
+  {"XGBoosterPredict_R",          (DL_FUNC) &XGBoosterPredict_R,          4},
+  {"XGBoosterSaveModel_R",        (DL_FUNC) &XGBoosterSaveModel_R,        2},
+  {"XGBoosterSetAttr_R",          (DL_FUNC) &XGBoosterSetAttr_R,          3},
+  {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
+  {"XGBoosterUpdateOneIter_R",    (DL_FUNC) &XGBoosterUpdateOneIter_R,    3},
+  {"XGCheckNullPtr_R",            (DL_FUNC) &XGCheckNullPtr_R,            1},
+  {"XGDMatrixCreateFromCSC_R",    (DL_FUNC) &XGDMatrixCreateFromCSC_R,    4},
+  {"XGDMatrixCreateFromFile_R",   (DL_FUNC) &XGDMatrixCreateFromFile_R,   2},
+  {"XGDMatrixCreateFromMat_R",    (DL_FUNC) &XGDMatrixCreateFromMat_R,    2},
+  {"XGDMatrixGetInfo_R",          (DL_FUNC) &XGDMatrixGetInfo_R,          2},
+  {"XGDMatrixNumCol_R",           (DL_FUNC) &XGDMatrixNumCol_R,           1},
+  {"XGDMatrixNumRow_R",           (DL_FUNC) &XGDMatrixNumRow_R,           1},
+  {"XGDMatrixSaveBinary_R",       (DL_FUNC) &XGDMatrixSaveBinary_R,       3},
+  {"XGDMatrixSetInfo_R",          (DL_FUNC) &XGDMatrixSetInfo_R,          3},
+  {"XGDMatrixSliceDMatrix_R",     (DL_FUNC) &XGDMatrixSliceDMatrix_R,     2},
+  {NULL, NULL, 0}
+};
+
+#if defined(_WIN32)
+__declspec(dllexport)
+#endif
+void R_init_xgboost(DllInfo *dll) {
+  R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+  R_useDynamicSymbols(dll, FALSE);
+}
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -56,8 +56,8 @@ SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
  CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

@@ -68,53 +68,62 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
  SEXP dim = getAttrib(mat, R_DimSymbol);
  size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
  size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
-  double *din = REAL(mat);
+  const bool is_int = TYPEOF(mat) == INTSXP;
+  double *din;
+  int *iin;
+  if (is_int) {
+    iin = INTEGER(mat);
+  } else {
+    din = REAL(mat);
+  }
  std::vector<float> data(nrow * ncol);
  #pragma omp parallel for schedule(static)
  for (omp_ulong i = 0; i < nrow; ++i) {
    for (size_t j = 0; j < ncol; ++j) {
-      data[i * ncol +j] = din[i + nrow * j];
+      data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
    }
  }
  DMatrixHandle handle;
  CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

 SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
                              SEXP indices,
-                              SEXP data) {
+                              SEXP data,
+                              SEXP num_row) {
  SEXP ret;
  R_API_BEGIN();
  const int *p_indptr = INTEGER(indptr);
  const int *p_indices = INTEGER(indices);
  const double *p_data = REAL(data);
-  int nindptr = length(indptr);
-  int ndata = length(data);
-  std::vector<bst_ulong> col_ptr_(nindptr);
+  size_t nindptr = static_cast<size_t>(length(indptr));
+  size_t ndata = static_cast<size_t>(length(data));
+  size_t nrow = static_cast<size_t>(INTEGER(num_row)[0]);
+  std::vector<size_t> col_ptr_(nindptr);
  std::vector<unsigned> indices_(ndata);
  std::vector<float> data_(ndata);

-  for (int i = 0; i < nindptr; ++i) {
-    col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
+  for (size_t i = 0; i < nindptr; ++i) {
+    col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
  }
  #pragma omp parallel for schedule(static)
-  for (int i = 0; i < ndata; ++i) {
+  for (int64_t i = 0; i < static_cast<int64_t>(ndata); ++i) {
    indices_[i] = static_cast<unsigned>(p_indices[i]);
    data_[i] = static_cast<float>(p_data[i]);
  }
  DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
-                                    BeginPtr(data_), nindptr, ndata,
-                                    &handle));
+  CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
+                                      BeginPtr(data_), nindptr, ndata,
+                                      nrow, &handle));
  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

@@ -132,8 +141,8 @@ SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
                                   &res));
  ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

@@ -184,8 +193,8 @@ SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
  for (size_t i = 0; i < olen; ++i) {
    REAL(ret)[i] = res[i];
  }
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

@@ -224,8 +233,8 @@ SEXP XGBoosterCreate_R(SEXP dmats) {
  CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
  R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

@@ -305,8 +314,8 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_lim
  for (size_t i = 0; i < olen; ++i) {
    REAL(ret)[i] = res[i];
  }
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

@@ -343,28 +352,45 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) {
  if (olen != 0) {
    memcpy(RAW(ret), raw, olen);
  }
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return ret;
 }

-SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
+SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
  SEXP out;
  R_API_BEGIN();
  bst_ulong olen;
  const char **res;
-  CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+  const char *fmt = CHAR(asChar(dump_format));
+  CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
                                CHAR(asChar(fmap)),
                                asInteger(with_stats),
+                                fmt,
                                &olen, &res));
  out = PROTECT(allocVector(STRSXP, olen));
-  for (size_t i = 0; i < olen; ++i) {
+  if (!strcmp("json", fmt)) {
    std::stringstream stream;
-    stream <<  "booster[" << i <<"]\n" << res[i];
-    SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+    stream <<  "[\n";
+    for (size_t i = 0; i < olen; ++i) {
+      stream << res[i];
+      if (i < olen - 1) {
+        stream << ",\n";
+      } else {
+        stream << "\n";
+      }
+    }
+    stream <<  "]";
+    SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
+  } else {
+    for (size_t i = 0; i < olen; ++i) {
+      std::stringstream stream;
+      stream <<  "booster[" << i <<"]\n" << res[i];
+      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+    }
  }
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return out;
 }

@@ -383,8 +409,8 @@ SEXP XGBoosterGetAttr_R(SEXP handle, SEXP name) {
  } else {
    out = PROTECT(R_NilValue);
  }
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return out;
 }

@@ -412,8 +438,7 @@ SEXP XGBoosterGetAttrNames_R(SEXP handle) {
  } else {
    out = PROTECT(R_NilValue);
  }
-  UNPROTECT(1);
  R_API_END();
+  UNPROTECT(1);
  return out;
 }
-
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -43,11 +43,13 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
 * \param indptr pointer to column headers
 * \param indices row indices
 * \param data content of the data
+ * \param num_row numer of rows (when it's set to 0, then guess from data)
 * \return created dmatrix
 */
 XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
                                      SEXP indices,
-                                      SEXP data);
+                                      SEXP data,
+                                      SEXP num_row);

 /*!
 * \brief create a new dmatrix from sliced content of existing matrix
@@ -183,8 +185,9 @@ XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle);
 * \param handle handle
 * \param fmap  name to fmap can be empty string
 * \param with_stats whether dump statistics of splits
+ * \param dump_format the format to dump the model in
 */
-XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
+XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format);

 /*!
 * \brief get learner attribute value
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -8,6 +8,11 @@ train <- agaricus.train
 test <- agaricus.test
 set.seed(1994)

+# disable some tests for Win32
+windows_flag = .Platform$OS.type == "windows" &&
+               .Machine$sizeof.pointer != 8
+solaris_flag = (Sys.info()['sysname'] == "SunOS")
+
 test_that("train and predict binary classification", {
  nrounds = 2
  expect_output(
@@ -107,7 +112,7 @@ test_that("train and predict RF with softprob", {
  set.seed(11)
  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
                 max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
-                 objective = "multi:softprob", num_class=3,
+                 objective = "multi:softprob", num_class=3, verbose = 0,
                 num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
  expect_equal(bst$niter, 15)
  expect_equal(xgb.ntree(bst), 15*3*4)
@@ -142,33 +147,38 @@ test_that("training continuation works", {

  # for the reference, use 4 iterations at once:
  set.seed(11)
-  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist)
+  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
  # first two iterations:
  set.seed(11)
-  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
  # continue for two more:
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1)
-  expect_equal(bst$raw, bst2$raw)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
+  if (!windows_flag && !solaris_flag)
+    expect_equal(bst$raw, bst2$raw)
  expect_false(is.null(bst2$evaluation_log))
  expect_equal(dim(bst2$evaluation_log), c(4, 2))
  expect_equal(bst2$evaluation_log, bst$evaluation_log)
  # test continuing from raw model data
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1$raw)
-  expect_equal(bst$raw, bst2$raw)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
+  if (!windows_flag && !solaris_flag)
+    expect_equal(bst$raw, bst2$raw)
  expect_equal(dim(bst2$evaluation_log), c(2, 2))
  # test continuing from a model in file
  xgb.save(bst1, "xgboost.model")
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = "xgboost.model")
-  expect_equal(bst$raw, bst2$raw)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model")
+  if (!windows_flag && !solaris_flag)
+    expect_equal(bst$raw, bst2$raw)
  expect_equal(dim(bst2$evaluation_log), c(2, 2))
 })


 test_that("xgb.cv works", {
  set.seed(11)
-  cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
-               eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-               verbose=TRUE)
+  expect_output(
+    cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
+                 eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
+                 verbose=TRUE)
+  , "train-error:")
  expect_is(cv, 'xgb.cv.synchronous')
  expect_false(is.null(cv$evaluation_log))
  expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
@@ -180,3 +190,36 @@ test_that("xgb.cv works", {
  expect_false(is.null(cv$callbacks))
  expect_false(is.null(cv$call))
 })
+
+test_that("train and predict with non-strict classes", {
+  # standard dense matrix input
+  train_dense <- as.matrix(train$data)
+  bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
+                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
+  pr0 <- predict(bst, train_dense)
+  
+  # dense matrix-like input of non-matrix class
+  class(train_dense) <- 'shmatrix'
+  expect_true(is.matrix(train_dense))
+  expect_error(
+    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
+                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
+    , regexp = NA)
+  expect_error(pr <- predict(bst, train_dense), regexp = NA)
+  expect_equal(pr0, pr)
+  
+  # dense matrix-like input of non-matrix class with some inheritance
+  class(train_dense) <- c('pphmatrix','shmatrix')
+  expect_true(is.matrix(train_dense))
+  expect_error(
+    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
+                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
+    , regexp = NA)
+  expect_error(pr <- predict(bst, train_dense), regexp = NA)
+  expect_equal(pr0, pr)
+  
+  # when someone inhertis from xgb.Booster, it should still be possible to use it as xgb.Booster
+  class(bst) <- c('super.Booster', 'xgb.Booster')
+  expect_error(pr <- predict(bst, train_dense), regexp = NA)
+  expect_equal(pr0, pr)
+})
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -107,18 +107,27 @@ test_that("cb.evaluation.log works as expected", {

 param <- list(objective = "binary:logistic", max_depth = 4, nthread = 2)

+test_that("can store evaluation_log without printing", {
+  expect_silent(
+    bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
+  )
+  expect_false(is.null(bst$evaluation_log))
+  expect_false(is.null(bst$evaluation_log$train_error))
+  expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
+})
+
 test_that("cb.reset.parameters works as expected", {

  # fixed eta
  set.seed(111)
-  bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9)
+  bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
  expect_false(is.null(bst0$evaluation_log))
  expect_false(is.null(bst0$evaluation_log$train_error))

  # same eta but re-set as a vector parameter in the callback
  set.seed(111)
  my_par <- list(eta = c(0.9, 0.9))
-  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
                    callbacks = list(cb.reset.parameters(my_par)))
  expect_false(is.null(bst1$evaluation_log$train_error))
  expect_equal(bst0$evaluation_log$train_error, 
@@ -127,7 +136,7 @@ test_that("cb.reset.parameters works as expected", {
  # same eta but re-set via a function in the callback
  set.seed(111)
  my_par <- list(eta = function(itr, itr_end) 0.9)
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
                    callbacks = list(cb.reset.parameters(my_par)))
  expect_false(is.null(bst2$evaluation_log$train_error))
  expect_equal(bst0$evaluation_log$train_error, 
@@ -136,7 +145,7 @@ test_that("cb.reset.parameters works as expected", {
  # different eta re-set as a vector parameter in the callback
  set.seed(111)
  my_par <- list(eta = c(0.6, 0.5))
-  bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+  bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
                    callbacks = list(cb.reset.parameters(my_par)))
  expect_false(is.null(bst3$evaluation_log$train_error))
  expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
@@ -144,13 +153,18 @@ test_that("cb.reset.parameters works as expected", {
  # resetting multiple parameters at the same time runs with no error
  my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
  expect_error(
-    bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+    bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
                      callbacks = list(cb.reset.parameters(my_par)))
  , NA) # NA = no error
+  # CV works as well
+  expect_error(
+    bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0,
+                   callbacks = list(cb.reset.parameters(my_par)))
+  , NA) # NA = no error

  # expect no learning with 0 learning rate
  my_par <- list(eta = c(0., 0.))
-  bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, 
+  bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
                    callbacks = list(cb.reset.parameters(my_par)))
  expect_false(is.null(bstX$evaluation_log$train_error))
  er <- unique(bstX$evaluation_log$train_error)
@@ -162,7 +176,7 @@ test_that("cb.save.model works as expected", {
  files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model')
  for (f in files) if (file.exists(f)) file.remove(f)
  
-  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1,
+  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
                   save_period = 1, save_name = "xgboost_%02d.model")
  expect_true(file.exists('xgboost_01.model'))
  expect_true(file.exists('xgboost_02.model'))
@@ -173,7 +187,8 @@ test_that("cb.save.model works as expected", {
  expect_equal(bst$raw, b2$raw)

  # save_period = 0 saves the last iteration's model
-  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, save_period = 0)
+  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
+                   save_period = 0)
  expect_true(file.exists('xgboost.model'))
  b2 <- xgb.load('xgboost.model')
  expect_equal(bst$raw, b2$raw)
@@ -181,16 +196,6 @@ test_that("cb.save.model works as expected", {
  for (f in files) if (file.exists(f)) file.remove(f)
 })

-test_that("can store evaluation_log without printing", {
-  expect_silent(
-    bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1,
-                     verbose = 0, callbacks = list(cb.evaluation.log()))
-  )
-  expect_false(is.null(bst$evaluation_log))
-  expect_false(is.null(bst$evaluation_log$train_error))
-  expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
-})
-
 test_that("early stopping xgb.train works", {
  set.seed(11)
  expect_output(
@@ -206,6 +211,13 @@ test_that("early stopping xgb.train works", {
  err_pred <- err(ltest, pred)
  err_log <- bst$evaluation_log[bst$best_iteration, test_error]
  expect_equal(err_log, err_pred, tolerance = 5e-6)
+  
+  set.seed(11)
+  expect_silent(
+    bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
+                      early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
+  )
+  expect_equal(bst$evaluation_log, bst0$evaluation_log)
 })

 test_that("early stopping using a specific metric works", {
@@ -243,7 +255,7 @@ test_that("early stopping xgb.cv works", {
 test_that("prediction in xgb.cv works", {
  set.seed(11)
  nrounds = 4
-  cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE)
+  cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
  expect_false(is.null(cv$evaluation_log))
  expect_false(is.null(cv$pred))
  expect_length(cv$pred, nrow(train$data))
@@ -253,13 +265,22 @@ test_that("prediction in xgb.cv works", {

  # save CV models
  set.seed(11)
-  cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE,
+  cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0,
                callbacks = list(cb.cv.predict(save_models = TRUE)))
  expect_equal(cv$evaluation_log, cvx$evaluation_log)
  expect_length(cvx$models, 5)
  expect_true(all(sapply(cvx$models, class) == 'xgb.Booster'))
 })

+test_that("prediction in xgb.cv works for gblinear too", {
+  set.seed(11)
+  p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2)
+  cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
+  expect_false(is.null(cv$evaluation_log))
+  expect_false(is.null(cv$pred))
+  expect_length(cv$pred, nrow(train$data))
+})
+
 test_that("prediction in early-stopping xgb.cv works", {
  set.seed(1)
  expect_output(
@@ -286,7 +307,7 @@ test_that("prediction in xgb.cv for softprob works", {
  expect_warning(
    cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
                 eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
-                 subsample = 0.8, gamma = 2,
+                 subsample = 0.8, gamma = 2, verbose = 0,
                 prediction = TRUE, objective = "multi:softprob", num_class = 3)
  , NA)
  expect_false(is.null(cv$pred))
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -1,4 +1,5 @@
 require(xgboost)
+require(Matrix)

 context("testing xgb.DMatrix functionality")

@@ -6,20 +7,41 @@ data(agaricus.test, package='xgboost')
 test_data <- agaricus.test$data[1:100,]
 test_label <- agaricus.test$label[1:100]

-test_that("xgb.DMatrix: basic construction, saving, loading", {
+test_that("xgb.DMatrix: basic construction", {
  # from sparse matrix
  dtest1 <- xgb.DMatrix(test_data, label=test_label)
+  
  # from dense matrix 
  dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label)
  expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
+  expect_equal(dim(dtest1), dim(dtest2))
  
+  #from dense integer matrix
+  int_data <- as.matrix(test_data)
+  storage.mode(int_data) <- "integer"
+  dtest3 <- xgb.DMatrix(int_data, label=test_label)
+  expect_equal(dim(dtest1), dim(dtest3))
+})
+
+test_that("xgb.DMatrix: saving, loading", {
  # save to a local file
+  dtest1 <- xgb.DMatrix(test_data, label=test_label)
  tmp_file <- tempfile('xgb.DMatrix_')
  expect_true(xgb.DMatrix.save(dtest1, tmp_file))
  # read from a local file
-  dtest3 <- xgb.DMatrix(tmp_file)
+  expect_output(dtest3 <- xgb.DMatrix(tmp_file), "entries loaded from")
+  expect_output(dtest3 <- xgb.DMatrix(tmp_file, silent = TRUE), NA)
  unlink(tmp_file)
  expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
+  
+  # from a libsvm text file
+  tmp <- c("0 1:1 2:1","1 3:1","0 1:1")
+  tmp_file <- 'tmp.libsvm'
+  writeLines(tmp, tmp_file)
+  dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
+  expect_equal(dim(dtest4), c(3, 4))
+  expect_equal(getinfo(dtest4, 'label'), c(0,1,0))
+  unlink(tmp_file)
 })

 test_that("xgb.DMatrix: getinfo & setinfo", {
@@ -65,3 +87,13 @@ test_that("xgb.DMatrix: colnames", {
  expect_silent(colnames(dtest) <- NULL)
  expect_null(colnames(dtest))
 })
+
+test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
+  set.seed(123)
+  nr <- 1000
+  x <- rsparsematrix(nr, 100, density=0.0005)
+  # we want it very sparse, so that last rows are empty
+  expect_lt(max(x@i), nr)
+  dtest <- xgb.DMatrix(x)
+  expect_equal(dim(dtest), dim(x))
+})
--- a/R-package/tests/testthat/test_gc_safety.R
+++ b/R-package/tests/testthat/test_gc_safety.R
@@ -0,0 +1,15 @@
+require(xgboost)
+
+context("Garbage Collection Safety Check")
+
+test_that("train and prediction when gctorture is on", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  train <- agaricus.train
+  test <- agaricus.test
+  gctorture(TRUE)
+  bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+  eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+  pred <- predict(bst, test$data)
+  gctorture(FALSE)
+})
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -2,18 +2,47 @@ context('Test generalized linear models')

 require(xgboost)

-test_that("glm works", {
+test_that("gblinear works", {
  data(agaricus.train, package='xgboost')
  data(agaricus.test, package='xgboost')
  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
-  expect_equal(class(dtrain), "xgb.DMatrix")
-  expect_equal(class(dtest), "xgb.DMatrix")
+
  param <- list(objective = "binary:logistic", booster = "gblinear",
-                nthread = 2, alpha = 0.0001, lambda = 1)
+                nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
  watchlist <- list(eval = dtest, train = dtrain)
-  num_round <- 2
-  bst <- xgb.train(param, dtrain, num_round, watchlist)
+
+  n <- 5         # iterations
+  ERR_UL <- 0.005 # upper limit for the test set error
+  VERB <- 0      # chatterbox switch
+
+  param$updater = 'shotgun'
+  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
  ypred <- predict(bst, dtest)
  expect_equal(length(getinfo(dtest, 'label')), 1611)
+  expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
+
+  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
+                   callbacks = list(cb.gblinear.history()))
+  expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
+  h <- xgb.gblinear.history(bst)
+  expect_equal(dim(h), c(n, ncol(dtrain) + 1))
+  expect_is(h, "matrix")
+
+  param$updater = 'coord_descent'
+  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
+  expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
+
+  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
+  expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
+
+  bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy')
+  expect_lt(bst$evaluation_log$eval_error[2], ERR_UL)
+
+  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
+                   top_n = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
+  expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
+  h <- xgb.gblinear.history(bst)
+  expect_equal(dim(h), c(n, ncol(dtrain) + 1))
+  expect_s4_class(h, "dgCMatrix")
 })
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -3,7 +3,9 @@ context('Test helper functions')
 require(xgboost)
 require(data.table)
 require(Matrix)
-require(vcd)
+require(vcd, quietly = TRUE)
+
+float_tolerance = 5e-6

 set.seed(1982)
 data(Arthritis)
@@ -14,30 +16,124 @@ df[,ID := NULL]
 sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
 label <- df[, ifelse(Improved == "Marked", 1, 0)]

+# binary
+nrounds <- 12
 bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
-               eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic", booster = "gbtree")
+                    eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
+                    objective = "binary:logistic", booster = "gbtree")

 bst.GLM <- xgboost(data = sparse_matrix, label = label,
-                   eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic", booster = "gblinear")
+                   eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
+                   objective = "binary:logistic", booster = "gblinear")

 feature.names <- colnames(sparse_matrix)

+# multiclass
+mlabel <- as.numeric(iris$Species) - 1
+nclass <- 3
+mbst.Tree <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
+                     max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds,
+                     objective = "multi:softprob", num_class = nclass, base_score = 0)
+
+mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
+                    booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
+                    objective = "multi:softprob", num_class = nclass, base_score = 0)
+
+
 test_that("xgb.dump works", {
-  expect_length(xgb.dump(bst.Tree), 172)
+  expect_length(xgb.dump(bst.Tree), 200)
  expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
  expect_true(file.exists('xgb.model.dump'))
  expect_gt(file.size('xgb.model.dump'), 8000)
+
+  # JSON format
+  dmp <- xgb.dump(bst.Tree, dump_format = "json")
+  expect_length(dmp, 1)
+  expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188)
 })

 test_that("xgb.dump works for gblinear", {
  expect_length(xgb.dump(bst.GLM), 14)
-  # also make sure that it works properly for a sparse model where some coefficients 
+  # also make sure that it works properly for a sparse model where some coefficients
  # are 0 from setting large L1 regularization:
-  bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, 
+  bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
                        alpha=2, objective = "binary:logistic", booster = "gblinear")
  d.sp <- xgb.dump(bst.GLM.sp)
  expect_length(d.sp, 14)
  expect_gt(sum(d.sp == "0"), 0)
+
+  # JSON format
+  dmp <- xgb.dump(bst.GLM.sp, dump_format = "json")
+  expect_length(dmp, 1)
+  expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
+})
+
+test_that("predict leafs works", {
+  # no error for gbtree
+  expect_error(pred_leaf <- predict(bst.Tree, sparse_matrix, predleaf = TRUE), regexp = NA)
+  expect_equal(dim(pred_leaf), c(nrow(sparse_matrix), nrounds))
+  # error for gblinear
+  expect_error(predict(bst.GLM, sparse_matrix, predleaf = TRUE))
+})
+
+test_that("predict feature contributions works", {
+  # gbtree binary classifier
+  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA)
+  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
+  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
+  # must work with data that has no column names
+  X <- sparse_matrix
+  colnames(X) <- NULL
+  expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA)
+  expect_equal(pred_contr, pred_contr_, check.attributes = FALSE,
+               tolerance = float_tolerance)
+
+  # gbtree binary classifier (approximate method)
+  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
+  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
+  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
+
+  # gblinear binary classifier
+  expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA)
+  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
+  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
+  # manual calculation of linear terms
+  coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric
+  coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
+  pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*")
+  expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual),
+               tolerance = float_tolerance)
+
+  # gbtree multiclass
+  pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
+  pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE)
+  expect_is(pred_contr, "list")
+  expect_length(pred_contr, 3)
+  for (g in seq_along(pred_contr)) {
+    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
+    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5)
+  }
+
+  # gblinear multiclass (set base_score = 0, which is base margin in multiclass)
+  pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
+  pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE)
+  expect_length(pred_contr, 3)
+  coefs_all <- xgb.dump(mbst.GLM)[-c(1,2,6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE)
+  for (g in seq_along(pred_contr)) {
+    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
+    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance)
+    # manual calculation of linear terms
+    coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
+    pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*")
+    expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual),
+                 tolerance = float_tolerance)
+  }
 })

 test_that("xgb-attribute functionality", {
@@ -46,7 +142,7 @@ test_that("xgb-attribute functionality", {
  list.ch <- list.val[order(names(list.val))]
  list.ch <- lapply(list.ch, as.character)
  # note: iter is 0-index in xgb attributes
-  list.default <- list(niter = "9")
+  list.default <- list(niter = as.character(nrounds - 1))
  list.ch <- c(list.ch, list.default)
  # proper input:
  expect_error(xgb.attr(bst.Tree, NULL))
@@ -73,22 +169,94 @@ test_that("xgb-attribute functionality", {
  expect_null(xgb.attributes(bst))
 })

+if (grepl('Windows', Sys.info()[['sysname']]) ||
+    grepl('Linux', Sys.info()[['sysname']]) ||
+    grepl('Darwin', Sys.info()[['sysname']])) {
+    test_that("xgb-attribute numeric precision", {
+      # check that lossless conversion works with 17 digits
+      # numeric -> character -> numeric
+      X <- 10^runif(100, -20, 20)
+      if (capabilities('long.double')) {
+          X2X <- as.numeric(format(X, digits = 17))
+          expect_identical(X, X2X)
+      }
+      # retrieved attributes to be the same as written
+      for (x in X) {
+        xgb.attr(bst.Tree, "x") <- x
+        expect_equal(as.numeric(xgb.attr(bst.Tree, "x")), x, tolerance = float_tolerance)
+        xgb.attributes(bst.Tree) <- list(a = "A", b = x)
+        expect_equal(as.numeric(xgb.attr(bst.Tree, "b")), x, tolerance = float_tolerance)
+      }
+    })
+}
+
+test_that("xgb.Booster serializing as R object works", {
+  saveRDS(bst.Tree, 'xgb.model.rds')
+  bst <- readRDS('xgb.model.rds')
+  dtrain <- xgb.DMatrix(sparse_matrix, label = label)
+  expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
+  expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
+  xgb.save(bst, 'xgb.model')
+  nil_ptr <- new("externalptr")
+  class(nil_ptr) <- "xgb.Booster.handle"
+  expect_true(identical(bst$handle, nil_ptr))
+  bst <- xgb.Booster.complete(bst)
+  expect_true(!identical(bst$handle, nil_ptr))
+  expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
+})
+
 test_that("xgb.model.dt.tree works with and without feature names", {
  names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
  dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
  expect_equal(names.dt.trees, names(dt.tree))
-  expect_equal(dim(dt.tree), c(162, 10))
-  expect_output(str(xgb.model.dt.tree(model = bst.Tree)), 'Feature.*\\"3\\"')
+  expect_equal(dim(dt.tree), c(188, 10))
+  expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
+
+  dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
+  expect_equal(dt.tree, dt.tree.0)
+
+  # when model contains no feature names:
+  bst.Tree.x <- bst.Tree
+  bst.Tree.x$feature_names <- NULL
+  dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
+  expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
+  expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
+
+  # using integer node ID instead of character
+  dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
+  expect_equal(as.integer(tstrsplit(dt.tree$Yes, '-')[[2]]), dt.tree.int$Yes)
+  expect_equal(as.integer(tstrsplit(dt.tree$No, '-')[[2]]), dt.tree.int$No)
+  expect_equal(as.integer(tstrsplit(dt.tree$Missing, '-')[[2]]), dt.tree.int$Missing)
+})
+
+test_that("xgb.model.dt.tree throws error for gblinear", {
+  expect_error(xgb.model.dt.tree(model = bst.GLM))
 })

 test_that("xgb.importance works with and without feature names", {
  importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
  expect_equal(dim(importance.Tree), c(7, 4))
  expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
-  expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"')
+  expect_output(str(importance.Tree), 'Feature.*\\"Age\\"')
+
+  importance.Tree.0 <- xgb.importance(model = bst.Tree)
+  expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance)
+
+  # when model contains no feature names:
+  bst.Tree.x <- bst.Tree
+  bst.Tree.x$feature_names <- NULL
+  importance.Tree.x <- xgb.importance(model = bst.Tree)
+  expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE],
+               tolerance = float_tolerance)
+
  imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
  expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
  xgb.ggplot.importance(importance_matrix = importance.Tree)
+
+  # for multiclass
+  imp.Tree <- xgb.importance(model = mbst.Tree)
+  expect_equal(dim(imp.Tree), c(4, 4))
+  xgb.importance(model = mbst.Tree, trees = seq(from=0, by=nclass, length.out=nrounds))
 })

 test_that("xgb.importance works with GLM model", {
@@ -99,6 +267,22 @@ test_that("xgb.importance works with GLM model", {
  imp2plot <- xgb.plot.importance(importance.GLM)
  expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance"))
  xgb.ggplot.importance(importance.GLM)
+
+  # for multiclass
+  imp.GLM <- xgb.importance(model = mbst.GLM)
+  expect_equal(dim(imp.GLM), c(12, 3))
+  expect_equal(imp.GLM$Class, rep(0:2, each=4))
+})
+
+test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
+  bst1 <- xgboost(data = sparse_matrix, label = label, max_depth = 1,
+                  eta = 1, nthread = 2, nrounds = 1, verbose = 0,
+                  objective = "binary:logistic")
+  expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error
+  expect_equal(nrow(dt), 3)
+  expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error
+  expect_equal(nrow(imp), 1)
+  expect_equal(imp$Gain, 1)
 })

 test_that("xgb.plot.tree works with and without feature names", {
@@ -118,6 +302,13 @@ test_that("xgb.plot.deepness works", {
  xgb.ggplot.deepness(model = bst.Tree)
 })

+test_that("xgb.plot.shap works", {
+  sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
+  expect_equal(names(sh), c("data", "shap_contrib"))
+  expect_equal(NCOL(sh$data), 2)
+  expect_equal(NCOL(sh$shap_contrib), 2)
+})
+
 test_that("check.deprecation works", {
  ttt <- function(a = NNULL, DUMMY=NULL, ...) {
    check.deprecation(...)
--- a/R-package/tests/testthat/test_monotone.R
+++ b/R-package/tests/testthat/test_monotone.R
@@ -0,0 +1,24 @@
+require(xgboost)
+
+context("monotone constraints")
+
+set.seed(1024)
+x = rnorm(1000, 10)
+y = -1*x + rnorm(1000, 0.001) + 3*sin(x)
+train = matrix(x, ncol = 1)
+
+
+test_that("monotone constraints for regression", {
+  bst = xgboost(data = train, label = y, max_depth = 2,
+                eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
+                monotone_constraints = -1)
+  
+  pred = predict(bst, train)
+  
+  ind = order(train[,1])
+  pred.ord = pred[ind]
+  expect_true({
+    !any(diff(pred.ord) > 0)
+  }, "Monotone Contraint Satisfied")
+  
+})
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -0,0 +1,97 @@
+require(xgboost)
+
+context("update trees in an existing model")
+
+data(agaricus.train, package = 'xgboost')
+data(agaricus.test, package = 'xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+test_that("updating the model works", {
+  watchlist = list(train = dtrain, test = dtest)
+
+  # no-subsampling
+  p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
+  set.seed(11)
+  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
+  tr1 <- xgb.model.dt.tree(model = bst1)
+  
+  # with subsampling
+  p2 <- modifyList(p1, list(subsample = 0.1))
+  set.seed(11)
+  bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0)
+  tr2 <- xgb.model.dt.tree(model = bst2)
+  
+  # the same no-subsampling boosting with an extra 'refresh' updater:
+  p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
+  set.seed(11)
+  bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0)
+  tr1r <- xgb.model.dt.tree(model = bst1r)
+  # all should be the same when no subsampling
+  expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
+  expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE)
+
+  # the same boosting with subsampling with an extra 'refresh' updater:
+  p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
+  set.seed(11)
+  bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0)
+  tr2r <- xgb.model.dt.tree(model = bst2r)
+  # should be the same evaluation but different gains and larger cover
+  expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
+  expect_equal(tr2[Feature == 'Leaf']$Quality, tr2r[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2r[Feature != 'Leaf']$Quality)), 100)
+  expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5)
+
+  # process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
+  p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
+  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
+  tr1u <- xgb.model.dt.tree(model = bst1u)
+  # all should be the same when no subsampling
+  expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
+  expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
+  
+  # process type 'update' for model with subsampling, refreshing only the tree stats from training data:
+  p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
+  bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
+  tr2u <- xgb.model.dt.tree(model = bst2u)
+  # should be the same evaluation but different gains and larger cover
+  expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
+  expect_equal(tr2[Feature == 'Leaf']$Quality, tr2u[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2u[Feature != 'Leaf']$Quality)), 100)
+  expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5)
+  # the results should be the same as for the model with an extra 'refresh' updater
+  expect_equal(bst2r$evaluation_log, bst2u$evaluation_log)
+  expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
+  
+  # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
+  p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
+  bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
+  tr1ut <- xgb.model.dt.tree(model = bst1ut)
+  # should be the same evaluations but different gains and smaller cover (test data is smaller)
+  expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
+  expect_equal(tr1[Feature == 'Leaf']$Quality, tr1ut[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr1[Feature != 'Leaf']$Quality - tr1ut[Feature != 'Leaf']$Quality)), 100)
+  expect_lt(sum(tr1ut$Cover) / sum(tr1$Cover), 0.5)
+})
+
+test_that("updating works for multiclass & multitree", {
+  dtr <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1)
+  watchlist <- list(train = dtr)
+  p0 <- list(max_depth = 2, eta = 0.5, nthread = 2, subsample = 0.6,
+             objective = "multi:softprob", num_class = 3, num_parallel_tree = 2,
+             base_score = 0)
+  set.seed(121)
+  bst0 <- xgb.train(p0, dtr, 5, watchlist, verbose = 0)
+  tr0 <- xgb.model.dt.tree(model = bst0)
+  
+  # run update process for an original model with subsampling
+  p0u <- modifyList(p0, list(process_type='update', updater='refresh', refresh_leaf=FALSE))
+  bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0)
+  tr0u <- xgb.model.dt.tree(model = bst0u)
+  
+  # should be the same evaluation but different gains and larger cover
+  expect_equal(bst0$evaluation_log, bst0u$evaluation_log)
+  expect_equal(tr0[Feature == 'Leaf']$Quality, tr0u[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr0[Feature != 'Leaf']$Quality - tr0u[Feature != 'Leaf']$Quality)), 100)
+  expect_gt(sum(tr0u$Cover) / sum(tr0$Cover), 1.5)
+})
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -5,7 +5,7 @@ output:
    css: vignette.css
    number_sections: yes
    toc: yes
-author: Tianqi Chen, Tong He, Michaël Benesty
+author: Tianqi Chen, Tong He, Michaël Benesty, Yuan Tang
 vignette: >
  %\VignetteIndexEntry{Discover your data}
  %\VignetteEngine{knitr::rmarkdown}
@@ -18,11 +18,11 @@ Understand your dataset with XGBoost
 Introduction
 ------------

-The purpose of this Vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.
+The purpose of this vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.

-This Vignette is not about predicting anything (see [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **Xgboost** to highlight the *link* between the *features* of your data and the *outcome*.
+This vignette is not about predicting anything (see [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **Xgboost** to highlight the *link* between the *features* of your data and the *outcome*.

-Pacakge loading:
+Package loading:

 ```{r libLoading, results='hold', message=F, warning=F}
 require(xgboost)
@@ -36,7 +36,7 @@ if (!require('vcd')) install.packages('vcd')
 Preparation of the dataset
 --------------------------

-### Numeric VS categorical variables
+### Numeric v.s. categorical variables


 **Xgboost** manages only `numeric` vectors.
@@ -68,7 +68,7 @@ df <- data.table(Arthritis, keep.rownames = F)

 > `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.

-The first thing we want to do is to have a look to the first lines of the `data.table`:
+The first thing we want to do is to have a look to the first few lines of the `data.table`:

 ```{r}
 head(df)
@@ -103,9 +103,9 @@ Therefore, 20 is not closer to 30 than 60. To make it short, the distance betwee
 head(df[,AgeDiscret := as.factor(round(Age/10,0))])
 ```

-##### Random split in two groups
+##### Random split into two groups

-Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
+Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).

 ```{r}
 head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
@@ -134,23 +134,24 @@ levels(df[,Treatment])
 ```


-#### One-hot encoding
+#### Encoding categorical features

 Next step, we will transform the categorical data to dummy variables.
-This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
+Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach.
+We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it producess "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).

-The purpose is to transform each value of each *categorical* feature in a *binary* feature `{0, 1}`.
+The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.

-For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
+For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `TreatmentPlacebo` and the value `0` in the new column `TreatmentTreated`. The column `TreatmentPlacebo` will disappear during the contrast encoding, as it would be absorbed into a common constant intercept column.

 Column `Improved` is excluded because it will be our `label` column, the one we want to predict.

 ```{r, warning=FALSE,message=FALSE}
-sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
+sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
 head(sparse_matrix)
 ```

-> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
+> Formula `Improved ~ .` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` column selection removes the intercept column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.

 Create the output `numeric` vector (not as a sparse `Matrix`):

@@ -317,8 +318,6 @@ In boosting, when a specific link between feature and outcome have been learned

 If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters!

-**Warning**: this is still an experimental parameter.
-
 For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:

 ```{r, warning=FALSE, message=FALSE}
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@@ -21,7 +21,8 @@ if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align =

 %
 <<prelim,echo=FALSE>>=
-xgboost.version = '0.4-2'
+xgboost.version <- packageDescription("xgboost")$Version
+
@
 %

--- a/Show More
+++ b/Show More