Compare commits

..

25 Commits

Author SHA1 Message Date
Nan Zhu
15419d3fd9 [jvm-packages] update version of 1.1.0 branch (#6435) 2020-11-25 20:15:54 +08:00
Philip Hyunsu Cho
4e1fba261d [CI] Fix cuDF install; merge 'gpu' and 'cudf' test suite (#5814) 2020-07-15 12:13:27 -07:00
Hyunsu Cho
76d1b69664 Uninstall cuPy 2020-07-14 20:34:23 -07:00
Philip Hyunsu Cho
1bf42c817a [CI] Reduce load on Windows CI pipeline (#5892) 2020-07-14 19:56:38 -07:00
Philip Hyunsu Cho
9ac614fe61 [CI] Enforce daily budget in Jenkins CI (#5884)
* [CI] Throttle Jenkins CI

* Don't use Jenkins master instance
2020-07-14 19:56:18 -07:00
Philip Hyunsu Cho
d6d8be6519 Add explicit cast to pass 32-bit CRAN check (#5777) 2020-06-14 02:36:13 -07:00
Philip Hyunsu Cho
34408a7fdc Release patch release 1.1.1 with faster CPU performance (#5732)
* Fix release degradation (#5720)

* fix release degradation, related to 5666

* less resizes

Co-authored-by: SHVETS, KIRILL <kirill.shvets@intel.com>

* Make 1.1.1 patch release

* Disable too-many-function-args pylint warning for predict()

* Fix Windows CI

* Remove cpplint

Co-authored-by: ShvetsKS <33296480+ShvetsKS@users.noreply.github.com>
Co-authored-by: SHVETS, KIRILL <kirill.shvets@intel.com>
2020-06-04 10:56:07 -07:00
Philip Hyunsu Cho
f9b246f5ee Add pkgconfig to cmake (#5744) (#5748)
* Add pkgconfig to cmake

* Move xgboost.pc.in to cmake/

Co-authored-by: Peter Jung <peter@jung.ninja>
Co-authored-by: Peter Jung <peter.jung@heureka.cz>
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
2020-06-02 12:04:27 -07:00
Jiaming Yuan
8467880aeb Fix loading old model. (#5724) (#5737)
* Add test.
2020-06-01 04:32:24 +08:00
Jiaming Yuan
e74560c86a [CI] Backport Remove CUDA 9.0 from Windows CI. (#5674) (#5738)
* [CI] Remove CUDA 9.0 from Windows CI. (#5674)

* Remove CUDA 9.0 on Windows CI.

* Require cuda10 tag, to differentiate

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>

* Pylint.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2020-05-31 21:35:00 +08:00
Hyunsu Cho
882b966536 [Doc] Fix typos in AFT tutorial 2020-05-27 03:09:46 -07:00
Philip Hyunsu Cho
115e4c3360 [R] Fix duplicated libomp.dylib error on Mac OSX (#5701) 2020-05-24 23:38:42 -07:00
Hyunsu Cho
f5d4fddafe Release 1.1.0 2020-05-17 00:26:22 -07:00
Jiaming Yuan
66690f3d07 Add JSON schema to model dump. (#5660) 2020-05-15 12:26:49 +08:00
Rory Mitchell
c42f533ae9 Resolve vector<bool>::iterator crash (#5642) 2020-05-11 18:14:41 +08:00
Philip Hyunsu Cho
751160b69c Upgrade to CUDA 10.0 (#5649)
Co-authored-by: fis <jm.yuan@outlook.com>
2020-05-11 18:04:47 +08:00
Hyunsu Cho
8aaabce7c9 Make RC2 2020-05-04 09:11:38 -07:00
Philip Hyunsu Cho
14543176d1 Fix build on big endian CPUs (#5617)
* Fix build on big endian CPUs

* Clang-tidy
2020-05-04 09:09:22 -07:00
Jason E. Aten, Ph.D
afa6e086cc Clarify meaning of training parameter in XGBoosterPredict() (#5604)
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
2020-05-04 09:08:57 -07:00
Philip Hyunsu Cho
636ab6b522 Instruct Mac users to install libomp (#5606) 2020-05-04 09:08:25 -07:00
Philip Hyunsu Cho
6daa6ee4e0 [R] Address warnings to comply with CRAN submission policy (#5600)
* [R] Address warnings to comply with CRAN submission policy

* Include <xgboost/logging.h>
2020-05-04 09:08:16 -07:00
Philip Hyunsu Cho
4979991d5b [CI] Grant public read access to Mac OSX wheels (#5602) 2020-05-04 09:07:56 -07:00
Philip Hyunsu Cho
02faddc5f3 Fix compilation on Mac OSX High Sierra (10.13) (#5597)
* Fix compilation on Mac OSX High Sierra

* [CI] Build Mac OSX binary wheel using Travis CI
2020-05-04 09:07:29 -07:00
Jiaming Yuan
844d7c1d5b Set device in device dmatrix. (#5596) 2020-04-25 13:44:30 +08:00
Hyunsu Cho
3728855ce9 Make RC1 2020-04-24 13:56:54 -07:00
670 changed files with 14577 additions and 48176 deletions

1
.github/FUNDING.yml vendored
View File

@@ -1,2 +1 @@
open_collective: xgboost open_collective: xgboost
custom: https://xgboost.ai/sponsors

View File

@@ -1,316 +0,0 @@
# This is a basic workflow to help you get started with Actions
name: XGBoost-CI
# Controls when the action will run. Triggers the workflow on push or pull request
# events but only for the master branch
on: [push, pull_request]
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
gtest-cpu:
name: Test Google C++ test (CPU)
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-10.15]
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: Install system packages
run: |
brew install lz4 ninja libomp
- name: Build gtest binary
run: |
mkdir build
cd build
cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON -GNinja
ninja -v
- name: Run gtest binary
run: |
cd build
# libomp internal error:
# OMP: Error #131: Thread identifier invalid.
./testxgboost --gtest_filter="-HistIndexCreationWithExternalMemory.Test"
ctest -R TestXGBoostCLI --extra-verbose
gtest-cpu-nonomp:
name: Test Google C++ unittest (CPU Non-OMP)
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: Install system packages
run: |
sudo apt-get install -y --no-install-recommends ninja-build
- name: Build and install XGBoost
shell: bash -l {0}
run: |
mkdir build
cd build
cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF
ninja -v
- name: Run gtest binary
run: |
cd build
ctest --extra-verbose
python-sdist-test:
name: Test installing XGBoost Python source package
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-10.15, windows-latest]
python-version: ["3.8"]
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: Install osx system dependencies
if: matrix.os == 'macos-10.15'
run: |
brew install ninja libomp
- name: Install Ubuntu system dependencies
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt-get install -y --no-install-recommends ninja-build
- uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
python-version: ${{ matrix.python-version }}
activate-environment: test
- name: Display Conda env
shell: bash -l {0}
run: |
conda info
conda list
- name: Build and install XGBoost
shell: bash -l {0}
run: |
cd python-package
python --version
python setup.py sdist
pip install -v ./dist/xgboost-*.tar.gz
cd ..
python -c 'import xgboost'
c-api-demo:
name: Test installing XGBoost lib + building the C API demo
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest"]
python-version: ["3.8"]
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: Install system packages
run: |
sudo apt-get install -y --no-install-recommends ninja-build
- uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
python-version: ${{ matrix.python-version }}
activate-environment: test
- name: Display Conda env
shell: bash -l {0}
run: |
conda info
conda list
- name: Build and install XGBoost
shell: bash -l {0}
run: |
mkdir build
cd build
cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
ninja -v install
- name: Build and run C API demo
shell: bash -l {0}
run: |
cd demo/c-api/
mkdir build
cd build
cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
ninja -v
cd ..
./build/api-demo
test-with-jvm:
name: Test JVM on OS ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest]
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: '3.8'
architecture: 'x64'
- uses: actions/setup-java@v1
with:
java-version: 1.8
- name: Install Python packages
run: |
python -m pip install wheel setuptools
python -m pip install awscli
- name: Cache Maven packages
uses: actions/cache@v2
with:
path: ~/.m2
key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
restore-keys: ${{ runner.os }}-m2
- name: Test XGBoost4J
run: |
cd jvm-packages
mvn test -B -pl :xgboost4j_2.12
- name: Extract branch name
shell: bash
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
id: extract_branch
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest'
- name: Publish artifact xgboost4j.dll to S3
run: |
cd lib/
Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
dir
python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest'
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Test XGBoost4J-Spark
run: |
rm -rfv build/
cd jvm-packages
mvn -B test
if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
env:
RABIT_MOCK: ON
lint:
runs-on: ubuntu-latest
name: Code linting for Python and C++
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: '3.7'
architecture: 'x64'
- name: Install Python packages
run: |
python -m pip install wheel setuptools
python -m pip install pylint cpplint numpy scipy scikit-learn
- name: Run lint
run: |
make lint
mypy:
runs-on: ubuntu-latest
name: Type checking for Python
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: '3.7'
architecture: 'x64'
- name: Install Python packages
run: |
python -m pip install wheel setuptools mypy dask[complete] distributed
- name: Run mypy
run: |
make mypy
doxygen:
runs-on: ubuntu-latest
name: Generate C/C++ API doc using Doxygen
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: '3.7'
architecture: 'x64'
- name: Install system packages
run: |
sudo apt-get install -y --no-install-recommends doxygen graphviz ninja-build
python -m pip install wheel setuptools
python -m pip install awscli
- name: Run Doxygen
run: |
mkdir build
cd build
cmake .. -DBUILD_C_DOC=ON -GNinja
ninja -v doc_doxygen
- name: Extract branch name
shell: bash
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
id: extract_branch
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
- name: Publish
run: |
cd build/
tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
sphinx:
runs-on: ubuntu-latest
name: Build docs using Sphinx
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: '3.8'
architecture: 'x64'
- name: Install system packages
run: |
sudo apt-get install -y --no-install-recommends graphviz
python -m pip install wheel setuptools
python -m pip install -r doc/requirements.txt
- name: Extract branch name
shell: bash
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
id: extract_branch
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
- name: Run Sphinx
run: |
make -C doc html
env:
SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}

View File

@@ -1,44 +0,0 @@
# Run R tests with noLD R. Only triggered by a pull request review
# See discussion at https://github.com/dmlc/xgboost/pull/6378
name: XGBoost-R-noLD
on:
pull_request_review_comment:
types: [created]
env:
R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
jobs:
test-R-noLD:
if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association)
timeout-minutes: 120
runs-on: ubuntu-latest
container: rhub/debian-gcc-devel-nold
steps:
- name: Install git and system packages
shell: bash
run: |
apt-get update && apt-get install -y git libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libxml2-dev
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: Install dependencies
shell: bash
run: |
cat > install_libs.R <<EOT
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
EOT
/tmp/R-devel/bin/Rscript install_libs.R
- name: Run R tests
shell: bash
run: |
cd R-package && \
/tmp/R-devel/bin/R CMD INSTALL . && \
/tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')"

View File

@@ -1,116 +0,0 @@
name: XGBoost-R-Tests
on: [push, pull_request]
env:
R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
jobs:
lintr:
runs-on: ${{ matrix.config.os }}
name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
strategy:
matrix:
config:
- {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: r-lib/actions/setup-r@master
with:
r-version: ${{ matrix.config.r }}
- name: Install dependencies
shell: Rscript {0}
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Run lintr
run: |
cd R-package
R.exe CMD INSTALL .
Rscript.exe tests/helper_scripts/run_lint.R
test-with-R:
runs-on: ${{ matrix.config.os }}
name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
strategy:
fail-fast: false
matrix:
config:
- {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'}
- {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'}
- {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: r-lib/actions/setup-r@master
with:
r-version: ${{ matrix.config.r }}
- name: Install dependencies
shell: Rscript {0}
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- uses: actions/setup-python@v2
with:
python-version: '3.7'
architecture: 'x64'
- name: Test R
run: |
python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool='${{ matrix.config.build }}'
test-R-CRAN:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
config:
- {r: 'release'}
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: r-lib/actions/setup-r@master
with:
r-version: ${{ matrix.config.r }}
- uses: r-lib/actions/setup-tinytex@master
- name: Install system packages
run: |
sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc
- name: Install dependencies
shell: Rscript {0}
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Check R Package
run: |
# Print stacktrace upon success of failure
make Rcheck || tests/ci_build/print_r_stacktrace.sh fail
tests/ci_build/print_r_stacktrace.sh success

24
.gitignore vendored
View File

@@ -51,7 +51,6 @@ Debug
#.Rbuildignore #.Rbuildignore
R-package.Rproj R-package.Rproj
*.cache* *.cache*
.mypy_cache/
# java # java
java/xgboost4j/target java/xgboost4j/target
java/xgboost4j/tmp java/xgboost4j/tmp
@@ -71,7 +70,6 @@ build
build_plugin build_plugin
recommonmark/ recommonmark/
tags tags
TAGS
*.class *.class
target target
*.swp *.swp
@@ -94,7 +92,6 @@ metastore_db
# files from R-package source install # files from R-package source install
**/config.status **/config.status
R-package/src/Makevars R-package/src/Makevars
*.lib
# Visual Studio Code # Visual Studio Code
/.vscode/ /.vscode/
@@ -105,23 +102,4 @@ R-package/src/Makevars
/cmake-build-debug/ /cmake-build-debug/
# GDB # GDB
.gdb_history .gdb_history
# Python joblib.Memory used in pytest.
cachedir/
# Files from local Dask work
dask-worker-space/
# Jupyter notebook checkpoints
.ipynb_checkpoints/
# credentials and key material
config
credentials
credentials.csv
*.env
*.pem
*.pub
*.rdp
*_rsa

7
.gitmodules vendored
View File

@@ -1,10 +1,9 @@
[submodule "dmlc-core"] [submodule "dmlc-core"]
path = dmlc-core path = dmlc-core
url = https://github.com/dmlc/dmlc-core url = https://github.com/dmlc/dmlc-core
branch = main [submodule "rabit"]
path = rabit
url = https://github.com/dmlc/rabit
[submodule "cub"] [submodule "cub"]
path = cub path = cub
url = https://github.com/NVlabs/cub url = https://github.com/NVlabs/cub
[submodule "gputreeshap"]
path = gputreeshap
url = https://github.com/rapidsai/gputreeshap.git

View File

@@ -1,32 +1,51 @@
# disable sudo for container build.
sudo: required sudo: required
# Enabling test OS X
os:
- linux
- osx
osx_image: xcode10.1
dist: bionic dist: bionic
# Use Build Matrix to do lint and build seperately
env: env:
matrix:
# python package test
- TASK=python_test
# test installation of Python source distribution
- TASK=python_sdist_test
# java package test
- TASK=java_test
# cmake test
- TASK=cmake_test
global: global:
- secure: "PR16i9F8QtNwn99C5NDp8nptAS+97xwDtXEJJfEiEVhxPaaRkOp0MPWhogCaK0Eclxk1TqkgWbdXFknwGycX620AzZWa/A1K3gAs+GrpzqhnPMuoBJ0Z9qxXTbSJvCyvMbYwVrjaxc/zWqdMU8waWz8A7iqKGKs/SqbQ3rO6v7c=" - secure: "PR16i9F8QtNwn99C5NDp8nptAS+97xwDtXEJJfEiEVhxPaaRkOp0MPWhogCaK0Eclxk1TqkgWbdXFknwGycX620AzZWa/A1K3gAs+GrpzqhnPMuoBJ0Z9qxXTbSJvCyvMbYwVrjaxc/zWqdMU8waWz8A7iqKGKs/SqbQ3rO6v7c="
- secure: "dAGAjBokqm/0nVoLMofQni/fWIBcYSmdq4XvCBX1ZAMDsWnuOfz/4XCY6h2lEI1rVHZQ+UdZkc9PioOHGPZh5BnvE49/xVVWr9c4/61lrDOlkD01ZjSAeoV0fAZq+93V/wPl4QV+MM+Sem9hNNzFSbN5VsQLAiWCSapWsLdKzqA=" - secure: "dAGAjBokqm/0nVoLMofQni/fWIBcYSmdq4XvCBX1ZAMDsWnuOfz/4XCY6h2lEI1rVHZQ+UdZkc9PioOHGPZh5BnvE49/xVVWr9c4/61lrDOlkD01ZjSAeoV0fAZq+93V/wPl4QV+MM+Sem9hNNzFSbN5VsQLAiWCSapWsLdKzqA="
jobs: matrix:
include: exclude:
- os: osx - os: linux
arch: amd64
osx_image: xcode10.2
env: TASK=python_test env: TASK=python_test
- os: osx - os: linux
arch: amd64
osx_image: xcode10.2
env: TASK=java_test env: TASK=java_test
- os: linux
env: TASK=cmake_test
# dependent brew packages # dependent brew packages
# the dependencies from homebrew is installed manually from setup script due to outdated image from travis.
addons: addons:
homebrew: homebrew:
update: false
apt:
packages: packages:
- snapd - cmake
- unzip - libomp
- graphviz
- openssl
- libgit2
- wget
- r
update: true
before_install: before_install:
- source tests/travis/travis_setup_env.sh - source tests/travis/travis_setup_env.sh

View File

@@ -1,10 +1,9 @@
cmake_minimum_required(VERSION 3.13) cmake_minimum_required(VERSION 3.13)
project(xgboost LANGUAGES CXX C VERSION 1.4.2) project(xgboost LANGUAGES CXX C VERSION 1.1.1)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)
cmake_policy(SET CMP0079 NEW) cmake_policy(SET CMP0079 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
cmake_policy(SET CMP0063 NEW) cmake_policy(SET CMP0063 NEW)
if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13)) if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
@@ -24,23 +23,17 @@ write_version()
set_default_configuration_release() set_default_configuration_release()
#-- Options #-- Options
## User options
option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF) option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF)
option(USE_OPENMP "Build with OpenMP support." ON) option(USE_OPENMP "Build with OpenMP support." ON)
option(BUILD_STATIC_LIB "Build static library" OFF) option(BUILD_STATIC_LIB "Build static library" OFF)
option(RABIT_BUILD_MPI "Build MPI" OFF)
## Bindings ## Bindings
option(JVM_BINDINGS "Build JVM bindings" OFF) option(JVM_BINDINGS "Build JVM bindings" OFF)
option(R_LIB "Build shared library for R package" OFF) option(R_LIB "Build shared library for R package" OFF)
## Dev ## Dev
option(USE_DEBUG_OUTPUT "Dump internal training results like gradients and predictions to stdout. option(USE_DEBUG_OUTPUT "Dump internal training results like gradients and predictions to stdout.
Should only be used for debugging." OFF) Should only be used for debugging." OFF)
option(FORCE_COLORED_OUTPUT "Force colored output from compilers, useful when ninja is used instead of make." OFF)
option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC/Clang" OFF)
option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
option(GOOGLE_TEST "Build google tests" OFF) option(GOOGLE_TEST "Build google tests" OFF)
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF) option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF)
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF) option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header") set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
option(RABIT_MOCK "Build rabit with mock" OFF) option(RABIT_MOCK "Build rabit with mock" OFF)
@@ -64,9 +57,6 @@ address, leak, undefined and thread.")
## Plugins ## Plugins
option(PLUGIN_LZ4 "Build lz4 plugin" OFF) option(PLUGIN_LZ4 "Build lz4 plugin" OFF)
option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF) option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF)
option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
## TODO: 1. Add check if DPC++ compiler is used for building
option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
option(ADD_PKGCONFIG "Add xgboost.pc into system." ON) option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
#-- Checks for building XGBoost #-- Checks for building XGBoost
@@ -76,9 +66,6 @@ endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
if (USE_NCCL AND NOT (USE_CUDA)) if (USE_NCCL AND NOT (USE_CUDA))
message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.") message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
endif (USE_NCCL AND NOT (USE_CUDA)) endif (USE_NCCL AND NOT (USE_CUDA))
if (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.") message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
@@ -92,23 +79,6 @@ endif (R_LIB AND GOOGLE_TEST)
if (USE_AVX) if (USE_AVX)
message(SEND_ERROR "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.") message(SEND_ERROR "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.")
endif (USE_AVX) endif (USE_AVX)
if (PLUGIN_RMM AND NOT (USE_CUDA))
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
endif (PLUGIN_RMM AND NOT (USE_CUDA))
if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.")
endif (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
if (ENABLE_ALL_WARNINGS)
if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
endif ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
endif (ENABLE_ALL_WARNINGS)
if (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
message(SEND_ERROR "Cannot build a static library libxgboost.a when R or JVM packages are enabled.")
endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
#-- Sanitizer #-- Sanitizer
if (USE_SANITIZER) if (USE_SANITIZER)
@@ -123,20 +93,11 @@ if (USE_CUDA)
message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}") message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
enable_language(CUDA) enable_language(CUDA)
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.0)
message(FATAL_ERROR "CUDA version must be at least 10.0!")
endif()
set(GEN_CODE "") set(GEN_CODE "")
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE) format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap) message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
endif (USE_CUDA) endif (USE_CUDA)
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
endif()
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if (USE_OPENMP) if (USE_OPENMP)
@@ -152,59 +113,63 @@ endif (USE_OPENMP)
msvc_use_static_runtime() msvc_use_static_runtime()
add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core) add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
set_target_properties(dmlc PROPERTIES set_target_properties(dmlc PROPERTIES
CXX_STANDARD 14 CXX_STANDARD 11
CXX_STANDARD_REQUIRED ON CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON) POSITION_INDEPENDENT_CODE ON)
if (MSVC) list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
target_compile_options(dmlc PRIVATE
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
if (TARGET dmlc_unit_tests)
target_compile_options(dmlc_unit_tests PRIVATE
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
endif (TARGET dmlc_unit_tests)
endif (MSVC)
if (ENABLE_ALL_WARNINGS)
target_compile_options(dmlc PRIVATE -Wall -Wextra)
endif (ENABLE_ALL_WARNINGS)
# rabit # rabit
set(RABIT_BUILD_DMLC OFF)
set(DMLC_ROOT ${xgboost_SOURCE_DIR}/dmlc-core)
set(RABIT_WITH_R_LIB ${R_LIB})
add_subdirectory(rabit) add_subdirectory(rabit)
# core xgboost if (RABIT_MOCK)
add_subdirectory(${xgboost_SOURCE_DIR}/src) list(APPEND LINKED_LIBRARIES_PRIVATE rabit_mock_static)
target_link_libraries(objxgboost PUBLIC dmlc) else()
list(APPEND LINKED_LIBRARIES_PRIVATE rabit)
endif(RABIT_MOCK)
foreach(lib rabit rabit_base rabit_empty rabit_mock rabit_mock_static)
# Explicitly link dmlc to rabit, so that configured header (build_config.h)
# from dmlc is correctly applied to rabit.
if (TARGET ${lib})
target_link_libraries(${lib} dmlc ${CMAKE_THREAD_LIBS_INIT})
if (HIDE_CXX_SYMBOLS) # Hide all C++ symbols from Rabit
set_target_properties(${lib} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endif (HIDE_CXX_SYMBOLS)
endif (TARGET ${lib})
endforeach()
# Exports some R specific definitions and objects # Exports some R specific definitions and objects
if (R_LIB) if (R_LIB)
add_subdirectory(${xgboost_SOURCE_DIR}/R-package) add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
endif (R_LIB) endif (R_LIB)
# Plugin # core xgboost
list(APPEND LINKED_LIBRARIES_PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
add_subdirectory(${xgboost_SOURCE_DIR}/plugin) add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
add_subdirectory(${xgboost_SOURCE_DIR}/src)
target_link_libraries(objxgboost PUBLIC dmlc)
set(XGBOOST_OBJ_SOURCES "${XGBOOST_OBJ_SOURCES};$<TARGET_OBJECTS:objxgboost>")
#-- library #-- library
if (BUILD_STATIC_LIB) if (BUILD_STATIC_LIB)
add_library(xgboost STATIC) add_library(xgboost STATIC ${XGBOOST_OBJ_SOURCES})
else (BUILD_STATIC_LIB) else (BUILD_STATIC_LIB)
add_library(xgboost SHARED) add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES})
endif (BUILD_STATIC_LIB) endif (BUILD_STATIC_LIB)
target_link_libraries(xgboost PRIVATE objxgboost)
if (USE_CUDA)
xgboost_set_cuda_flags(xgboost)
endif (USE_CUDA)
#-- Hide all C++ symbols #-- Hide all C++ symbols
if (HIDE_CXX_SYMBOLS) if (HIDE_CXX_SYMBOLS)
foreach(target objxgboost xgboost dmlc) set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
set_target_properties(${target} PROPERTIES CXX_VISIBILITY_PRESET hidden) set_target_properties(xgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach()
endif (HIDE_CXX_SYMBOLS) endif (HIDE_CXX_SYMBOLS)
target_include_directories(xgboost target_include_directories(xgboost
INTERFACE INTERFACE
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>) $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>)
target_link_libraries(xgboost PRIVATE ${LINKED_LIBRARIES_PRIVATE})
# This creates its own shared library `xgboost4j'. # This creates its own shared library `xgboost4j'.
if (JVM_BINDINGS) if (JVM_BINDINGS)
@@ -213,21 +178,18 @@ endif (JVM_BINDINGS)
#-- End shared library #-- End shared library
#-- CLI for xgboost #-- CLI for xgboost
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc) add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES})
target_link_libraries(runxgboost PRIVATE objxgboost)
if (USE_NVTX)
enable_nvtx(runxgboost)
endif (USE_NVTX)
target_include_directories(runxgboost target_include_directories(runxgboost
PRIVATE PRIVATE
${xgboost_SOURCE_DIR}/include ${xgboost_SOURCE_DIR}/include
${xgboost_SOURCE_DIR}/dmlc-core/include ${xgboost_SOURCE_DIR}/dmlc-core/include
${xgboost_SOURCE_DIR}/rabit/include) ${xgboost_SOURCE_DIR}/rabit/include)
target_link_libraries(runxgboost PRIVATE ${LINKED_LIBRARIES_PRIVATE})
set_target_properties( set_target_properties(
runxgboost PROPERTIES runxgboost PROPERTIES
OUTPUT_NAME xgboost OUTPUT_NAME xgboost
CXX_STANDARD 14 CXX_STANDARD 11
CXX_STANDARD_REQUIRED ON) CXX_STANDARD_REQUIRED ON)
#-- End CLI for xgboost #-- End CLI for xgboost
@@ -238,12 +200,11 @@ add_dependencies(xgboost runxgboost)
#-- Installing XGBoost #-- Installing XGBoost
if (R_LIB) if (R_LIB)
include(cmake/RPackageInstallTargetSetup.cmake)
set_target_properties(xgboost PROPERTIES PREFIX "") set_target_properties(xgboost PROPERTIES PREFIX "")
if (APPLE) if (APPLE)
set_target_properties(xgboost PROPERTIES SUFFIX ".so") set_target_properties(xgboost PROPERTIES SUFFIX ".so")
endif (APPLE) endif (APPLE)
setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install") setup_rpackage_install_target(xgboost ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst") set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
endif (R_LIB) endif (R_LIB)
if (MINGW) if (MINGW)
@@ -260,20 +221,7 @@ include(GNUInstallDirs)
install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
# Install libraries. If `xgboost` is a static lib, specify `objxgboost` also, to avoid the install(TARGETS xgboost runxgboost
# following error:
#
# > install(EXPORT ...) includes target "xgboost" which requires target "objxgboost" that is not
# > in any export set.
#
# https://github.com/dmlc/xgboost/issues/6085
if (BUILD_STATIC_LIB)
set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
else (BUILD_STATIC_LIB)
set(INSTALL_TARGETS xgboost runxgboost)
endif (BUILD_STATIC_LIB)
install(TARGETS ${INSTALL_TARGETS}
EXPORT XGBoostTargets EXPORT XGBoostTargets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}

View File

@@ -10,8 +10,8 @@ The Project Management Committee(PMC) consists group of active committers that m
- Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project. - Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
* [Michael Benesty](https://github.com/pommedeterresautee) * [Michael Benesty](https://github.com/pommedeterresautee)
- Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R. - Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
* [Yuan Tang](https://github.com/terrytangyuan), Ant Group * [Yuan Tang](https://github.com/terrytangyuan), Ant Financial
- Yuan is a software engineer in Ant Group. He contributed mostly in R and Python packages. - Yuan is a software engineer in Ant Financial. He contributed mostly in R and Python packages.
* [Nan Zhu](https://github.com/CodingCat), Uber * [Nan Zhu](https://github.com/CodingCat), Uber
- Nan is a software engineer in Uber. He contributed mostly in JVM packages. - Nan is a software engineer in Uber. He contributed mostly in JVM packages.
* [Jiaming Yuan](https://github.com/trivialfis) * [Jiaming Yuan](https://github.com/trivialfis)
@@ -37,8 +37,6 @@ Committers are people who have made substantial contribution to the project and
- Sergei is a software engineer in Criteo. He contributed mostly in JVM packages. - Sergei is a software engineer in Criteo. He contributed mostly in JVM packages.
* [Scott Lundberg](http://scottlundberg.com/), University of Washington * [Scott Lundberg](http://scottlundberg.com/), University of Washington
- Scott is a Ph.D. student at University of Washington. He is the creator of SHAP, a unified approach to explain the output of machine learning models such as decision tree ensembles. He also helps maintain the XGBoost Julia package. - Scott is a Ph.D. student at University of Washington. He is the creator of SHAP, a unified approach to explain the output of machine learning models such as decision tree ensembles. He also helps maintain the XGBoost Julia package.
* [Egor Smirnov](https://github.com/SmirnovEgorRu), Intel
- Egor has led a major effort to improve the performance of XGBoost on multi-core CPUs.
Become a Committer Become a Committer

363
Jenkinsfile vendored
View File

@@ -6,9 +6,6 @@
// Command to run command inside a docker container // Command to run command inside a docker container
dockerRun = 'tests/ci_build/ci_build.sh' dockerRun = 'tests/ci_build/ci_build.sh'
// Which CUDA version to use when building reference distribution wheel
ref_cuda_ver = '10.0'
import groovy.transform.Field import groovy.transform.Field
@Field @Field
@@ -38,15 +35,25 @@ pipeline {
agent { label 'job_initializer' } agent { label 'job_initializer' }
steps { steps {
script { script {
def buildNumber = env.BUILD_NUMBER as int
if (buildNumber > 1) milestone(buildNumber - 1)
milestone(buildNumber)
checkoutSrcs() checkoutSrcs()
commit_id = "${GIT_COMMIT}" commit_id = "${GIT_COMMIT}"
} }
sh 'python3 tests/jenkins_get_approval.py' sh 'python3 tests/jenkins_get_approval.py'
stash name: 'srcs' stash name: 'srcs'
milestone ordinal: 1
}
}
stage('Jenkins Linux: Formatting Check') {
agent none
steps {
script {
parallel ([
'clang-tidy': { ClangTidy() },
'sphinx-doc': { SphinxDoc() },
'doxygen': { Doxygen() }
])
}
milestone ordinal: 2
} }
} }
stage('Jenkins Linux: Build') { stage('Jenkins Linux: Build') {
@@ -54,23 +61,16 @@ pipeline {
steps { steps {
script { script {
parallel ([ parallel ([
'clang-tidy': { ClangTidy() },
'build-cpu': { BuildCPU() }, 'build-cpu': { BuildCPU() },
'build-cpu-arm64': { BuildCPUARM64() },
'build-cpu-rabit-mock': { BuildCPUMock() }, 'build-cpu-rabit-mock': { BuildCPUMock() },
// Build reference, distribution-ready Python wheel with CUDA 10.0 'build-cpu-non-omp': { BuildCPUNonOmp() },
// using CentOS 6 image
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
// The build-gpu-* builds below use Ubuntu image
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) }, 'build-jvm-packages': { BuildJVMPackages(spark_version: '2.4.3') },
'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '10.0') },
'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
'build-jvm-doc': { BuildJVMDoc() } 'build-jvm-doc': { BuildJVMDoc() }
]) ])
} }
milestone ordinal: 3
} }
} }
stage('Jenkins Linux: Test') { stage('Jenkins Linux: Test') {
@@ -79,19 +79,19 @@ pipeline {
script { script {
parallel ([ parallel ([
'test-python-cpu': { TestPythonCPU() }, 'test-python-cpu': { TestPythonCPU() },
'test-python-cpu-arm64': { TestPythonCPUARM64() }, 'test-python-gpu-cuda9.0': { TestPythonGPU(cuda_version: '9.0') },
// artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env 'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
'test-python-gpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', test_rmm: true) }, 'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') }, 'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, 'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, 'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') },
'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') }, 'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') },
'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') } 'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') },
'test-r-3.5.3': { TestR(use_r35: true) }
]) ])
} }
milestone ordinal: 4
} }
} }
stage('Jenkins Linux: Deploy') { stage('Jenkins Linux: Deploy') {
@@ -99,9 +99,10 @@ pipeline {
steps { steps {
script { script {
parallel ([ parallel ([
'deploy-jvm-packages': { DeployJVMPackages(spark_version: '3.0.0') } 'deploy-jvm-packages': { DeployJVMPackages(spark_version: '2.4.3') }
]) ])
} }
milestone ordinal: 5
} }
} }
} }
@@ -122,17 +123,13 @@ def checkoutSrcs() {
} }
} }
def GetCUDABuildContainerType(cuda_version) {
return (cuda_version == ref_cuda_ver) ? 'gpu_build_centos6' : 'gpu_build'
}
def ClangTidy() { def ClangTidy() {
node('linux && cpu_build') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
echo "Running clang-tidy job..." echo "Running clang-tidy job..."
def container_type = "clang_tidy" def container_type = "clang_tidy"
def docker_binary = "docker" def docker_binary = "docker"
def dockerArgs = "--build-arg CUDA_VERSION_ARG=10.1" def dockerArgs = "--build-arg CUDA_VERSION=10.1"
sh """ sh """
${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py ${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py
""" """
@@ -140,6 +137,48 @@ def ClangTidy() {
} }
} }
def Lint() {
node('linux && cpu') {
unstash name: 'srcs'
echo "Running lint..."
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} make lint
"""
deleteDir()
}
}
def SphinxDoc() {
node('linux && cpu') {
unstash name: 'srcs'
echo "Running sphinx-doc..."
def container_type = "cpu"
def docker_binary = "docker"
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e SPHINX_GIT_BRANCH=${BRANCH_NAME}'"
sh """#!/bin/bash
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} make -C doc html
"""
deleteDir()
}
}
def Doxygen() {
node('linux && cpu') {
unstash name: 'srcs'
echo "Running doxygen..."
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh ${BRANCH_NAME}
"""
echo 'Uploading doc...'
s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2"
deleteDir()
}
}
def BuildCPU() { def BuildCPU() {
node('linux && cpu') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
@@ -151,15 +190,15 @@ def BuildCPU() {
# This step is not necessary, but here we include it, to ensure that DMLC_CORE_USE_CMAKE flag is correctly propagated # This step is not necessary, but here we include it, to ensure that DMLC_CORE_USE_CMAKE flag is correctly propagated
# We want to make sure that we use the configured header build/dmlc/build_config.h instead of include/dmlc/build_config_default.h. # We want to make sure that we use the configured header build/dmlc/build_config.h instead of include/dmlc/build_config_default.h.
# See discussion at https://github.com/dmlc/xgboost/issues/5510 # See discussion at https://github.com/dmlc/xgboost/issues/5510
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh
${dockerRun} ${container_type} ${docker_binary} bash -c "cd build && ctest --extra-verbose" ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
""" """
// Sanitizer test // Sanitizer test
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer -e ASAN_OPTIONS=symbolize=1 -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log --cap-add SYS_PTRACE'" def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer -e ASAN_OPTIONS=symbolize=1 -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log --cap-add SYS_PTRACE'"
sh """ sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak;undefined" \ ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak;undefined" \
-DCMAKE_BUILD_TYPE=Debug -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ -DCMAKE_BUILD_TYPE=Debug -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests --extra-verbose" ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
""" """
stash name: 'xgboost_cli', includes: 'xgboost' stash name: 'xgboost_cli', includes: 'xgboost'
@@ -167,35 +206,6 @@ def BuildCPU() {
} }
} }
def BuildCPUARM64() {
node('linux && arm64') {
unstash name: 'srcs'
echo "Build CPU ARM64"
def container_type = "aarch64"
def docker_binary = "docker"
def wheel_tag = "manylinux2014_aarch64"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOL=ON
${dockerRun} ${container_type} ${docker_binary} bash -c "cd build && ctest --extra-verbose"
${dockerRun} ${container_type} ${docker_binary} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
${dockerRun} ${container_type} ${docker_binary} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} ${wheel_tag}
${dockerRun} ${container_type} ${docker_binary} bash -c "auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl && python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag}"
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
${dockerRun} ${container_type} ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
"""
echo 'Stashing Python wheel...'
stash name: "xgboost_whl_arm64_cpu", includes: 'python-package/dist/*.whl'
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
echo 'Uploading Python wheel...'
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
}
stash name: 'xgboost_cli_arm64', includes: 'xgboost'
deleteDir()
}
}
def BuildCPUMock() { def BuildCPUMock() {
node('linux && cpu') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
@@ -211,101 +221,48 @@ def BuildCPUMock() {
} }
} }
def BuildCPUNonOmp() {
node('linux && cpu') {
unstash name: 'srcs'
echo "Build CPU without OpenMP"
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DUSE_OPENMP=OFF
"""
echo "Running Non-OpenMP C++ test..."
sh """
${dockerRun} ${container_type} ${docker_binary} build/testxgboost
"""
deleteDir()
}
}
def BuildCUDA(args) { def BuildCUDA(args) {
node('linux && cpu_build') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
echo "Build with CUDA ${args.cuda_version}" echo "Build with CUDA ${args.cuda_version}"
def container_type = GetCUDABuildContainerType(args.cuda_version) def container_type = "gpu_build"
def docker_binary = "docker" def docker_binary = "docker"
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.cuda_version}" def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
def arch_flag = ""
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
arch_flag = "-DGPU_COMPUTE_VER=75"
}
def wheel_tag = "manylinux2010_x86_64"
sh """ sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOLS=ON ${arch_flag} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOLS=ON
${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} ${wheel_tag} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python3 tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
""" """
if (args.cuda_version == ref_cuda_ver) { // Stash wheel for CUDA 10.0 target
sh """ if (args.cuda_version == '10.0') {
${dockerRun} auditwheel_x86_64 ${docker_binary} auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl echo 'Stashing Python wheel...'
${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py wheelhouse/*.whl ${commit_id} ${wheel_tag} stash name: 'xgboost_whl_cuda10', includes: 'python-package/dist/*.whl'
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
${dockerRun} auditwheel_x86_64 ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
"""
}
echo 'Stashing Python wheel...'
stash name: "xgboost_whl_cuda${args.cuda_version}", includes: 'python-package/dist/*.whl'
if (args.cuda_version == ref_cuda_ver && (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release'))) {
echo 'Uploading Python wheel...'
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/" path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl' s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
}
echo 'Stashing C++ test executable (testxgboost)...'
stash name: "xgboost_cpp_tests_cuda${args.cuda_version}", includes: 'build/testxgboost'
if (args.build_rmm) {
echo "Build with CUDA ${args.cuda_version} and RMM"
container_type = "rmm"
docker_binary = "docker"
docker_args = "--build-arg CUDA_VERSION_ARG=${args.cuda_version}"
sh """
rm -rf build/
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag}
${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
"""
echo 'Stashing Python wheel...'
stash name: "xgboost_whl_rmm_cuda${args.cuda_version}", includes: 'python-package/dist/*.whl'
echo 'Stashing C++ test executable (testxgboost)...' echo 'Stashing C++ test executable (testxgboost)...'
stash name: "xgboost_cpp_tests_rmm_cuda${args.cuda_version}", includes: 'build/testxgboost' stash name: 'xgboost_cpp_tests', includes: 'build/testxgboost'
} }
deleteDir() deleteDir()
} }
} }
def BuildRPackageWithCUDA(args) {
node('linux && cpu_build') {
unstash name: 'srcs'
def container_type = 'gpu_build_r_centos6'
def docker_binary = "docker"
def docker_args = "--build-arg CUDA_VERSION_ARG=10.0"
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_r_pkg_with_cuda.sh ${commit_id}
"""
echo 'Uploading R tarball...'
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', includePathPattern:'xgboost_r_gpu_linux_*.tar.gz'
}
deleteDir()
}
}
def BuildJVMPackagesWithCUDA(args) {
node('linux && mgpu') {
unstash name: 'srcs'
echo "Build XGBoost4J-Spark with Spark ${args.spark_version}, CUDA ${args.cuda_version}"
def container_type = "jvm_gpu_build"
def docker_binary = "nvidia-docker"
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.cuda_version}"
def arch_flag = ""
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
arch_flag = "-DGPU_COMPUTE_VER=75"
}
// Use only 4 CPU cores
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='--cpuset-cpus 0-3'"
sh """
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_jvm_packages.sh ${args.spark_version} -Duse.cuda=ON $arch_flag
"""
echo "Stashing XGBoost4J JAR with CUDA ${args.cuda_version} ..."
stash name: 'xgboost4j_jar_gpu', includes: "jvm-packages/xgboost4j-gpu/target/*.jar,jvm-packages/xgboost4j-spark-gpu/target/*.jar"
deleteDir()
}
}
def BuildJVMPackages(args) { def BuildJVMPackages(args) {
node('linux && cpu') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
@@ -332,17 +289,15 @@ def BuildJVMDoc() {
sh """ sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME}
""" """
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { echo 'Uploading doc...'
echo 'Uploading doc...' s3Upload file: "jvm-packages/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${BRANCH_NAME}.tar.bz2"
s3Upload file: "jvm-packages/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${BRANCH_NAME}.tar.bz2"
}
deleteDir() deleteDir()
} }
} }
def TestPythonCPU() { def TestPythonCPU() {
node('linux && cpu') { node('linux && cpu') {
unstash name: "xgboost_whl_cuda${ref_cuda_ver}" unstash name: 'xgboost_whl_cuda10'
unstash name: 'srcs' unstash name: 'srcs'
unstash name: 'xgboost_cli' unstash name: 'xgboost_cli'
echo "Test Python CPU" echo "Test Python CPU"
@@ -350,72 +305,65 @@ def TestPythonCPU() {
def docker_binary = "docker" def docker_binary = "docker"
sh """ sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu
""" ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu-py35
deleteDir()
}
}
def TestPythonCPUARM64() {
node('linux && arm64') {
unstash name: "xgboost_whl_arm64_cpu"
unstash name: 'srcs'
unstash name: 'xgboost_cli_arm64'
echo "Test Python CPU ARM64"
def container_type = "aarch64"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu-arm64
""" """
deleteDir() deleteDir()
} }
} }
def TestPythonGPU(args) { def TestPythonGPU(args) {
def nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu' nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
def artifact_cuda_version = (args.artifact_cuda_version) ?: ref_cuda_ver
node(nodeReq) { node(nodeReq) {
unstash name: "xgboost_whl_cuda${artifact_cuda_version}" unstash name: 'xgboost_whl_cuda10'
unstash name: "xgboost_cpp_tests_cuda${artifact_cuda_version}"
unstash name: 'srcs' unstash name: 'srcs'
echo "Test Python GPU: CUDA ${args.host_cuda_version}" echo "Test Python GPU: CUDA ${args.cuda_version}"
def container_type = "gpu" def container_type = "gpu"
def docker_binary = "nvidia-docker" def docker_binary = "nvidia-docker"
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}" def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu' if (args.multi_gpu) {
// Allocate extra space in /dev/shm to enable NCCL echo "Using multiple GPUs"
def docker_extra_params = (args.multi_gpu) ? "CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'" : '' sh """
sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu
if (args.test_rmm) { """
sh "rm -rfv build/ python-package/dist/" } else {
unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" echo "Using a single GPU"
unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" sh """
sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator} --use-rmm-pool" ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh gpu
"""
} }
deleteDir() deleteDir()
} }
} }
def TestCppGPU(args) { def TestCppRabit() {
def nodeReq = 'linux && mgpu'
def artifact_cuda_version = (args.artifact_cuda_version) ?: ref_cuda_ver
node(nodeReq) { node(nodeReq) {
unstash name: "xgboost_cpp_tests_cuda${artifact_cuda_version}" unstash name: 'xgboost_rabit_tests'
unstash name: 'srcs' unstash name: 'srcs'
echo "Test C++, CUDA ${args.host_cuda_version}" echo "Test C++, rabit mock on"
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
"""
deleteDir()
}
}
def TestCppGPU(args) {
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
node(nodeReq) {
unstash name: 'xgboost_cpp_tests'
unstash name: 'srcs'
echo "Test C++, CUDA ${args.cuda_version}"
def container_type = "gpu" def container_type = "gpu"
def docker_binary = "nvidia-docker" def docker_binary = "nvidia-docker"
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}" def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.multi_gpu) {
if (args.test_rmm) { echo "Using multiple GPUs"
sh "rm -rfv build/" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost --gtest_filter=*.MGPU_*"
unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" } else {
echo "Test C++, CUDA ${args.host_cuda_version} with RMM" echo "Using a single GPU"
container_type = "rmm" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost --gtest_filter=-*.MGPU_*"
docker_binary = "nvidia-docker"
docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool --gtest_filter=-*DeathTest.*"
"""
} }
deleteDir() deleteDir()
} }
@@ -443,13 +391,30 @@ def CrossTestJVMwithJDK(args) {
} }
} }
def TestR(args) {
node('linux && cpu') {
unstash name: 'srcs'
echo "Test R package"
def container_type = "rproject"
def docker_binary = "docker"
def use_r35_flag = (args.use_r35) ? "1" : "0"
def docker_args = "--build-arg USE_R35=${use_r35_flag}"
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_test_rpkg.sh || tests/ci_build/print_r_stacktrace.sh
"""
deleteDir()
}
}
def DeployJVMPackages(args) { def DeployJVMPackages(args) {
node('linux && cpu') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
echo 'Deploying to xgboost-maven-repo S3 repo...' echo 'Deploying to xgboost-maven-repo S3 repo...'
def container_type = "jvm"
def docker_binary = "docker"
sh """ sh """
${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=10.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
""" """
} }
deleteDir() deleteDir()

View File

@@ -10,29 +10,18 @@ def commit_id // necessary to pass a variable from one stage to another
pipeline { pipeline {
agent none agent none
// Setup common job properties
options {
timestamps()
timeout(time: 240, unit: 'MINUTES')
buildDiscarder(logRotator(numToKeepStr: '10'))
preserveStashes()
}
// Build stages // Build stages
stages { stages {
stage('Jenkins Win64: Initialize') { stage('Jenkins Win64: Initialize') {
agent { label 'job_initializer' } agent { label 'job_initializer' }
steps { steps {
script { script {
def buildNumber = env.BUILD_NUMBER as int
if (buildNumber > 1) milestone(buildNumber - 1)
milestone(buildNumber)
checkoutSrcs() checkoutSrcs()
commit_id = "${GIT_COMMIT}" commit_id = "${GIT_COMMIT}"
} }
sh 'python3 tests/jenkins_get_approval.py' sh 'python3 tests/jenkins_get_approval.py'
stash name: 'srcs' stash name: 'srcs'
milestone ordinal: 1
} }
} }
stage('Jenkins Win64: Build') { stage('Jenkins Win64: Build') {
@@ -40,9 +29,10 @@ pipeline {
steps { steps {
script { script {
parallel ([ parallel ([
'build-win64-cuda10.1': { BuildWin64() } 'build-win64-cuda10.0': { BuildWin64() }
]) ])
} }
milestone ordinal: 2
} }
} }
stage('Jenkins Win64: Test') { stage('Jenkins Win64: Test') {
@@ -50,9 +40,11 @@ pipeline {
steps { steps {
script { script {
parallel ([ parallel ([
'test-win64-cuda10.1': { TestWin64() }, 'test-win64-cpu': { TestWin64CPU() },
'test-win64-gpu-cuda10.1': { TestWin64GPU(cuda_target: 'cuda10_1') }
]) ])
} }
milestone ordinal: 3
} }
} }
} }
@@ -74,18 +66,14 @@ def checkoutSrcs() {
} }
def BuildWin64() { def BuildWin64() {
node('win64 && cuda10_unified') { node('win64 && build && cuda10') {
unstash name: 'srcs' unstash name: 'srcs'
echo "Building XGBoost for Windows AMD64 target..." echo "Building XGBoost for Windows AMD64 target..."
bat "nvcc --version" bat "nvcc --version"
def arch_flag = ""
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
arch_flag = "-DGPU_COMPUTE_VER=75"
}
bat """ bat """
mkdir build mkdir build
cd build cd build
cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag} -DCMAKE_UNITY_BUILD=ON cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON
""" """
bat """ bat """
cd build cd build
@@ -103,11 +91,8 @@ def BuildWin64() {
""" """
echo 'Stashing Python wheel...' echo 'Stashing Python wheel...'
stash name: 'xgboost_whl', includes: 'python-package/dist/*.whl' stash name: 'xgboost_whl', includes: 'python-package/dist/*.whl'
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
echo 'Uploading Python wheel...' s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
}
echo 'Stashing C++ test executable (testxgboost)...' echo 'Stashing C++ test executable (testxgboost)...'
stash name: 'xgboost_cpp_tests', includes: 'build/testxgboost.exe' stash name: 'xgboost_cpp_tests', includes: 'build/testxgboost.exe'
stash name: 'xgboost_cli', includes: 'xgboost.exe' stash name: 'xgboost_cli', includes: 'xgboost.exe'
@@ -115,29 +100,51 @@ def BuildWin64() {
} }
} }
def TestWin64() { def TestWin64CPU() {
node('win64 && cuda10_unified') { node('win64 && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
unstash name: 'xgboost_whl' unstash name: 'xgboost_whl'
unstash name: 'xgboost_cli' unstash name: 'xgboost_cli'
unstash name: 'xgboost_cpp_tests' echo "Test Win64 CPU"
echo "Test Win64"
bat "nvcc --version"
echo "Running C++ tests..."
bat "build\\testxgboost.exe"
echo "Installing Python dependencies..."
def env_name = 'win64_' + UUID.randomUUID().toString().replaceAll('-', '')
bat "conda env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml"
echo "Installing Python wheel..." echo "Installing Python wheel..."
bat "conda activate && (python -m pip uninstall -y xgboost || cd .)"
bat """ bat """
conda activate ${env_name} && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i" conda activate && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i"
"""
echo "Installing Python dependencies..."
bat """
conda activate && conda upgrade scikit-learn pandas numpy
""" """
echo "Running Python tests..." echo "Running Python tests..."
bat "conda activate ${env_name} && python -m pytest -v -s -rxXs --fulltrace tests\\python" bat "conda activate && python -m pytest -v -s --fulltrace tests\\python"
bat """ bat "conda activate && python -m pip uninstall -y xgboost"
conda activate ${env_name} && python -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)" tests\\python-gpu deleteDir()
""" }
bat "conda env remove --name ${env_name}" }
def TestWin64GPU(args) {
node("win64 && gpu && ${args.cuda_target}") {
unstash name: 'srcs'
unstash name: 'xgboost_whl'
unstash name: 'xgboost_cpp_tests'
echo "Test Win64 GPU (${args.cuda_target})"
bat "nvcc --version"
echo "Running C++ tests..."
bat "build\\testxgboost.exe"
echo "Installing Python wheel..."
bat "conda activate && (python -m pip uninstall -y xgboost || cd .)"
bat """
conda activate && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i"
"""
echo "Installing Python dependencies..."
bat """
conda activate && conda upgrade scikit-learn pandas numpy && python -m pip uninstall -y cupy-cuda100 cupy-cuda101 cupy-cuda102
"""
echo "Running Python tests..."
bat """
conda activate && python -m pytest -v -s --fulltrace -m "(not slow) and (not mgpu)" tests\\python-gpu
"""
bat "conda activate && python -m pip uninstall -y xgboost"
deleteDir() deleteDir()
} }
} }

View File

@@ -44,7 +44,7 @@ export CXX = g++
endif endif
endif endif
export CFLAGS= -DDMLC_LOG_CUSTOMIZE=1 -std=c++14 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS) export CFLAGS= -DDMLC_LOG_CUSTOMIZE=1 -std=c++11 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS)
CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include -I$(GTEST_PATH)/include CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include -I$(GTEST_PATH)/include
ifeq ($(TEST_COVER), 1) ifeq ($(TEST_COVER), 1)
@@ -86,15 +86,6 @@ cover: check
) )
endif endif
# dask is required to pass, others are not
# If any of the dask tests failed, contributor won't see the other error.
mypy:
cd python-package; \
mypy ./xgboost/dask.py ../tests/python/test_with_dask.py --follow-imports=silent; \
mypy ../tests/python-gpu/test_gpu_with_dask.py --follow-imports=silent; \
mypy . || true ;
clean: clean:
$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o #xgboost $(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o #xgboost
$(RM) -rf build_tests *.gcov tests/cpp/xgboost_test $(RM) -rf build_tests *.gcov tests/cpp/xgboost_test
@@ -143,18 +134,14 @@ Rpack: clean_all
sed -i -e 's/@OPENMP_LIB@//g' xgboost/src/Makevars.win sed -i -e 's/@OPENMP_LIB@//g' xgboost/src/Makevars.win
rm -f xgboost/src/Makevars.win-e # OSX sed create this extra file; remove it rm -f xgboost/src/Makevars.win-e # OSX sed create this extra file; remove it
bash R-package/remove_warning_suppression_pragma.sh bash R-package/remove_warning_suppression_pragma.sh
bash xgboost/remove_warning_suppression_pragma.sh
rm xgboost/remove_warning_suppression_pragma.sh rm xgboost/remove_warning_suppression_pragma.sh
rm -rfv xgboost/tests/helper_scripts/
R ?= R
Rbuild: Rpack Rbuild: Rpack
$(R) CMD build xgboost R CMD build --no-build-vignettes xgboost
rm -rf xgboost rm -rf xgboost
Rcheck: Rbuild Rcheck: Rbuild
$(R) CMD check --as-cran xgboost*.tar.gz R CMD check xgboost*.tar.gz
-include build/*.d -include build/*.d
-include build/*/*.d -include build/*/*.d

633
NEWS.md
View File

@@ -3,639 +3,6 @@ XGBoost Change Log
This file records the changes in xgboost library in reverse chronological order. This file records the changes in xgboost library in reverse chronological order.
## v1.3.0 (2020.12.08)
### XGBoost4J-Spark: Exceptions should cancel jobs gracefully instead of killing SparkContext (#6019).
* By default, exceptions in XGBoost4J-Spark causes the whole SparkContext to shut down, necessitating the restart of the Spark cluster. This behavior is often a major inconvenience.
* Starting from 1.3.0 release, XGBoost adds a new parameter `killSparkContextOnWorkerFailure` to optionally prevent killing SparkContext. If this parameter is set, exceptions will gracefully cancel training jobs instead of killing SparkContext.
### GPUTreeSHAP: GPU acceleration of the TreeSHAP algorithm (#6038, #6064, #6087, #6099, #6163, #6281, #6332)
* [SHAP (SHapley Additive exPlanations)](https://github.com/slundberg/shap) is a game theoretic approach to explain predictions of machine learning models. It computes feature importance scores for individual examples, establishing how each feature influences a particular prediction. TreeSHAP is an optimized SHAP algorithm specifically designed for decision tree ensembles.
* Starting with 1.3.0 release, it is now possible to leverage CUDA-capable GPUs to accelerate the TreeSHAP algorithm. Check out [the demo notebook](https://github.com/dmlc/xgboost/blob/master/demo/gpu_acceleration/shap.ipynb).
* The CUDA implementation of the TreeSHAP algorithm is hosted at [rapidsai/GPUTreeSHAP](https://github.com/rapidsai/gputreeshap). XGBoost imports it as a Git submodule.
### New style Python callback API (#6199, #6270, #6320, #6348, #6376, #6399, #6441)
* The XGBoost Python package now offers a re-designed callback API. The new callback API lets you design various extensions of training in idomatic Python. In addition, the new callback API allows you to use early stopping with the native Dask API (`xgboost.dask`). Check out [the tutorial](https://xgboost.readthedocs.io/en/release_1.3.0/python/callbacks.html) and [the demo](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/callbacks.py).
### Enable the use of `DeviceQuantileDMatrix` / `DaskDeviceQuantileDMatrix` with large data (#6201, #6229, #6234).
* `DeviceQuantileDMatrix` can achieve memory saving by avoiding extra copies of the training data, and the saving is bigger for large data. Unfortunately, large data with more than 2^31 elements was triggering integer overflow bugs in CUB and Thrust. Tracking issue: #6228.
* This release contains a series of work-arounds to allow the use of `DeviceQuantileDMatrix` with large data:
- Loop over `copy_if` (#6201)
- Loop over `thrust::reduce` (#6229)
- Implement the inclusive scan algorithm in-house, to handle large offsets (#6234)
### Support slicing of tree models (#6302)
* Accessing the best iteration of a model after the application of early stopping used to be error-prone, need to manually pass the `ntree_limit` argument to the `predict()` function.
* Now we provide a simple interface to slice tree models by specifying a range of boosting rounds. The tree ensemble can be split into multiple sub-ensembles via the slicing interface. Check out [an example](https://xgboost.readthedocs.io/en/release_1.3.0/python/model.html).
* In addition, the early stopping callback now supports `save_best` option. When enabled, XGBoost will save (persist) the model at the best boosting round and discard the trees that were fit subsequent to the best round.
### Weighted subsampling of features (columns) (#5962)
* It is now possible to sample features (columns) via weighted subsampling, in which features with higher weights are more likely to be selected in the sample. Weighted subsampling allows you to encode domain knowledge by emphasizing a particular set of features in the choice of tree splits. In addition, you can prevent particular features from being used in any splits, by assigning them zero weights.
* Check out [the demo](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/feature_weights.py).
### Improved integration with Dask
* Support reverse-proxy environment such as Google Kubernetes Engine (#6343, #6475)
* An XGBoost training job will no longer use all available workers. Instead, it will only use the workers that contain input data (#6343).
* The new callback API works well with the Dask training API.
* The `predict()` and `fit()` function of `DaskXGBClassifier` and `DaskXGBRegressor` now accept a base margin (#6155).
* Support more meta data in the Dask API (#6130, #6132, #6333).
* Allow passing extra keyword arguments as `kwargs` in `predict()` (#6117)
* Fix typo in dask interface: `sample_weights` -> `sample_weight` (#6240)
* Allow empty data matrix in AFT survival, as Dask may produce empty partitions (#6379)
* Speed up prediction by overlapping prediction jobs in all workers (#6412)
### Experimental support for direct splits with categorical features (#6028, #6128, #6137, #6140, #6164, #6165, #6166, #6179, #6194, #6219)
* Currently, XGBoost requires users to one-hot-encode categorical variables. This has adverse performance implications, as the creation of many dummy variables results into higher memory consumption and may require fitting deeper trees to achieve equivalent model accuracy.
* The 1.3.0 release of XGBoost contains an experimental support for direct handling of categorical variables in test nodes. Each test node will have the condition of form `feature_value \in match_set`, where the `match_set` on the right hand side contains one or more matching categories. The matching categories in `match_set` represent the condition for traversing to the right child node. Currently, XGBoost will only generate categorical splits with only a single matching category ("one-vs-rest split"). In a future release, we plan to remove this restriction and produce splits with multiple matching categories in `match_set`.
* The categorical split requires the use of JSON model serialization. The legacy binary serialization method cannot be used to save (persist) models with categorical splits.
* Note. This feature is currently highly experimental. Use it at your own risk. See the detailed list of limitations at [#5949](https://github.com/dmlc/xgboost/pull/5949).
### Experimental plugin for RAPIDS Memory Manager (#5873, #6131, #6146, #6150, #6182)
* RAPIDS Memory Manager library ([rapidsai/rmm](https://github.com/rapidsai/rmm)) provides a collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin. With this plugin, XGBoost is now able to share a common GPU memory pool with other applications using RMM, such as the RAPIDS data science packages.
* See [the demo](https://github.com/dmlc/xgboost/blob/master/demo/rmm_plugin/README.md) for a working example, as well as directions for building XGBoost with the RMM plugin.
* The plugin will be soon considered non-experimental, once #6297 is resolved.
### Experimental plugin for oneAPI programming model (#5825)
* oneAPI is a programming interface developed by Intel aimed at providing one programming model for many types of hardware such as CPU, GPU, FGPA and other hardware accelerators.
* XGBoost now includes an experimental plugin for using oneAPI for the predictor and objective functions. The plugin is hosted in the directory `plugin/updater_oneapi`.
* Roadmap: #5442
### Pickling the XGBoost model will now trigger JSON serialization (#6027)
* The pickle will now contain the JSON string representation of the XGBoost model, as well as related configuration.
### Performance improvements
* Various performance improvement on multi-core CPUs
- Optimize DMatrix build time by up to 3.7x. (#5877)
- CPU predict performance improvement, by up to 3.6x. (#6127)
- Optimize CPU sketch allreduce for sparse data (#6009)
- Thread local memory allocation for BuildHist, leading to speedup up to 1.7x. (#6358)
- Disable hyperthreading for DMatrix creation (#6386). This speeds up DMatrix creation by up to 2x.
- Simple fix for static shedule in predict (#6357)
* Unify thread configuration, to make it easy to utilize all CPU cores (#6186)
* [jvm-packages] Clean the way deterministic paritioning is computed (#6033)
* Speed up JSON serialization by implementing an intrusive pointer class (#6129). It leads to 1.5x-2x performance boost.
### API additions
* [R] Add SHAP summary plot using ggplot2 (#5882)
* Modin DataFrame can now be used as input (#6055)
* [jvm-packages] Add `getNumFeature` method (#6075)
* Add MAPE metric (#6119)
* Implement GPU predict leaf. (#6187)
* Enable cuDF/cuPy inputs in `XGBClassifier` (#6269)
* Document tree method for feature weights. (#6312)
* Add `fail_on_invalid_gpu_id` parameter, which will cause XGBoost to terminate upon seeing an invalid value of `gpu_id` (#6342)
### Breaking: the default evaluation metric for classification is changed to `logloss` / `mlogloss` (#6183)
* The default metric used to be accuracy, and it is not statistically consistent to perform early stopping with the accuracy metric when we are really optimizing the log loss for the `binary:logistic` objective.
* For statistical consistency, the default metric for classification has been changed to `logloss`. Users may choose to preserve the old behavior by explicitly specifying `eval_metric`.
### Breaking: `skmaker` is now removed (#5971)
* The `skmaker` updater has not been documented nor tested.
### Breaking: the JSON model format no longer stores the leaf child count (#6094).
* The leaf child count field has been deprecated and is not used anywhere in the XGBoost codebase.
### Breaking: XGBoost now requires MacOS 10.14 (Mojave) and later.
* Homebrew has dropped support for MacOS 10.13 (High Sierra), so we are not able to install the OpenMP runtime (`libomp`) from Homebrew on MacOS 10.13. Please use MacOS 10.14 (Mojave) or later.
### Deprecation notices
* The use of `LabelEncoder` in `XGBClassifier` is now deprecated and will be removed in the next minor release (#6269). The deprecation is necessary to support multiple types of inputs, such as cuDF data frames or cuPy arrays.
* The use of certain positional arguments in the Python interface is deprecated (#6365). Users will use deprecation warnings for the use of position arguments for certain function parameters. New code should use keyword arguments as much as possible. We have not yet decided when we will fully require the use of keyword arguments.
### Bug-fixes
* On big-endian arch, swap the byte order in the binary serializer to enable loading models that were produced by a little-endian machine (#5813).
* [jvm-packages] Fix deterministic partitioning with dataset containing Double.NaN (#5996)
* Limit tree depth for GPU hist to 31 to prevent integer overflow (#6045)
* [jvm-packages] Set `maxBins` to 256 to align with the default value in the C++ code (#6066)
* [R] Fix CRAN check (#6077)
* Add back support for `scipy.sparse.coo_matrix` (#6162)
* Handle duplicated values in sketching. (#6178)
* Catch all standard exceptions in C API. (#6220)
* Fix linear GPU input (#6255)
* Fix inplace prediction interval. (#6259)
* [R] allow `xgb.plot.importance()` calls to fill a grid (#6294)
* Lazy import dask libraries. (#6309)
* Deterministic data partitioning for external memory (#6317)
* Avoid resetting seed for every configuration. (#6349)
* Fix label errors in graph visualization (#6369)
* [jvm-packages] fix potential unit test suites aborted issue due to race condition (#6373)
* [R] Fix warnings from `R check --as-cran` (#6374)
* [R] Fix a crash that occurs with noLD R (#6378)
* [R] Do not convert continuous labels to factors (#6380)
* [R] remove uses of `exists()` (#6387)
* Propagate parameters to the underlying `Booster` handle from `XGBClassifier.set_param` / `XGBRegressor.set_param`. (#6416)
* [R] Fix R package installation via CMake (#6423)
* Enforce row-major order in cuPy array (#6459)
* Fix filtering callable objects in the parameters passed to the scikit-learn API. (#6466)
### Maintenance: Testing, continuous integration, build system
* [CI] Improve JVM test in GitHub Actions (#5930)
* Refactor plotting test so that it can run independently (#6040)
* [CI] Cancel builds on subsequent pushes (#6011)
* Fix Dask Pytest fixture (#6024)
* [CI] Migrate linters to GitHub Actions (#6035)
* [CI] Remove win2016 JVM test from GitHub Actions (#6042)
* Fix CMake build with `BUILD_STATIC_LIB` option (#6090)
* Don't link imported target in CMake (#6093)
* Work around a compiler bug in MacOS AppleClang 11 (#6103)
* [CI] Fix CTest by running it in a correct directory (#6104)
* [R] Check warnings explicitly for model compatibility tests (#6114)
* [jvm-packages] add xgboost4j-gpu/xgboost4j-spark-gpu module to facilitate release (#6136)
* [CI] Time GPU tests. (#6141)
* [R] remove warning in configure.ac (#6152)
* [CI] Upgrade cuDF and RMM to 0.16 nightlies; upgrade to Ubuntu 18.04 (#6157)
* [CI] Test C API demo (#6159)
* Option for generating device debug info. (#6168)
* Update `.gitignore` (#6175, #6193, #6346)
* Hide C++ symbols from dmlc-core (#6188)
* [CI] Added arm64 job in Travis-CI (#6200)
* [CI] Fix Docker build for CUDA 11 (#6202)
* [CI] Move non-OpenMP gtest to GitHub Actions (#6210)
* [jvm-packages] Fix up build for xgboost4j-gpu, xgboost4j-spark-gpu (#6216)
* Add more tests for categorical data support (#6219)
* [dask] Test for data initializaton. (#6226)
* Bump junit from 4.11 to 4.13.1 in /jvm-packages/xgboost4j (#6230)
* Bump junit from 4.11 to 4.13.1 in /jvm-packages/xgboost4j-gpu (#6233)
* [CI] Reduce testing load with RMM (#6249)
* [CI] Build a Python wheel for aarch64 platform (#6253)
* [CI] Time the CPU tests on Jenkins. (#6257)
* [CI] Skip Dask tests on ARM. (#6267)
* Fix a typo in `is_arm()` in testing.py (#6271)
* [CI] replace `egrep` with `grep -E` (#6287)
* Support unity build. (#6295)
* [CI] Mark flaky tests as XFAIL (#6299)
* [CI] Use separate Docker cache for each CUDA version (#6305)
* Added `USE_NCCL_LIB_PATH` option to enable user to set `NCCL_LIBRARY` during build (#6310)
* Fix flaky data initialization test. (#6318)
* Add a badge for GitHub Actions (#6321)
* Optional `find_package` for sanitizers. (#6329)
* Use pytest conventions consistently in Python tests (#6337)
* Fix missing space in warning message (#6340)
* Update `custom_metric_obj.rst` (#6367)
* [CI] Run R check with `--as-cran` flag on GitHub Actions (#6371)
* [CI] Remove R check from Jenkins (#6372)
* Mark GPU external memory test as XFAIL. (#6381)
* [CI] Add noLD R test (#6382)
* Fix MPI build. (#6403)
* [CI] Upgrade to MacOS Mojave image (#6406)
* Fix flaky sparse page dmatrix test. (#6417)
* [CI] Upgrade cuDF and RMM to 0.17 nightlies (#6434)
* [CI] Fix CentOS 6 Docker images (#6467)
* [CI] Vendor libgomp in the manylinux Python wheel (#6461)
* [CI] Hot fix for libgomp vendoring (#6482)
### Maintenance: Clean up and merge the Rabit submodule (#6023, #6095, #6096, #6105, #6110, #6262, #6275, #6290)
* The Rabit submodule is now maintained as part of the XGBoost codebase.
* Tests for Rabit are now part of the test suites of XGBoost.
* Rabit can now be built on the Windows platform.
* We made various code re-formatting for the C++ code with clang-tidy.
* Public headers of XGBoost no longer depend on Rabit headers.
* Unused CMake targets for Rabit were removed.
* Single-point model recovery has been dropped and removed from Rabit, simplifying the Rabit code greatly. The single-point model recovery feature has not been adequately maintained over the years.
* We removed the parts of Rabit that were not useful for XGBoost.
### Maintenance: Refactor code for legibility and maintainability
* Unify CPU hist sketching (#5880)
* [R] fix uses of 1:length(x) and other small things (#5992)
* Unify evaluation functions. (#6037)
* Make binary bin search reusable. (#6058)
* Unify set index data. (#6062)
* [R] Remove `stringi` dependency (#6109)
* Merge extract cuts into QuantileContainer. (#6125)
* Reduce C++ compiler warnings (#6197, #6198, #6213, #6286, #6325)
* Cleanup Python code. (#6223)
* Small cleanup to evaluator. (#6400)
### Usability Improvements, Documentation
* [jvm-packages] add example to handle missing value other than 0 (#5677)
* Add DMatrix usage examples to the C API demo (#5854)
* List `DaskDeviceQuantileDMatrix` in the doc. (#5975)
* Update Python custom objective demo. (#5981)
* Update the JSON model schema to document more objective functions. (#5982)
* [Python] Fix warning when `missing` field is not used. (#5969)
* Fix typo in tracker logging (#5994)
* Move a warning about empty dataset, so that it's shown for all objectives and metrics (#5998)
* Fix the instructions for installing the nightly build. (#6004)
* [Doc] Add dtreeviz as a showcase example of integration with 3rd-party software (#6013)
* [jvm-packages] [doc] Update install doc for JVM packages (#6051)
* Fix typo in `xgboost.callback.early_stop` docstring (#6071)
* Add cache suffix to the files used in the external memory demo. (#6088)
* [Doc] Document the parameter `kill_spark_context_on_worker_failure` (#6097)
* Fix link to the demo for custom objectives (#6100)
* Update Dask doc. (#6108)
* Validate weights are positive values. (#6115)
* Document the updated CMake version requirement. (#6123)
* Add demo for `DaskDeviceQuantileDMatrix`. (#6156)
* Cosmetic fixes in `faq.rst` (#6161)
* Fix error message. (#6176)
* [Doc] Add list of winning solutions in data science competitions using XGBoost (#6177)
* Fix a comment in demo to use correct reference (#6190)
* Update the list of winning solutions using XGBoost (#6192)
* Consistent style for build status badge (#6203)
* [Doc] Add info on GPU compiler (#6204)
* Update the list of winning solutions (#6222, #6254)
* Add link to XGBoost's Twitter handle (#6244)
* Fix minor typos in XGBClassifier methods' docstrings (#6247)
* Add sponsors link to FUNDING.yml (#6252)
* Group CLI demo into subdirectory. (#6258)
* Reduce warning messages from `gbtree`. (#6273)
* Create a tutorial for using the C API in a C/C++ application (#6285)
* Update plugin instructions for CMake build (#6289)
* [doc] make Dask distributed example copy-pastable (#6345)
* [Python] Add option to use `libxgboost.so` from the system path (#6362)
* Fixed few grammatical mistakes in doc (#6393)
* Fix broken link in CLI doc (#6396)
* Improve documentation for the Dask API (#6413)
* Revise misleading exception information: no such param of `allow_non_zero_missing` (#6418)
* Fix CLI ranking demo. (#6439)
* Fix broken links. (#6455)
### Acknowledgement
**Contributors**: Nan Zhu (@CodingCat), @FelixYBW, Jack Dunn (@JackDunnNZ), Jean Lescut-Muller (@JeanLescut), Boris Feld (@Lothiraldan), Nikhil Choudhary (@Nikhil1O1), Rory Mitchell (@RAMitchell), @ShvetsKS, Anthony D'Amato (@Totoketchup), @Wittty-Panda, neko (@akiyamaneko), Alexander Gugel (@alexanderGugel), @dependabot[bot], DIVYA CHAUHAN (@divya661), Daniel Steinberg (@dstein64), Akira Funahashi (@funasoul), Philip Hyunsu Cho (@hcho3), Tong He (@hetong007), Hristo Iliev (@hiliev), Honza Sterba (@honzasterba), @hzy001, Igor Moura (@igormp), @jameskrach, James Lamb (@jameslamb), Naveed Ahmed Saleem Janvekar (@janvekarnaveed), Kyle Nicholson (@kylejn27), lacrosse91 (@lacrosse91), Christian Lorentzen (@lorentzenchr), Manikya Bardhan (@manikyabard), @nabokovas, John Quitto-Graham (@nvidia-johnq), @odidev, Qi Zhang (@qzhang90), Sergio Gavilán (@sgavil), Tanuja Kirthi Doddapaneni (@tanuja3), Cuong Duong (@tcuongd), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), vcarpani (@vcarpani), Vladislav Epifanov (@vepifanov), Vitalie Spinu (@vspinu), Bobby Wang (@wbo4958), Zeno Gantner (@zenogantner), zhang_jf (@zuston)
**Reviewers**: Nan Zhu (@CodingCat), John Zedlewski (@JohnZed), Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Anthony D'Amato (@Totoketchup), @Wittty-Panda, Alexander Gugel (@alexanderGugel), Codecov Comments Bot (@codecov-commenter), Codecov (@codecov-io), DIVYA CHAUHAN (@divya661), Devin Robison (@drobison00), Geoffrey Blake (@geoffreyblake), Mark Harris (@harrism), Philip Hyunsu Cho (@hcho3), Honza Sterba (@honzasterba), Igor Moura (@igormp), @jakirkham, @jameskrach, James Lamb (@jameslamb), Janakarajan Natarajan (@janaknat), Jake Hemstad (@jrhemstad), Keith Kraus (@kkraus14), Kyle Nicholson (@kylejn27), Christian Lorentzen (@lorentzenchr), Michael Mayer (@mayer79), Nikolay Petrov (@napetrov), @odidev, PSEUDOTENSOR / Jonathan McKinney (@pseudotensor), Qi Zhang (@qzhang90), Sergio Gavilán (@sgavil), Scott Lundberg (@slundberg), Cuong Duong (@tcuongd), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), vcarpani (@vcarpani), Vladislav Epifanov (@vepifanov), Vincent Nijs (@vnijs), Vitalie Spinu (@vspinu), Bobby Wang (@wbo4958), William Hicks (@wphicks)
## v1.2.0 (2020.08.22)
### XGBoost4J-Spark now supports the GPU algorithm (#5171)
* Now XGBoost4J-Spark is able to leverage NVIDIA GPU hardware to speed up training.
* There is on-going work for accelerating the rest of the data pipeline with NVIDIA GPUs (#5950, #5972).
### XGBoost now supports CUDA 11 (#5808)
* It is now possible to build XGBoost with CUDA 11. Note that we do not yet distribute pre-built binaries built with CUDA 11; all current distributions use CUDA 10.0.
### Better guidance for persisting XGBoost models in an R environment (#5940, #5964)
* Users are strongly encouraged to use `xgb.save()` and `xgb.save.raw()` instead of `saveRDS()`. This is so that the persisted models can be accessed with future releases of XGBoost.
* The previous release (1.1.0) had problems loading models that were saved with `saveRDS()`. This release adds a compatibility layer to restore access to the old RDS files. Note that this is meant to be a temporary measure; users are advised to stop using `saveRDS()` and migrate to `xgb.save()` and `xgb.save.raw()`.
### New objectives and metrics
* The pseudo-Huber loss `reg:pseudohubererror` is added (#5647). The corresponding metric is `mphe`. Right now, the slope is hard-coded to 1.
* The Accelerated Failure Time objective for survival analysis (`survival:aft`) is now accelerated on GPUs (#5714, #5716). The survival metrics `aft-nloglik` and `interval-regression-accuracy` are also accelerated on GPUs.
### Improved integration with scikit-learn
* Added `n_features_in_` attribute to the scikit-learn interface to store the number of features used (#5780). This is useful for integrating with some scikit-learn features such as `StackingClassifier`. See [this link](https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html) for more details.
* `XGBoostError` now inherits `ValueError`, which conforms scikit-learn's exception requirement (#5696).
### Improved integration with Dask
* The XGBoost Dask API now exposes an asynchronous interface (#5862). See [the document](https://xgboost.readthedocs.io/en/latest/tutorials/dask.html#working-with-asyncio) for details.
* Zero-copy ingestion of GPU arrays via `DaskDeviceQuantileDMatrix` (#5623, #5799, #5800, #5803, #5837, #5874, #5901): Previously, the Dask interface had to make 2 data copies: one for concatenating the Dask partition/block into a single block and another for internal representation. To save memory, we introduce `DaskDeviceQuantileDMatrix`. As long as Dask partitions are resident in the GPU memory, `DaskDeviceQuantileDMatrix` is able to ingest them directly without making copies. This matrix type wraps `DeviceQuantileDMatrix`.
* The prediction function now returns GPU Series type if the input is from Dask-cuDF (#5710). This is to preserve the input data type.
### Robust handling of external data types (#5689, #5893)
- As we support more and more external data types, the handling logic has proliferated all over the code base and became hard to keep track. It also became unclear how missing values and threads are handled. We refactored the Python package code to collect all data handling logic to a central location, and now we have an explicit list of of all supported data types.
### Improvements in GPU-side data matrix (`DeviceQuantileDMatrix`)
* The GPU-side data matrix now implements its own quantile sketching logic, so that data don't have to be transported back to the main memory (#5700, #5747, #5760, #5846, #5870, #5898). The GK sketching algorithm is also now better documented.
- Now we can load extremely sparse dataset like URL, although performance is still sub-optimal.
* The GPU-side data matrix now exposes an iterative interface (#5783), so that users are able to construct a matrix from a data iterator. See the [Python demo](https://github.com/dmlc/xgboost/blob/release_1.2.0/demo/guide-python/data_iterator.py).
### New language binding: Swift (#5728)
* Visit https://github.com/kongzii/SwiftXGBoost for more details.
### Robust model serialization with JSON (#5772, #5804, #5831, #5857, #5934)
* We continue efforts from the 1.0.0 release to adopt JSON as the format to save and load models robustly.
* JSON model IO is significantly faster and produces smaller model files.
* Round-trip reproducibility is guaranteed, via the introduction of an efficient float-to-string conversion algorithm known as [the Ryū algorithm](https://dl.acm.org/doi/10.1145/3192366.3192369). The conversion is locale-independent, producing consistent numeric representation regardless of the locale setting of the user's machine.
* We fixed an issue in loading large JSON files to memory.
* It is now possible to load a JSON file from a remote source such as S3.
### Performance improvements
* CPU hist tree method optimization
- Skip missing lookup in hist row partitioning if data is dense. (#5644)
- Specialize training procedures for CPU hist tree method on distributed environment. (#5557)
- Add single point histogram for CPU hist. Previously gradient histogram for CPU hist is hard coded to be 64 bit, now users can specify the parameter `single_precision_histogram` to use 32 bit histogram instead for faster training performance. (#5624, #5811)
* GPU hist tree method optimization
- Removed some unnecessary synchronizations and better memory allocation pattern. (#5707)
- Optimize GPU Hist for wide dataset. Previously for wide dataset the atomic operation is performed on global memory, now it can run on shared memory for faster histogram building. But there's a known small regression on GeForce cards with dense data. (#5795, #5926, #5948, #5631)
### API additions
* Support passing fmap to importance plot (#5719). Now importance plot can show actual names of features instead of default ones.
* Support 64bit seed. (#5643)
* A new C API `XGBoosterGetNumFeature` is added for getting number of features in booster (#5856).
* Feature names and feature types are now stored in C++ core and saved in binary DMatrix (#5858).
### Breaking: The `predict()` method of `DaskXGBClassifier` now produces class predictions (#5986). Use `predict_proba()` to obtain probability predictions.
* Previously, `DaskXGBClassifier.predict()` produced probability predictions. This is inconsistent with the behavior of other scikit-learn classifiers, where `predict()` returns class predictions. We make a breaking change in 1.2.0 release so that `DaskXGBClassifier.predict()` now correctly produces class predictions and thus behave like other scikit-learn classifiers. Furthermore, we introduce the `predict_proba()` method for obtaining probability predictions, again to be in line with other scikit-learn classifiers.
### Breaking: Custom evaluation metric now receives raw prediction (#5954)
* Previously, the custom evaluation metric received a transformed prediction result when used with a classifier. Now the custom metric will receive a raw (untransformed) prediction and will need to transform the prediction itself. See [demo/guide-python/custom\_softmax.py](https://github.com/dmlc/xgboost/blob/release_1.2.0/demo/guide-python/custom_softmax.py) for an example.
* This change is to make the custom metric behave consistently with the custom objective, which already receives raw prediction (#5564).
### Breaking: XGBoost4J-Spark now requires Spark 3.0 and Scala 2.12 (#5836, #5890)
* Starting with version 3.0, Spark can manage GPU resources and allocate them among executors.
* Spark 3.0 dropped support for Scala 2.11 and now only supports Scala 2.12. Thus, XGBoost4J-Spark also only supports Scala 2.12.
### Breaking: XGBoost Python package now requires Python 3.6 and later (#5715)
* Python 3.6 has many useful features such as f-strings.
### Breaking: XGBoost now adopts the C++14 standard (#5664)
* Make sure to use a sufficiently modern C++ compiler that supports C++14, such as Visual Studio 2017, GCC 5.0+, and Clang 3.4+.
### Bug-fixes
* Fix a data race in the prediction function (#5853). As a byproduct, the prediction function now uses a thread-local data store and became thread-safe.
* Restore capability to run prediction when the test input has fewer features than the training data (#5955). This capability is necessary to support predicting with LIBSVM inputs. The previous release (1.1) had broken this capability, so we restore it in this version with better tests.
* Fix OpenMP build with CMake for R package, to support CMake 3.13 (#5895).
* Fix Windows 2016 build (#5902, #5918).
* Fix edge cases in scikit-learn interface with Pandas input by disabling feature validation. (#5953)
* [R] Enable weighted learning to rank (#5945)
* [R] Fix early stopping with custom objective (#5923)
* Fix NDK Build (#5886)
* Add missing explicit template specializations for greater portability (#5921)
* Handle empty rows in data iterators correctly (#5929). This bug affects file loader and JVM data frames.
* Fix `IsDense` (#5702)
* [jvm-packages] Fix wrong method name `setAllowZeroForMissingValue` (#5740)
* Fix shape inference for Dask predict (#5989)
### Usability Improvements, Documentation
* [Doc] Document that CUDA 10.0 is required (#5872)
* Refactored command line interface (CLI). Now CLI is able to handle user errors and output basic document. (#5574)
* Better error handling in Python: use `raise from` syntax to preserve full stacktrace (#5787).
* The JSON model dump now has a formal schema (#5660, #5818). The benefit is to prevent `dump_model()` function from breaking. See [this document](https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html#difference-between-saving-model-and-dumping-model) to understand the difference between saving and dumping models.
* Add a reference to the GPU external memory paper (#5684)
* Document more objective parameters in the R package (#5682)
* Document the existence of pre-built binary wheels for MacOS (#5711)
* Remove `max.depth` in the R gblinear example. (#5753)
* Added conda environment file for building docs (#5773)
* Mention dask blog post in the doc, which introduces using Dask with GPU and some internal workings. (#5789)
* Fix rendering of Markdown docs (#5821)
* Document new objectives and metrics available on GPUs (#5909)
* Better message when no GPU is found. (#5594)
* Remove the use of `silent` parameter from R demos. (#5675)
* Don't use masked array in array interface. (#5730)
* Update affiliation of @terrytangyuan: Ant Financial -> Ant Group (#5827)
* Move dask tutorial closer other distributed tutorials (#5613)
* Update XGBoost + Dask overview documentation (#5961)
* Show `n_estimators` in the docstring of the scikit-learn interface (#6041)
* Fix a type in a doctring of the scikit-learn interface (#5980)
### Maintenance: testing, continuous integration, build system
* [CI] Remove CUDA 9.0 from CI (#5674, #5745)
* Require CUDA 10.0+ in CMake build (#5718)
* [R] Remove dependency on gendef for Visual Studio builds (fixes #5608) (#5764). This enables building XGBoost with GPU support with R 4.x.
* [R-package] Reduce duplication in configure.ac (#5693)
* Bump com.esotericsoftware to 4.0.2 (#5690)
* Migrate some tests from AppVeyor to GitHub Actions to speed up the tests. (#5911, #5917, #5919, #5922, #5928)
* Reduce cost of the Jenkins CI server (#5884, #5904, #5892). We now enforce a daily budget via an automated monitor. We also dramatically reduced the workload for the Windows platform, since the cloud VM cost is vastly greater for Windows.
* [R] Set up automated R linter (#5944)
* [R] replace uses of T and F with TRUE and FALSE (#5778)
* Update Docker container 'CPU' (#5956)
* Simplify CMake build with modern CMake techniques (#5871)
* Use `hypothesis` package for testing (#5759, #5835, #5849).
* Define `_CRT_SECURE_NO_WARNINGS` to remove unneeded warnings in MSVC (#5434)
* Run all Python demos in CI, to ensure that they don't break (#5651)
* Enhance nvtx support (#5636). Now we can use unified timer between CPU and GPU. Also CMake is able to find nvtx automatically.
* Speed up python test. (#5752)
* Add helper for generating batches of data. (#5756)
* Add c-api-demo to .gitignore (#5855)
* Add option to enable all compiler warnings in GCC/Clang (#5897)
* Make Python model compatibility test runnable locally (#5941)
* Add cupy to Windows CI (#5797)
* [CI] Fix cuDF install; merge 'gpu' and 'cudf' test suite (#5814)
* Update rabit submodule (#5680, #5876)
* Force colored output for Ninja build. (#5959)
* [CI] Assign larger /dev/shm to NCCL (#5966)
* Add missing Pytest marks to AsyncIO unit test (#5968)
* [CI] Use latest cuDF and dask-cudf (#6048)
* Add CMake flag to log C API invocations, to aid debugging (#5925)
* Fix a unit test on CLI, to handle RC versions (#6050)
* [CI] Use mgpu machine to run gpu hist unit tests (#6050)
* [CI] Build GPU-enabled JAR artifact and deploy to xgboost-maven-repo (#6050)
### Maintenance: Refactor code for legibility and maintainability
* Remove dead code in DMatrix initialization. (#5635)
* Catch dmlc error by ref. (#5678)
* Refactor the `gpu_hist` split evaluation in preparation for batched nodes enumeration. (#5610)
* Remove column major specialization. (#5755)
* Remove unused imports in Python (#5776)
* Avoid including `c_api.h` in header files. (#5782)
* Remove unweighted GK quantile, which is unused. (#5816)
* Add Python binding for rabit ops. (#5743)
* Implement `Empty` method for host device vector. (#5781)
* Remove print (#5867)
* Enforce tree order in JSON (#5974)
### Acknowledgement
**Contributors**: Nan Zhu (@CodingCat), @LionOrCatThatIsTheQuestion, Dmitry Mottl (@Mottl), Rory Mitchell (@RAMitchell), @ShvetsKS, Alex Wozniakowski (@a-wozniakowski), Alexander Gugel (@alexanderGugel), @anttisaukko, @boxdot, Andy Adinets (@canonizer), Ram Rachum (@cool-RR), Elliot Hershberg (@elliothershberg), Jason E. Aten, Ph.D. (@glycerine), Philip Hyunsu Cho (@hcho3), @jameskrach, James Lamb (@jameslamb), James Bourbeau (@jrbourbeau), Peter Jung (@kongzii), Lorenz Walthert (@lorenzwalthert), Oleksandr Kuvshynov (@okuvshynov), Rong Ou (@rongou), Shaochen Shi (@shishaochen), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), Bobby Wang (@wbo4958), Zhang Zhang (@zhangzhang10)
**Reviewers**: Nan Zhu (@CodingCat), @LionOrCatThatIsTheQuestion, Hao Yang (@QuantHao), Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Alex Wozniakowski (@a-wozniakowski), Amit Kumar (@aktech), Avinash Barnwal (@avinashbarnwal), @boxdot, Andy Adinets (@canonizer), Chandra Shekhar Reddy (@chandrureddy), Ram Rachum (@cool-RR), Cristiano Goncalves (@cristianogoncalves), Elliot Hershberg (@elliothershberg), Jason E. Aten, Ph.D. (@glycerine), Philip Hyunsu Cho (@hcho3), Tong He (@hetong007), James Lamb (@jameslamb), James Bourbeau (@jrbourbeau), Lee Drake (@leedrake5), DougM (@mengdong), Oleksandr Kuvshynov (@okuvshynov), RongOu (@rongou), Shaochen Shi (@shishaochen), Xu Xiao (@sperlingxx), Yuan Tang (@terrytangyuan), Theodore Vasiloudis (@thvasilo), Jiaming Yuan (@trivialfis), Bobby Wang (@wbo4958), Zhang Zhang (@zhangzhang10)
## v1.1.1 (2020.06.06)
This patch release applies the following patches to 1.1.0 release:
* CPU performance improvement in the PyPI wheels (#5720)
* Fix loading old model (#5724)
* Install pkg-config file (#5744)
## v1.1.0 (2020.05.17)
### Better performance on multi-core CPUs (#5244, #5334, #5522)
* Poor performance scaling of the `hist` algorithm for multi-core CPUs has been under investigation (#3810). #5244 concludes the ongoing effort to improve performance scaling on multi-CPUs, in particular Intel CPUs. Roadmap: #5104
* #5334 makes steps toward reducing memory consumption for the `hist` tree method on CPU.
* #5522 optimizes random number generation for data sampling.
### Deterministic GPU algorithm for regression and classification (#5361)
* GPU algorithm for regression and classification tasks is now deterministic.
* Roadmap: #5023. Currently only single-GPU training is deterministic. Distributed training with multiple GPUs is not yet deterministic.
### Improve external memory support on GPUs (#5093, #5365)
* Starting from 1.0.0 release, we added support for external memory on GPUs to enable training with larger datasets. Gradient-based sampling (#5093) speeds up the external memory algorithm by intelligently sampling a subset of the training data to copy into the GPU memory. [Learn more about out-of-core GPU gradient boosting.](https://arxiv.org/abs/2005.09148)
* GPU-side data sketching now works with data from external memory (#5365).
### Parameter validation: detection of unused or incorrect parameters (#5477, #5569, #5508)
* Mis-spelled training parameter is a common user mistake. In previous versions of XGBoost, mis-spelled parameters were silently ignored. Starting with 1.0.0 release, XGBoost will produce a warning message if there is any unused training parameters. The 1.1.0 release makes parameter validation available to the scikit-learn interface (#5477) and the R binding (#5569).
### Thread-safe, in-place prediction method (#5389, #5512)
* Previously, the prediction method was not thread-safe (#5339). This release adds a new API function `inplace_predict()` that is thread-safe. It is now possible to serve concurrent requests for prediction using a shared model object.
* It is now possible to compute prediction in-place for selected data formats (`numpy.ndarray` / `scipy.sparse.csr_matrix` / `cupy.ndarray` / `cudf.DataFrame` / `pd.DataFrame`) without creating a `DMatrix` object.
### Addition of Accelerated Failure Time objective for survival analysis (#4763, #5473, #5486, #5552, #5553)
* Survival analysis (regression) models the time it takes for an event of interest to occur. The target label is potentially censored, i.e. the label is a range rather than a single number. We added a new objective `survival:aft` to support survival analysis. Also added is the new API to specify the ranged labels. Check out [the tutorial](https://xgboost.readthedocs.io/en/release_1.1.0/tutorials/aft_survival_analysis.html) and the [demos](https://github.com/dmlc/xgboost/tree/release_1.1.0/demo/aft_survival).
* GPU support is work in progress (#5714).
### Improved installation experience on Mac OSX (#5597, #5602, #5606, #5701)
* It only takes two commands to install the XGBoost Python package: `brew install libomp` followed by `pip install xgboost`. The installed XGBoost will use all CPU cores. Even better, starting with this release, we distribute pre-compiled binary wheels targeting Mac OSX. Now the install command `pip install xgboost` finishes instantly, as it no longer compiles the C++ source of XGBoost. The last three Mac versions (High Sierra, Mojave, Catalina) are supported.
* R package: the 1.1.0 release fixes the error `Initializing libomp.dylib, but found libomp.dylib already initialized` (#5701)
### Ranking metrics are now accelerated on GPUs (#5380, #5387, #5398)
### GPU-side data matrix to ingest data directly from other GPU libraries (#5420, #5465)
* Previously, data on GPU memory had to be copied back to the main memory before it could be used by XGBoost. Starting with 1.1.0 release, XGBoost provides a dedicated interface (`DeviceQuantileDMatrix`) so that it can ingest data from GPU memory directly. The result is that XGBoost interoperates better with GPU-accelerated data science libraries, such as cuDF, cuPy, and PyTorch.
* Set device in device dmatrix. (#5596)
### Robust model serialization with JSON (#5123, #5217)
* We continue efforts from the 1.0.0 release to adopt JSON as the format to save and load models robustly. Refer to the release note for 1.0.0 to learn more.
* It is now possible to store internal configuration of the trained model (`Booster`) object in R as a JSON string (#5123, #5217).
### Improved integration with Dask
* Pass through `verbose` parameter for dask fit (#5413)
* Use `DMLC_TASK_ID`. (#5415)
* Order the prediction result. (#5416)
* Honor `nthreads` from dask worker. (#5414)
* Enable grid searching with scikit-learn. (#5417)
* Check non-equal when setting threads. (#5421)
* Accept other inputs for prediction. (#5428)
* Fix missing value for scikit-learn interface. (#5435)
### XGBoost4J-Spark: Check number of columns in the data iterator (#5202, #5303)
* Before, the native layer in XGBoost did not know the number of columns (features) ahead of time and had to guess the number of columns by counting the feature index when ingesting data. This method has a failure more in distributed setting: if the training data is highly sparse, some features may be completely missing in one or more worker partitions. Thus, one or more workers may deduce an incorrect data shape, leading to crashes or silently wrong models.
* Enforce correct data shape by passing the number of columns explicitly from the JVM layer into the native layer.
### Major refactoring of the `DMatrix` class
* Continued from 1.0.0 release.
* Remove update prediction cache from predictors. (#5312)
* Predict on Ellpack. (#5327)
* Partial rewrite EllpackPage (#5352)
* Use ellpack for prediction only when sparsepage doesn't exist. (#5504)
* RFC: #4354, Roadmap: #5143
### Breaking: XGBoost Python package now requires Pip 19.0 and higher (#5589)
* Your Linux machine may have an old version of Pip and may attempt to install a source package, leading to long installation time. This is because we are now using `manylinux2010` tag in the binary wheel release. Ensure you have Pip 19.0 or newer by running `python3 -m pip -V` to check the version. Upgrade Pip with command
```
python3 -m pip install --upgrade pip
```
Upgrading to latest pip allows us to depend on newer versions of system libraries. [TensorFlow](https://www.tensorflow.org/install/pip) also requires Pip 19.0+.
### Breaking: GPU algorithm now requires CUDA 10.0 and higher (#5649)
* CUDA 10.0 is necessary to make the GPU algorithm deterministic (#5361).
### Breaking: `silent` parameter is now removed (#5476)
* Please use `verbosity` instead.
### Breaking: Set `output_margin` to True for custom objectives (#5564)
* Now both R and Python interface custom objectives get un-transformed (raw) prediction outputs.
### Breaking: `Makefile` is now removed. We use CMake exclusively to build XGBoost (#5513)
* Exception: the R package uses Autotools, as the CRAN ecosystem did not yet adopt CMake widely.
### Breaking: `distcol` updater is now removed (#5507)
* The `distcol` updater has been long broken, and currently we lack resources to implement a working implementation from scratch.
### Deprecation notices
* **Python 3.5**. This release is the last release to support Python 3.5. The following release (1.2.0) will require Python 3.6.
* **Scala 2.11**. Currently XGBoost4J supports Scala 2.11. However, if a future release of XGBoost adopts Spark 3, it will not support Scala 2.11, as Spark 3 requires Scala 2.12+. We do not yet know which XGBoost release will adopt Spark 3.
### Known limitations
* (Python package) When early stopping is activated with `early_stopping_rounds` at training time, the prediction method (`xgb.predict()`) behaves in a surprising way. If XGBoost runs for M rounds and chooses iteration N (N < M) as the best iteration, then the prediction method will use M trees by default. To use the best iteration (N trees), users will need to manually take the best iteration field `bst.best_iteration` and pass it as the `ntree_limit` argument to `xgb.predict()`. See #5209 and #4052 for additional context.
* GPU ranking objective is currently not deterministic (#5561).
* When training parameter `reg_lambda` is set to zero, some leaf nodes may be assigned a NaN value. (See [discussion](https://discuss.xgboost.ai/t/still-getting-unexplained-nans-new-replication-code/1383/9).) For now, please set `reg_lambda` to a nonzero value.
### Community and Governance
* The XGBoost Project Management Committee (PMC) is pleased to announce a new committer: Egor Smirnov (@SmirnovEgorRu). He has led a major initiative to improve the performance of XGBoost on multi-core CPUs.
### Bug-fixes
* Improved compatibility with scikit-learn (#5255, #5505, #5538)
* Remove f-string, since it's not supported by Python 3.5 (#5330). Note that Python 3.5 support is deprecated and schedule to be dropped in the upcoming release (1.2.0).
* Fix the pruner so that it doesn't prune the same branch twice (#5335)
* Enforce only major version in JSON model schema (#5336). Any major revision of the model schema would bump up the major version.
* Fix a small typo in sklearn.py that broke multiple eval metrics (#5341)
* Restore loading model from a memory buffer (#5360)
* Define lazy isinstance for Python compat (#5364)
* [R] fixed uses of `class()` (#5426)
* Force compressed buffer to be 4 bytes aligned, to keep cuda-memcheck happy (#5441)
* Remove warning for calling host function (`std::max`) on a GPU device (#5453)
* Fix uninitialized value bug in xgboost callback (#5463)
* Fix model dump in CLI (#5485)
* Fix out-of-bound array access in `WQSummary::SetPrune()` (#5493)
* Ensure that configured `dmlc/build_config.h` is picked up by Rabit and XGBoost, to fix build on Alpine (#5514)
* Fix a misspelled method, made in a git merge (#5509)
* Fix a bug in binary model serialization (#5532)
* Fix CLI model IO (#5535)
* Don't use `uint` for threads (#5542)
* Fix R interaction constraints to handle more than 100000 features (#5543)
* [jvm-packages] XGBoost Spark should deal with NaN when parsing evaluation output (#5546)
* GPU-side data sketching is now aware of query groups in learning-to-rank data (#5551)
* Fix DMatrix slicing for newly added fields (#5552)
* Fix configuration status with loading binary model (#5562)
* Fix build when OpenMP is disabled (#5566)
* R compatibility patches (#5577, #5600)
* gpu\_hist performance fixes (#5558)
* Don't set seed on CLI interface (#5563)
* [R] When serializing model, preserve model attributes related to early stopping (#5573)
* Avoid rabit calls in learner configuration (#5581)
* Hide C++ symbols in libxgboost.so when building Python wheel (#5590). This fixes apache/incubator-tvm#4953.
* Fix compilation on Mac OSX High Sierra (10.13) (#5597)
* Fix build on big endian CPUs (#5617)
* Resolve crash due to use of `vector<bool>::iterator` (#5642)
* Validation JSON model dump using JSON schema (#5660)
### Performance improvements
* Wide dataset quantile performance improvement (#5306)
* Reduce memory usage of GPU-side data sketching (#5407)
* Reduce span check overhead (#5464)
* Serialise booster after training to free up GPU memory (#5484)
* Use the maximum amount of GPU shared memory available to speed up the histogram kernel (#5491)
* Use non-synchronising scan in Thrust (#5560)
* Use `cudaDeviceGetAttribute()` instead of `cudaGetDeviceProperties()` for speed (#5570)
### API changes
* Support importing data from a Pandas SparseArray (#5431)
* `HostDeviceVector` (vector shared between CPU and GPU memory) now exposes `HostSpan` interface, to enable access on the CPU side with bound check (#5459)
* Accept other gradient types for `SplitEntry` (#5467)
### Usability Improvements, Documentation
* Add `JVM_CHECK_CALL` to prevent C++ exceptions from leaking into the JVM layer (#5199)
* Updated Windows build docs (#5283)
* Update affiliation of @hcho3 (#5292)
* Display Sponsor button, link to OpenCollective (#5325)
* Update docs for GPU external memory (#5332)
* Add link to GPU documentation (#5437)
* Small updates to GPU documentation (#5483)
* Edits on tutorial for XGBoost job on Kubernetes (#5487)
* Add reference to GPU external memory (#5490)
* Fix typos (#5346, #5371, #5384, #5399, #5482, #5515)
* Update Python doc (#5517)
* Add Neptune and Optuna to list of examples (#5528)
* Raise error if the number of data weights doesn't match the number of data sets (#5540)
* Add a note about GPU ranking (#5572)
* Clarify meaning of `training` parameter in the C API function `XGBoosterPredict()` (#5604)
* Better error handling for situations where existing trees cannot be modified (#5406, #5418). This feature is enabled when `process_type` is set to `update`.
### Maintenance: testing, continuous integration, build system
* Add C++ test coverage for data sketching (#5251)
* Ignore gdb\_history (#5257)
* Rewrite setup.py. (#5271, #5280)
* Use `scikit-learn` in extra dependencies (#5310)
* Add CMake option to build static library (#5397)
* [R] changed FindLibR to take advantage of CMake cache (#5427)
* [R] fixed inconsistency in R -e calls in FindLibR.cmake (#5438)
* Refactor tests with data generator (#5439)
* Resolve failing Travis CI (#5445)
* Update dmlc-core. (#5466)
* [CI] Use clang-tidy 10 (#5469)
* De-duplicate code for checking maximum number of nodes (#5497)
* [CI] Use Ubuntu 18.04 LTS in JVM CI, because 19.04 is EOL (#5537)
* [jvm-packages] [CI] Create a Maven repository to host SNAPSHOT JARs (#5533)
* [jvm-packages] [CI] Publish XGBoost4J JARs with Scala 2.11 and 2.12 (#5539)
* [CI] Use Vault repository to re-gain access to devtoolset-4 (#5589)
### Maintenance: Refactor code for legibility and maintainability
* Move prediction cache to Learner (#5220, #5302)
* Remove SimpleCSRSource (#5315)
* Refactor SparsePageSource, delete cache files after use (#5321)
* Remove unnecessary DMatrix methods (#5324)
* Split up `LearnerImpl` (#5350)
* Move segment sorter to common (#5378)
* Move thread local entry into Learner (#5396)
* Split up test helpers header (#5455)
* Requires setting leaf stat when expanding tree (#5501)
* Purge device\_helpers.cuh (#5534)
* Use thrust functions instead of custom functions (#5544)
### Acknowledgement
**Contributors**: Nan Zhu (@CodingCat), Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Andrew Kane (@ankane), Avinash Barnwal (@avinashbarnwal), Bart Broere (@bartbroere), Andy Adinets (@canonizer), Chen Qin (@chenqin), Daiki Katsuragawa (@daikikatsuragawa), David Díaz Vico (@daviddiazvico), Darius Kharazi (@dkharazi), Darby Payne (@dpayne), Jason E. Aten, Ph.D. (@glycerine), Philip Hyunsu Cho (@hcho3), James Lamb (@jameslamb), Jan Borchmann (@jborchma), Kamil A. Kaczmarek (@kamil-kaczmarek), Melissa Kohl (@mjkohl32), Nicolas Scozzaro (@nscozzaro), Paul Kaefer (@paulkaefer), Rong Ou (@rongou), Samrat Pandiri (@samratp), Sriram Chandramouli (@sriramch), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), Liang-Chi Hsieh (@viirya), Bobby Wang (@wbo4958), Zhang Zhang (@zhangzhang10),
**Reviewers**: Nan Zhu (@CodingCat), @LeZhengThu, Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Steve Bronder (@SteveBronder), Nikita Titov (@StrikerRUS), Andrew Kane (@ankane), Avinash Barnwal (@avinashbarnwal), @brydag, Andy Adinets (@canonizer), Chandra Shekhar Reddy (@chandrureddy), Chen Qin (@chenqin), Codecov (@codecov-io), David Díaz Vico (@daviddiazvico), Darby Payne (@dpayne), Jason E. Aten, Ph.D. (@glycerine), Philip Hyunsu Cho (@hcho3), James Lamb (@jameslamb), @johnny-cat, Mu Li (@mli), Mate Soos (@msoos), @rnyak, Rong Ou (@rongou), Sriram Chandramouli (@sriramch), Toby Dylan Hocking (@tdhock), Yuan Tang (@terrytangyuan), Oleksandr Pryimak (@trams), Jiaming Yuan (@trivialfis), Liang-Chi Hsieh (@viirya), Bobby Wang (@wbo4958),
## v1.0.2 (2020.03.03)
This patch release applies the following patches to 1.0.0 release:
* Fix a small typo in sklearn.py that broke multiple eval metrics (#5341)
* Restore loading model from buffer (#5360)
* Use type name for data type check (#5364)
## v1.0.1 (2020.02.21)
This release is identical to the 1.0.0 release, except that it fixes a small bug that rendered 1.0.0 incompatible with Python 3.5. See #5328.
## v1.0.0 (2020.02.19) ## v1.0.0 (2020.02.19)
This release marks a major milestone for the XGBoost project. This release marks a major milestone for the XGBoost project.

View File

@@ -6,11 +6,8 @@ file(GLOB_RECURSE R_SOURCES
${CMAKE_CURRENT_LIST_DIR}/src/*.c) ${CMAKE_CURRENT_LIST_DIR}/src/*.c)
# Use object library to expose symbols # Use object library to expose symbols
add_library(xgboost-r OBJECT ${R_SOURCES}) add_library(xgboost-r OBJECT ${R_SOURCES})
if (ENABLE_ALL_WARNINGS)
target_compile_options(xgboost-r PRIVATE -Wall -Wextra) set(R_DEFINITIONS
endif (ENABLE_ALL_WARNINGS)
target_compile_definitions(xgboost-r
PUBLIC
-DXGBOOST_STRICT_R_MODE=1 -DXGBOOST_STRICT_R_MODE=1
-DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1 -DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1
-DDMLC_LOG_BEFORE_THROW=0 -DDMLC_LOG_BEFORE_THROW=0
@@ -18,27 +15,24 @@ target_compile_definitions(xgboost-r
-DDMLC_LOG_CUSTOMIZE=1 -DDMLC_LOG_CUSTOMIZE=1
-DRABIT_CUSTOMIZE_MSG_ -DRABIT_CUSTOMIZE_MSG_
-DRABIT_STRICT_CXX98_) -DRABIT_STRICT_CXX98_)
target_compile_definitions(xgboost-r
PRIVATE ${R_DEFINITIONS})
target_include_directories(xgboost-r target_include_directories(xgboost-r
PRIVATE PRIVATE
${LIBR_INCLUDE_DIRS} ${LIBR_INCLUDE_DIRS}
${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/dmlc-core/include ${PROJECT_SOURCE_DIR}/dmlc-core/include
${PROJECT_SOURCE_DIR}/rabit/include) ${PROJECT_SOURCE_DIR}/rabit/include)
target_link_libraries(xgboost-r PUBLIC ${LIBR_CORE_LIBRARY})
if (USE_OPENMP)
find_package(OpenMP REQUIRED)
target_link_libraries(xgboost-r PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
endif (USE_OPENMP)
set_target_properties( set_target_properties(
xgboost-r PROPERTIES xgboost-r PROPERTIES
CXX_STANDARD 14 CXX_STANDARD 11
CXX_STANDARD_REQUIRED ON CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON) POSITION_INDEPENDENT_CODE ON)
# Get compilation and link flags of xgboost-r and propagate to objxgboost set(XGBOOST_DEFINITIONS "${XGBOOST_DEFINITIONS};${R_DEFINITIONS}" PARENT_SCOPE)
target_link_libraries(objxgboost PUBLIC xgboost-r) set(XGBOOST_OBJ_SOURCES $<TARGET_OBJECTS:xgboost-r> PARENT_SCOPE)
# Add all objects of xgboost-r to objxgboost set(LINKED_LIBRARIES_PRIVATE ${LINKED_LIBRARIES_PRIVATE} ${LIBR_CORE_LIBRARY} PARENT_SCOPE)
target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:xgboost-r>)
set(LIBR_HOME "${LIBR_HOME}" PARENT_SCOPE) if (USE_OPENMP)
set(LIBR_EXECUTABLE "${LIBR_EXECUTABLE}" PARENT_SCOPE) target_link_libraries(xgboost-r PRIVATE OpenMP::OpenMP_CXX)
endif ()

View File

@@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 1.4.2.1 Version: 1.1.1.1
Date: 2020-08-28 Date: 2020-02-21
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"), email = "tianqi.tchen@gmail.com"),
@@ -31,9 +31,9 @@ Authors@R: c(
) )
Description: Extreme Gradient Boosting, which is an efficient implementation Description: Extreme Gradient Boosting, which is an efficient implementation
of the gradient boosting framework from Chen & Guestrin (2016) <doi:10.1145/2939672.2939785>. of the gradient boosting framework from Chen & Guestrin (2016) <doi:10.1145/2939672.2939785>.
This package is its R interface. The package includes efficient linear This package is its R interface. The package includes efficient linear
model solver and tree learning algorithms. The package can automatically model solver and tree learning algorithms. The package can automatically
do parallel computation on a single machine which could be more than 10 do parallel computation on a single machine which could be more than 10
times faster than existing gradient boosting packages. It supports times faster than existing gradient boosting packages. It supports
various objective functions, including regression, classification and ranking. various objective functions, including regression, classification and ranking.
The package is made to be extensible, so that users are also allowed to define The package is made to be extensible, so that users are also allowed to define
@@ -53,9 +53,8 @@ Suggests:
testthat, testthat,
lintr, lintr,
igraph (>= 1.0.1), igraph (>= 1.0.1),
float, jsonlite,
crayon, float
titanic
Depends: Depends:
R (>= 3.3.0) R (>= 3.3.0)
Imports: Imports:
@@ -63,6 +62,6 @@ Imports:
methods, methods,
data.table (>= 1.9.6), data.table (>= 1.9.6),
magrittr (>= 1.5), magrittr (>= 1.5),
jsonlite (>= 1.0), stringi (>= 0.5.2)
RoxygenNote: 7.1.1 RoxygenNote: 7.1.0
SystemRequirements: GNU make, C++14 SystemRequirements: GNU make, C++11

View File

@@ -36,10 +36,8 @@ export(xgb.create.features)
export(xgb.cv) export(xgb.cv)
export(xgb.dump) export(xgb.dump)
export(xgb.gblinear.history) export(xgb.gblinear.history)
export(xgb.get.config)
export(xgb.ggplot.deepness) export(xgb.ggplot.deepness)
export(xgb.ggplot.importance) export(xgb.ggplot.importance)
export(xgb.ggplot.shap.summary)
export(xgb.importance) export(xgb.importance)
export(xgb.load) export(xgb.load)
export(xgb.load.raw) export(xgb.load.raw)
@@ -48,12 +46,10 @@ export(xgb.plot.deepness)
export(xgb.plot.importance) export(xgb.plot.importance)
export(xgb.plot.multi.trees) export(xgb.plot.multi.trees)
export(xgb.plot.shap) export(xgb.plot.shap)
export(xgb.plot.shap.summary)
export(xgb.plot.tree) export(xgb.plot.tree)
export(xgb.save) export(xgb.save)
export(xgb.save.raw) export(xgb.save.raw)
export(xgb.serialize) export(xgb.serialize)
export(xgb.set.config)
export(xgb.train) export(xgb.train)
export(xgb.unserialize) export(xgb.unserialize)
export(xgboost) export(xgboost)
@@ -80,11 +76,14 @@ importFrom(graphics,lines)
importFrom(graphics,par) importFrom(graphics,par)
importFrom(graphics,points) importFrom(graphics,points)
importFrom(graphics,title) importFrom(graphics,title)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(magrittr,"%>%") importFrom(magrittr,"%>%")
importFrom(stats,median) importFrom(stats,median)
importFrom(stats,predict) importFrom(stats,predict)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_match_first_regex)
importFrom(stringi,stri_replace_all_regex)
importFrom(stringi,stri_replace_first_regex)
importFrom(stringi,stri_split_regex)
importFrom(utils,head) importFrom(utils,head)
importFrom(utils,object.size) importFrom(utils,object.size)
importFrom(utils,str) importFrom(utils,str)

View File

@@ -62,11 +62,11 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
callback <- function(env = parent.frame()) { callback <- function(env = parent.frame()) {
if (length(env$bst_evaluation) == 0 || if (length(env$bst_evaluation) == 0 ||
period == 0 || period == 0 ||
NVL(env$rank, 0) != 0) NVL(env$rank, 0) != 0 )
return() return()
i <- env$iteration i <- env$iteration
if ((i - 1) %% period == 0 || if ((i-1) %% period == 0 ||
i == env$begin_iteration || i == env$begin_iteration ||
i == env$end_iteration) { i == env$end_iteration) {
stdev <- if (showsd) env$bst_evaluation_err else NULL stdev <- if (showsd) env$bst_evaluation_err else NULL
@@ -115,7 +115,7 @@ cb.evaluation.log <- function() {
stop("bst_evaluation must have non-empty names") stop("bst_evaluation must have non-empty names")
mnames <<- gsub('-', '_', names(env$bst_evaluation)) mnames <<- gsub('-', '_', names(env$bst_evaluation))
if (!is.null(env$bst_evaluation_err)) if(!is.null(env$bst_evaluation_err))
mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std')) mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std'))
} }
@@ -123,12 +123,12 @@ cb.evaluation.log <- function() {
env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log))) env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log)))
setnames(env$evaluation_log, c('iter', mnames)) setnames(env$evaluation_log, c('iter', mnames))
if (!is.null(env$bst_evaluation_err)) { if(!is.null(env$bst_evaluation_err)) {
# rearrange col order from _mean,_mean,...,_std,_std,... # rearrange col order from _mean,_mean,...,_std,_std,...
# to be _mean,_std,_mean,_std,... # to be _mean,_std,_mean,_std,...
len <- length(mnames) len <- length(mnames)
means <- mnames[seq_len(len / 2)] means <- mnames[seq_len(len/2)]
stds <- mnames[(len / 2 + 1):len] stds <- mnames[(len/2 + 1):len]
cnames <- numeric(len) cnames <- numeric(len)
cnames[c(TRUE, FALSE)] <- means cnames[c(TRUE, FALSE)] <- means
cnames[c(FALSE, TRUE)] <- stds cnames[c(FALSE, TRUE)] <- stds
@@ -144,7 +144,7 @@ cb.evaluation.log <- function() {
return(finalizer(env)) return(finalizer(env))
ev <- env$bst_evaluation ev <- env$bst_evaluation
if (!is.null(env$bst_evaluation_err)) if(!is.null(env$bst_evaluation_err))
ev <- c(ev, env$bst_evaluation_err) ev <- c(ev, env$bst_evaluation_err)
env$evaluation_log <- c(env$evaluation_log, env$evaluation_log <- c(env$evaluation_log,
list(c(iter = env$iteration, ev))) list(c(iter = env$iteration, ev)))
@@ -351,19 +351,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
finalizer <- function(env) { finalizer <- function(env) {
if (!is.null(env$bst)) { if (!is.null(env$bst)) {
attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score')) attr_best_score = as.numeric(xgb.attr(env$bst$handle, 'best_score'))
if (best_score != attr_best_score) { if (best_score != attr_best_score)
# If the difference is too big, throw an error stop("Inconsistent 'best_score' values between the closure state: ", best_score,
if (abs(best_score - attr_best_score) >= 1e-14) { " and the xgb.attr: ", attr_best_score)
stop("Inconsistent 'best_score' values between the closure state: ", best_score, env$bst$best_iteration = best_iteration
" and the xgb.attr: ", attr_best_score) env$bst$best_ntreelimit = best_ntreelimit
} env$bst$best_score = best_score
# If the difference is due to floating-point truncation, update best_score
best_score <- attr_best_score
}
env$bst$best_iteration <- best_iteration
env$bst$best_ntreelimit <- best_ntreelimit
env$bst$best_score <- best_score
} else { } else {
env$basket$best_iteration <- best_iteration env$basket$best_iteration <- best_iteration
env$basket$best_ntreelimit <- best_ntreelimit env$basket$best_ntreelimit <- best_ntreelimit
@@ -378,9 +372,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
return(finalizer(env)) return(finalizer(env))
i <- env$iteration i <- env$iteration
score <- env$bst_evaluation[metric_idx] score = env$bst_evaluation[metric_idx]
if ((maximize && score > best_score) || if (( maximize && score > best_score) ||
(!maximize && score < best_score)) { (!maximize && score < best_score)) {
best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
@@ -506,7 +500,7 @@ cb.cv.predict <- function(save_models = FALSE) {
for (fd in env$bst_folds) { for (fd in env$bst_folds) {
pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE) pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
if (is.matrix(pred)) { if (is.matrix(pred)) {
pred[fd$index, ] <- pr pred[fd$index,] <- pr
} else { } else {
pred[fd$index] <- pr pred[fd$index] <- pr
} }
@@ -619,7 +613,9 @@ cb.gblinear.history <- function(sparse=FALSE) {
init <- function(env) { init <- function(env) {
if (!is.null(env$bst)) { # xgb.train: if (!is.null(env$bst)) { # xgb.train:
coef_path <- list()
} else if (!is.null(env$bst_folds)) { # xgb.cv: } else if (!is.null(env$bst_folds)) { # xgb.cv:
coef_path <- rep(list(), length(env$bst_folds))
} else stop("Parent frame has neither 'bst' nor 'bst_folds'") } else stop("Parent frame has neither 'bst' nor 'bst_folds'")
} }
@@ -709,11 +705,11 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
if (!is_cv) { if (!is_cv) {
# extract num_class & num_feat from the internal model # extract num_class & num_feat from the internal model
dmp <- xgb.dump(model) dmp <- xgb.dump(model)
if (length(dmp) < 2 || dmp[2] != "bias:") if(length(dmp) < 2 || dmp[2] != "bias:")
stop("It does not appear to be a gblinear model") stop("It does not appear to be a gblinear model")
dmp <- dmp[-c(1, 2)] dmp <- dmp[-c(1,2)]
n <- which(dmp == 'weight:') n <- which(dmp == 'weight:')
if (length(n) != 1) if(length(n) != 1)
stop("It does not appear to be a gblinear model") stop("It does not appear to be a gblinear model")
num_class <- n - 1 num_class <- n - 1
num_feat <- (length(dmp) - 4) / num_class num_feat <- (length(dmp) - 4) / num_class
@@ -736,9 +732,9 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
if (!is.null(class_index) && num_class > 1) { if (!is.null(class_index) && num_class > 1) {
coef_path <- if (is.list(coef_path)) { coef_path <- if (is.list(coef_path)) {
lapply(coef_path, lapply(coef_path,
function(x) x[, seq(1 + class_index, by = num_class, length.out = num_feat)]) function(x) x[, seq(1 + class_index, by=num_class, length.out=num_feat)])
} else { } else {
coef_path <- coef_path[, seq(1 + class_index, by = num_class, length.out = num_feat)] coef_path <- coef_path[, seq(1 + class_index, by=num_class, length.out=num_feat)]
} }
} }
coef_path coef_path

View File

@@ -20,12 +20,6 @@ NVL <- function(x, val) {
stop("typeof(x) == ", typeof(x), " is not supported by NVL") stop("typeof(x) == ", typeof(x), " is not supported by NVL")
} }
# List of classification and ranking objectives
.CLASSIFICATION_OBJECTIVES <- function() {
return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map'))
}
# #
# Low-level functions for boosting -------------------------------------------- # Low-level functions for boosting --------------------------------------------
@@ -75,23 +69,23 @@ check.booster.params <- function(params, ...) {
if (!is.null(params[['monotone_constraints']]) && if (!is.null(params[['monotone_constraints']]) &&
typeof(params[['monotone_constraints']]) != "character") { typeof(params[['monotone_constraints']]) != "character") {
vec2str <- paste(params[['monotone_constraints']], collapse = ',') vec2str = paste(params[['monotone_constraints']], collapse = ',')
vec2str <- paste0('(', vec2str, ')') vec2str = paste0('(', vec2str, ')')
params[['monotone_constraints']] <- vec2str params[['monotone_constraints']] = vec2str
} }
# interaction constraints parser (convert from list of column indices to string) # interaction constraints parser (convert from list of column indices to string)
if (!is.null(params[['interaction_constraints']]) && if (!is.null(params[['interaction_constraints']]) &&
typeof(params[['interaction_constraints']]) != "character"){ typeof(params[['interaction_constraints']]) != "character"){
# check input class # check input class
if (!identical(class(params[['interaction_constraints']]), 'list')) stop('interaction_constraints should be class list') if (!identical(class(params[['interaction_constraints']]),'list')) stop('interaction_constraints should be class list')
if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric', 'integer'))) { if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric','integer'))) {
stop('interaction_constraints should be a list of numeric/integer vectors') stop('interaction_constraints should be a list of numeric/integer vectors')
} }
# recast parameter as string # recast parameter as string
interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse = ','), ']')) interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse=','), ']'))
params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse = ','), ']') params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse=','), ']')
} }
return(params) return(params)
} }
@@ -151,8 +145,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
if (is.null(obj)) { if (is.null(obj)) {
.Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain) .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
} else { } else {
pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE, pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE)
ntreelimit = 0)
gpair <- obj(pred, dtrain) gpair <- obj(pred, dtrain)
.Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess) .Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess)
} }
@@ -173,12 +166,13 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
evnames <- names(watchlist) evnames <- names(watchlist)
if (is.null(feval)) { if (is.null(feval)) {
msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames)) msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2) msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
res <- structure(as.numeric(mat[2, ]), names = mat[1, ]) res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
names(res) <- msg[c(TRUE,FALSE)] # odds are the names
} else { } else {
res <- sapply(seq_along(watchlist), function(j) { res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]] w <- watchlist[[j]]
preds <- predict(booster_handle, w, outputmargin = TRUE, ntreelimit = 0) # predict using all trees preds <- predict(booster_handle, w) # predict using all trees
eval_res <- feval(preds, w) eval_res <- feval(preds, w)
out <- eval_res$value out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric) names(out) <- paste0(evnames[j], "-", eval_res$metric)
@@ -193,23 +187,13 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
# Helper functions for cross validation --------------------------------------- # Helper functions for cross validation ---------------------------------------
# #
# Possibly convert the labels into factors, depending on the objective.
# The labels are converted into factors only when the given objective refers to the classification
# or ranking tasks.
convert.labels <- function(labels, objective_name) {
if (objective_name %in% .CLASSIFICATION_OBJECTIVES()) {
return(as.factor(labels))
} else {
return(labels)
}
}
# Generates random (stratified if needed) CV folds # Generates random (stratified if needed) CV folds
generate.cv.folds <- function(nfold, nrows, stratified, label, params) { generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
# cannot do it for rank # cannot do it for rank
objective <- params$objective if (exists('objective', where = params) &&
if (is.character(objective) && strtrim(objective, 5) == 'rank:') { is.character(params$objective) &&
strtrim(params$objective, 5) == 'rank:') {
stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n", stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
"\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n") "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
} }
@@ -222,16 +206,19 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
# - For classification, need to convert y labels to factor before making the folds, # - For classification, need to convert y labels to factor before making the folds,
# and then do stratification by factor levels. # and then do stratification by factor levels.
# - For regression, leave y numeric and do stratification by quantiles. # - For regression, leave y numeric and do stratification by quantiles.
if (is.character(objective)) { if (exists('objective', where = params) &&
y <- convert.labels(y, params$objective) is.character(params$objective)) {
# If 'objective' provided in params, assume that y is a classification label
# unless objective is reg:squarederror
if (params$objective != 'reg:squarederror')
y <- factor(y)
} else { } else {
# If no 'objective' given in params, it means that user either wants to # If no 'objective' given in params, it means that user either wants to
# use the default 'reg:squarederror' objective or has provided a custom # use the default 'reg:squarederror' objective or has provided a custom
# obj function. Here, assume classification setting when y has 5 or less # obj function. Here, assume classification setting when y has 5 or less
# unique values: # unique values:
if (length(unique(y)) <= 5) { if (length(unique(y)) <= 5)
y <- factor(y) y <- factor(y)
}
} }
folds <- xgb.createFolds(y, nfold) folds <- xgb.createFolds(y, nfold)
} else { } else {
@@ -320,68 +307,6 @@ xgb.createFolds <- function(y, k = 10)
#' @name xgboost-deprecated #' @name xgboost-deprecated
NULL NULL
#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
#'
#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
#' the model is to be accessed in the future. If you train a model with the current version of
#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
#'
#' @details
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
#' the JSON format by specifying the JSON extension. To read the model back, use
#' \code{\link{xgb.load}}.
#'
#' Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
#' in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
#' re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
#' as part of another R object.
#'
#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
#' model but also internal configurations and parameters, and its format is not stable across
#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
#'
#' For more details and explanation about model persistence and archival, consult the page
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#'
#' # Save as a stand-alone file; load it with xgb.load()
#' xgb.save(bst, 'xgb.model')
#' bst2 <- xgb.load('xgb.model')
#'
#' # Save as a stand-alone file (JSON); load it with xgb.load()
#' xgb.save(bst, 'xgb.model.json')
#' bst2 <- xgb.load('xgb.model.json')
#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
#'
#' # Save as a raw byte vector; load it with xgb.load.raw()
#' xgb_bytes <- xgb.save.raw(bst)
#' bst2 <- xgb.load.raw(xgb_bytes)
#'
#' # Persist XGBoost model as part of another R object
#' obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost model")
#' # Persist the R object. Here, saveRDS() is okay, since it doesn't persist
#' # xgb.Booster directly. What's being persisted is the future-proof byte representation
#' # as given by xgb.save.raw().
#' saveRDS(obj, 'my_object.rds')
#' # Read back the R object
#' obj2 <- readRDS('my_object.rds')
#' # Re-construct xgb.Booster object from the bytes
#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
#' if (file.exists('my_object.rds')) file.remove('my_object.rds')
#'
#' @name a-compatibility-note-for-saveRDS-save
NULL
# Lookup table for the deprecated parameters bookkeeping # Lookup table for the deprecated parameters bookkeeping
depr_par_lut <- matrix(c( depr_par_lut <- matrix(c(
'print.every.n', 'print_every_n', 'print.every.n', 'print_every_n',
@@ -390,8 +315,8 @@ depr_par_lut <- matrix(c(
'with.stats', 'with_stats', 'with.stats', 'with_stats',
'numberOfClusters', 'n_clusters', 'numberOfClusters', 'n_clusters',
'features.keep', 'features_keep', 'features.keep', 'features_keep',
'plot.height', 'plot_height', 'plot.height','plot_height',
'plot.width', 'plot_width', 'plot.width','plot_width',
'n_first_tree', 'trees', 'n_first_tree', 'trees',
'dummy', 'DUMMY' 'dummy', 'DUMMY'
), ncol = 2, byrow = TRUE) ), ncol = 2, byrow = TRUE)
@@ -404,20 +329,20 @@ colnames(depr_par_lut) <- c('old', 'new')
check.deprecation <- function(..., env = parent.frame()) { check.deprecation <- function(..., env = parent.frame()) {
pars <- list(...) pars <- list(...)
# exact and partial matches # exact and partial matches
all_match <- pmatch(names(pars), depr_par_lut[, 1]) all_match <- pmatch(names(pars), depr_par_lut[,1])
# indices of matched pars' names # indices of matched pars' names
idx_pars <- which(!is.na(all_match)) idx_pars <- which(!is.na(all_match))
if (length(idx_pars) == 0) return() if (length(idx_pars) == 0) return()
# indices of matched LUT rows # indices of matched LUT rows
idx_lut <- all_match[idx_pars] idx_lut <- all_match[idx_pars]
# which of idx_lut were the exact matches? # which of idx_lut were the exact matches?
ex_match <- depr_par_lut[idx_lut, 1] %in% names(pars) ex_match <- depr_par_lut[idx_lut,1] %in% names(pars)
for (i in seq_along(idx_pars)) { for (i in seq_along(idx_pars)) {
pars_par <- names(pars)[idx_pars[i]] pars_par <- names(pars)[idx_pars[i]]
old_par <- depr_par_lut[idx_lut[i], 1] old_par <- depr_par_lut[idx_lut[i], 1]
new_par <- depr_par_lut[idx_lut[i], 2] new_par <- depr_par_lut[idx_lut[i], 2]
if (!ex_match[i]) { if (!ex_match[i]) {
warning("'", pars_par, "' was partially matched to '", old_par, "'") warning("'", pars_par, "' was partially matched to '", old_par,"'")
} }
.Deprecated(new_par, old = old_par, package = 'xgboost') .Deprecated(new_par, old = old_par, package = 'xgboost')
if (new_par != 'NULL') { if (new_par != 'NULL') {

View File

@@ -1,7 +1,6 @@
# Construct an internal xgboost Booster and return a handle to it. # Construct an internal xgboost Booster and return a handle to it.
# internal utility function # internal utility function
xgb.Booster.handle <- function(params = list(), cachelist = list(), xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) {
modelfile = NULL) {
if (typeof(cachelist) != "list" || if (typeof(cachelist) != "list" ||
!all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) { !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) {
stop("cachelist must be a list of xgb.DMatrix objects") stop("cachelist must be a list of xgb.DMatrix objects")
@@ -11,7 +10,6 @@ xgb.Booster.handle <- function(params = list(), cachelist = list(),
if (typeof(modelfile) == "character") { if (typeof(modelfile) == "character") {
## A filename ## A filename
handle <- .Call(XGBoosterCreate_R, cachelist) handle <- .Call(XGBoosterCreate_R, cachelist)
modelfile <- path.expand(modelfile)
.Call(XGBoosterLoadModel_R, handle, modelfile[1]) .Call(XGBoosterLoadModel_R, handle, modelfile[1])
class(handle) <- "xgb.Booster.handle" class(handle) <- "xgb.Booster.handle"
if (length(params) > 0) { if (length(params) > 0) {
@@ -64,8 +62,8 @@ is.null.handle <- function(handle) {
return(FALSE) return(FALSE)
} }
# Return a verified to be valid handle out of either xgb.Booster.handle or # Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster
# xgb.Booster internal utility function # internal utility function
xgb.get.handle <- function(object) { xgb.get.handle <- function(object) {
if (inherits(object, "xgb.Booster")) { if (inherits(object, "xgb.Booster")) {
handle <- object$handle handle <- object$handle
@@ -112,8 +110,6 @@ xgb.get.handle <- function(object) {
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' saveRDS(bst, "xgb.model.rds") #' saveRDS(bst, "xgb.model.rds")
#' #'
#' # Warning: The resulting RDS file is only compatible with the current XGBoost version.
#' # Refer to the section titled "a-compatibility-note-for-saveRDS-save".
#' bst1 <- readRDS("xgb.model.rds") #' bst1 <- readRDS("xgb.model.rds")
#' if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds") #' if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds")
#' # the handle is invalid: #' # the handle is invalid:
@@ -373,8 +369,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames)) matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
} else { } else {
arr <- array(ret, c(n_col1, n_group, n_row), arr <- array(ret, c(n_col1, n_group, n_row),
dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2, 3, 1)) # [group, row, col] dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col]
lapply(seq_len(n_group), function(g) arr[g, , ]) lapply(seq_len(n_group), function(g) arr[g,,])
} }
} else if (predinteraction) { } else if (predinteraction) {
n_col1 <- ncol(newdata) + 1 n_col1 <- ncol(newdata) + 1
@@ -383,11 +379,11 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
ret <- if (n_ret == n_row) { ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = list(NULL, cnames)) matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_group == 1) { } else if (n_group == 1) {
array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3, 1, 2)) array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2))
} else { } else {
arr <- array(ret, c(n_col1, n_col1, n_group, n_row), arr <- array(ret, c(n_col1, n_col1, n_group, n_row),
dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3, 4, 1, 2)) # [group, row, col1, col2] dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2]
lapply(seq_len(n_group), function(g) arr[g, , , ]) lapply(seq_len(n_group), function(g) arr[g,,,])
} }
} else if (reshape && npred_per_case > 1) { } else if (reshape && npred_per_case > 1) {
ret <- matrix(ret, nrow = n_row, byrow = TRUE) ret <- matrix(ret, nrow = n_row, byrow = TRUE)
@@ -660,7 +656,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
if (!is.null(x$params)) { if (!is.null(x$params)) {
cat('params (as set within xgb.train):\n') cat('params (as set within xgb.train):\n')
cat(' ', cat( ' ',
paste(names(x$params), paste(names(x$params),
paste0('"', unlist(x$params), '"'), paste0('"', unlist(x$params), '"'),
sep = ' = ', collapse = ', '), '\n', sep = '') sep = ' = ', collapse = ', '), '\n', sep = '')
@@ -673,9 +669,9 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
if (length(attrs) > 0) { if (length(attrs) > 0) {
cat('xgb.attributes:\n') cat('xgb.attributes:\n')
if (verbose) { if (verbose) {
cat(paste(paste0(' ', names(attrs)), cat( paste(paste0(' ',names(attrs)),
paste0('"', unlist(attrs), '"'), paste0('"', unlist(attrs), '"'),
sep = ' = ', collapse = '\n'), '\n', sep = '') sep = ' = ', collapse = '\n'), '\n', sep = '')
} else { } else {
cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '') cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '')
} }
@@ -697,7 +693,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
#cat('ntree: ', xgb.ntree(x), '\n', sep='') #cat('ntree: ', xgb.ntree(x), '\n', sep='')
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
'evaluation_log', 'niter', 'feature_names'))) { 'evaluation_log','niter','feature_names'))) {
if (is.atomic(x[[n]])) { if (is.atomic(x[[n]])) {
cat(n, ':', x[[n]], '\n', sep = ' ') cat(n, ':', x[[n]], '\n', sep = ' ')
} else { } else {

View File

@@ -15,7 +15,8 @@
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data') #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@@ -26,7 +27,6 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, ...)
if (length(data) > 1) if (length(data) > 1)
stop("'data' has class 'character' and length ", length(data), stop("'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename.") ".\n 'data' accepts either a numeric matrix or a single filename.")
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent)) handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) { } else if (is.matrix(data)) {
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing) handle <- .Call(XGDMatrixCreateFromMat_R, data, missing)
@@ -65,7 +65,6 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
warning("xgboost: label will be ignored.") warning("xgboost: label will be ignored.")
} }
if (is.character(data)) { if (is.character(data)) {
data <- path.expand(data)
dtrain <- xgb.DMatrix(data[1]) dtrain <- xgb.DMatrix(data[1])
} else if (inherits(data, "xgb.DMatrix")) { } else if (inherits(data, "xgb.DMatrix")) {
dtrain <- data dtrain <- data
@@ -172,7 +171,8 @@ dimnames.xgb.DMatrix <- function(x) {
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' #'
#' labels <- getinfo(dtrain, 'label') #' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels) #' setinfo(dtrain, 'label', 1-labels)
@@ -224,7 +224,8 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' #'
#' labels <- getinfo(dtrain, 'label') #' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels) #' setinfo(dtrain, 'label', 1-labels)
@@ -256,6 +257,8 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
return(TRUE) return(TRUE)
} }
if (name == "weight") { if (name == "weight") {
if (length(info) != nrow(object))
stop("The length of weights must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info)) .Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
return(TRUE) return(TRUE)
} }
@@ -289,7 +292,8 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' #'
#' dsub <- slice(dtrain, 1:42) #' dsub <- slice(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label') #' labels1 <- getinfo(dsub, 'label')
@@ -318,7 +322,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
for (i in seq_along(ind)) { for (i in seq_along(ind)) {
obj_attr <- attr(object, nms[i]) obj_attr <- attr(object, nms[i])
if (NCOL(obj_attr) > 1) { if (NCOL(obj_attr) > 1) {
attr(ret, nms[i]) <- obj_attr[idxset, ] attr(ret, nms[i]) <- obj_attr[idxset,]
} else { } else {
attr(ret, nms[i]) <- obj_attr[idxset] attr(ret, nms[i]) <- obj_attr[idxset]
} }
@@ -345,7 +349,8 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' #'
#' dtrain #' dtrain
#' print(dtrain, verbose=TRUE) #' print(dtrain, verbose=TRUE)
@@ -354,10 +359,10 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
#' @export #' @export
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- character(0) infos <- c()
if (length(getinfo(x, 'label')) > 0) infos <- 'label' if(length(getinfo(x, 'label')) > 0) infos <- 'label'
if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') if(length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
if (length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin') if(length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin')
if (length(infos) == 0) infos <- 'NA' if (length(infos) == 0) infos <- 'NA'
cat(infos) cat(infos)
cnames <- colnames(x) cnames <- colnames(x)

View File

@@ -1,13 +1,14 @@
#' Save xgb.DMatrix object to binary file #' Save xgb.DMatrix object to binary file
#' #'
#' Save xgb.DMatrix object to binary file #' Save xgb.DMatrix object to binary file
#' #'
#' @param dmatrix the \code{xgb.DMatrix} object #' @param dmatrix the \code{xgb.DMatrix} object
#' @param fname the name of the file to write. #' @param fname the name of the file to write.
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data') #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@@ -17,8 +18,7 @@ xgb.DMatrix.save <- function(dmatrix, fname) {
stop("fname must be character") stop("fname must be character")
if (!inherits(dmatrix, "xgb.DMatrix")) if (!inherits(dmatrix, "xgb.DMatrix"))
stop("dmatrix must be xgb.DMatrix") stop("dmatrix must be xgb.DMatrix")
fname <- path.expand(fname)
.Call(XGDMatrixSaveBinary_R, dmatrix, fname[1], 0L) .Call(XGDMatrixSaveBinary_R, dmatrix, fname[1], 0L)
return(TRUE) return(TRUE)
} }

View File

@@ -1,38 +0,0 @@
#' Global configuration consists of a collection of parameters that can be applied in the global
#' scope. See \url{https://xgboost.readthedocs.io/en/stable/parameter.html} for the full list of
#' parameters supported in the global configuration. Use \code{xgb.set.config} to update the
#' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
#' values of all global-scope parameters (listed in
#' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
#'
#' @rdname xgbConfig
#' @title Set and get global configuration
#' @name xgb.set.config, xgb.get.config
#' @export xgb.set.config xgb.get.config
#' @param ... List of parameters to be set, as keyword arguments
#' @return
#' \code{xgb.set.config} returns \code{TRUE} to signal success. \code{xgb.get.config} returns
#' a list containing all global-scope parameters and their values.
#'
#' @examples
#' # Set verbosity level to silent (0)
#' xgb.set.config(verbosity = 0)
#' # Now global verbosity level is 0
#' config <- xgb.get.config()
#' print(config$verbosity)
#' # Set verbosity level to warning (1)
#' xgb.set.config(verbosity = 1)
#' # Now global verbosity level is 1
#' config <- xgb.get.config()
#' print(config$verbosity)
xgb.set.config <- function(...) {
new_config <- list(...)
.Call(XGBSetGlobalConfig_R, jsonlite::toJSON(new_config, auto_unbox = TRUE))
return(TRUE)
}
#' @rdname xgbConfig
xgb.get.config <- function() {
config <- .Call(XGBGetGlobalConfig_R)
return(jsonlite::fromJSON(config))
}

View File

@@ -1,87 +1,87 @@
#' Create new features from a previously learned model #' Create new features from a previously learned model
#' #'
#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model. #' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
#' #'
#' @param model decision tree boosting model learned on the original data #' @param model decision tree boosting model learned on the original data
#' @param data original data (usually provided as a \code{dgCMatrix} matrix) #' @param data original data (usually provided as a \code{dgCMatrix} matrix)
#' @param ... currently not used #' @param ... currently not used
#' #'
#' @return \code{dgCMatrix} matrix including both the original data and the new features. #' @return \code{dgCMatrix} matrix including both the original data and the new features.
#' #'
#' @details #' @details
#' This is the function inspired from the paragraph 3.1 of the paper: #' This is the function inspired from the paragraph 3.1 of the paper:
#' #'
#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook} #' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
#' #'
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, #' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
#' Joaquin Quinonero Candela)} #' Joaquin Quinonero Candela)}
#' #'
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014 #' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
#' #'
#' \url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}. #' \url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
#' #'
#' Extract explaining the method: #' Extract explaining the method:
#' #'
#' "We found that boosted decision trees are a powerful and very #' "We found that boosted decision trees are a powerful and very
#' convenient way to implement non-linear and tuple transformations #' convenient way to implement non-linear and tuple transformations
#' of the kind we just described. We treat each individual #' of the kind we just described. We treat each individual
#' tree as a categorical feature that takes as value the #' tree as a categorical feature that takes as value the
#' index of the leaf an instance ends up falling in. We use #' index of the leaf an instance ends up falling in. We use
#' 1-of-K coding of this type of features. #' 1-of-K coding of this type of features.
#' #'
#' For example, consider the boosted tree model in Figure 1 with 2 subtrees, #' For example, consider the boosted tree model in Figure 1 with 2 subtrees,
#' where the first subtree has 3 leafs and the second 2 leafs. If an #' where the first subtree has 3 leafs and the second 2 leafs. If an
#' instance ends up in leaf 2 in the first subtree and leaf 1 in #' instance ends up in leaf 2 in the first subtree and leaf 1 in
#' second subtree, the overall input to the linear classifier will #' second subtree, the overall input to the linear classifier will
#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries #' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
#' correspond to the leaves of the first subtree and last 2 to #' correspond to the leaves of the first subtree and last 2 to
#' those of the second subtree. #' those of the second subtree.
#' #'
#' [...] #' [...]
#' #'
#' We can understand boosted decision tree #' We can understand boosted decision tree
#' based transformation as a supervised feature encoding that #' based transformation as a supervised feature encoding that
#' converts a real-valued vector into a compact binary-valued #' converts a real-valued vector into a compact binary-valued
#' vector. A traversal from root node to a leaf node represents #' vector. A traversal from root node to a leaf node represents
#' a rule on certain features." #' a rule on certain features."
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label)) #' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
#' #'
#' param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic') #' param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
#' nrounds = 4 #' nrounds = 4
#' #'
#' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) #' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
#' #'
#' # Model accuracy without new features #' # Model accuracy without new features
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / #' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
#' length(agaricus.test$label) #' length(agaricus.test$label)
#' #'
#' # Convert previous features to one hot encoding #' # Convert previous features to one hot encoding
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data) #' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data) #' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
#' #'
#' # learning with new features #' # learning with new features
#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label) #' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label) #' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
#' watchlist <- list(train = new.dtrain) #' watchlist <- list(train = new.dtrain)
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
#' #'
#' # Model accuracy with new features #' # Model accuracy with new features
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / #' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
#' length(agaricus.test$label) #' length(agaricus.test$label)
#' #'
#' # Here the accuracy was already good and is now perfect. #' # Here the accuracy was already good and is now perfect.
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", #' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
#' accuracy.after, "!\n")) #' accuracy.after, "!\n"))
#' #'
#' @export #' @export
xgb.create.features <- function(model, data, ...){ xgb.create.features <- function(model, data, ...){
check.deprecation(...) check.deprecation(...)
pred_with_leaf <- predict(model, data, predleaf = TRUE) pred_with_leaf <- predict(model, data, predleaf = TRUE)
cols <- lapply(as.data.frame(pred_with_leaf), factor) cols <- lapply(as.data.frame(pred_with_leaf), factor)
cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint cbind(data, sparse.model.matrix( ~ . -1, cols))
} }

View File

@@ -2,15 +2,12 @@
#' #'
#' The cross validation function of xgboost #' The cross validation function of xgboost
#' #'
#' @param params the list of parameters. The complete list of parameters is #' @param params the list of parameters. Commonly used ones are:
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
#' is a shorter summary:
#' \itemize{ #' \itemize{
#' \item \code{objective} objective function, common ones are #' \item \code{objective} objective function, common ones are
#' \itemize{ #' \itemize{
#' \item \code{reg:squarederror} Regression with squared loss. #' \item \code{reg:squarederror} Regression with squared loss
#' \item \code{binary:logistic} logistic regression for classification. #' \item \code{binary:logistic} logistic regression for classification
#' \item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
#' } #' }
#' \item \code{eta} step size of each boosting step #' \item \code{eta} step size of each boosting step
#' \item \code{max_depth} maximum depth of the tree #' \item \code{max_depth} maximum depth of the tree
@@ -36,8 +33,6 @@
#' \item \code{error} binary classification error rate #' \item \code{error} binary classification error rate
#' \item \code{rmse} Rooted mean square error #' \item \code{rmse} Rooted mean square error
#' \item \code{logloss} negative log-likelihood function #' \item \code{logloss} negative log-likelihood function
#' \item \code{mae} Mean absolute error
#' \item \code{mape} Mean absolute percentage error
#' \item \code{auc} Area under curve #' \item \code{auc} Area under curve
#' \item \code{aucpr} Area under PR curve #' \item \code{aucpr} Area under PR curve
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification #' \item \code{merror} Exact matching error, used to evaluate multi-class classification
@@ -81,7 +76,7 @@
#' #'
#' All observations are used for both training and validation. #' All observations are used for both training and validation.
#' #'
#' Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29} #' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
#' #'
#' @return #' @return
#' An object of class \code{xgb.cv.synchronous} with the following elements: #' An object of class \code{xgb.cv.synchronous} with the following elements:
@@ -112,7 +107,7 @@
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"), #' cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
#' max_depth = 3, eta = 1, objective = "binary:logistic") #' max_depth = 3, eta = 1, objective = "binary:logistic")
#' print(cv) #' print(cv)
@@ -139,20 +134,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
# stop("Either 'eval_metric' or 'feval' must be provided for CV") # stop("Either 'eval_metric' or 'feval' must be provided for CV")
# Check the labels # Check the labels
if ((inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) || if ( (inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
(!inherits(data, 'xgb.DMatrix') && is.null(label))) { (!inherits(data, 'xgb.DMatrix') && is.null(label))) {
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
} else if (inherits(data, 'xgb.DMatrix')) { } else if (inherits(data, 'xgb.DMatrix')) {
if (!is.null(label)) if (!is.null(label))
warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix") warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix")
cv_label <- getinfo(data, 'label') cv_label = getinfo(data, 'label')
} else { } else {
cv_label <- label cv_label = label
} }
# CV folds # CV folds
if (!is.null(folds)) { if(!is.null(folds)) {
if (!is.list(folds) || length(folds) < 2) if(!is.list(folds) || length(folds) < 2)
stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold") stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
nfold <- length(folds) nfold <- length(folds)
} else { } else {
@@ -167,7 +162,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
# verbosity & evaluation printing callback: # verbosity & evaluation printing callback:
params <- c(params, list(silent = 1)) params <- c(params, list(silent = 1))
print_every_n <- max(as.integer(print_every_n), 1L) print_every_n <- max( as.integer(print_every_n), 1L)
if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd)) callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
} }
@@ -198,20 +193,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
bst_folds <- lapply(seq_along(folds), function(k) { bst_folds <- lapply(seq_along(folds), function(k) {
dtest <- slice(dall, folds[[k]]) dtest <- slice(dall, folds[[k]])
# code originally contributed by @RolandASc on stackoverflow # code originally contributed by @RolandASc on stackoverflow
if (is.null(train_folds)) if(is.null(train_folds))
dtrain <- slice(dall, unlist(folds[-k])) dtrain <- slice(dall, unlist(folds[-k]))
else else
dtrain <- slice(dall, train_folds[[k]]) dtrain <- slice(dall, train_folds[[k]])
handle <- xgb.Booster.handle(params, list(dtrain, dtest)) handle <- xgb.Booster.handle(params, list(dtrain, dtest))
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]]) list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]])
}) })
rm(dall) rm(dall)
# a "basket" to collect some results from callbacks # a "basket" to collect some results from callbacks
basket <- list() basket <- list()
# extract parameters that can affect the relationship b/w #trees and #iterations # extract parameters that can affect the relationship b/w #trees and #iterations
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
# those are fixed for CV (no training continuation) # those are fixed for CV (no training continuation)
begin_iteration <- 1 begin_iteration <- 1
@@ -228,7 +223,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
}) })
msg <- simplify2array(msg) msg <- simplify2array(msg)
bst_evaluation <- rowMeans(msg) bst_evaluation <- rowMeans(msg)
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) # nolint bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2)
for (f in cb$post_iter) f() for (f in cb$post_iter) f()
@@ -287,10 +282,10 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
} }
if (!is.null(x$params)) { if (!is.null(x$params)) {
cat('params (as set within xgb.cv):\n') cat('params (as set within xgb.cv):\n')
cat(' ', cat( ' ',
paste(names(x$params), paste(names(x$params),
paste0('"', unlist(x$params), '"'), paste0('"', unlist(x$params), '"'),
sep = ' = ', collapse = ', '), '\n', sep = '') sep = ' = ', collapse = ', '), '\n', sep = '')
} }
if (!is.null(x$callbacks) && length(x$callbacks) > 0) { if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
cat('callbacks:\n') cat('callbacks:\n')

View File

@@ -1,15 +1,15 @@
#' Dump an xgboost model in text format. #' Dump an xgboost model in text format.
#' #'
#' Dump an xgboost model in text format. #' Dump an xgboost model in text format.
#' #'
#' @param model the model object. #' @param model the model object.
#' @param fname the name of the text file where to save the model text dump. #' @param fname the name of the text file where to save the model text dump.
#' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector. #' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
#' @param fmap feature map file representing feature types. #' @param fmap feature map file representing feature types.
#' Detailed description could be found at #' Detailed description could be found at
#' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}. #' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
#' See demo/ for walkthrough example in R, and #' See demo/ for walkthrough example in R, and
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} #' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
#' for example Format. #' for example Format.
#' @param with_stats whether to dump some additional statistics about the splits. #' @param with_stats whether to dump some additional statistics about the splits.
#' When this option is on, the model dump contains two additional values: #' When this option is on, the model dump contains two additional values:
@@ -27,18 +27,18 @@
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' # save the model in file 'xgb.model.dump' #' # save the model in file 'xgb.model.dump'
#' dump_path = file.path(tempdir(), 'model.dump') #' dump_path = file.path(tempdir(), 'model.dump')
#' xgb.dump(bst, dump_path, with_stats = TRUE) #' xgb.dump(bst, dump_path, with_stats = TRUE)
#' #'
#' # print the model without saving it to a file #' # print the model without saving it to a file
#' print(xgb.dump(bst, with_stats = TRUE)) #' print(xgb.dump(bst, with_stats = TRUE))
#' #'
#' # print in JSON format: #' # print in JSON format:
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) #' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
#' #'
#' @export #' @export
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE, xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
dump_format = c("text", "json"), ...) { dump_format = c("text", "json"), ...) {
@@ -50,23 +50,22 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
stop("fname: argument must be a character string (when provided)") stop("fname: argument must be a character string (when provided)")
if (!(is.null(fmap) || is.character(fmap))) if (!(is.null(fmap) || is.character(fmap)))
stop("fmap: argument must be a character string (when provided)") stop("fmap: argument must be a character string (when provided)")
model <- xgb.Booster.complete(model) model <- xgb.Booster.complete(model)
model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats), model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats),
as.character(dump_format)) as.character(dump_format))
if (is.null(fname)) if (is.null(fname))
model_dump <- gsub('\t', '', model_dump, fixed = TRUE) model_dump <- stri_replace_all_regex(model_dump, '\t', '')
if (dump_format == "text") if (dump_format == "text")
model_dump <- unlist(strsplit(model_dump, '\n', fixed = TRUE)) model_dump <- unlist(stri_split_regex(model_dump, '\n'))
model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE) model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
if (is.null(fname)) { if (is.null(fname)) {
return(model_dump) return(model_dump)
} else { } else {
fname <- path.expand(fname)
writeLines(model_dump, fname[1]) writeLines(model_dump, fname[1])
return(TRUE) return(TRUE)
} }

View File

@@ -3,9 +3,9 @@
#' @rdname xgb.plot.importance #' @rdname xgb.plot.importance
#' @export #' @export
xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
rel_to_first = FALSE, n_clusters = c(1:10), ...) { rel_to_first = FALSE, n_clusters = c(1:10), ...) {
importance_matrix <- xgb.plot.importance(importance_matrix, top_n = top_n, measure = measure, importance_matrix <- xgb.plot.importance(importance_matrix, top_n = top_n, measure = measure,
rel_to_first = rel_to_first, plot = FALSE, ...) rel_to_first = rel_to_first, plot = FALSE, ...)
if (!requireNamespace("ggplot2", quietly = TRUE)) { if (!requireNamespace("ggplot2", quietly = TRUE)) {
@@ -14,21 +14,21 @@ xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measur
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required", call. = FALSE) stop("Ckmeans.1d.dp package is required", call. = FALSE)
} }
clusters <- suppressWarnings( clusters <- suppressWarnings(
Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix$Importance, n_clusters) Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix$Importance, n_clusters)
) )
importance_matrix[, Cluster := as.character(clusters$cluster)] importance_matrix[, Cluster := as.character(clusters$cluster)]
plot <- plot <-
ggplot2::ggplot(importance_matrix, ggplot2::ggplot(importance_matrix,
ggplot2::aes(x = factor(Feature, levels = rev(Feature)), y = Importance, width = 0.5), ggplot2::aes(x = factor(Feature, levels = rev(Feature)), y = Importance, width = 0.5),
environment = environment()) + environment = environment()) +
ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = "identity") + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = "identity") +
ggplot2::coord_flip() + ggplot2::coord_flip() +
ggplot2::xlab("Features") + ggplot2::xlab("Features") +
ggplot2::ggtitle("Feature importance") + ggplot2::ggtitle("Feature importance") +
ggplot2::theme(plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), ggplot2::theme(plot.title = ggplot2::element_text(lineheight = .9, face = "bold"),
panel.grid.major.y = ggplot2::element_blank()) panel.grid.major.y = ggplot2::element_blank())
return(plot) return(plot)
} }
@@ -42,7 +42,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
stop("ggplot2 package is required for plotting the graph deepness.", call. = FALSE) stop("ggplot2 package is required for plotting the graph deepness.", call. = FALSE)
which <- match.arg(which) which <- match.arg(which)
dt_depths <- xgb.plot.deepness(model = model, plot = FALSE) dt_depths <- xgb.plot.deepness(model = model, plot = FALSE)
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth] dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
setkey(dt_summaries, 'Depth') setkey(dt_summaries, 'Depth')
@@ -60,30 +60,30 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
axis.ticks = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank() axis.text.x = ggplot2::element_blank()
) )
p2 <- p2 <-
ggplot2::ggplot(dt_summaries) + ggplot2::ggplot(dt_summaries) +
ggplot2::geom_bar(ggplot2::aes(x = Depth, y = Cover), stat = "Identity") + ggplot2::geom_bar(ggplot2::aes(x = Depth, y = Cover), stat = "Identity") +
ggplot2::xlab("Leaf depth") + ggplot2::xlab("Leaf depth") +
ggplot2::ylab("Weighted cover") ggplot2::ylab("Weighted cover")
multiplot(p1, p2, cols = 1) multiplot(p1, p2, cols = 1)
return(invisible(list(p1, p2))) return(invisible(list(p1, p2)))
} else if (which == "max.depth") { } else if (which == "max.depth") {
p <- p <-
ggplot2::ggplot(dt_depths[, max(Depth), Tree]) + ggplot2::ggplot(dt_depths[, max(Depth), Tree]) +
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1), ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
height = 0.15, alpha = 0.4, size = 3, stroke = 0) + height = 0.15, alpha=0.4, size=3, stroke=0) +
ggplot2::xlab("tree #") + ggplot2::xlab("tree #") +
ggplot2::ylab("Max tree leaf depth") ggplot2::ylab("Max tree leaf depth")
return(p) return(p)
} else if (which == "med.depth") { } else if (which == "med.depth") {
p <- p <-
ggplot2::ggplot(dt_depths[, median(as.numeric(Depth)), Tree]) + ggplot2::ggplot(dt_depths[, median(as.numeric(Depth)), Tree]) +
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1), ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
height = 0.15, alpha = 0.4, size = 3, stroke = 0) + height = 0.15, alpha=0.4, size=3, stroke=0) +
ggplot2::xlab("tree #") + ggplot2::xlab("tree #") +
ggplot2::ylab("Median tree leaf depth") ggplot2::ylab("Median tree leaf depth")
return(p) return(p)
@@ -92,102 +92,24 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
p <- p <-
ggplot2::ggplot(dt_depths[, median(abs(Weight)), Tree]) + ggplot2::ggplot(dt_depths[, median(abs(Weight)), Tree]) +
ggplot2::geom_point(ggplot2::aes(x = Tree, y = V1), ggplot2::geom_point(ggplot2::aes(x = Tree, y = V1),
alpha = 0.4, size = 3, stroke = 0) + alpha=0.4, size=3, stroke=0) +
ggplot2::xlab("tree #") + ggplot2::xlab("tree #") +
ggplot2::ylab("Median absolute leaf weight") ggplot2::ylab("Median absolute leaf weight")
return(p) return(p)
} }
} }
#' @rdname xgb.plot.shap.summary
#' @export
xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
data_list <- xgb.shap.data(
data = data,
shap_contrib = shap_contrib,
features = features,
top_n = top_n,
model = model,
trees = trees,
target_class = target_class,
approxcontrib = approxcontrib,
subsample = subsample,
max_observations = 10000 # 10,000 samples per feature.
)
p_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
# Reverse factor levels so that the first level is at the top of the plot
p_data[, "feature" := factor(feature, rev(levels(feature)))]
p <- ggplot2::ggplot(p_data, ggplot2::aes(x = feature, y = p_data$shap_value, colour = p_data$feature_value)) +
ggplot2::geom_jitter(alpha = 0.5, width = 0.1) +
ggplot2::scale_colour_viridis_c(limits = c(-3, 3), option = "plasma", direction = -1) +
ggplot2::geom_abline(slope = 0, intercept = 0, colour = "darkgrey") +
ggplot2::coord_flip()
p
}
#' Combine and melt feature values and SHAP contributions for sample
#' observations.
#'
#' Conforms to data format required for ggplot functions.
#'
#' Internal utility function.
#'
#' @param data_list List containing 'data' and 'shap_contrib' returned by
#' \code{xgb.shap.data()}.
#' @param normalize Whether to standardize feature values to have mean 0 and
#' standard deviation 1 (useful for comparing multiple features on the same
#' plot). Default \code{FALSE}.
#'
#' @return A data.table containing the observation ID, the feature name, the
#' feature value (normalized if specified), and the SHAP contribution value.
prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
data <- data_list[["data"]]
shap_contrib <- data_list[["shap_contrib"]]
data <- data.table::as.data.table(as.matrix(data))
if (normalize) {
data[, (names(data)) := lapply(.SD, normalize)]
}
data[, "id" := seq_len(nrow(data))]
data_m <- data.table::melt.data.table(data, id.vars = "id", variable.name = "feature", value.name = "feature_value")
shap_contrib <- data.table::as.data.table(as.matrix(shap_contrib))
shap_contrib[, "id" := seq_len(nrow(shap_contrib))]
shap_contrib_m <- data.table::melt.data.table(shap_contrib, id.vars = "id", variable.name = "feature", value.name = "shap_value")
p_data <- data.table::merge.data.table(data_m, shap_contrib_m, by = c("id", "feature"))
p_data
}
#' Scale feature value to have mean 0, standard deviation 1
#'
#' This is used to compare multiple features on the same plot.
#' Internal utility function
#'
#' @param x Numeric vector
#'
#' @return Numeric vector with mean 0 and sd 1.
normalize <- function(x) {
loc <- mean(x, na.rm = TRUE)
scale <- stats::sd(x, na.rm = TRUE)
(x - loc) / scale
}
# Plot multiple ggplot graph aligned by rows and columns. # Plot multiple ggplot graph aligned by rows and columns.
# ... the plots # ... the plots
# cols number of columns # cols number of columns
# internal utility function # internal utility function
multiplot <- function(..., cols = 1) { multiplot <- function(..., cols = 1) {
plots <- list(...) plots <- list(...)
num_plots <- length(plots) num_plots = length(plots)
layout <- matrix(seq(1, cols * ceiling(num_plots / cols)), layout <- matrix(seq(1, cols * ceiling(num_plots / cols)),
ncol = cols, nrow = ceiling(num_plots / cols)) ncol = cols, nrow = ceiling(num_plots / cols))
if (num_plots == 1) { if (num_plots == 1) {
print(plots[[1]]) print(plots[[1]])
} else { } else {
@@ -196,7 +118,7 @@ multiplot <- function(..., cols = 1) {
for (i in 1:num_plots) { for (i in 1:num_plots) {
# Get the i,j matrix positions of the regions that contain this subplot # Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE)) matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
print( print(
plots[[i]], vp = grid::viewport( plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row, layout.pos.row = matchidx$row,
@@ -209,5 +131,5 @@ multiplot <- function(..., cols = 1) {
globalVariables(c( globalVariables(c(
"Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
"element_blank", "element_text", "V1", "Weight", "feature" "element_blank", "element_text", "V1", "Weight"
)) ))

View File

@@ -1,66 +1,66 @@
#' Importance of features in a model. #' Importance of features in a model.
#' #'
#' Creates a \code{data.table} of feature importances in a model. #' Creates a \code{data.table} of feature importances in a model.
#' #'
#' @param feature_names character vector of feature names. If the model already #' @param feature_names character vector of feature names. If the model already
#' contains feature names, those would be used when \code{feature_names=NULL} (default value). #' contains feature names, those would be used when \code{feature_names=NULL} (default value).
#' Non-null \code{feature_names} could be provided to override those in the model. #' Non-null \code{feature_names} could be provided to override those in the model.
#' @param model object of class \code{xgb.Booster}. #' @param model object of class \code{xgb.Booster}.
#' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included #' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included
#' into the importance calculation. If set to \code{NULL}, all trees of the model are parsed. #' into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
#' It could be useful, e.g., in multiclass classification to get feature importances #' It could be useful, e.g., in multiclass classification to get feature importances
#' for each class separately. IMPORTANT: the tree index in xgboost models #' for each class separately. IMPORTANT: the tree index in xgboost models
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees). #' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
#' @param data deprecated. #' @param data deprecated.
#' @param label deprecated. #' @param label deprecated.
#' @param target deprecated. #' @param target deprecated.
#' #'
#' @details #' @details
#' #'
#' This function works for both linear and tree models. #' This function works for both linear and tree models.
#' #'
#' For linear models, the importance is the absolute magnitude of linear coefficients. #' For linear models, the importance is the absolute magnitude of linear coefficients.
#' For that reason, in order to obtain a meaningful ranking by importance for a linear model, #' For that reason, in order to obtain a meaningful ranking by importance for a linear model,
#' the features need to be on the same scale (which you also would want to do when using either #' the features need to be on the same scale (which you also would want to do when using either
#' L1 or L2 regularization). #' L1 or L2 regularization).
#' #'
#' @return #' @return
#' #'
#' For a tree model, a \code{data.table} with the following columns: #' For a tree model, a \code{data.table} with the following columns:
#' \itemize{ #' \itemize{
#' \item \code{Features} names of the features used in the model; #' \item \code{Features} names of the features used in the model;
#' \item \code{Gain} represents fractional contribution of each feature to the model based on #' \item \code{Gain} represents fractional contribution of each feature to the model based on
#' the total gain of this feature's splits. Higher percentage means a more important #' the total gain of this feature's splits. Higher percentage means a more important
#' predictive feature. #' predictive feature.
#' \item \code{Cover} metric of the number of observation related to this feature; #' \item \code{Cover} metric of the number of observation related to this feature;
#' \item \code{Frequency} percentage representing the relative number of times #' \item \code{Frequency} percentage representing the relative number of times
#' a feature have been used in trees. #' a feature have been used in trees.
#' } #' }
#' #'
#' A linear model's importance \code{data.table} has the following columns: #' A linear model's importance \code{data.table} has the following columns:
#' \itemize{ #' \itemize{
#' \item \code{Features} names of the features used in the model; #' \item \code{Features} names of the features used in the model;
#' \item \code{Weight} the linear coefficient of this feature; #' \item \code{Weight} the linear coefficient of this feature;
#' \item \code{Class} (only for multiclass models) class label. #' \item \code{Class} (only for multiclass models) class label.
#' } #' }
#' #'
#' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, #' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names},
#' index of the features will be used instead. Because the index is extracted from the model dump #' index of the features will be used instead. Because the index is extracted from the model dump
#' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). #' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
#' #'
#' @examples #' @examples
#' #'
#' # binomial classification using gbtree: #' # binomial classification using gbtree:
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' xgb.importance(model = bst) #' xgb.importance(model = bst)
#' #'
#' # binomial classification using gblinear: #' # binomial classification using gblinear:
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear",
#' eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic") #' eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
#' xgb.importance(model = bst) #' xgb.importance(model = bst)
#' #'
#' # multiclass classification using gbtree: #' # multiclass classification using gbtree:
#' nclass <- 3 #' nclass <- 3
#' nrounds <- 10 #' nrounds <- 10
@@ -73,7 +73,7 @@
#' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds)) #' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
#' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds)) #' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
#' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds)) #' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
#' #'
#' # multiclass classification using gblinear: #' # multiclass classification using gblinear:
#' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1, #' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
#' booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15, #' booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
@@ -83,33 +83,33 @@
#' @export #' @export
xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
data = NULL, label = NULL, target = NULL){ data = NULL, label = NULL, target = NULL){
if (!(is.null(data) && is.null(label) && is.null(target))) if (!(is.null(data) && is.null(label) && is.null(target)))
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated") warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
if (!inherits(model, "xgb.Booster")) if (!inherits(model, "xgb.Booster"))
stop("model: must be an object of class xgb.Booster") stop("model: must be an object of class xgb.Booster")
if (is.null(feature_names) && !is.null(model$feature_names)) if (is.null(feature_names) && !is.null(model$feature_names))
feature_names <- model$feature_names feature_names <- model$feature_names
if (!(is.null(feature_names) || is.character(feature_names))) if (!(is.null(feature_names) || is.character(feature_names)))
stop("feature_names: Has to be a character vector") stop("feature_names: Has to be a character vector")
model_text_dump <- xgb.dump(model = model, with_stats = TRUE) model_text_dump <- xgb.dump(model = model, with_stats = TRUE)
# linear model # linear model
if (model_text_dump[2] == "bias:"){ if(model_text_dump[2] == "bias:"){
weights <- which(model_text_dump == "weight:") %>% weights <- which(model_text_dump == "weight:") %>%
{model_text_dump[(. + 1):length(model_text_dump)]} %>% {model_text_dump[(. + 1):length(model_text_dump)]} %>%
as.numeric as.numeric
num_class <- NVL(model$params$num_class, 1) num_class <- NVL(model$params$num_class, 1)
if (is.null(feature_names)) if(is.null(feature_names))
feature_names <- seq(to = length(weights) / num_class) - 1 feature_names <- seq(to = length(weights) / num_class) - 1
if (length(feature_names) * num_class != length(weights)) if (length(feature_names) * num_class != length(weights))
stop("feature_names length does not match the number of features used in the model") stop("feature_names length does not match the number of features used in the model")
result <- if (num_class == 1) { result <- if (num_class == 1) {
data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))] data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
} else { } else {
@@ -117,17 +117,18 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
Weight = weights, Weight = weights,
Class = seq_len(num_class) - 1)[order(Class, -abs(Weight))] Class = seq_len(num_class) - 1)[order(Class, -abs(Weight))]
} }
} else { # tree model } else {
result <- xgb.model.dt.tree(feature_names = feature_names, # tree model
text = model_text_dump, result <- xgb.model.dt.tree(feature_names = feature_names,
trees = trees)[ text = model_text_dump,
Feature != "Leaf", .(Gain = sum(Quality), trees = trees)[
Cover = sum(Cover), Feature != "Leaf", .(Gain = sum(Quality),
Frequency = .N), by = Feature][ Cover = sum(Cover),
, `:=`(Gain = Gain / sum(Gain), Frequency = .N), by = Feature][
Cover = Cover / sum(Cover), ,`:=`(Gain = Gain / sum(Gain),
Frequency = Frequency / sum(Frequency))][ Cover = Cover / sum(Cover),
order(Gain, decreasing = TRUE)] Frequency = Frequency / sum(Frequency))][
order(Gain, decreasing = TRUE)]
} }
result result
} }

View File

@@ -1,12 +1,12 @@
#' Parse a boosted tree model text dump #' Parse a boosted tree model text dump
#' #'
#' Parse a boosted tree model text dump into a \code{data.table} structure. #' Parse a boosted tree model text dump into a \code{data.table} structure.
#' #'
#' @param feature_names character vector of feature names. If the model already #' @param feature_names character vector of feature names. If the model already
#' contains feature names, those would be used when \code{feature_names=NULL} (default value). #' contains feature names, those would be used when \code{feature_names=NULL} (default value).
#' Non-null \code{feature_names} could be provided to override those in the model. #' Non-null \code{feature_names} could be provided to override those in the model.
#' @param model object of class \code{xgb.Booster} #' @param model object of class \code{xgb.Booster}
#' @param text \code{character} vector previously generated by the \code{xgb.dump} #' @param text \code{character} vector previously generated by the \code{xgb.dump}
#' function (where parameter \code{with_stats = TRUE} should have been set). #' function (where parameter \code{with_stats = TRUE} should have been set).
#' \code{text} takes precedence over \code{model}. #' \code{text} takes precedence over \code{model}.
#' @param trees an integer vector of tree indices that should be parsed. #' @param trees an integer vector of tree indices that should be parsed.
@@ -18,11 +18,11 @@
#' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE). #' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
#' @param ... currently not used. #' @param ... currently not used.
#' #'
#' @return #' @return
#' A \code{data.table} with detailed information about model trees' nodes. #' A \code{data.table} with detailed information about model trees' nodes.
#' #'
#' The columns of the \code{data.table} are: #' The columns of the \code{data.table} are:
#' #'
#' \itemize{ #' \itemize{
#' \item \code{Tree}: integer ID of a tree in a model (zero-based index) #' \item \code{Tree}: integer ID of a tree in a model (zero-based index)
#' \item \code{Node}: integer ID of a node in a tree (zero-based index) #' \item \code{Node}: integer ID of a node in a tree (zero-based index)
@@ -36,111 +36,109 @@
#' \item \code{Quality}: either the split gain (change in loss) or the leaf value #' \item \code{Quality}: either the split gain (change in loss) or the leaf value
#' \item \code{Cover}: metric related to the number of observation either seen by a split #' \item \code{Cover}: metric related to the number of observation either seen by a split
#' or collected by a leaf during training. #' or collected by a leaf during training.
#' } #' }
#' #'
#' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers #' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from #' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
#' the corresponding trees in the "Node" column. #' the corresponding trees in the "Node" column.
#' #'
#' @examples #' @examples
#' # Basic use: #' # Basic use:
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' #'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' #'
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
#' #'
#' # This bst model already has feature_names stored with it, so those would be used when #' # This bst model already has feature_names stored with it, so those would be used when
#' # feature_names is not set: #' # feature_names is not set:
#' (dt <- xgb.model.dt.tree(model = bst)) #' (dt <- xgb.model.dt.tree(model = bst))
#' #'
#' # How to match feature names of splits that are following a current 'Yes' branch: #' # How to match feature names of splits that are following a current 'Yes' branch:
#' #'
#' merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)] #' merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
#' #'
#' @export #' @export
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
trees = NULL, use_int_id = FALSE, ...){ trees = NULL, use_int_id = FALSE, ...){
check.deprecation(...) check.deprecation(...)
if (!inherits(model, "xgb.Booster") && !is.character(text)) { if (!inherits(model, "xgb.Booster") && !is.character(text)) {
stop("Either 'model' must be an object of class xgb.Booster\n", stop("Either 'model' must be an object of class xgb.Booster\n",
" or 'text' must be a character vector with the result of xgb.dump\n", " or 'text' must be a character vector with the result of xgb.dump\n",
" (or NULL if 'model' was provided).") " (or NULL if 'model' was provided).")
} }
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names)) if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
feature_names <- model$feature_names feature_names <- model$feature_names
if (!(is.null(feature_names) || is.character(feature_names))) { if (!(is.null(feature_names) || is.character(feature_names))) {
stop("feature_names: must be a character vector") stop("feature_names: must be a character vector")
} }
if (!(is.null(trees) || is.numeric(trees))) { if (!(is.null(trees) || is.numeric(trees))) {
stop("trees: must be a vector of integers.") stop("trees: must be a vector of integers.")
} }
if (is.null(text)){ if (is.null(text)){
text <- xgb.dump(model = model, with_stats = TRUE) text <- xgb.dump(model = model, with_stats = TRUE)
} }
if (length(text) < 2 || if (length(text) < 2 ||
sum(grepl('yes=(\\d+),no=(\\d+)', text)) < 1) { sum(stri_detect_regex(text, 'yes=(\\d+),no=(\\d+)')) < 1) {
stop("Non-tree model detected! This function can only be used with tree models.") stop("Non-tree model detected! This function can only be used with tree models.")
} }
position <- which(grepl("booster", text, fixed = TRUE)) position <- which(!is.na(stri_match_first_regex(text, "booster")))
add.tree.id <- function(node, tree) if (use_int_id) node else paste(tree, node, sep = "-") add.tree.id <- function(node, tree) if (use_int_id) node else paste(tree, node, sep = "-")
anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
td <- data.table(t = text) td <- data.table(t = text)
td[position, Tree := 1L] td[position, Tree := 1L]
td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L] td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L]
if (is.null(trees)) { if (is.null(trees)) {
trees <- 0:max(td$Tree) trees <- 0:max(td$Tree)
} else { } else {
trees <- trees[trees >= 0 & trees <= max(td$Tree)] trees <- trees[trees >= 0 & trees <= max(td$Tree)]
} }
td <- td[Tree %in% trees & !grepl('^booster', t)] td <- td[Tree %in% trees & !grepl('^booster', t)]
td[, Node := as.integer(sub("^([0-9]+):.*", "\\1", t))] td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ]
if (!use_int_id) td[, ID := add.tree.id(Node, Tree)] if (!use_int_id) td[, ID := add.tree.id(Node, Tree)]
td[, isLeaf := grepl("leaf", t, fixed = TRUE)] td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]
# parse branch lines # parse branch lines
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),", branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")") "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
td[isLeaf == FALSE, td[isLeaf == FALSE,
(branch_cols) := { (branch_cols) := {
matches <- regmatches(t, regexec(branch_rx, t)) # skip some indices with spurious capture groups from anynumber_regex
# skip some indices with spurious capture groups from anynumber_regex xtr <- stri_match_first_regex(t, branch_rx)[, c(2,3,5,6,7,8,10), drop = FALSE]
xtr <- do.call(rbind, matches)[, c(2, 3, 5, 6, 7, 8, 10), drop = FALSE] xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree) lapply(seq_len(ncol(xtr)), function(i) xtr[,i])
as.data.table(xtr)
}] }]
# assign feature_names when available # assign feature_names when available
if (!is.null(feature_names)) { if (!is.null(feature_names)) {
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE)) if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
stop("feature_names has less elements than there are features used in the model") stop("feature_names has less elements than there are features used in the model")
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]] td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1] ]
} }
# parse leaf lines # parse leaf lines
leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")") leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
leaf_cols <- c("Feature", "Quality", "Cover") leaf_cols <- c("Feature", "Quality", "Cover")
td[isLeaf == TRUE, td[isLeaf == TRUE,
(leaf_cols) := { (leaf_cols) := {
matches <- regmatches(t, regexec(leaf_rx, t)) xtr <- stri_match_first_regex(t, leaf_rx)[, c(2,4)]
xtr <- do.call(rbind, matches)[, c(2, 4)] c("Leaf", lapply(seq_len(ncol(xtr)), function(i) xtr[,i]))
c("Leaf", as.data.table(xtr))
}] }]
# convert some columns to numeric # convert some columns to numeric
numeric_cols <- c("Split", "Quality", "Cover") numeric_cols <- c("Split", "Quality", "Cover")
td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols] td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
@@ -148,14 +146,14 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
int_cols <- c("Yes", "No", "Missing") int_cols <- c("Yes", "No", "Missing")
td[, (int_cols) := lapply(.SD, as.integer), .SDcols = int_cols] td[, (int_cols) := lapply(.SD, as.integer), .SDcols = int_cols]
} }
td[, t := NULL] td[, t := NULL]
td[, isLeaf := NULL] td[, isLeaf := NULL]
td[order(Tree, Node)] td[order(Tree, Node)]
} }
# Avoid error messages during CRAN check. # Avoid error messages during CRAN check.
# The reason is that these variables are never declared # The reason is that these variables are never declared
# They are mainly column names inferred by Data.table... # They are mainly column names inferred by Data.table...
globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf", ".SD", ".SDcols")) globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf",".SD", ".SDcols"))

View File

@@ -2,48 +2,48 @@
#' #'
#' Visualizes distributions related to depth of tree leafs. #' Visualizes distributions related to depth of tree leafs.
#' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend. #' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
#' #'
#' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function #' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function
#' or a data.table result of the \code{xgb.model.dt.tree} function. #' or a data.table result of the \code{xgb.model.dt.tree} function.
#' @param plot (base R barplot) whether a barplot should be produced. #' @param plot (base R barplot) whether a barplot should be produced.
#' If FALSE, only a data.table is returned. #' If FALSE, only a data.table is returned.
#' @param which which distribution to plot (see details). #' @param which which distribution to plot (see details).
#' @param ... other parameters passed to \code{barplot} or \code{plot}. #' @param ... other parameters passed to \code{barplot} or \code{plot}.
#' #'
#' @details #' @details
#' #'
#' When \code{which="2x1"}, two distributions with respect to the leaf depth #' When \code{which="2x1"}, two distributions with respect to the leaf depth
#' are plotted on top of each other: #' are plotted on top of each other:
#' \itemize{ #' \itemize{
#' \item the distribution of the number of leafs in a tree model at a certain depth; #' \item the distribution of the number of leafs in a tree model at a certain depth;
#' \item the distribution of average weighted number of observations ("cover") #' \item the distribution of average weighted number of observations ("cover")
#' ending up in leafs at certain depth. #' ending up in leafs at certain depth.
#' } #' }
#' Those could be helpful in determining sensible ranges of the \code{max_depth} #' Those could be helpful in determining sensible ranges of the \code{max_depth}
#' and \code{min_child_weight} parameters. #' and \code{min_child_weight} parameters.
#' #'
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth #' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how #' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
#' a tree's median absolute leaf weight changes through the iterations. #' a tree's median absolute leaf weight changes through the iterations.
#' #'
#' This function was inspired by the blog post #' This function was inspired by the blog post
#' \url{https://github.com/aysent/random-forest-leaf-visualization}. #' \url{https://github.com/aysent/random-forest-leaf-visualization}.
#' #'
#' @return #' @return
#' #'
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function #' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model, #' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions). #' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
#' #'
#' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"} #' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
#' or a single ggplot graph for the other \code{which} options. #' or a single ggplot graph for the other \code{which} options.
#' #'
#' @seealso #' @seealso
#' #'
#' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}. #' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
#' #'
#' @examples #' @examples
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' #'
#' # Change max_depth to a higher number to get a more significant result #' # Change max_depth to a higher number to get a more significant result
@@ -53,16 +53,16 @@
#' #'
#' xgb.plot.deepness(bst) #' xgb.plot.deepness(bst)
#' xgb.ggplot.deepness(bst) #' xgb.ggplot.deepness(bst)
#' #'
#' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2) #' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
#' #'
#' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2) #' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
#' #'
#' @rdname xgb.plot.deepness #' @rdname xgb.plot.deepness
#' @export #' @export
xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"), xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
plot = TRUE, ...) { plot = TRUE, ...) {
if (!(inherits(model, "xgb.Booster") || is.data.table(model))) if (!(inherits(model, "xgb.Booster") || is.data.table(model)))
stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n", stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
"or a data.table result of the xgb.importance function") "or a data.table result of the xgb.importance function")
@@ -71,32 +71,32 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
stop("igraph package is required for plotting the graph deepness.", call. = FALSE) stop("igraph package is required for plotting the graph deepness.", call. = FALSE)
which <- match.arg(which) which <- match.arg(which)
dt_tree <- model dt_tree <- model
if (inherits(model, "xgb.Booster")) if (inherits(model, "xgb.Booster"))
dt_tree <- xgb.model.dt.tree(model = model) dt_tree <- xgb.model.dt.tree(model = model)
if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree))) if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))
stop("Model tree columns are not as expected!\n", stop("Model tree columns are not as expected!\n",
" Note that this function works only for tree models.") " Note that this function works only for tree models.")
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID") dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID")
setkeyv(dt_depths, c("Tree", "ID")) setkeyv(dt_depths, c("Tree", "ID"))
# count by depth levels, and also calculate average cover at a depth # count by depth levels, and also calculate average cover at a depth
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth] dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
setkey(dt_summaries, "Depth") setkey(dt_summaries, "Depth")
if (plot) { if (plot) {
if (which == "2x1") { if (which == "2x1") {
op <- par(no.readonly = TRUE) op <- par(no.readonly = TRUE)
par(mfrow = c(2, 1), par(mfrow = c(2,1),
oma = c(3, 1, 3, 1) + 0.1, oma = c(3,1,3,1) + 0.1,
mar = c(1, 4, 1, 0) + 0.1) mar = c(1,4,1,0) + 0.1)
dt_summaries[, barplot(N, border = NA, ylab = 'Number of leafs', ...)] dt_summaries[, barplot(N, border = NA, ylab = 'Number of leafs', ...)]
dt_summaries[, barplot(Cover, border = NA, ylab = "Weighted cover", names.arg = Depth, ...)] dt_summaries[, barplot(Cover, border = NA, ylab = "Weighted cover", names.arg = Depth, ...)]
title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1) title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1)
par(op) par(op)
} else if (which == "max.depth") { } else if (which == "max.depth") {
@@ -123,14 +123,14 @@ get.leaf.depth <- function(dt_tree) {
dt_tree[Feature != "Leaf", .(ID, To = No, Tree)] dt_tree[Feature != "Leaf", .(ID, To = No, Tree)]
)) ))
# whether "To" is a leaf: # whether "To" is a leaf:
dt_edges <- dt_edges <-
merge(dt_edges, merge(dt_edges,
dt_tree[Feature == "Leaf", .(ID, Leaf = TRUE)], dt_tree[Feature == "Leaf", .(ID, Leaf = TRUE)],
all.x = TRUE, by.x = "To", by.y = "ID") all.x = TRUE, by.x = "To", by.y = "ID")
dt_edges[is.na(Leaf), Leaf := FALSE] dt_edges[is.na(Leaf), Leaf := FALSE]
dt_edges[, { dt_edges[, {
graph <- igraph::graph_from_data_frame(.SD[, .(ID, To)]) graph <- igraph::graph_from_data_frame(.SD[,.(ID, To)])
# min(ID) in a tree is a root node # min(ID) in a tree is a root node
paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE]) paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE])
# list of paths to each leaf in a tree # list of paths to each leaf in a tree

View File

@@ -92,27 +92,28 @@ xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure
importance_matrix <- head(importance_matrix, top_n) importance_matrix <- head(importance_matrix, top_n)
} }
if (rel_to_first) { if (rel_to_first) {
importance_matrix[, Importance := Importance / max(abs(Importance))] importance_matrix[, Importance := Importance/max(abs(Importance))]
} }
if (is.null(cex)) { if (is.null(cex)) {
cex <- 2.5 / log2(1 + nrow(importance_matrix)) cex <- 2.5/log2(1 + nrow(importance_matrix))
} }
if (plot) { if (plot) {
original_mar <- par()$mar op <- par(no.readonly = TRUE)
mar <- op$mar
# reset margins so this function doesn't have side effects
on.exit({par(mar = original_mar)})
mar <- original_mar
if (!is.null(left_margin)) if (!is.null(left_margin))
mar[2] <- left_margin mar[2] <- left_margin
par(mar = mar) par(mar = mar)
# reverse the order of rows to have the highest ranked at the top # reverse the order of rows to have the highest ranked at the top
importance_matrix[rev(seq_len(nrow(importance_matrix))), importance_matrix[nrow(importance_matrix):1,
barplot(Importance, horiz = TRUE, border = NA, cex.names = cex, barplot(Importance, horiz = TRUE, border = NA, cex.names = cex,
names.arg = Feature, las = 1, ...)] names.arg = Feature, las = 1, ...)]
grid(NULL, NA)
# redraw over the grid
importance_matrix[nrow(importance_matrix):1,
barplot(Importance, horiz = TRUE, border = NA, add = TRUE)]
par(op)
} }
invisible(importance_matrix) invisible(importance_matrix)

View File

@@ -9,7 +9,7 @@
#' @param plot_height height in pixels of the graph to produce #' @param plot_height height in pixels of the graph to produce
#' @param render a logical flag for whether the graph should be rendered (see Value). #' @param render a logical flag for whether the graph should be rendered (see Value).
#' @param ... currently not used #' @param ... currently not used
#' #'
#' @details #' @details
#' #'
#' This function tries to capture the complexity of a gradient boosted tree model #' This function tries to capture the complexity of a gradient boosted tree model
@@ -67,55 +67,58 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
# first number of the path represents the tree, then the following numbers are related to the path to follow # first number of the path represents the tree, then the following numbers are related to the path to follow
# root init # root init
root.nodes <- tree.matrix[Node == 0, ID] root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID]
tree.matrix[ID %in% root.nodes, abs.node.position := root.nodes] tree.matrix[ID %in% root.nodes, abs.node.position := root.nodes]
precedent.nodes <- root.nodes precedent.nodes <- root.nodes
while (tree.matrix[, sum(is.na(abs.node.position))] > 0) { while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)] yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)] no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0") yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0")
no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1") no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1")
tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos] tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos]
tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos] tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos]
precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
} }
tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")] tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")]
tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")] tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")]
for (nm in c("abs.node.position", "Yes", "No")) remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
data.table::set(tree.matrix, j = nm, value = sub("^\\d+-", "", tree.matrix[[nm]]))
tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position),
Yes = remove.tree(Yes),
No = remove.tree(No))]
nodes.dt <- tree.matrix[ nodes.dt <- tree.matrix[
, .(Quality = sum(Quality)) , .(Quality = sum(Quality))
, by = .(abs.node.position, Feature) , by = .(abs.node.position, Feature)
][, .(Text = paste0(Feature[1:min(length(Feature), features_keep)], ][, .(Text = paste0(Feature[1:min(length(Feature), features_keep)],
" (", " (",
format(Quality[1:min(length(Quality), features_keep)], digits = 5), format(Quality[1:min(length(Quality), features_keep)], digits=5),
")") %>% ")") %>%
paste0(collapse = "\n")) paste0(collapse = "\n"))
, by = abs.node.position] , by = abs.node.position]
edges.dt <- tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)] %>% edges.dt <- tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)] %>%
list(tree.matrix[Feature != "Leaf", .(abs.node.position, No)]) %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>%
rbindlist() %>% rbindlist() %>%
setnames(c("From", "To")) %>% setnames(c("From", "To")) %>%
.[, .N, .(From, To)] %>% .[, .N, .(From, To)] %>%
.[, N := NULL] .[, N:=NULL]
nodes <- DiagrammeR::create_node_df( nodes <- DiagrammeR::create_node_df(
n = nrow(nodes.dt), n = nrow(nodes.dt),
label = nodes.dt[, Text] label = nodes.dt[,Text]
) )
edges <- DiagrammeR::create_edge_df( edges <- DiagrammeR::create_edge_df(
from = match(edges.dt[, From], nodes.dt[, abs.node.position]), from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
to = match(edges.dt[, To], nodes.dt[, abs.node.position]), to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
rel = "leading_to") rel = "leading_to")
graph <- DiagrammeR::create_graph( graph <- DiagrammeR::create_graph(
nodes_df = nodes, nodes_df = nodes,
edges_df = edges, edges_df = edges,

View File

@@ -81,7 +81,6 @@
#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) #' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot
#' #'
#' # multiclass example - plots for each class separately: #' # multiclass example - plots for each class separately:
#' nclass <- 3 #' nclass <- 3
@@ -100,7 +99,6 @@
#' n_col = 2, col = col, pch = 16, pch_NA = 17) #' n_col = 2, col = col, pch = 16, pch_NA = 17)
#' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, #' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
#' n_col = 2, col = col, pch = 16, pch_NA = 17) #' n_col = 2, col = col, pch = 16, pch_NA = 17)
#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot
#' #'
#' @rdname xgb.plot.shap #' @rdname xgb.plot.shap
#' @export #' @export
@@ -111,33 +109,69 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07, plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
plot_loess = TRUE, col_loess = 2, span_loess = 0.5, plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
which = c("1d", "2d"), plot = TRUE, ...) { which = c("1d", "2d"), plot = TRUE, ...) {
data_list <- xgb.shap.data(
data = data, if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
shap_contrib = shap_contrib, stop("data: must be either matrix or dgCMatrix")
features = features,
top_n = top_n, if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
model = model, stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
trees = trees,
target_class = target_class, if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
approxcontrib = approxcontrib, stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
subsample = subsample,
max_observations = 100000 if (!is.null(shap_contrib) &&
) (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
data <- data_list[["data"]] stop("shap_contrib is not compatible with the provided data")
shap_contrib <- data_list[["shap_contrib"]]
features <- colnames(data) nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
idx <- sample(1:nrow(data), nsample)
data <- data[idx,]
if (is.null(shap_contrib)) {
shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
} else {
shap_contrib <- shap_contrib[idx,]
}
which <- match.arg(which) which <- match.arg(which)
if (which == "2d") if (which == "2d")
stop("2D plots are not implemented yet") stop("2D plots are not implemented yet")
if (is.null(features)) {
imp <- xgb.importance(model = model, trees = trees)
top_n <- as.integer(top_n[1])
if (top_n < 1 && top_n > 100)
stop("top_n: must be an integer within [1, 100]")
features <- imp$Feature[1:min(top_n, NROW(imp))]
}
if (is.character(features)) {
if (is.null(colnames(data)))
stop("Either provide `data` with column names or provide `features` as column indices")
features <- match(features, colnames(data))
}
if (n_col > length(features)) n_col <- length(features) if (n_col > length(features)) n_col <- length(features)
if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
else Reduce("+", lapply(shap_contrib, abs))
}
shap_contrib <- shap_contrib[, features, drop = FALSE]
data <- data[, features, drop = FALSE]
cols <- colnames(data)
if (is.null(cols)) cols <- colnames(shap_contrib)
if (is.null(cols)) cols <- paste0('X', 1:ncol(data))
colnames(data) <- cols
colnames(shap_contrib) <- cols
if (plot && which == "1d") { if (plot && which == "1d") {
op <- par(mfrow = c(ceiling(length(features) / n_col), n_col), op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
oma = c(0, 0, 0, 0) + 0.2, oma = c(0,0,0,0) + 0.2,
mar = c(3.5, 3.5, 0, 0) + 0.1, mar = c(3.5,3.5,0,0) + 0.1,
mgp = c(1.7, 0.6, 0)) mgp = c(1.7, 0.6, 0))
for (f in features) { for (f in cols) {
ord <- order(data[, f]) ord <- order(data[, f])
x <- data[, f][ord] x <- data[, f][ord]
y <- shap_contrib[, f][ord] y <- shap_contrib[, f][ord]
@@ -158,7 +192,7 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
grid() grid()
if (plot_loess) { if (plot_loess) {
# compress x to 3 digits, and mean-aggredate y # compress x to 3 digits, and mean-aggredate y
zz <- data.table(x = signif(x, 3), y)[, .(.N, y = mean(y)), x] zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x]
if (nrow(zz) <= 5) { if (nrow(zz) <= 5) {
lines(zz$x, zz$y, col = col_loess) lines(zz$x, zz$y, col = col_loess)
} else { } else {
@@ -182,108 +216,3 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
} }
invisible(list(data = data, shap_contrib = shap_contrib)) invisible(list(data = data, shap_contrib = shap_contrib))
} }
#' SHAP contribution dependency summary plot
#'
#' Compare SHAP contributions of different features.
#'
#' A point plot (each point representing one sample from \code{data}) is
#' produced for each feature, with the points plotted on the SHAP value axis.
#' Each point (observation) is coloured based on its feature value. The plot
#' hence allows us to see which features have a negative / positive contribution
#' on the model prediction, and whether the contribution is different for larger
#' or smaller values of the feature. We effectively try to replicate the
#' \code{summary_plot} function from https://github.com/slundberg/shap.
#'
#' @inheritParams xgb.plot.shap
#'
#' @return A \code{ggplot2} object.
#' @export
#'
#' @examples # See \code{\link{xgb.plot.shap}}.
#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
#' \url{https://github.com/slundberg/shap}
xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
# Only ggplot implementation is available.
xgb.ggplot.shap.summary(data, shap_contrib, features, top_n, model, trees, target_class, approxcontrib, subsample)
}
#' Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
#' Internal utility function.
#'
#' @inheritParams xgb.plot.shap
#' @keywords internal
#'
#' @return A list containing: 'data', a matrix containing sample observations
#' and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
#' values for these observations.
xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
trees = NULL, target_class = NULL, approxcontrib = FALSE,
subsample = NULL, max_observations = 100000) {
if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
stop("data: must be either matrix or dgCMatrix")
if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
if (!is.null(shap_contrib) &&
(!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
stop("shap_contrib is not compatible with the provided data")
if (is.character(features) && is.null(colnames(data)))
stop("either provide `data` with column names or provide `features` as column indices")
if (is.null(model$feature_names) && model$nfeatures != ncol(data))
stop("if model has no feature_names, columns in `data` must match features in model")
if (!is.null(subsample)) {
idx <- sample(x = seq_len(nrow(data)), size = as.integer(subsample * nrow(data)), replace = FALSE)
} else {
idx <- seq_len(min(nrow(data), max_observations))
}
data <- data[idx, ]
if (is.null(colnames(data))) {
colnames(data) <- paste0("X", seq_len(ncol(data)))
}
if (!is.null(shap_contrib)) {
if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
}
shap_contrib <- shap_contrib[idx, ]
if (is.null(colnames(shap_contrib))) {
colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
}
} else {
shap_contrib <- predict(model, newdata = data, predcontrib = TRUE, approxcontrib = approxcontrib)
if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
}
}
if (is.null(features)) {
if (!is.null(model$feature_names)) {
imp <- xgb.importance(model = model, trees = trees)
} else {
imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data))
}
top_n <- top_n[1]
if (top_n < 1 | top_n > 100) stop("top_n: must be an integer within [1, 100]")
features <- imp$Feature[1:min(top_n, NROW(imp))]
}
if (is.character(features)) {
features <- match(features, colnames(data))
}
shap_contrib <- shap_contrib[, features, drop = FALSE]
data <- data[, features, drop = FALSE]
list(
data = data,
shap_contrib = shap_contrib
)
}

View File

@@ -1,7 +1,7 @@
#' Plot a boosted tree model #' Plot a boosted tree model
#' #'
#' Read a tree model text dump and plot the model. #' Read a tree model text dump and plot the model.
#' #'
#' @param feature_names names of each feature as a \code{character} vector. #' @param feature_names names of each feature as a \code{character} vector.
#' @param model produced by the \code{xgb.train} function. #' @param model produced by the \code{xgb.train} function.
#' @param trees an integer vector of tree indices that should be visualized. #' @param trees an integer vector of tree indices that should be visualized.
@@ -14,10 +14,10 @@
#' @param show_node_id a logical flag for whether to show node id's in the graph. #' @param show_node_id a logical flag for whether to show node id's in the graph.
#' @param ... currently not used. #' @param ... currently not used.
#' #'
#' @details #' @details
#' #'
#' The content of each node is organised that way: #' The content of each node is organised that way:
#' #'
#' \itemize{ #' \itemize{
#' \item Feature name. #' \item Feature name.
#' \item \code{Cover}: The sum of second order gradient of training data classified to the leaf. #' \item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
@@ -27,21 +27,21 @@
#' \item \code{Gain} (for split nodes): the information gain metric of a split #' \item \code{Gain} (for split nodes): the information gain metric of a split
#' (corresponds to the importance of the node in the model). #' (corresponds to the importance of the node in the model).
#' \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction. #' \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction.
#' } #' }
#' The tree root nodes also indicate the Tree index (0-based). #' The tree root nodes also indicate the Tree index (0-based).
#' #'
#' The "Yes" branches are marked by the "< split_value" label. #' The "Yes" branches are marked by the "< split_value" label.
#' The branches that also used for missing values are marked as bold #' The branches that also used for missing values are marked as bold
#' (as in "carrying extra capacity"). #' (as in "carrying extra capacity").
#' #'
#' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR. #' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
#' #'
#' @return #' @return
#' #'
#' When \code{render = TRUE}: #' When \code{render = TRUE}:
#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}. #' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
#' Similar to ggplot objects, it needs to be printed to see it when not running from command line. #' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
#' #'
#' When \code{render = FALSE}: #' When \code{render = FALSE}:
#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}. #' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
#' This could be useful if one wants to modify some of the graph attributes #' This could be useful if one wants to modify some of the graph attributes
@@ -49,23 +49,23 @@
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' #'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' # plot all the trees #' # plot all the trees
#' xgb.plot.tree(model = bst) #' xgb.plot.tree(model = bst)
#' # plot only the first tree and display the node ID: #' # plot only the first tree and display the node ID:
#' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE) #' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
#' #'
#' \dontrun{ #' \dontrun{
#' # Below is an example of how to save this plot to a file. #' # Below is an example of how to save this plot to a file.
#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed. #' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
#' library(DiagrammeR) #' library(DiagrammeR)
#' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE) #' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
#' export_graph(gr, 'tree.pdf', width=1500, height=1900) #' export_graph(gr, 'tree.pdf', width=1500, height=1900)
#' export_graph(gr, 'tree.png', width=1500, height=1900) #' export_graph(gr, 'tree.png', width=1500, height=1900)
#' } #' }
#' #'
#' @export #' @export
xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL, xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
render = TRUE, show_node_id = FALSE, ...){ render = TRUE, show_node_id = FALSE, ...){
@@ -77,18 +77,18 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
if (!requireNamespace("DiagrammeR", quietly = TRUE)) { if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE) stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
} }
dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees) dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)] dt[, label:= paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)]
if (show_node_id) if (show_node_id)
dt[, label := paste0(ID, ": ", label)] dt[, label := paste0(ID, ": ", label)]
dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)] dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)]
dt[, shape := "rectangle"][Feature == "Leaf", shape := "oval"] dt[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
dt[, filledcolor := "Beige"][Feature == "Leaf", filledcolor := "Khaki"] dt[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
# in order to draw the first tree on top: # in order to draw the first tree on top:
dt <- dt[order(-Tree)] dt <- dt[order(-Tree)]
nodes <- DiagrammeR::create_node_df( nodes <- DiagrammeR::create_node_df(
n = nrow(dt), n = nrow(dt),
ID = dt$ID, ID = dt$ID,
@@ -97,7 +97,7 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
shape = dt$shape, shape = dt$shape,
data = dt$Feature, data = dt$Feature,
fontcolor = "black") fontcolor = "black")
edges <- DiagrammeR::create_edge_df( edges <- DiagrammeR::create_edge_df(
from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID), from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID),
to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID), to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID),
@@ -126,9 +126,9 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
attr_type = "edge", attr_type = "edge",
attr = c("color", "arrowsize", "arrowhead", "fontname"), attr = c("color", "arrowsize", "arrowhead", "fontname"),
value = c("DimGray", "1.5", "vee", "Helvetica")) value = c("DimGray", "1.5", "vee", "Helvetica"))
if (!render) return(invisible(graph)) if (!render) return(invisible(graph))
DiagrammeR::render_graph(graph, width = plot_width, height = plot_height) DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
} }

View File

@@ -1,33 +1,29 @@
#' Save xgboost model to binary file #' Save xgboost model to binary file
#' #'
#' Save xgboost model to a file in binary format. #' Save xgboost model to a file in binary format.
#' #'
#' @param model model object of \code{xgb.Booster} class. #' @param model model object of \code{xgb.Booster} class.
#' @param fname name of the file to write. #' @param fname name of the file to write.
#' #'
#' @details #' @details
#' This methods allows to save a model in an xgboost-internal binary format which is universal #' This methods allows to save a model in an xgboost-internal binary format which is universal
#' among the various xgboost interfaces. In R, the saved model file could be read-in later #' among the various xgboost interfaces. In R, the saved model file could be read-in later
#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter #' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
#' of \code{\link{xgb.train}}. #' of \code{\link{xgb.train}}.
#' #'
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} #' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and #' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with #' corresponding R-methods would need to be used to load it.
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in #'
#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn #' @seealso
#' how to persist models in a future-proof way, i.e. to make the model accessible in future #' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
#' releases of XGBoost. #'
#'
#' @seealso
#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
#'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model') #' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model') #' bst <- xgb.load('xgb.model')
@@ -42,7 +38,6 @@ xgb.save <- function(model, fname) {
if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "") if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
} }
model <- xgb.Booster.complete(model, saveraw = FALSE) model <- xgb.Booster.complete(model, saveraw = FALSE)
fname <- path.expand(fname)
.Call(XGBoosterSaveModel_R, model$handle, fname[1]) .Call(XGBoosterSaveModel_R, model$handle, fname[1])
return(TRUE) return(TRUE)
} }

View File

@@ -3,9 +3,9 @@
#' \code{xgb.train} is an advanced interface for training an xgboost model. #' \code{xgb.train} is an advanced interface for training an xgboost model.
#' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}. #' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
#' #'
#' @param params the list of parameters. The complete list of parameters is #' @param params the list of parameters.
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below #' The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
#' is a shorter summary: #' Below is a shorter summary:
#' #'
#' 1. General Parameters #' 1. General Parameters
#' #'
@@ -15,7 +15,7 @@
#' #'
#' 2. Booster Parameters #' 2. Booster Parameters
#' #'
#' 2.1. Parameters for Tree Booster #' 2.1. Parameter for Tree Booster
#' #'
#' \itemize{ #' \itemize{
#' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 #' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
@@ -24,14 +24,12 @@
#' \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 #' \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1 #' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 #' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
#' \item \code{lambda} L2 regularization term on weights. Default: 1
#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 #' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
#' \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint. #' \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.
#' \item \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions. Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. Feature index values should start from \code{0} (\code{0} references the first column). Leave argument unspecified for no interaction constraints. #' \item \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions. Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. Feature index values should start from \code{0} (\code{0} references the first column). Leave argument unspecified for no interaction constraints.
#' } #' }
#' #'
#' 2.2. Parameters for Linear Booster #' 2.2. Parameter for Linear Booster
#' #'
#' \itemize{ #' \itemize{
#' \item \code{lambda} L2 regularization term on weights. Default: 0 #' \item \code{lambda} L2 regularization term on weights. Default: 0
@@ -45,23 +43,13 @@
#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: #' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
#' \itemize{ #' \itemize{
#' \item \code{reg:squarederror} Regression with squared loss (Default). #' \item \code{reg:squarederror} Regression with squared loss (Default).
#' \item \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}. All inputs are required to be greater than -1. Also, see metric rmsle for possible issue with this objective.
#' \item \code{reg:logistic} logistic regression. #' \item \code{reg:logistic} logistic regression.
#' \item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability. #' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. #' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
#' \item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. #' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
#' \item \code{count:poisson}: poisson regression for count data, output mean of poisson distribution. \code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).
#' \item \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored). Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function \code{h(t) = h0(t) * HR)}.
#' \item \code{survival:aft}: Accelerated failure time model for censored survival time data. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details.
#' \item \code{aft_loss_distribution}: Probabilty Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}. #' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
#' \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. #' \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#' \item \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.
#' \item \code{rank:map}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)} is maximized.
#' \item \code{reg:gamma}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.
#' \item \code{reg:tweedie}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.
#' } #' }
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. #' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
@@ -132,18 +120,16 @@
#' Note that when using a customized metric, only this single metric can be used. #' Note that when using a customized metric, only this single metric can be used.
#' The following is the list of built-in metrics for which Xgboost provides optimized implementation: #' The following is the list of built-in metrics for which Xgboost provides optimized implementation:
#' \itemize{ #' \itemize{
#' \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
#' \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
#' \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html} #' \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
#' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. #' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. #' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
#' Different threshold (e.g., 0.) could be specified as "error@0." #' Different threshold (e.g., 0.) could be specified as "error@0."
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. #' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
#' \item \code{mae} Mean absolute error #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{mape} Mean absolute percentage error
#' \item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation. #' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG} #' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
#' } #' }
#' #'
#' The following callbacks are automatically created when certain parameters are set: #' The following callbacks are automatically created when certain parameters are set:
@@ -195,8 +181,8 @@
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' #'
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label)) #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label)) #' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
#' watchlist <- list(train = dtrain, eval = dtest) #' watchlist <- list(train = dtrain, eval = dtest)
#' #'
#' ## A simple xgb.train example: #' ## A simple xgb.train example:
@@ -282,7 +268,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
# evaluation printing callback # evaluation printing callback
params <- c(params) params <- c(params)
print_every_n <- max(as.integer(print_every_n), 1L) print_every_n <- max( as.integer(print_every_n), 1L)
if (!has.callbacks(callbacks, 'cb.print.evaluation') && if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
verbose) { verbose) {
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n)) callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
@@ -332,9 +318,12 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class) niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
} }
} }
if (is_update && nrounds > niter_init) if(is_update && nrounds > niter_init)
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)") stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
# TODO: distributed code
rank <- 0
niter_skip <- ifelse(is_update, 0, niter_init) niter_skip <- ifelse(is_update, 0, niter_init)
begin_iteration <- niter_skip + 1 begin_iteration <- niter_skip + 1
end_iteration <- niter_skip + nrounds end_iteration <- niter_skip + nrounds
@@ -346,6 +335,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
bst_evaluation <- numeric(0)
if (length(watchlist) > 0) if (length(watchlist) > 0)
bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval) bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)
@@ -360,7 +350,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
bst <- xgb.Booster.complete(bst, saveraw = TRUE) bst <- xgb.Booster.complete(bst, saveraw = TRUE)
# store the total number of boosting iterations # store the total number of boosting iterations
bst$niter <- end_iteration bst$niter = end_iteration
# store the evaluation results # store the evaluation results
if (length(evaluation_log) > 0 && if (length(evaluation_log) > 0 &&

View File

@@ -6,26 +6,7 @@
xgb.unserialize <- function(buffer) { xgb.unserialize <- function(buffer) {
cachelist <- list() cachelist <- list()
handle <- .Call(XGBoosterCreate_R, cachelist) handle <- .Call(XGBoosterCreate_R, cachelist)
tryCatch( .Call(XGBoosterUnserializeFromBuffer_R, handle, buffer)
.Call(XGBoosterUnserializeFromBuffer_R, handle, buffer),
error = function(e) {
error_msg <- conditionMessage(e)
m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)",
error_msg, perl = TRUE)
groups <- regmatches(error_msg, m)[[1]]
if (length(groups) == 3) {
warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ",
"loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ",
"function, to ensure that your model can be read in current and upcoming ",
"XGBoost releases. Please use xgb.save() instead to preserve models for the ",
"long term. For more details and explanation, see ",
"https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html",
sep = ""))
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
} else {
stop(e)
}
})
class(handle) <- "xgb.Booster.handle" class(handle) <- "xgb.Booster.handle"
return (handle) return (handle)
} }

View File

@@ -91,8 +91,11 @@ NULL
#' @importFrom data.table setkeyv #' @importFrom data.table setkeyv
#' @importFrom data.table setnames #' @importFrom data.table setnames
#' @importFrom magrittr %>% #' @importFrom magrittr %>%
#' @importFrom jsonlite fromJSON #' @importFrom stringi stri_detect_regex
#' @importFrom jsonlite toJSON #' @importFrom stringi stri_match_first_regex
#' @importFrom stringi stri_replace_first_regex
#' @importFrom stringi stri_replace_all_regex
#' @importFrom stringi stri_split_regex
#' @importFrom utils object.size str tail #' @importFrom utils object.size str tail
#' @importFrom stats predict #' @importFrom stats predict
#' @importFrom stats median #' @importFrom stats median

18
R-package/configure vendored
View File

@@ -613,7 +613,6 @@ infodir
docdir docdir
oldincludedir oldincludedir
includedir includedir
runstatedir
localstatedir localstatedir
sharedstatedir sharedstatedir
sysconfdir sysconfdir
@@ -683,7 +682,6 @@ datadir='${datarootdir}'
sysconfdir='${prefix}/etc' sysconfdir='${prefix}/etc'
sharedstatedir='${prefix}/com' sharedstatedir='${prefix}/com'
localstatedir='${prefix}/var' localstatedir='${prefix}/var'
runstatedir='${localstatedir}/run'
includedir='${prefix}/include' includedir='${prefix}/include'
oldincludedir='/usr/include' oldincludedir='/usr/include'
docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -936,15 +934,6 @@ do
| -silent | --silent | --silen | --sile | --sil) | -silent | --silent | --silen | --sile | --sil)
silent=yes ;; silent=yes ;;
-runstatedir | --runstatedir | --runstatedi | --runstated \
| --runstate | --runstat | --runsta | --runst | --runs \
| --run | --ru | --r)
ac_prev=runstatedir ;;
-runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
| --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
| --run=* | --ru=* | --r=*)
runstatedir=$ac_optarg ;;
-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
ac_prev=sbindir ;; ac_prev=sbindir ;;
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1082,7 +1071,7 @@ fi
for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \ datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
libdir localedir mandir runstatedir libdir localedir mandir
do do
eval ac_val=\$$ac_var eval ac_val=\$$ac_var
# Remove trailing slashes. # Remove trailing slashes.
@@ -1235,7 +1224,6 @@ Fine tuning of the installation directories:
--sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sysconfdir=DIR read-only single-machine data [PREFIX/etc]
--sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
--localstatedir=DIR modifiable single-machine data [PREFIX/var] --localstatedir=DIR modifiable single-machine data [PREFIX/var]
--runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run]
--libdir=DIR object code libraries [EPREFIX/lib] --libdir=DIR object code libraries [EPREFIX/lib]
--includedir=DIR C header files [PREFIX/include] --includedir=DIR C header files [PREFIX/include]
--oldincludedir=DIR C header files for non-gcc [/usr/include] --oldincludedir=DIR C header files for non-gcc [/usr/include]
@@ -2725,14 +2713,14 @@ main ()
return 0; return 0;
} }
_ACEOF _ACEOF
${CC} -o conftest conftest.c ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes ${CC} -o conftest conftest.c /usr/local/lib/libomp.dylib -Xclang -fopenmp 2>/dev/null && ./conftest && ac_pkg_openmp=yes
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_pkg_openmp}" >&5 { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_pkg_openmp}" >&5
$as_echo "${ac_pkg_openmp}" >&6; } $as_echo "${ac_pkg_openmp}" >&6; }
if test "${ac_pkg_openmp}" = no; then if test "${ac_pkg_openmp}" = no; then
OPENMP_CXXFLAGS='' OPENMP_CXXFLAGS=''
OPENMP_LIB='' OPENMP_LIB=''
echo '*****************************************************************************************' echo '*****************************************************************************************'
echo ' OpenMP is unavailable on this Mac OSX system. Training speed may be suboptimal.' echo 'WARNING: OpenMP is unavailable on this Mac OSX system. Training speed may be suboptimal.'
echo ' To use all CPU cores for training jobs, you should install OpenMP by running\n' echo ' To use all CPU cores for training jobs, you should install OpenMP by running\n'
echo ' brew install libomp' echo ' brew install libomp'
echo '*****************************************************************************************' echo '*****************************************************************************************'

View File

@@ -1,6 +1,6 @@
### configure.ac -*- Autoconf -*- ### configure.ac -*- Autoconf -*-
AC_PREREQ(2.69) AC_PREREQ(2.62)
AC_INIT([xgboost],[0.6-3],[],[xgboost],[]) AC_INIT([xgboost],[0.6-3],[],[xgboost],[])
@@ -33,13 +33,13 @@ then
ac_pkg_openmp=no ac_pkg_openmp=no
AC_MSG_CHECKING([whether OpenMP will work in a package]) AC_MSG_CHECKING([whether OpenMP will work in a package])
AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])]) AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
${CC} -o conftest conftest.c ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes ${CC} -o conftest conftest.c /usr/local/lib/libomp.dylib -Xclang -fopenmp 2>/dev/null && ./conftest && ac_pkg_openmp=yes
AC_MSG_RESULT([${ac_pkg_openmp}]) AC_MSG_RESULT([${ac_pkg_openmp}])
if test "${ac_pkg_openmp}" = no; then if test "${ac_pkg_openmp}" = no; then
OPENMP_CXXFLAGS='' OPENMP_CXXFLAGS=''
OPENMP_LIB='' OPENMP_LIB=''
echo '*****************************************************************************************' echo '*****************************************************************************************'
echo ' OpenMP is unavailable on this Mac OSX system. Training speed may be suboptimal.' echo 'WARNING: OpenMP is unavailable on this Mac OSX system. Training speed may be suboptimal.'
echo ' To use all CPU cores for training jobs, you should install OpenMP by running\n' echo ' To use all CPU cores for training jobs, you should install OpenMP by running\n'
echo ' brew install libomp' echo ' brew install libomp'
echo '*****************************************************************************************' echo '*****************************************************************************************'
@@ -52,3 +52,4 @@ AC_SUBST(ENDIAN_FLAG)
AC_SUBST(BACKTRACE_LIB) AC_SUBST(BACKTRACE_LIB)
AC_CONFIG_FILES([src/Makevars]) AC_CONFIG_FILES([src/Makevars])
AC_OUTPUT AC_OUTPUT

View File

@@ -17,4 +17,4 @@ Benchmarks
Notes Notes
==== ====
* Contribution of examples, benchmarks is more than welcomed! * Contribution of examples, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request :) * If you like to share how you use xgboost to solve your problem, send a pull request:)

View File

@@ -3,8 +3,8 @@ require(methods)
# we load in the agaricus dataset # we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom is edible # In this example, we are aiming to predict whether a mushroom is edible
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} # the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
@@ -26,7 +26,7 @@ bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2,
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
print("Training xgboost with xgb.DMatrix") print("Training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2, bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
objective = "binary:logistic") objective = "binary:logistic")
# Verbose = 0,1,2 # Verbose = 0,1,2
@@ -46,7 +46,7 @@ bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
#--------------------basic prediction using xgboost-------------- #--------------------basic prediction using xgboost--------------
# you can do prediction using the following line # you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or xgb.DMatrix # you can put in Matrix, sparseMatrix, or xgb.DMatrix
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label) err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err)) print(paste("test-error=", err))
@@ -58,31 +58,31 @@ xgb.save(bst, "xgboost.model")
bst2 <- xgb.load("xgboost.model") bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data) pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred # pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
# save model to R's raw vector # save model to R's raw vector
raw <- xgb.save.raw(bst) raw = xgb.save.raw(bst)
# load binary model to R # load binary model to R
bst3 <- xgb.load(raw) bst3 <- xgb.load(raw)
pred3 <- predict(bst3, test$data) pred3 <- predict(bst3, test$data)
# pred3 should be identical to pred # pred3 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred)))) print(paste("sum(abs(pred3-pred))=", sum(abs(pred3-pred))))
#----------------Advanced features -------------- #----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix # to use advanced features, we need to put data in xgb.DMatrix
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label = test$label) dtest <- xgb.DMatrix(data = test$data, label=test$label)
#---------------Using watchlist---------------- #---------------Using watchlist----------------
# watchlist is a list of xgb.DMatrix, each of them is tagged with name # watchlist is a list of xgb.DMatrix, each of them is tagged with name
watchlist <- list(train = dtrain, test = dtest) watchlist <- list(train=dtrain, test=dtest)
# to train with watchlist, use xgb.train, which contains more advanced features # to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list # watchlist allows us to monitor the evaluation result on all data in the list
print("Train xgboost using xgb.train with watchlist") print("Train xgboost using xgb.train with watchlist")
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics # we can change evaluation metrics, or use multiple evaluation metrics
print("train xgboost using xgb.train with watchlist, watch logloss and error") print("train xgboost using xgb.train with watchlist, watch logloss and error")
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
eval_metric = "error", eval_metric = "logloss", eval_metric = "error", eval_metric = "logloss",
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
@@ -90,17 +90,17 @@ bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist =
xgb.DMatrix.save(dtrain, "dtrain.buffer") xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix # to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer") dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo # information can be extracted from xgb.DMatrix using getinfo
label <- getinfo(dtest, "label") label = getinfo(dtest, "label")
pred <- predict(bst, dtest) pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err)) print(paste("test-error=", err))
# You can dump the tree you learned using xgb.dump into a text file # You can dump the tree you learned using xgb.dump into a text file
dump_path <- file.path(tempdir(), 'dump.raw.txt') dump_path = file.path(tempdir(), 'dump.raw.txt')
xgb.dump(bst, dump_path, with_stats = TRUE) xgb.dump(bst, dump_path, with_stats = T)
# Finally, you can check which features are the most important. # Finally, you can check which features are the most important.
print("Most important features (look at column Gain):") print("Most important features (look at column Gain):")

View File

@@ -1,7 +1,7 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
@@ -11,12 +11,12 @@ watchlist <- list(eval = dtest, train = dtrain)
# #
print('start running example to start from a initial prediction') print('start running example to start from a initial prediction')
# train xgboost for 1 round # train xgboost for 1 round
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic') param <- list(max_depth=2, eta=1, nthread = 2, silent=1, objective='binary:logistic')
bst <- xgb.train(param, dtrain, 1, watchlist) bst <- xgb.train(param, dtrain, 1, watchlist)
# Note: we need the margin value instead of transformed prediction in set_base_margin # Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation # do predict with output_margin=TRUE, will always give you margin values before logistic transformation
ptrain <- predict(bst, dtrain, outputmargin = TRUE) ptrain <- predict(bst, dtrain, outputmargin=TRUE)
ptest <- predict(bst, dtest, outputmargin = TRUE) ptest <- predict(bst, dtest, outputmargin=TRUE)
# set the base_margin property of dtrain and dtest # set the base_margin property of dtrain and dtest
# base margin is the base prediction we will boost from # base margin is the base prediction we will boost from
setinfo(dtrain, "base_margin", ptrain) setinfo(dtrain, "base_margin", ptrain)

View File

@@ -1,5 +1,5 @@
# install development version of caret library that contains xgboost models # install development version of caret library that contains xgboost models
devtools::install_github("topepo/caret/pkg/caret") devtools::install_github("topepo/caret/pkg/caret")
require(caret) require(caret)
require(xgboost) require(xgboost)
require(data.table) require(data.table)
@@ -9,17 +9,17 @@ require(e1071)
# Load Arthritis dataset in memory. # Load Arthritis dataset in memory.
data(Arthritis) data(Arthritis)
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good). # Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = FALSE) df <- data.table(Arthritis, keep.rownames = F)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. # Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. # For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[, AgeDiscret := as.factor(round(Age / 10, 0))] df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). # Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). # We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[, ID := NULL] df[,ID:=NULL]
#-------------Basic Training using XGBoost in caret Library----------------- #-------------Basic Training using XGBoost in caret Library-----------------
# Set up control parameters for caret::train # Set up control parameters for caret::train

View File

@@ -6,10 +6,10 @@ if (!require(vcd)) {
require(vcd) require(vcd)
} }
# According to its documentation, Xgboost works only on numbers. # According to its documentation, Xgboost works only on numbers.
# Sometimes the dataset we have to work on have categorical data. # Sometimes the dataset we have to work on have categorical data.
# A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable. # A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
# #
# In R, categorical variable is called Factor. # In R, categorical variable is called Factor.
# Type ?factor in console for more information. # Type ?factor in console for more information.
# #
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost. # In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
@@ -19,7 +19,7 @@ if (!require(vcd)) {
data(Arthritis) data(Arthritis)
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good). # create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = FALSE) df <- data.table(Arthritis, keep.rownames = F)
# Let's have a look to the data.table # Let's have a look to the data.table
cat("Print the dataset\n") cat("Print the dataset\n")
@@ -32,17 +32,17 @@ str(df)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. # Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. # For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[, AgeDiscret := as.factor(round(Age / 10, 0))] df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). # Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). # We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[, ID := NULL] df[,ID:=NULL]
# List the different values for the column Treatment: Placebo, Treated. # List the different values for the column Treatment: Placebo, Treated.
cat("Values of the categorical feature Treatment\n") cat("Values of the categorical feature Treatment\n")
print(levels(df[, Treatment])) print(levels(df[,Treatment]))
# Next step, we will transform the categorical data to dummy variables. # Next step, we will transform the categorical data to dummy variables.
# This method is also called one hot encoding. # This method is also called one hot encoding.
@@ -52,16 +52,16 @@ print(levels(df[, Treatment]))
# #
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values. # Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
# Column Improved is excluded because it will be our output column, the one we want to predict. # Column Improved is excluded because it will be our output column, the one we want to predict.
sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df) sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
cat("Encoding of the sparse Matrix\n") cat("Encoding of the sparse Matrix\n")
print(sparse_matrix) print(sparse_matrix)
# Create the output vector (not sparse) # Create the output vector (not sparse)
# 1. Set, for all rows, field in Y column to 0; # 1. Set, for all rows, field in Y column to 0;
# 2. set Y to 1 when Improved == Marked; # 2. set Y to 1 when Improved == Marked;
# 3. Return Y column # 3. Return Y column
output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y] output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
# Following is the same process as other demo # Following is the same process as other demo
cat("Learning...\n") cat("Learning...\n")

View File

@@ -1,25 +1,25 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
nrounds <- 2 nrounds <- 2
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic') param <- list(max_depth=2, eta=1, silent=1, nthread=2, objective='binary:logistic')
cat('running cross validation\n') cat('running cross validation\n')
# do cross validation, this will print result out as # do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value # [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric # std_value is standard deviation of the metric
xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = {'error'}) xgb.cv(param, dtrain, nrounds, nfold=5, metrics={'error'})
cat('running cross validation, disable standard deviation display\n') cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as # do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value # [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric # std_value is standard deviation of the metric
xgb.cv(param, dtrain, nrounds, nfold = 5, xgb.cv(param, dtrain, nrounds, nfold=5,
metrics = 'error', showsd = FALSE) metrics='error', showsd = FALSE)
### ###
# you can also do cross validation with cutomized loss function # you can also do cross validation with cutomized loss function
@@ -29,18 +29,18 @@ print ('running cross validation, with cutomsized loss function')
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds)) preds <- 1/(1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max_depth = 2, eta = 1, param <- list(max_depth=2, eta=1, silent=1,
objective = logregobj, eval_metric = evalerror) objective = logregobj, eval_metric = evalerror)
# train with customized objective # train with customized objective
xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5) xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5)

View File

@@ -1,7 +1,7 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
@@ -15,7 +15,7 @@ num_round <- 2
# this is loglikelihood loss # this is loglikelihood loss
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds)) preds <- 1/(1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
@@ -29,36 +29,36 @@ logregobj <- function(preds, dtrain) {
# Take this in mind when you use the customization, and maybe you need write customized evaluation function # Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, param <- list(max_depth=2, eta=1, nthread = 2, verbosity=0,
objective = logregobj, eval_metric = evalerror) objective=logregobj, eval_metric=evalerror)
print ('start training with user customized objective') print ('start training with user customized objective')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train # simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist) bst <- xgb.train(param, dtrain, num_round, watchlist)
# #
# there can be cases where you want additional information # there can be cases where you want additional information
# being considered besides the property of DMatrix you can get by getinfo # being considered besides the property of DMatrix you can get by getinfo
# you can set additional information as attributes if DMatrix # you can set additional information as attributes if DMatrix
# set label attribute of dtrain to be label, we use label as an example, it can be anything # set label attribute of dtrain to be label, we use label as an example, it can be anything
attr(dtrain, 'label') <- getinfo(dtrain, 'label') attr(dtrain, 'label') <- getinfo(dtrain, 'label')
# this is new customized objective, where you can access things you set # this is new customized objective, where you can access things you set
# same thing applies to customized evaluation function # same thing applies to customized evaluation function
logregobjattr <- function(preds, dtrain) { logregobjattr <- function(preds, dtrain) {
# now you can access the attribute in customized function # now you can access the attribute in customized function
labels <- attr(dtrain, 'label') labels <- attr(dtrain, 'label')
preds <- 1 / (1 + exp(-preds)) preds <- 1/(1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, param <- list(max_depth=2, eta=1, nthread = 2, verbosity=0,
objective = logregobjattr, eval_metric = evalerror) objective=logregobjattr, eval_metric=evalerror)
print ('start training with user customized objective, with additional attributes in DMatrix') print ('start training with user customized objective, with additional attributes in DMatrix')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train # simply look at xgboost.py's implementation of train

View File

@@ -1,20 +1,20 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default # note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction # note: what we are getting is margin value in prediction
# you must know what you are doing # you must know what you are doing
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0) param <- list(max_depth=2, eta=1, nthread=2, verbosity=0)
watchlist <- list(eval = dtest) watchlist <- list(eval = dtest)
num_round <- 20 num_round <- 20
# user define objective function, given prediction, return gradient and second order gradient # user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss # this is loglikelihood loss
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds)) preds <- 1/(1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
@@ -27,7 +27,7 @@ logregobj <- function(preds, dtrain) {
# Take this in mind when you use the customization, and maybe you need write customized evaluation function # Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
print ('start training with early Stopping setting') print ('start training with early Stopping setting')

View File

@@ -1,7 +1,7 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
## ##
@@ -11,14 +11,14 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
## ##
# change booster to gblinear, so that we are fitting a linear model # change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer # alpha is the L1 regularizer
# lambda is the L2 regularizer # lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term # you can also set lambda_bias which is L2 regularizer on the bias term
param <- list(objective = "binary:logistic", booster = "gblinear", param <- list(objective = "binary:logistic", booster = "gblinear",
nthread = 2, alpha = 0.0001, lambda = 1) nthread = 2, alpha = 0.0001, lambda = 1)
# normally, you do not need to set eta (step_size) # normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun), # XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases # there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable # setting eta to be smaller value, e.g 0.5 can make the optimization more stable
@@ -30,4 +30,5 @@ num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist) bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest) ypred <- predict(bst, dtest)
labels <- getinfo(dtest, 'label') labels <- getinfo(dtest, 'label')
cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n') cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')

View File

@@ -1,9 +1,9 @@
# An example of using GPU-accelerated tree building algorithms # An example of using GPU-accelerated tree building algorithms
# #
# NOTE: it can only run if you have a CUDA-enable GPU and the package was # NOTE: it can only run if you have a CUDA-enable GPU and the package was
# specially compiled with GPU support. # specially compiled with GPU support.
# #
# For the current functionality, see # For the current functionality, see
# https://xgboost.readthedocs.io/en/latest/gpu/index.html # https://xgboost.readthedocs.io/en/latest/gpu/index.html
# #
@@ -21,8 +21,8 @@ m <- X[, sel] %*% betas - 1 + rnorm(N)
y <- rbinom(N, 1, plogis(m)) y <- rbinom(N, 1, plogis(m))
tr <- sample.int(N, N * 0.75) tr <- sample.int(N, N * 0.75)
dtrain <- xgb.DMatrix(X[tr, ], label = y[tr]) dtrain <- xgb.DMatrix(X[tr,], label = y[tr])
dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr]) dtest <- xgb.DMatrix(X[-tr,], label = y[-tr])
wl <- list(train = dtrain, test = dtest) wl <- list(train = dtrain, test = dtest)
# An example of running 'gpu_hist' algorithm # An example of running 'gpu_hist' algorithm

View File

@@ -4,39 +4,34 @@ library(data.table)
set.seed(1024) set.seed(1024)
# Function to obtain a list of interactions fitted in trees, requires input of maximum depth # Function to obtain a list of interactions fitted in trees, requires input of maximum depth
treeInteractions <- function(input_tree, input_max_depth) { treeInteractions <- function(input_tree, input_max_depth){
ID_merge <- i.id <- i.feature <- NULL # Suppress warning "no visible binding for global variable" trees <- copy(input_tree) # copy tree input to prevent overwriting
trees <- data.table::copy(input_tree) # copy tree input to prevent overwriting
if (input_max_depth < 2) return(list()) # no interactions if max depth < 2 if (input_max_depth < 2) return(list()) # no interactions if max depth < 2
if (nrow(input_tree) == 1) return(list()) if (nrow(input_tree) == 1) return(list())
# Attach parent nodes # Attach parent nodes
for (i in 2:input_max_depth) { for (i in 2:input_max_depth){
if (i == 2) trees[, ID_merge := ID] else trees[, ID_merge := get(paste0('parent_', i - 2))] if (i == 2) trees[, ID_merge:=ID] else trees[, ID_merge:=get(paste0('parent_',i-2))]
parents_left <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = Yes)] parents_left <- trees[!is.na(Split), list(i.id=ID, i.feature=Feature, ID_merge=Yes)]
parents_right <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = No)] parents_right <- trees[!is.na(Split), list(i.id=ID, i.feature=Feature, ID_merge=No)]
data.table::setorderv(trees, 'ID_merge') setorderv(trees, 'ID_merge')
data.table::setorderv(parents_left, 'ID_merge') setorderv(parents_left, 'ID_merge')
data.table::setorderv(parents_right, 'ID_merge') setorderv(parents_right, 'ID_merge')
trees <- merge(trees, parents_left, by = 'ID_merge', all.x = TRUE) trees <- merge(trees, parents_left, by='ID_merge', all.x=T)
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1)) trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
:= list(i.id, i.feature)] trees[, c('i.id','i.feature'):=NULL]
trees[, c('i.id', 'i.feature') := NULL]
trees <- merge(trees, parents_right, by = 'ID_merge', all.x = TRUE) trees <- merge(trees, parents_right, by='ID_merge', all.x=T)
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1)) trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
:= list(i.id, i.feature)] trees[, c('i.id','i.feature'):=NULL]
trees[, c('i.id', 'i.feature') := NULL]
} }
# Extract nodes with interactions # Extract nodes with interactions
interaction_trees <- trees[!is.na(Split) & !is.na(parent_1), interaction_trees <- trees[!is.na(Split) & !is.na(parent_1),
c('Feature', paste0('parent_feat_', 1:(input_max_depth - 1))), c('Feature',paste0('parent_feat_',1:(input_max_depth-1))), with=F]
with = FALSE] interaction_trees_split <- split(interaction_trees, 1:nrow(interaction_trees))
interaction_trees_split <- split(interaction_trees, seq_len(nrow(interaction_trees)))
interaction_list <- lapply(interaction_trees_split, as.character) interaction_list <- lapply(interaction_trees_split, as.character)
# Remove NAs (no parent interaction) # Remove NAs (no parent interaction)
@@ -52,62 +47,59 @@ treeInteractions <- function(input_tree, input_max_depth) {
# Generate sample data # Generate sample data
x <- list() x <- list()
for (i in 1:10) { for (i in 1:10){
x[[i]] <- i * rnorm(1000, 10) x[[i]] = i*rnorm(1000, 10)
} }
x <- as.data.table(x) x <- as.data.table(x)
y <- -1 * x[, rowSums(.SD)] + x[['V1']] * x[['V2']] + x[['V3']] * x[['V4']] * x[['V5']] y = -1*x[, rowSums(.SD)] + x[['V1']]*x[['V2']] + x[['V3']]*x[['V4']]*x[['V5']] + rnorm(1000, 0.001) + 3*sin(x[['V7']])
+ rnorm(1000, 0.001) + 3 * sin(x[['V7']])
train <- as.matrix(x) train = as.matrix(x)
# Interaction constraint list (column names form) # Interaction constraint list (column names form)
interaction_list <- list(c('V1', 'V2'), c('V3', 'V4', 'V5')) interaction_list <- list(c('V1','V2'),c('V3','V4','V5'))
# Convert interaction constraint list into feature index form # Convert interaction constraint list into feature index form
cols2ids <- function(object, col_names) { cols2ids <- function(object, col_names) {
LUT <- seq_along(col_names) - 1 LUT <- seq_along(col_names) - 1
names(LUT) <- col_names names(LUT) <- col_names
rapply(object, function(x) LUT[x], classes = "character", how = "replace") rapply(object, function(x) LUT[x], classes="character", how="replace")
} }
interaction_list_fid <- cols2ids(interaction_list, colnames(train)) interaction_list_fid = cols2ids(interaction_list, colnames(train))
# Fit model with interaction constraints # Fit model with interaction constraints
bst <- xgboost(data = train, label = y, max_depth = 4, bst = xgboost(data = train, label = y, max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000, eta = 0.1, nthread = 2, nrounds = 1000,
interaction_constraints = interaction_list_fid) interaction_constraints = interaction_list_fid)
bst_tree <- xgb.model.dt.tree(colnames(train), bst) bst_tree <- xgb.model.dt.tree(colnames(train), bst)
bst_interactions <- treeInteractions(bst_tree, 4) bst_interactions <- treeInteractions(bst_tree, 4) # interactions constrained to combinations of V1*V2 and V3*V4*V5
# interactions constrained to combinations of V1*V2 and V3*V4*V5
# Fit model without interaction constraints # Fit model without interaction constraints
bst2 <- xgboost(data = train, label = y, max_depth = 4, bst2 = xgboost(data = train, label = y, max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000) eta = 0.1, nthread = 2, nrounds = 1000)
bst2_tree <- xgb.model.dt.tree(colnames(train), bst2) bst2_tree <- xgb.model.dt.tree(colnames(train), bst2)
bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions
# Fit model with both interaction and monotonicity constraints # Fit model with both interaction and monotonicity constraints
bst3 <- xgboost(data = train, label = y, max_depth = 4, bst3 = xgboost(data = train, label = y, max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000, eta = 0.1, nthread = 2, nrounds = 1000,
interaction_constraints = interaction_list_fid, interaction_constraints = interaction_list_fid,
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0)) monotone_constraints = c(-1,0,0,0,0,0,0,0,0,0))
bst3_tree <- xgb.model.dt.tree(colnames(train), bst3) bst3_tree <- xgb.model.dt.tree(colnames(train), bst3)
bst3_interactions <- treeInteractions(bst3_tree, 4) bst3_interactions <- treeInteractions(bst3_tree, 4) # interactions still constrained to combinations of V1*V2 and V3*V4*V5
# interactions still constrained to combinations of V1*V2 and V3*V4*V5
# Show monotonic constraints still apply by checking scores after incrementing V1 # Show monotonic constraints still apply by checking scores after incrementing V1
x1 <- sort(unique(x[['V1']])) x1 <- sort(unique(x[['V1']]))
for (i in seq_along(x1)){ for (i in 1:length(x1)){
testdata <- copy(x[, - ('V1')]) testdata <- copy(x[, -c('V1')])
testdata[['V1']] <- x1[i] testdata[['V1']] <- x1[i]
testdata <- testdata[, paste0('V', 1:10), with = FALSE] testdata <- testdata[, paste0('V',1:10), with=F]
pred <- predict(bst3, as.matrix(testdata)) pred <- predict(bst3, as.matrix(testdata))
# Should not print out anything due to monotonic constraints # Should not print out anything due to monotonic constraints
if (i > 1) if (any(pred > prev_pred)) print(i) if (i > 1) if (any(pred > prev_pred)) print(i)
prev_pred <- pred prev_pred <- pred
} }

View File

@@ -1,6 +1,7 @@
data(mtcars) data(mtcars)
head(mtcars) head(mtcars)
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11], bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
objective = 'count:poisson', nrounds = 5) objective='count:poisson',nrounds=5)
pred <- predict(bst, as.matrix(mtcars[, -11])) pred = predict(bst,as.matrix(mtcars[,-11]))
sqrt(mean((pred - mtcars[, 11]) ^ 2)) sqrt(mean((pred-mtcars[,11])^2))

View File

@@ -1,23 +1,23 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
nrounds <- 2 nrounds = 2
# training the model for two rounds # training the model for two rounds
bst <- xgb.train(param, dtrain, nrounds, nthread = 2, watchlist) bst = xgb.train(param, dtrain, nrounds, nthread = 2, watchlist)
cat('start testing prediction from first n trees\n') cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest, 'label') labels <- getinfo(dtest,'label')
### predict using first 1 tree ### predict using first 1 tree
ypred1 <- predict(bst, dtest, ntreelimit = 1) ypred1 = predict(bst, dtest, ntreelimit=1)
# by default, we predict using all the trees # by default, we predict using all the trees
ypred2 <- predict(bst, dtest) ypred2 = predict(bst, dtest)
cat('error of ypred1=', mean(as.numeric(ypred1 > 0.5) != labels), '\n') cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
cat('error of ypred2=', mean(as.numeric(ypred2 > 0.5) != labels), '\n') cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')

View File

@@ -5,34 +5,34 @@ require(Matrix)
set.seed(1982) set.seed(1982)
# load in the agaricus dataset # load in the agaricus dataset
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
nrounds <- 4 nrounds = 4
# training the model for two rounds # training the model for two rounds
bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
# Model accuracy without new features # Model accuracy without new features
accuracy.before <- (sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
/ length(agaricus.test$label))
# by default, we predict using all the trees # by default, we predict using all the trees
pred_with_leaf <- predict(bst, dtest, predleaf = TRUE)
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
head(pred_with_leaf) head(pred_with_leaf)
create.new.tree.features <- function(model, original.features){ create.new.tree.features <- function(model, original.features){
pred_with_leaf <- predict(model, original.features, predleaf = TRUE) pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
cols <- list() cols <- list()
for (i in 1:model$niter) { for(i in 1:model$niter){
# max is not the real max but it s not important for the purpose of adding features # max is not the real max but it s not important for the purpose of adding features
leaf.id <- sort(unique(pred_with_leaf[, i])) leaf.id <- sort(unique(pred_with_leaf[,i]))
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id) cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
} }
cbind(original.features, sparse.model.matrix(~ . - 1, as.data.frame(cols))) cbind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
} }
# Convert previous features to one hot encoding # Convert previous features to one hot encoding
@@ -47,9 +47,7 @@ watchlist <- list(train = new.dtrain)
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
# Model accuracy with new features # Model accuracy with new features
accuracy.after <- (sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
/ length(agaricus.test$label))
# Here the accuracy was already good and is now perfect. # Here the accuracy was already good and is now perfect.
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
accuracy.after, "!\n"))

View File

@@ -1,14 +1,14 @@
# running all scripts in demo folder # running all scripts in demo folder
demo(basic_walkthrough, package = 'xgboost') demo(basic_walkthrough)
demo(custom_objective, package = 'xgboost') demo(custom_objective)
demo(boost_from_prediction, package = 'xgboost') demo(boost_from_prediction)
demo(predict_first_ntree, package = 'xgboost') demo(predict_first_ntree)
demo(generalized_linear_model, package = 'xgboost') demo(generalized_linear_model)
demo(cross_validation, package = 'xgboost') demo(cross_validation)
demo(create_sparse_matrix, package = 'xgboost') demo(create_sparse_matrix)
demo(predict_leaf_indices, package = 'xgboost') demo(predict_leaf_indices)
demo(early_stopping, package = 'xgboost') demo(early_stopping)
demo(poisson_regression, package = 'xgboost') demo(poisson_regression)
demo(caret_wrapper, package = 'xgboost') demo(caret_wrapper)
demo(tweedie_regression, package = 'xgboost') demo(tweedie_regression)
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support #demo(gpu_accelerated) # can only run when built with GPU support

20
R-package/demo/tweedie_regression.R Normal file → Executable file
View File

@@ -8,12 +8,12 @@ data(AutoClaim)
dt <- data.table(AutoClaim) dt <- data.table(AutoClaim)
# exclude these columns from the model matrix # exclude these columns from the model matrix
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY') exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
# retains the missing values # retains the missing values
# NOTE: this dataset is comes ready out of the box # NOTE: this dataset is comes ready out of the box
options(na.action = 'na.pass') options(na.action = 'na.pass')
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE]) x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
options(na.action = 'na.omit') options(na.action = 'na.omit')
# response # response
@@ -21,29 +21,29 @@ y <- dt[, CLM_AMT5]
d_train <- xgb.DMatrix(data = x, label = y, missing = NA) d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
# the tweedie_variance_power parameter determines the shape of # the tweedie_variance_power parameter determines the shape of
# distribution # distribution
# - closer to 1 is more poisson like and the mass # - closer to 1 is more poisson like and the mass
# is more concentrated near zero # is more concentrated near zero
# - closer to 2 is more gamma like and the mass spreads to the # - closer to 2 is more gamma like and the mass spreads to the
# the right with less concentration near zero # the right with less concentration near zero
params <- list( params <- list(
objective = 'reg:tweedie', objective = 'reg:tweedie',
eval_metric = 'rmse', eval_metric = 'rmse',
tweedie_variance_power = 1.4, tweedie_variance_power = 1.4,
max_depth = 6, max_depth = 6,
eta = 1) eta = 1)
bst <- xgb.train( bst <- xgb.train(
data = d_train, data = d_train,
params = params, params = params,
maximize = FALSE, maximize = FALSE,
watchlist = list(train = d_train), watchlist = list(train = d_train),
nrounds = 20) nrounds = 20)
var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst) var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
preds <- predict(bst, d_train) preds <- predict(bst, d_train)
rmse <- sqrt(sum(mean((y - preds) ^ 2))) rmse <- sqrt(sum(mean((y - preds)^2)))

View File

@@ -1,96 +0,0 @@
# [description]
# Create a definition file (.def) from a .dll file, using objdump. This
# is used by FindLibR.cmake when building the R package with MSVC.
#
# [usage]
#
# Rscript make-r-def.R something.dll something.def
#
# [references]
# * https://www.cs.colorado.edu/~main/cs1300/doc/mingwfaq.html
args <- commandArgs(trailingOnly = TRUE)
IN_DLL_FILE <- args[[1L]]
OUT_DEF_FILE <- args[[2L]]
DLL_BASE_NAME <- basename(IN_DLL_FILE)
message(sprintf("Creating '%s' from '%s'", OUT_DEF_FILE, IN_DLL_FILE))
# system() will not raise an R exception if the process called
# fails. Wrapping it here to get that behavior.
#
# system() introduces a lot of overhead, at least on Windows,
# so trying processx if it is available
.pipe_shell_command_to_stdout <- function(command, args, out_file) {
has_processx <- suppressMessages({
suppressWarnings({
require("processx") # nolint
})
})
if (has_processx) {
p <- processx::process$new(
command = command
, args = args
, stdout = out_file
, windows_verbatim_args = FALSE
)
invisible(p$wait())
} else {
message(paste0(
"Using system2() to run shell commands. Installing "
, "'processx' with install.packages('processx') might "
, "make this faster."
))
exit_code <- system2(
command = command
, args = shQuote(args)
, stdout = out_file
)
if (exit_code != 0L) {
stop(paste0("Command failed with exit code: ", exit_code))
}
}
return(invisible(NULL))
}
# use objdump to dump all the symbols
OBJDUMP_FILE <- "objdump-out.txt"
.pipe_shell_command_to_stdout(
command = "objdump"
, args = c("-p", IN_DLL_FILE)
, out_file = OBJDUMP_FILE
)
objdump_results <- readLines(OBJDUMP_FILE)
result <- file.remove(OBJDUMP_FILE)
# Only one table in the objdump results matters for our purposes,
# see https://www.cs.colorado.edu/~main/cs1300/doc/mingwfaq.html
start_index <- which(
grepl(
pattern = "[Ordinal/Name Pointer] Table"
, x = objdump_results
, fixed = TRUE
)
)
empty_lines <- which(objdump_results == "")
end_of_table <- empty_lines[empty_lines > start_index][1L]
# Read the contents of the table
exported_symbols <- objdump_results[(start_index + 1L):end_of_table]
exported_symbols <- gsub("\t", "", exported_symbols)
exported_symbols <- gsub(".*\\] ", "", exported_symbols)
exported_symbols <- gsub(" ", "", exported_symbols)
# Write R.def file
writeLines(
text = c(
paste0("LIBRARY \"", DLL_BASE_NAME, "\"")
, "EXPORTS"
, exported_symbols
)
, con = OUT_DEF_FILE
, sep = "\n"
)
message(sprintf("Successfully created '%s'", OUT_DEF_FILE))

View File

@@ -1,64 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{a-compatibility-note-for-saveRDS-save}
\alias{a-compatibility-note-for-saveRDS-save}
\title{Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.}
\description{
It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
\code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
\code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
the model is to be accessed in the future. If you train a model with the current version of
XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
accessible in later releases of XGBoost. To ensure that your model can be accessed in future
releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
}
\details{
Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
the JSON format by specifying the JSON extension. To read the model back, use
\code{\link{xgb.load}}.
Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
as part of another R object.
Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
model but also internal configurations and parameters, and its format is not stable across
multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
For more details and explanation about model persistence and archival, consult the page
\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
# Save as a stand-alone file; load it with xgb.load()
xgb.save(bst, 'xgb.model')
bst2 <- xgb.load('xgb.model')
# Save as a stand-alone file (JSON); load it with xgb.load()
xgb.save(bst, 'xgb.model.json')
bst2 <- xgb.load('xgb.model.json')
if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
# Save as a raw byte vector; load it with xgb.load.raw()
xgb_bytes <- xgb.save.raw(bst)
bst2 <- xgb.load.raw(xgb_bytes)
# Persist XGBoost model as part of another R object
obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost model")
# Persist the R object. Here, saveRDS() is okay, since it doesn't persist
# xgb.Booster directly. What's being persisted is the future-proof byte representation
# as given by xgb.save.raw().
saveRDS(obj, 'my_object.rds')
# Read back the R object
obj2 <- readRDS('my_object.rds')
# Re-construct xgb.Booster object from the bytes
bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
if (file.exists('my_object.rds')) file.remove('my_object.rds')
}

View File

@@ -1,18 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.ggplot.R
\name{normalize}
\alias{normalize}
\title{Scale feature value to have mean 0, standard deviation 1}
\usage{
normalize(x)
}
\arguments{
\item{x}{Numeric vector}
}
\value{
Numeric vector with mean 0 and sd 1.
}
\description{
This is used to compare multiple features on the same plot.
Internal utility function
}

View File

@@ -1,27 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.ggplot.R
\name{prepare.ggplot.shap.data}
\alias{prepare.ggplot.shap.data}
\title{Combine and melt feature values and SHAP contributions for sample
observations.}
\usage{
prepare.ggplot.shap.data(data_list, normalize = FALSE)
}
\arguments{
\item{data_list}{List containing 'data' and 'shap_contrib' returned by
\code{xgb.shap.data()}.}
\item{normalize}{Whether to standardize feature values to have mean 0 and
standard deviation 1 (useful for comparing multiple features on the same
plot). Default \code{FALSE}.}
}
\value{
A data.table containing the observation ID, the feature name, the
feature value (normalized if specified), and the SHAP contribution value.
}
\description{
Conforms to data format required for ggplot functions.
}
\details{
Internal utility function.
}

View File

@@ -38,8 +38,6 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
saveRDS(bst, "xgb.model.rds") saveRDS(bst, "xgb.model.rds")
# Warning: The resulting RDS file is only compatible with the current XGBoost version.
# Refer to the section titled "a-compatibility-note-for-saveRDS-save".
bst1 <- readRDS("xgb.model.rds") bst1 <- readRDS("xgb.model.rds")
if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds") if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds")
# the handle is invalid: # the handle is invalid:

View File

@@ -24,9 +24,9 @@ This is the function inspired from the paragraph 3.1 of the paper:
\strong{Practical Lessons from Predicting Clicks on Ads at Facebook} \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
Joaquin Quinonero Candela)} Joaquin Quinonero Candela)}
International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014 International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
\url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}. \url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
@@ -37,10 +37,10 @@ Extract explaining the method:
convenient way to implement non-linear and tuple transformations convenient way to implement non-linear and tuple transformations
of the kind we just described. We treat each individual of the kind we just described. We treat each individual
tree as a categorical feature that takes as value the tree as a categorical feature that takes as value the
index of the leaf an instance ends up falling in. We use index of the leaf an instance ends up falling in. We use
1-of-K coding of this type of features. 1-of-K coding of this type of features.
For example, consider the boosted tree model in Figure 1 with 2 subtrees, For example, consider the boosted tree model in Figure 1 with 2 subtrees,
where the first subtree has 3 leafs and the second 2 leafs. If an where the first subtree has 3 leafs and the second 2 leafs. If an
instance ends up in leaf 2 in the first subtree and leaf 1 in instance ends up in leaf 2 in the first subtree and leaf 1 in
second subtree, the overall input to the linear classifier will second subtree, the overall input to the linear classifier will

View File

@@ -28,15 +28,12 @@ xgb.cv(
) )
} }
\arguments{ \arguments{
\item{params}{the list of parameters. The complete list of parameters is \item{params}{the list of parameters. Commonly used ones are:
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
is a shorter summary:
\itemize{ \itemize{
\item \code{objective} objective function, common ones are \item \code{objective} objective function, common ones are
\itemize{ \itemize{
\item \code{reg:squarederror} Regression with squared loss. \item \code{reg:squarederror} Regression with squared loss
\item \code{binary:logistic} logistic regression for classification. \item \code{binary:logistic} logistic regression for classification
\item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
} }
\item \code{eta} step size of each boosting step \item \code{eta} step size of each boosting step
\item \code{max_depth} maximum depth of the tree \item \code{max_depth} maximum depth of the tree
@@ -70,8 +67,6 @@ from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callb
\item \code{error} binary classification error rate \item \code{error} binary classification error rate
\item \code{rmse} Rooted mean square error \item \code{rmse} Rooted mean square error
\item \code{logloss} negative log-likelihood function \item \code{logloss} negative log-likelihood function
\item \code{mae} Mean absolute error
\item \code{mape} Mean absolute percentage error
\item \code{auc} Area under curve \item \code{auc} Area under curve
\item \code{aucpr} Area under PR curve \item \code{aucpr} Area under PR curve
\item \code{merror} Exact matching error, used to evaluate multi-class classification \item \code{merror} Exact matching error, used to evaluate multi-class classification
@@ -156,7 +151,7 @@ The cross-validation process is then repeated \code{nrounds} times, with each of
All observations are used for both training and validation. All observations are used for both training and validation.
Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29} Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')

View File

@@ -16,14 +16,14 @@ xgb.dump(
\arguments{ \arguments{
\item{model}{the model object.} \item{model}{the model object.}
\item{fname}{the name of the text file where to save the model text dump. \item{fname}{the name of the text file where to save the model text dump.
If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.} If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}
\item{fmap}{feature map file representing feature types. \item{fmap}{feature map file representing feature types.
Detailed description could be found at Detailed description could be found at
\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}. \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
See demo/ for walkthrough example in R, and See demo/ for walkthrough example in R, and
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
for example Format.} for example Format.}
\item{with_stats}{whether to dump some additional statistics about the splits. \item{with_stats}{whether to dump some additional statistics about the splits.
@@ -47,7 +47,7 @@ data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
# save the model in file 'xgb.model.dump' # save the model in file 'xgb.model.dump'
dump_path = file.path(tempdir(), 'model.dump') dump_path = file.path(tempdir(), 'model.dump')

View File

@@ -22,7 +22,7 @@ Non-null \code{feature_names} could be provided to override those in the model.}
\item{trees}{(only for the gbtree booster) an integer vector of tree indices that should be included \item{trees}{(only for the gbtree booster) an integer vector of tree indices that should be included
into the importance calculation. If set to \code{NULL}, all trees of the model are parsed. into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
It could be useful, e.g., in multiclass classification to get feature importances It could be useful, e.g., in multiclass classification to get feature importances
for each class separately. IMPORTANT: the tree index in xgboost models for each class separately. IMPORTANT: the tree index in xgboost models
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).} is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
@@ -37,7 +37,7 @@ For a tree model, a \code{data.table} with the following columns:
\itemize{ \itemize{
\item \code{Features} names of the features used in the model; \item \code{Features} names of the features used in the model;
\item \code{Gain} represents fractional contribution of each feature to the model based on \item \code{Gain} represents fractional contribution of each feature to the model based on
the total gain of this feature's splits. Higher percentage means a more important the total gain of this feature's splits. Higher percentage means a more important
predictive feature. predictive feature.
\item \code{Cover} metric of the number of observation related to this feature; \item \code{Cover} metric of the number of observation related to this feature;
\item \code{Frequency} percentage representing the relative number of times \item \code{Frequency} percentage representing the relative number of times
@@ -51,7 +51,7 @@ A linear model's importance \code{data.table} has the following columns:
\item \code{Class} (only for multiclass models) class label. \item \code{Class} (only for multiclass models) class label.
} }
If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names},
index of the features will be used instead. Because the index is extracted from the model dump index of the features will be used instead. Because the index is extracted from the model dump
(based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
} }
@@ -61,21 +61,21 @@ Creates a \code{data.table} of feature importances in a model.
\details{ \details{
This function works for both linear and tree models. This function works for both linear and tree models.
For linear models, the importance is the absolute magnitude of linear coefficients. For linear models, the importance is the absolute magnitude of linear coefficients.
For that reason, in order to obtain a meaningful ranking by importance for a linear model, For that reason, in order to obtain a meaningful ranking by importance for a linear model,
the features need to be on the same scale (which you also would want to do when using either the features need to be on the same scale (which you also would want to do when using either
L1 or L2 regularization). L1 or L2 regularization).
} }
\examples{ \examples{
# binomial classification using gbtree: # binomial classification using gbtree:
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
xgb.importance(model = bst) xgb.importance(model = bst)
# binomial classification using gblinear: # binomial classification using gblinear:
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear",
eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic") eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
xgb.importance(model = bst) xgb.importance(model = bst)

View File

@@ -17,8 +17,8 @@ Load xgboost model from the binary model file.
} }
\details{ \details{
The input file is expected to contain a model saved in an xgboost-internal binary format The input file is expected to contain a model saved in an xgboost-internal binary format
using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
appropriate methods from other xgboost interfaces. E.g., a model trained in Python and appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
saved from there in xgboost format, could be loaded from R. saved from there in xgboost format, could be loaded from R.
Note: a model saved as an R-object, has to be loaded using corresponding R-methods, Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
@@ -29,7 +29,7 @@ data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model') xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model') bst <- xgb.load('xgb.model')

View File

@@ -20,7 +20,7 @@ Non-null \code{feature_names} could be provided to override those in the model.}
\item{model}{object of class \code{xgb.Booster}} \item{model}{object of class \code{xgb.Booster}}
\item{text}{\code{character} vector previously generated by the \code{xgb.dump} \item{text}{\code{character} vector previously generated by the \code{xgb.dump}
function (where parameter \code{with_stats = TRUE} should have been set). function (where parameter \code{with_stats = TRUE} should have been set).
\code{text} takes precedence over \code{model}.} \code{text} takes precedence over \code{model}.}
@@ -53,10 +53,10 @@ The columns of the \code{data.table} are:
\item \code{Quality}: either the split gain (change in loss) or the leaf value \item \code{Quality}: either the split gain (change in loss) or the leaf value
\item \code{Cover}: metric related to the number of observation either seen by a split \item \code{Cover}: metric related to the number of observation either seen by a split
or collected by a leaf during training. or collected by a leaf during training.
} }
When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
the corresponding trees in the "Node" column. the corresponding trees in the "Node" column.
} }
\description{ \description{
@@ -67,17 +67,17 @@ Parse a boosted tree model text dump into a \code{data.table} structure.
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
# This bst model already has feature_names stored with it, so those would be used when # This bst model already has feature_names stored with it, so those would be used when
# feature_names is not set: # feature_names is not set:
(dt <- xgb.model.dt.tree(model = bst)) (dt <- xgb.model.dt.tree(model = bst))
# How to match feature names of splits that are following a current 'Yes' branch: # How to match feature names of splits that are following a current 'Yes' branch:
merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)] merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
} }

View File

@@ -23,7 +23,7 @@ or a data.table result of the \code{xgb.model.dt.tree} function.}
\item{which}{which distribution to plot (see details).} \item{which}{which distribution to plot (see details).}
\item{plot}{(base R barplot) whether a barplot should be produced. \item{plot}{(base R barplot) whether a barplot should be produced.
If FALSE, only a data.table is returned.} If FALSE, only a data.table is returned.}
\item{...}{other parameters passed to \code{barplot} or \code{plot}.} \item{...}{other parameters passed to \code{barplot} or \code{plot}.}
@@ -45,10 +45,10 @@ When \code{which="2x1"}, two distributions with respect to the leaf depth
are plotted on top of each other: are plotted on top of each other:
\itemize{ \itemize{
\item the distribution of the number of leafs in a tree model at a certain depth; \item the distribution of the number of leafs in a tree model at a certain depth;
\item the distribution of average weighted number of observations ("cover") \item the distribution of average weighted number of observations ("cover")
ending up in leafs at certain depth. ending up in leafs at certain depth.
} }
Those could be helpful in determining sensible ranges of the \code{max_depth} Those could be helpful in determining sensible ranges of the \code{max_depth}
and \code{min_child_weight} parameters. and \code{min_child_weight} parameters.
When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth

View File

@@ -131,7 +131,6 @@ bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot
# multiclass example - plots for each class separately: # multiclass example - plots for each class separately:
nclass <- 3 nclass <- 3
@@ -150,7 +149,6 @@ xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
n_col = 2, col = col, pch = 16, pch_NA = 17) n_col = 2, col = col, pch = 16, pch_NA = 17)
xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
n_col = 2, col = col, pch = 16, pch_NA = 17) n_col = 2, col = col, pch = 16, pch_NA = 17)
xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot
} }
\references{ \references{

View File

@@ -1,78 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.shap.R
\name{xgb.ggplot.shap.summary}
\alias{xgb.ggplot.shap.summary}
\alias{xgb.plot.shap.summary}
\title{SHAP contribution dependency summary plot}
\usage{
xgb.ggplot.shap.summary(
data,
shap_contrib = NULL,
features = NULL,
top_n = 10,
model = NULL,
trees = NULL,
target_class = NULL,
approxcontrib = FALSE,
subsample = NULL
)
xgb.plot.shap.summary(
data,
shap_contrib = NULL,
features = NULL,
top_n = 10,
model = NULL,
trees = NULL,
target_class = NULL,
approxcontrib = FALSE,
subsample = NULL
)
}
\arguments{
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
feature importance is calculated, and \code{top_n} high ranked features are taken.}
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
or \code{features} is missing.}
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
only SHAP contributions for that specific class are used.
If it is not set, SHAP importances are averaged over all classes.}
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
it is set so that up to 100K data points are used.}
}
\value{
A \code{ggplot2} object.
}
\description{
Compare SHAP contributions of different features.
}
\details{
A point plot (each point representing one sample from \code{data}) is
produced for each feature, with the points plotted on the SHAP value axis.
Each point (observation) is coloured based on its feature value. The plot
hence allows us to see which features have a negative / positive contribution
on the model prediction, and whether the contribution is different for larger
or smaller values of the feature. We effectively try to replicate the
\code{summary_plot} function from https://github.com/slundberg/shap.
}
\examples{
# See \code{\link{xgb.plot.shap}}.
}
\seealso{
\code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
\url{https://github.com/slundberg/shap}
}

View File

@@ -60,7 +60,7 @@ The content of each node is organised that way:
\item \code{Gain} (for split nodes): the information gain metric of a split \item \code{Gain} (for split nodes): the information gain metric of a split
(corresponds to the importance of the node in the model). (corresponds to the importance of the node in the model).
\item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction. \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction.
} }
The tree root nodes also indicate the Tree index (0-based). The tree root nodes also indicate the Tree index (0-based).
The "Yes" branches are marked by the "< split_value" label. The "Yes" branches are marked by the "< split_value" label.
@@ -80,7 +80,7 @@ xgb.plot.tree(model = bst)
xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE) xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
\dontrun{ \dontrun{
# Below is an example of how to save this plot to a file. # Below is an example of how to save this plot to a file.
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed. # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
library(DiagrammeR) library(DiagrammeR)
gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE) gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)

View File

@@ -15,25 +15,21 @@ xgb.save(model, fname)
Save xgboost model to a file in binary format. Save xgboost model to a file in binary format.
} }
\details{ \details{
This methods allows to save a model in an xgboost-internal binary format which is universal This methods allows to save a model in an xgboost-internal binary format which is universal
among the various xgboost interfaces. In R, the saved model file could be read-in later among the various xgboost interfaces. In R, the saved model file could be read-in later
using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
of \code{\link{xgb.train}}. of \code{\link{xgb.train}}.
Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
or \code{\link[base]{save}}). However, it would then only be compatible with R, and or \code{\link[base]{save}}). However, it would then only be compatible with R, and
corresponding R-methods would need to be used to load it. Moreover, persisting the model with corresponding R-methods would need to be used to load it.
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
how to persist models in a future-proof way, i.e. to make the model accessible in future
releases of XGBoost.
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model') xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model') bst <- xgb.load('xgb.model')

View File

@@ -1,55 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.shap.R
\name{xgb.shap.data}
\alias{xgb.shap.data}
\title{Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
Internal utility function.}
\usage{
xgb.shap.data(
data,
shap_contrib = NULL,
features = NULL,
top_n = 1,
model = NULL,
trees = NULL,
target_class = NULL,
approxcontrib = FALSE,
subsample = NULL,
max_observations = 1e+05
)
}
\arguments{
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
feature importance is calculated, and \code{top_n} high ranked features are taken.}
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
or \code{features} is missing.}
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
only SHAP contributions for that specific class are used.
If it is not set, SHAP importances are averaged over all classes.}
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
it is set so that up to 100K data points are used.}
}
\value{
A list containing: 'data', a matrix containing sample observations
and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
values for these observations.
}
\description{
Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
Internal utility function.
}
\keyword{internal}

View File

@@ -42,9 +42,9 @@ xgboost(
) )
} }
\arguments{ \arguments{
\item{params}{the list of parameters. The complete list of parameters is \item{params}{the list of parameters.
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
is a shorter summary: Below is a shorter summary:
1. General Parameters 1. General Parameters
@@ -82,23 +82,13 @@ xgboost(
\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
\itemize{ \itemize{
\item \code{reg:squarederror} Regression with squared loss (Default). \item \code{reg:squarederror} Regression with squared loss (Default).
\item \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}. All inputs are required to be greater than -1. Also, see metric rmsle for possible issue with this objective.
\item \code{reg:logistic} logistic regression. \item \code{reg:logistic} logistic regression.
\item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
\item \code{binary:logistic} logistic regression for binary classification. Output probability. \item \code{binary:logistic} logistic regression for binary classification. Output probability.
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
\item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. \item \code{num_class} set the number of classes. To use only with multiclass objectives.
\item \code{count:poisson}: poisson regression for count data, output mean of poisson distribution. \code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).
\item \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored). Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function \code{h(t) = h0(t) * HR)}.
\item \code{survival:aft}: Accelerated failure time model for censored survival time data. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details.
\item \code{aft_loss_distribution}: Probabilty Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}. \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
\item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
\item \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.
\item \code{rank:map}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)} is maximized.
\item \code{reg:gamma}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.
\item \code{reg:tweedie}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.
} }
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
@@ -215,18 +205,16 @@ User may set one or several \code{eval_metric} parameters.
Note that when using a customized metric, only this single metric can be used. Note that when using a customized metric, only this single metric can be used.
The following is the list of built-in metrics for which Xgboost provides optimized implementation: The following is the list of built-in metrics for which Xgboost provides optimized implementation:
\itemize{ \itemize{
\item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error} \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
\item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood} \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
\item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html} \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
\item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
Different threshold (e.g., 0.) could be specified as "error@0." Different threshold (e.g., 0.) could be specified as "error@0."
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
\item \code{mae} Mean absolute error \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
\item \code{mape} Mean absolute percentage error
\item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
\item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation. \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG} \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
} }
The following callbacks are automatically created when certain parameters are set: The following callbacks are automatically created when certain parameters are set:

View File

@@ -1,39 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.config.R
\name{xgb.set.config, xgb.get.config}
\alias{xgb.set.config, xgb.get.config}
\alias{xgb.set.config}
\alias{xgb.get.config}
\title{Set and get global configuration}
\usage{
xgb.set.config(...)
xgb.get.config()
}
\arguments{
\item{...}{List of parameters to be set, as keyword arguments}
}
\value{
\code{xgb.set.config} returns \code{TRUE} to signal success. \code{xgb.get.config} returns
a list containing all global-scope parameters and their values.
}
\description{
Global configuration consists of a collection of parameters that can be applied in the global
scope. See \url{https://xgboost.readthedocs.io/en/stable/parameter.html} for the full list of
parameters supported in the global configuration. Use \code{xgb.set.config} to update the
values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
values of all global-scope parameters (listed in
\url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
}
\examples{
# Set verbosity level to silent (0)
xgb.set.config(verbosity = 0)
# Now global verbosity level is 0
config <- xgb.get.config()
print(config$verbosity)
# Set verbosity level to warning (1)
xgb.set.config(verbosity = 1)
# Now global verbosity level is 1
config <- xgb.get.config()
print(config$verbosity)
}

View File

@@ -3,12 +3,12 @@ PKGROOT=../../
ENABLE_STD_THREAD=1 ENABLE_STD_THREAD=1
# _*_ mode: Makefile; _*_ # _*_ mode: Makefile; _*_
CXX_STD = CXX14 CXX_STD = CXX11
XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
-DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\ -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
-DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\ -DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\
-DRABIT_CUSTOMIZE_MSG_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_
# disable the use of thread_local for 32 bit windows: # disable the use of thread_local for 32 bit windows:
ifeq ($(R_OSTYPE)$(WIN),windows) ifeq ($(R_OSTYPE)$(WIN),windows)
@@ -19,7 +19,6 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS) PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
PKG_CXXFLAGS= @OPENMP_CXXFLAGS@ @ENDIAN_FLAG@ -pthread PKG_CXXFLAGS= @OPENMP_CXXFLAGS@ @ENDIAN_FLAG@ -pthread
PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \ OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o\
$(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \ $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o\
$(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/c_api.o \ $(PKGROOT)/rabit/src/engine_empty.o $(PKGROOT)/rabit/src/c_api.o
$(PKGROOT)/rabit/src/allreduce_base.o

View File

@@ -15,12 +15,12 @@ xgblib:
cp -r ../../include . cp -r ../../include .
cp -r ../../amalgamation . cp -r ../../amalgamation .
CXX_STD = CXX14 CXX_STD = CXX11
XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
-DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\ -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
-DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\ -DDMLC_LOG_CUSTOMIZE=1 -DXGBOOST_CUSTOMIZE_LOGGER=1\
-DRABIT_CUSTOMIZE_MSG_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_
# disable the use of thread_local for 32 bit windows: # disable the use of thread_local for 32 bit windows:
ifeq ($(R_OSTYPE)$(WIN),windows) ifeq ($(R_OSTYPE)$(WIN),windows)
@@ -31,9 +31,8 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS) PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \ OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o\
$(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \ $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o\
$(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/c_api.o \ $(PKGROOT)/rabit/src/engine_empty.o $(PKGROOT)/rabit/src/c_api.o
$(PKGROOT)/rabit/src/allreduce_base.o
$(OBJECTS) : xgblib $(OBJECTS) : xgblib

View File

@@ -43,8 +43,6 @@ extern SEXP XGDMatrixNumRow_R(SEXP);
extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP); extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
extern SEXP XGBSetGlobalConfig_R(SEXP);
extern SEXP XGBGetGlobalConfig_R();
static const R_CallMethodDef CallEntries[] = { static const R_CallMethodDef CallEntries[] = {
{"XGBoosterBoostOneIter_R", (DL_FUNC) &XGBoosterBoostOneIter_R, 4}, {"XGBoosterBoostOneIter_R", (DL_FUNC) &XGBoosterBoostOneIter_R, 4},
@@ -75,8 +73,6 @@ static const R_CallMethodDef CallEntries[] = {
{"XGDMatrixSaveBinary_R", (DL_FUNC) &XGDMatrixSaveBinary_R, 3}, {"XGDMatrixSaveBinary_R", (DL_FUNC) &XGDMatrixSaveBinary_R, 3},
{"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3}, {"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3},
{"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2}, {"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2},
{"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1},
{"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0},
{NULL, NULL, 0} {NULL, NULL, 0}
}; };

View File

@@ -1,7 +1,6 @@
// Copyright (c) 2014 by Contributors // Copyright (c) 2014 by Contributors
#include <dmlc/logging.h> #include <dmlc/logging.h>
#include <dmlc/omp.h> #include <dmlc/omp.h>
#include <dmlc/common.h>
#include <xgboost/c_api.h> #include <xgboost/c_api.h>
#include <vector> #include <vector>
#include <string> #include <string>
@@ -50,21 +49,6 @@ void _DMatrixFinalizer(SEXP ext) {
R_API_END(); R_API_END();
} }
SEXP XGBSetGlobalConfig_R(SEXP json_str) {
R_API_BEGIN();
CHECK_CALL(XGBSetGlobalConfig(CHAR(asChar(json_str))));
R_API_END();
return R_NilValue;
}
SEXP XGBGetGlobalConfig_R() {
const char* json_str;
R_API_BEGIN();
CHECK_CALL(XGBGetGlobalConfig(&json_str));
R_API_END();
return mkString(json_str);
}
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
@@ -93,16 +77,12 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
din = REAL(mat); din = REAL(mat);
} }
std::vector<float> data(nrow * ncol); std::vector<float> data(nrow * ncol);
dmlc::OMPException exc;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (omp_ulong i = 0; i < nrow; ++i) { for (omp_ulong i = 0; i < nrow; ++i) {
exc.Run([&]() { for (size_t j = 0; j < ncol; ++j) {
for (size_t j = 0; j < ncol; ++j) { data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j]; }
}
});
} }
exc.Rethrow();
DMatrixHandle handle; DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle)); CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
@@ -131,15 +111,11 @@ SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
for (size_t i = 0; i < nindptr; ++i) { for (size_t i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<size_t>(p_indptr[i]); col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
} }
dmlc::OMPException exc;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int64_t i = 0; i < static_cast<int64_t>(ndata); ++i) { for (int64_t i = 0; i < static_cast<int64_t>(ndata); ++i) {
exc.Run([&]() { indices_[i] = static_cast<unsigned>(p_indices[i]);
indices_[i] = static_cast<unsigned>(p_indices[i]); data_[i] = static_cast<float>(p_data[i]);
data_[i] = static_cast<float>(p_data[i]);
});
} }
exc.Rethrow();
DMatrixHandle handle; DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_), CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata, BeginPtr(data_), nindptr, ndata,
@@ -184,16 +160,12 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
R_API_BEGIN(); R_API_BEGIN();
int len = length(array); int len = length(array);
const char *name = CHAR(asChar(field)); const char *name = CHAR(asChar(field));
dmlc::OMPException exc;
if (!strcmp("group", name)) { if (!strcmp("group", name)) {
std::vector<unsigned> vec(len); std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
exc.Run([&]() { vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
});
} }
exc.Rethrow();
CHECK_CALL(XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle), CHECK_CALL(XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), CHAR(asChar(field)),
BeginPtr(vec), len)); BeginPtr(vec), len));
@@ -201,11 +173,8 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
std::vector<float> vec(len); std::vector<float> vec(len);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
exc.Run([&]() { vec[i] = REAL(array)[i];
vec[i] = REAL(array)[i];
});
} }
exc.Rethrow();
CHECK_CALL(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), CHECK_CALL(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), CHAR(asChar(field)),
BeginPtr(vec), len)); BeginPtr(vec), len));
@@ -296,15 +265,11 @@ SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
<< "gradient and hess must have same length"; << "gradient and hess must have same length";
int len = length(grad); int len = length(grad);
std::vector<float> tgrad(len), thess(len); std::vector<float> tgrad(len), thess(len);
dmlc::OMPException exc;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int j = 0; j < len; ++j) { for (int j = 0; j < len; ++j) {
exc.Run([&]() { tgrad[j] = REAL(grad)[j];
tgrad[j] = REAL(grad)[j]; thess[j] = REAL(hess)[j];
thess[j] = REAL(hess)[j];
});
} }
exc.Rethrow();
CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle), CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain), R_ExternalPtrAddr(dtrain),
BeginPtr(tgrad), BeginPtr(thess), BeginPtr(tgrad), BeginPtr(thess),
@@ -410,7 +375,7 @@ SEXP XGBoosterSaveJsonConfig_R(SEXP handle) {
SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value) { SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value)))); XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value)));
R_API_END(); R_API_END();
return R_NilValue; return R_NilValue;
} }
@@ -432,9 +397,9 @@ SEXP XGBoosterSerializeToBuffer_R(SEXP handle) {
SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) { SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle), XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle),
RAW(raw), RAW(raw),
length(raw))); length(raw));
R_API_END(); R_API_END();
return R_NilValue; return R_NilValue;
} }

View File

@@ -21,19 +21,6 @@
*/ */
XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle); XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);
/*!
* \brief Set global configuration
* \param json_str a JSON string representing the list of key-value pairs
* \return R_NilValue
*/
XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
/*!
* \brief Get global configuration
* \return JSON string
*/
XGB_DLL SEXP XGBGetGlobalConfig_R();
/*! /*!
* \brief load a data matrix * \brief load a data matrix
* \param fname name of the content * \param fname name of the content

View File

@@ -13,10 +13,27 @@ void CustomLogMessage::Log(const std::string& msg) {
} }
} // namespace dmlc } // namespace dmlc
// implements rabit error handling.
extern "C" {
void XGBoostAssert_R(int exp, const char *fmt, ...);
void XGBoostCheck_R(int exp, const char *fmt, ...);
}
namespace rabit {
namespace utils {
extern "C" {
void (*Printf)(const char *fmt, ...) = Rprintf;
void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
void (*Error)(const char *fmt, ...) = error;
}
}
}
namespace xgboost { namespace xgboost {
ConsoleLogger::~ConsoleLogger() { ConsoleLogger::~ConsoleLogger() {
if (cur_verbosity_ == LogVerbosity::kIgnore || if (cur_verbosity_ == LogVerbosity::kIgnore ||
cur_verbosity_ <= GlobalVerbosity()) { cur_verbosity_ <= global_verbosity_) {
dmlc::CustomLogMessage::Log(log_stream_.str()); dmlc::CustomLogMessage::Log(log_stream_.str());
} }
} }

View File

@@ -1,105 +0,0 @@
# Script to generate reference models. The reference models are used to test backward compatibility
# of saved model files from XGBoost version 0.90 and 1.0.x.
library(xgboost)
library(Matrix)
set.seed(0)
metadata <- list(
kRounds = 2,
kRows = 1000,
kCols = 4,
kForests = 2,
kMaxDepth = 2,
kClasses = 3
)
X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows,
ncol = metadata$kCols, sparse = TRUE)
w <- runif(metadata$kRows)
version <- packageVersion('xgboost')
target_dir <- 'models'
save_booster <- function (booster, model_name) {
booster_bin <- function (model_name) {
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
}
booster_json <- function (model_name) {
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
}
booster_rds <- function (model_name) {
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
}
xgb.save(booster, booster_bin(model_name))
saveRDS(booster, booster_rds(model_name))
if (version >= '1.0.0') {
xgb.save(booster, booster_json(model_name))
}
}
generate_regression_model <- function () {
print('Regression')
y <- rnorm(metadata$kRows)
data <- xgb.DMatrix(X, label = y)
params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
max_depth = metadata$kMaxDepth)
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
save_booster(booster, 'reg')
}
generate_logistic_model <- function () {
print('Binary classification with logistic loss')
y <- sample(0:1, size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == 1, min(y) == 0)
objective <- c('binary:logistic', 'binary:logitraw')
name <- c('logit', 'logitraw')
for (i in seq_len(length(objective))) {
data <- xgb.DMatrix(X, label = y, weight = w)
params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
max_depth = metadata$kMaxDepth, objective = objective[i])
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
save_booster(booster, name[i])
}
}
generate_classification_model <- function () {
print('Multi-class classification')
y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0)
data <- xgb.DMatrix(X, label = y, weight = w)
params <- list(num_class = metadata$kClasses, tree_method = 'hist',
num_parallel_tree = metadata$kForests, max_depth = metadata$kMaxDepth,
objective = 'multi:softmax')
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
save_booster(booster, 'cls')
}
generate_ranking_model <- function () {
print('Learning to rank')
y <- sample(0:4, size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == 4, min(y) == 0)
kGroups <- 20
w <- runif(kGroups)
g <- rep(50, times = kGroups)
data <- xgb.DMatrix(X, label = y, group = g)
# setinfo(data, 'weight', w)
# ^^^ does not work in version <= 1.1.0; see https://github.com/dmlc/xgboost/issues/5942
# So call low-level function XGDMatrixSetInfo_R directly. Since this function is not an exported
# symbol, use the triple-colon operator.
.Call(xgboost:::XGDMatrixSetInfo_R, data, 'weight', as.numeric(w))
params <- list(objective = 'rank:ndcg', num_parallel_tree = metadata$kForests,
tree_method = 'hist', max_depth = metadata$kMaxDepth)
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
save_booster(booster, 'ltr')
}
dir.create(target_dir)
invisible(generate_regression_model())
invisible(generate_logistic_model())
invisible(generate_classification_model())
invisible(generate_ranking_model())

View File

@@ -1,71 +0,0 @@
library(lintr)
library(crayon)
my_linters <- list(
absolute_path_linter = lintr::absolute_path_linter,
assignment_linter = lintr::assignment_linter,
closed_curly_linter = lintr::closed_curly_linter,
commas_linter = lintr::commas_linter,
equals_na = lintr::equals_na_linter,
infix_spaces_linter = lintr::infix_spaces_linter,
line_length_linter = lintr::line_length_linter,
no_tab_linter = lintr::no_tab_linter,
object_usage_linter = lintr::object_usage_linter,
object_length_linter = lintr::object_length_linter,
open_curly_linter = lintr::open_curly_linter,
semicolon = lintr::semicolon_terminator_linter,
seq = lintr::seq_linter,
spaces_inside_linter = lintr::spaces_inside_linter,
spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter,
trailing_blank_lines_linter = lintr::trailing_blank_lines_linter,
trailing_whitespace_linter = lintr::trailing_whitespace_linter,
true_false = lintr::T_and_F_symbol_linter,
unneeded_concatenation = lintr::unneeded_concatenation_linter
)
results <- lapply(
list.files(path = '.', pattern = '\\.[Rr]$', recursive = TRUE),
function (r_file) {
cat(sprintf("Processing %s ...\n", r_file))
list(r_file = r_file,
output = lintr::lint(filename = r_file, linters = my_linters))
})
num_issue <- Reduce(sum, lapply(results, function (e) length(e$output)))
lint2str <- function(lint_entry) {
color <- function(type) {
switch(type,
"warning" = crayon::magenta,
"error" = crayon::red,
"style" = crayon::blue,
crayon::bold
)
}
paste0(
lapply(lint_entry$output,
function (lint_line) {
paste0(
crayon::bold(lint_entry$r_file, ":",
as.character(lint_line$line_number), ":",
as.character(lint_line$column_number), ": ", sep = ""),
color(lint_line$type)(lint_line$type, ": ", sep = ""),
crayon::bold(lint_line$message), "\n",
lint_line$line, "\n",
lintr:::highlight_string(lint_line$message, lint_line$column_number, lint_line$ranges),
"\n",
collapse = "")
}),
collapse = "")
}
if (num_issue > 0) {
cat(sprintf('R linters found %d issues:\n', num_issue))
for (entry in results) {
if (length(entry$output)) {
cat(paste0('**** ', crayon::bold(entry$r_file), '\n'))
cat(paste0(lint2str(entry), collapse = ''))
}
}
quit(save = 'no', status = 1) # Signal error to parent shell
}

View File

@@ -1,4 +1,4 @@
library(testthat) library(testthat)
library(xgboost) library(xgboost)
test_check("xgboost", reporter = ProgressReporter) test_check("xgboost")

View File

@@ -2,23 +2,22 @@ require(xgboost)
context("basic functions") context("basic functions")
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
set.seed(1994) set.seed(1994)
# disable some tests for Win32 # disable some tests for Win32
windows_flag <- .Platform$OS.type == "windows" && windows_flag = .Platform$OS.type == "windows" &&
.Machine$sizeof.pointer != 8 .Machine$sizeof.pointer != 8
solaris_flag <- (Sys.info()['sysname'] == "SunOS") solaris_flag = (Sys.info()['sysname'] == "SunOS")
test_that("train and predict binary classification", { test_that("train and predict binary classification", {
nrounds <- 2 nrounds = 2
expect_output( expect_output(
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic", eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic")
eval_metric = "error")
, "train-error") , "train-error")
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
expect_equal(bst$niter, nrounds) expect_equal(bst$niter, nrounds)
@@ -31,24 +30,24 @@ test_that("train and predict binary classification", {
pred1 <- predict(bst, train$data, ntreelimit = 1) pred1 <- predict(bst, train$data, ntreelimit = 1)
expect_length(pred1, 6513) expect_length(pred1, 6513)
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
err_log <- bst$evaluation_log[1, train_error] err_log <- bst$evaluation_log[1, train_error]
expect_lt(abs(err_pred1 - err_log), 10e-6) expect_lt(abs(err_pred1 - err_log), 10e-6)
}) })
test_that("parameter validation works", { test_that("parameter validation works", {
p <- list(foo = "bar") p <- list(foo = "bar")
nrounds <- 1 nrounds = 1
set.seed(1994) set.seed(1994)
d <- cbind( d <- cbind(
x1 = rnorm(10), x1 = rnorm(10),
x2 = rnorm(10), x2 = rnorm(10),
x3 = rnorm(10)) x3 = rnorm(10))
y <- d[, "x1"] + d[, "x2"]^2 + y <- d[,"x1"] + d[,"x2"]^2 +
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
rnorm(10) rnorm(10)
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data=d, info = list(label=y))
correct <- function() { correct <- function() {
params <- list(max_depth = 2, booster = "dart", params <- list(max_depth = 2, booster = "dart",
@@ -66,20 +65,20 @@ test_that("parameter validation works", {
xgb.train(params = params, data = dtrain, nrounds = nrounds)) xgb.train(params = params, data = dtrain, nrounds = nrounds))
print(output) print(output)
} }
expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"') expect_output(incorrect(), "bar, foo")
}) })
test_that("dart prediction works", { test_that("dart prediction works", {
nrounds <- 32 nrounds = 32
set.seed(1994) set.seed(1994)
d <- cbind( d <- cbind(
x1 = rnorm(100), x1 = rnorm(100),
x2 = rnorm(100), x2 = rnorm(100),
x3 = rnorm(100)) x3 = rnorm(100))
y <- d[, "x1"] + d[, "x2"]^2 + y <- d[,"x1"] + d[,"x2"]^2 +
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
rnorm(100) rnorm(100)
set.seed(1994) set.seed(1994)
@@ -88,23 +87,23 @@ test_that("dart prediction works", {
eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror") eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0) pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds) pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) expect_true(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE) pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) expect_false(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
set.seed(1994) set.seed(1994)
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data=d, info = list(label=y))
booster_by_train <- xgb.train(params = list( booster_by_train <- xgb.train( params = list(
booster = "dart", booster = "dart",
max_depth = 2, max_depth = 2,
eta = 1, eta = 1,
rate_drop = 0.5, rate_drop = 0.5,
one_drop = TRUE, one_drop = TRUE,
nthread = 1, nthread = 1,
tree_method = "exact", tree_method= "exact",
objective = "reg:squarederror" objective = "reg:squarederror"
), ),
data = dtrain, data = dtrain,
nrounds = nrounds nrounds = nrounds
) )
@@ -112,9 +111,9 @@ test_that("dart prediction works", {
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds) pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE) pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE))) expect_true(all(matrix(pred_by_train_0, byrow=TRUE) == matrix(pred_by_xgboost_0, byrow=TRUE)))
expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) expect_true(all(matrix(pred_by_train_1, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) expect_true(all(matrix(pred_by_train_2, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
}) })
test_that("train and predict softprob", { test_that("train and predict softprob", {
@@ -123,7 +122,7 @@ test_that("train and predict softprob", {
expect_output( expect_output(
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
objective = "multi:softprob", num_class = 3, eval_metric = "merror") objective = "multi:softprob", num_class=3)
, "train-merror") , "train-merror")
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
@@ -131,17 +130,17 @@ test_that("train and predict softprob", {
pred <- predict(bst, as.matrix(iris[, -5])) pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris) * 3) expect_length(pred, nrow(iris) * 3)
# row sums add up to total probability of 1: # row sums add up to total probability of 1:
expect_equal(rowSums(matrix(pred, ncol = 3, byrow = TRUE)), rep(1, nrow(iris)), tolerance = 1e-7) expect_equal(rowSums(matrix(pred, ncol=3, byrow=TRUE)), rep(1, nrow(iris)), tolerance = 1e-7)
# manually calculate error at the last iteration: # manually calculate error at the last iteration:
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE) mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
expect_equal(as.numeric(t(mpred)), pred) expect_equal(as.numeric(t(mpred)), pred)
pred_labels <- max.col(mpred) - 1 pred_labels <- max.col(mpred) - 1
err <- sum(pred_labels != lb) / length(lb) err <- sum(pred_labels != lb)/length(lb)
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6) expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
# manually calculate error at the 1st iteration: # manually calculate error at the 1st iteration:
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1) mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
pred_labels <- max.col(mpred) - 1 pred_labels <- max.col(mpred) - 1
err <- sum(pred_labels != lb) / length(lb) err <- sum(pred_labels != lb)/length(lb)
expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6) expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
}) })
@@ -151,7 +150,7 @@ test_that("train and predict softmax", {
expect_output( expect_output(
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
objective = "multi:softmax", num_class = 3, eval_metric = "merror") objective = "multi:softmax", num_class=3)
, "train-merror") , "train-merror")
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
@@ -159,7 +158,7 @@ test_that("train and predict softmax", {
pred <- predict(bst, as.matrix(iris[, -5])) pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris)) expect_length(pred, nrow(iris))
err <- sum(pred != lb) / length(lb) err <- sum(pred != lb)/length(lb)
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6) expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
}) })
@@ -168,18 +167,18 @@ test_that("train and predict RF", {
lb <- train$label lb <- train$label
# single iteration # single iteration
bst <- xgboost(data = train$data, label = lb, max_depth = 5, bst <- xgboost(data = train$data, label = lb, max_depth = 5,
nthread = 2, nrounds = 1, objective = "binary:logistic", eval_metric = "error", nthread = 2, nrounds = 1, objective = "binary:logistic",
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1) num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1)
expect_equal(bst$niter, 1) expect_equal(bst$niter, 1)
expect_equal(xgb.ntree(bst), 20) expect_equal(xgb.ntree(bst), 20)
pred <- predict(bst, train$data) pred <- predict(bst, train$data)
pred_err <- sum((pred > 0.5) != lb) / length(lb) pred_err <- sum((pred > 0.5) != lb)/length(lb)
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6) expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
#expect_lt(pred_err, 0.03) #expect_lt(pred_err, 0.03)
pred <- predict(bst, train$data, ntreelimit = 20) pred <- predict(bst, train$data, ntreelimit = 20)
pred_err_20 <- sum((pred > 0.5) != lb) / length(lb) pred_err_20 <- sum((pred > 0.5) != lb)/length(lb)
expect_equal(pred_err_20, pred_err) expect_equal(pred_err_20, pred_err)
#pred <- predict(bst, train$data, ntreelimit = 1) #pred <- predict(bst, train$data, ntreelimit = 1)
@@ -194,20 +193,19 @@ test_that("train and predict RF with softprob", {
set.seed(11) set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds, max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
objective = "multi:softprob", eval_metric = "merror", objective = "multi:softprob", num_class=3, verbose = 0,
num_class = 3, verbose = 0,
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5) num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
expect_equal(bst$niter, 15) expect_equal(bst$niter, 15)
expect_equal(xgb.ntree(bst), 15 * 3 * 4) expect_equal(xgb.ntree(bst), 15*3*4)
# predict for all iterations: # predict for all iterations:
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE) pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE)
expect_equal(dim(pred), c(nrow(iris), 3)) expect_equal(dim(pred), c(nrow(iris), 3))
pred_labels <- max.col(pred) - 1 pred_labels <- max.col(pred) - 1
err <- sum(pred_labels != lb) / length(lb) err <- sum(pred_labels != lb)/length(lb)
expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6) expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
# predict for 7 iterations and adjust for 4 parallel trees per iteration # predict for 7 iterations and adjust for 4 parallel trees per iteration
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4) pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE, ntreelimit = 7 * 4)
err <- sum((max.col(pred) - 1) != lb) / length(lb) err <- sum((max.col(pred) - 1) != lb)/length(lb)
expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6) expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
}) })
@@ -225,7 +223,7 @@ test_that("use of multiple eval metrics works", {
test_that("training continuation works", { test_that("training continuation works", {
dtrain <- xgb.DMatrix(train$data, label = train$label) dtrain <- xgb.DMatrix(train$data, label = train$label)
watchlist <- list(train = dtrain) watchlist = list(train=dtrain)
param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2) param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
# for the reference, use 4 iterations at once: # for the reference, use 4 iterations at once:
@@ -247,18 +245,17 @@ test_that("training continuation works", {
expect_equal(bst$raw, bst2$raw) expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2)) expect_equal(dim(bst2$evaluation_log), c(2, 2))
# test continuing from a model in file # test continuing from a model in file
xgb.save(bst1, "xgboost.json") xgb.save(bst1, "xgboost.model")
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json") bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model")
if (!windows_flag && !solaris_flag) if (!windows_flag && !solaris_flag)
expect_equal(bst$raw, bst2$raw) expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2)) expect_equal(dim(bst2$evaluation_log), c(2, 2))
file.remove("xgboost.json")
}) })
test_that("model serialization works", { test_that("model serialization works", {
out_path <- "model_serialization" out_path <- "model_serialization"
dtrain <- xgb.DMatrix(train$data, label = train$label) dtrain <- xgb.DMatrix(train$data, label = train$label)
watchlist <- list(train = dtrain) watchlist = list(train=dtrain)
param <- list(objective = "binary:logistic") param <- list(objective = "binary:logistic")
booster <- xgb.train(param, dtrain, nrounds = 4, watchlist) booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
raw <- xgb.serialize(booster) raw <- xgb.serialize(booster)
@@ -276,7 +273,7 @@ test_that("xgb.cv works", {
expect_output( expect_output(
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5, cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
eval_metric = "error", verbose = TRUE) verbose=TRUE)
, "train-error:") , "train-error:")
expect_is(cv, 'xgb.cv.synchronous') expect_is(cv, 'xgb.cv.synchronous')
expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$evaluation_log))
@@ -295,13 +292,13 @@ test_that("xgb.cv works with stratified folds", {
set.seed(314159) set.seed(314159)
cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5, cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
verbose = TRUE, stratified = FALSE) verbose=TRUE, stratified = FALSE)
set.seed(314159) set.seed(314159)
cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5, cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
verbose = TRUE, stratified = TRUE) verbose=TRUE, stratified = TRUE)
# Stratified folds should result in a different evaluation logs # Stratified folds should result in a different evaluation logs
expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean])) expect_true(all(cv$evaluation_log[, test_error_mean] != cv2$evaluation_log[, test_error_mean]))
}) })
test_that("train and predict with non-strict classes", { test_that("train and predict with non-strict classes", {
@@ -322,7 +319,7 @@ test_that("train and predict with non-strict classes", {
expect_equal(pr0, pr) expect_equal(pr0, pr)
# dense matrix-like input of non-matrix class with some inheritance # dense matrix-like input of non-matrix class with some inheritance
class(train_dense) <- c('pphmatrix', 'shmatrix') class(train_dense) <- c('pphmatrix','shmatrix')
expect_true(is.matrix(train_dense)) expect_true(is.matrix(train_dense))
expect_error( expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2, bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
@@ -340,15 +337,15 @@ test_that("train and predict with non-strict classes", {
test_that("max_delta_step works", { test_that("max_delta_step works", {
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
watchlist <- list(train = dtrain) watchlist <- list(train = dtrain)
param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5) param <- list(objective = "binary:logistic", eval_metric="logloss", max_depth = 2, nthread = 2, eta = 0.5)
nrounds <- 5 nrounds = 5
# model with no restriction on max_delta_step # model with no restriction on max_delta_step
bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1) bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
# model with restricted max_delta_step # model with restricted max_delta_step
bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1) bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
# the no-restriction model is expected to have consistently lower loss during the initial interations # the no-restriction model is expected to have consistently lower loss during the initial interations
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss)) expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8) expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8)
}) })
test_that("colsample_bytree works", { test_that("colsample_bytree works", {

View File

@@ -2,12 +2,11 @@
require(xgboost) require(xgboost)
require(data.table) require(data.table)
require(titanic)
context("callbacks") context("callbacks")
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
@@ -22,25 +21,24 @@ ltrain <- add.noise(train$label, 0.2)
ltest <- add.noise(test$label, 0.2) ltest <- add.noise(test$label, 0.2)
dtrain <- xgb.DMatrix(train$data, label = ltrain) dtrain <- xgb.DMatrix(train$data, label = ltrain)
dtest <- xgb.DMatrix(test$data, label = ltest) dtest <- xgb.DMatrix(test$data, label = ltest)
watchlist <- list(train = dtrain, test = dtest) watchlist = list(train=dtrain, test=dtest)
err <- function(label, pr) sum((pr > 0.5) != label) / length(label) err <- function(label, pr) sum((pr > 0.5) != label)/length(label)
param <- list(objective = "binary:logistic", eval_metric = "error", param <- list(objective = "binary:logistic", max_depth = 2, nthread = 2)
max_depth = 2, nthread = 2)
test_that("cb.print.evaluation works as expected", { test_that("cb.print.evaluation works as expected", {
bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8) bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8)
bst_evaluation_err <- NULL bst_evaluation_err <- NULL
begin_iteration <- 1 begin_iteration <- 1
end_iteration <- 7 end_iteration <- 7
f0 <- cb.print.evaluation(period = 0) f0 <- cb.print.evaluation(period=0)
f1 <- cb.print.evaluation(period = 1) f1 <- cb.print.evaluation(period=1)
f5 <- cb.print.evaluation(period = 5) f5 <- cb.print.evaluation(period=5)
expect_false(is.null(attr(f1, 'call'))) expect_false(is.null(attr(f1, 'call')))
expect_equal(attr(f1, 'name'), 'cb.print.evaluation') expect_equal(attr(f1, 'name'), 'cb.print.evaluation')
@@ -59,13 +57,13 @@ test_that("cb.print.evaluation works as expected", {
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000") expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000") expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2) bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2)
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\\+0.100000\ttest-auc:0.800000\\+0.200000") expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\\+0.100000\ttest-auc:0.800000\\+0.200000")
}) })
test_that("cb.evaluation.log works as expected", { test_that("cb.evaluation.log works as expected", {
bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8) bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8)
bst_evaluation_err <- NULL bst_evaluation_err <- NULL
evaluation_log <- list() evaluation_log <- list()
@@ -77,38 +75,37 @@ test_that("cb.evaluation.log works as expected", {
iteration <- 1 iteration <- 1
expect_silent(f()) expect_silent(f())
expect_equal(evaluation_log, expect_equal(evaluation_log,
list(c(iter = 1, bst_evaluation))) list(c(iter=1, bst_evaluation)))
iteration <- 2 iteration <- 2
expect_silent(f()) expect_silent(f())
expect_equal(evaluation_log, expect_equal(evaluation_log,
list(c(iter = 1, bst_evaluation), c(iter = 2, bst_evaluation))) list(c(iter=1, bst_evaluation), c(iter=2, bst_evaluation)))
expect_silent(f(finalize = TRUE)) expect_silent(f(finalize = TRUE))
expect_equal(evaluation_log, expect_equal(evaluation_log,
data.table(iter = 1:2, train_auc = c(0.9, 0.9), test_auc = c(0.8, 0.8))) data.table(iter=1:2, train_auc=c(0.9,0.9), test_auc=c(0.8,0.8)))
bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2) bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2)
evaluation_log <- list() evaluation_log <- list()
f <- cb.evaluation.log() f <- cb.evaluation.log()
iteration <- 1 iteration <- 1
expect_silent(f()) expect_silent(f())
expect_equal(evaluation_log, expect_equal(evaluation_log,
list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)))) list(c(iter=1, c(bst_evaluation, bst_evaluation_err))))
iteration <- 2 iteration <- 2
expect_silent(f()) expect_silent(f())
expect_equal(evaluation_log, expect_equal(evaluation_log,
list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)), list(c(iter=1, c(bst_evaluation, bst_evaluation_err)),
c(iter = 2, c(bst_evaluation, bst_evaluation_err)))) c(iter=2, c(bst_evaluation, bst_evaluation_err))))
expect_silent(f(finalize = TRUE)) expect_silent(f(finalize = TRUE))
expect_equal(evaluation_log, expect_equal(evaluation_log,
data.table(iter = 1:2, data.table(iter=1:2,
train_auc_mean = c(0.9, 0.9), train_auc_std = c(0.1, 0.1), train_auc_mean=c(0.9,0.9), train_auc_std=c(0.1,0.1),
test_auc_mean = c(0.8, 0.8), test_auc_std = c(0.2, 0.2))) test_auc_mean=c(0.8,0.8), test_auc_std=c(0.2,0.2)))
}) })
param <- list(objective = "binary:logistic", eval_metric = "error", param <- list(objective = "binary:logistic", max_depth = 4, nthread = 2)
max_depth = 4, nthread = 2)
test_that("can store evaluation_log without printing", { test_that("can store evaluation_log without printing", {
expect_silent( expect_silent(
@@ -176,16 +173,16 @@ test_that("cb.reset.parameters works as expected", {
}) })
test_that("cb.save.model works as expected", { test_that("cb.save.model works as expected", {
files <- c('xgboost_01.json', 'xgboost_02.json', 'xgboost.json') files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model')
for (f in files) if (file.exists(f)) file.remove(f) for (f in files) if (file.exists(f)) file.remove(f)
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0, bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
save_period = 1, save_name = "xgboost_%02d.json") save_period = 1, save_name = "xgboost_%02d.model")
expect_true(file.exists('xgboost_01.json')) expect_true(file.exists('xgboost_01.model'))
expect_true(file.exists('xgboost_02.json')) expect_true(file.exists('xgboost_02.model'))
b1 <- xgb.load('xgboost_01.json') b1 <- xgb.load('xgboost_01.model')
expect_equal(xgb.ntree(b1), 1) expect_equal(xgb.ntree(b1), 1)
b2 <- xgb.load('xgboost_02.json') b2 <- xgb.load('xgboost_02.model')
expect_equal(xgb.ntree(b2), 2) expect_equal(xgb.ntree(b2), 2)
xgb.config(b2) <- xgb.config(bst) xgb.config(b2) <- xgb.config(bst)
@@ -194,9 +191,9 @@ test_that("cb.save.model works as expected", {
# save_period = 0 saves the last iteration's model # save_period = 0 saves the last iteration's model
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0, bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
save_period = 0, save_name = 'xgboost.json') save_period = 0)
expect_true(file.exists('xgboost.json')) expect_true(file.exists('xgboost.model'))
b2 <- xgb.load('xgboost.json') b2 <- xgb.load('xgboost.model')
xgb.config(b2) <- xgb.config(bst) xgb.config(b2) <- xgb.config(bst)
expect_equal(bst$raw, b2$raw) expect_equal(bst$raw, b2$raw)
@@ -239,8 +236,8 @@ test_that("early stopping xgb.train works", {
test_that("early stopping using a specific metric works", { test_that("early stopping using a specific metric works", {
set.seed(11) set.seed(11)
expect_output( expect_output(
bst <- xgb.train(param[-2], dtrain, nrounds = 20, watchlist, eta = 0.6, bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.6,
eval_metric = "logloss", eval_metric = "auc", eval_metric="logloss", eval_metric="auc",
callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE, callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
metric_name = 'test_logloss'))) metric_name = 'test_logloss')))
, "Stopping. Best iteration") , "Stopping. Best iteration")
@@ -255,26 +252,6 @@ test_that("early stopping using a specific metric works", {
expect_equal(logloss_log, logloss_pred, tolerance = 1e-5) expect_equal(logloss_log, logloss_pred, tolerance = 1e-5)
}) })
test_that("early stopping works with titanic", {
# This test was inspired by https://github.com/dmlc/xgboost/issues/5935
# It catches possible issues on noLD R
titanic <- titanic::titanic_train
titanic$Pclass <- as.factor(titanic$Pclass)
dtx <- model.matrix(~ 0 + ., data = titanic[, c("Pclass", "Sex")])
dty <- titanic$Survived
xgboost::xgboost(
data = dtx,
label = dty,
objective = "binary:logistic",
eval_metric = "auc",
nrounds = 100,
early_stopping_rounds = 3
)
expect_true(TRUE) # should not crash
})
test_that("early stopping xgb.cv works", { test_that("early stopping xgb.cv works", {
set.seed(11) set.seed(11)
expect_output( expect_output(
@@ -290,12 +267,12 @@ test_that("early stopping xgb.cv works", {
test_that("prediction in xgb.cv works", { test_that("prediction in xgb.cv works", {
set.seed(11) set.seed(11)
nrounds <- 4 nrounds = 4
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0) cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$evaluation_log))
expect_false(is.null(cv$pred)) expect_false(is.null(cv$pred))
expect_length(cv$pred, nrow(train$data)) expect_length(cv$pred, nrow(train$data))
err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f])))) err_pred <- mean( sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))) )
err_log <- cv$evaluation_log[nrounds, test_error_mean] err_log <- cv$evaluation_log[nrounds, test_error_mean]
expect_equal(err_pred, err_log, tolerance = 1e-6) expect_equal(err_pred, err_log, tolerance = 1e-6)
@@ -331,7 +308,7 @@ test_that("prediction in early-stopping xgb.cv works", {
expect_false(is.null(cv$pred)) expect_false(is.null(cv$pred))
expect_length(cv$pred, nrow(train$data)) expect_length(cv$pred, nrow(train$data))
err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f])))) err_pred <- mean( sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))) )
err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean] err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean]
expect_equal(err_pred, err_log, tolerance = 1e-6) expect_equal(err_pred, err_log, tolerance = 1e-6)
err_log_last <- cv$evaluation_log[cv$niter, test_error_mean] err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]

View File

@@ -1,21 +0,0 @@
context('Test global configuration')
test_that('Global configuration works with verbosity', {
old_verbosity <- xgb.get.config()$verbosity
for (v in c(0, 1, 2, 3)) {
xgb.set.config(verbosity = v)
expect_equal(xgb.get.config()$verbosity, v)
}
xgb.set.config(verbosity = old_verbosity)
expect_equal(xgb.get.config()$verbosity, old_verbosity)
})
test_that('Global configuration works with use_rmm flag', {
old_use_rmm_flag <- xgb.get.config()$use_rmm
for (v in c(TRUE, FALSE)) {
xgb.set.config(use_rmm = v)
expect_equal(xgb.get.config()$use_rmm, v)
}
xgb.set.config(use_rmm = old_use_rmm_flag)
expect_equal(xgb.get.config()$use_rmm, old_use_rmm_flag)
})

View File

@@ -4,8 +4,8 @@ require(xgboost)
set.seed(1994) set.seed(1994)
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
@@ -20,12 +20,12 @@ logregobj <- function(preds, dtrain) {
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.5))) / length(labels) err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max_depth = 2, eta = 1, nthread = 2, param <- list(max_depth=2, eta=1, nthread = 2,
objective = logregobj, eval_metric = evalerror) objective=logregobj, eval_metric=evalerror)
num_round <- 2 num_round <- 2
test_that("custom objective works", { test_that("custom objective works", {
@@ -37,19 +37,12 @@ test_that("custom objective works", {
}) })
test_that("custom objective in CV works", { test_that("custom objective in CV works", {
cv <- xgb.cv(param, dtrain, num_round, nfold = 10, verbose = FALSE) cv <- xgb.cv(param, dtrain, num_round, nfold=10, verbose=FALSE)
expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$evaluation_log))
expect_equal(dim(cv$evaluation_log), c(2, 5)) expect_equal(dim(cv$evaluation_log), c(2, 5))
expect_lt(cv$evaluation_log[num_round, test_error_mean], 0.03) expect_lt(cv$evaluation_log[num_round, test_error_mean], 0.03)
}) })
test_that("custom objective with early stop works", {
bst <- xgb.train(param, dtrain, 10, watchlist)
expect_equal(class(bst), "xgb.Booster")
train_log <- bst$evaluation_log$train_error
expect_true(all(diff(train_log) <= 0))
})
test_that("custom objective using DMatrix attr works", { test_that("custom objective using DMatrix attr works", {
attr(dtrain, 'label') <- getinfo(dtrain, 'label') attr(dtrain, 'label') <- getinfo(dtrain, 'label')
@@ -61,14 +54,14 @@ test_that("custom objective using DMatrix attr works", {
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
param$objective <- logregobjattr param$objective = logregobjattr
bst <- xgb.train(param, dtrain, num_round, watchlist) bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
}) })
test_that("custom objective with multi-class works", { test_that("custom objective with multi-class works", {
data <- as.matrix(iris[, -5]) data = as.matrix(iris[, -5])
label <- as.numeric(iris$Species) - 1 label = as.numeric(iris$Species) - 1
dtrain <- xgb.DMatrix(data = data, label = label) dtrain <- xgb.DMatrix(data = data, label = label)
nclasses <- 3 nclasses <- 3
@@ -79,10 +72,6 @@ test_that("custom objective with multi-class works", {
hess <- rnorm(dim(as.matrix(preds))[1]) hess <- rnorm(dim(as.matrix(preds))[1])
return (list(grad = grad, hess = hess)) return (list(grad = grad, hess = hess))
} }
fake_merror <- function(preds, dtrain) { param$objective = fake_softprob
expect_equal(dim(data)[1] * nclasses, dim(as.matrix(preds))[1]) bst <- xgb.train(param, dtrain, 1, num_class=nclasses)
}
param$objective <- fake_softprob
param$eval_metric <- fake_merror
bst <- xgb.train(param, dtrain, 1, num_class = nclasses)
}) })

View File

@@ -3,29 +3,29 @@ require(Matrix)
context("testing xgb.DMatrix functionality") context("testing xgb.DMatrix functionality")
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
test_data <- agaricus.test$data[1:100, ] test_data <- agaricus.test$data[1:100,]
test_label <- agaricus.test$label[1:100] test_label <- agaricus.test$label[1:100]
test_that("xgb.DMatrix: basic construction", { test_that("xgb.DMatrix: basic construction", {
# from sparse matrix # from sparse matrix
dtest1 <- xgb.DMatrix(test_data, label = test_label) dtest1 <- xgb.DMatrix(test_data, label=test_label)
# from dense matrix # from dense matrix
dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label) dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label)
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
expect_equal(dim(dtest1), dim(dtest2)) expect_equal(dim(dtest1), dim(dtest2))
#from dense integer matrix #from dense integer matrix
int_data <- as.matrix(test_data) int_data <- as.matrix(test_data)
storage.mode(int_data) <- "integer" storage.mode(int_data) <- "integer"
dtest3 <- xgb.DMatrix(int_data, label = test_label) dtest3 <- xgb.DMatrix(int_data, label=test_label)
expect_equal(dim(dtest1), dim(dtest3)) expect_equal(dim(dtest1), dim(dtest3))
}) })
test_that("xgb.DMatrix: saving, loading", { test_that("xgb.DMatrix: saving, loading", {
# save to a local file # save to a local file
dtest1 <- xgb.DMatrix(test_data, label = test_label) dtest1 <- xgb.DMatrix(test_data, label=test_label)
tmp_file <- tempfile('xgb.DMatrix_') tmp_file <- tempfile('xgb.DMatrix_')
expect_true(xgb.DMatrix.save(dtest1, tmp_file)) expect_true(xgb.DMatrix.save(dtest1, tmp_file))
# read from a local file # read from a local file
@@ -35,12 +35,12 @@ test_that("xgb.DMatrix: saving, loading", {
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
# from a libsvm text file # from a libsvm text file
tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1") tmp <- c("0 1:1 2:1","1 3:1","0 1:1")
tmp_file <- 'tmp.libsvm' tmp_file <- 'tmp.libsvm'
writeLines(tmp, tmp_file) writeLines(tmp, tmp_file)
dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE) dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
expect_equal(dim(dtest4), c(3, 4)) expect_equal(dim(dtest4), c(3, 4))
expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0)) expect_equal(getinfo(dtest4, 'label'), c(0,1,0))
unlink(tmp_file) unlink(tmp_file)
}) })
@@ -61,46 +61,46 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
expect_true(setinfo(dtest, 'weight', test_label)) expect_true(setinfo(dtest, 'weight', test_label))
expect_true(setinfo(dtest, 'base_margin', test_label)) expect_true(setinfo(dtest, 'base_margin', test_label))
expect_true(setinfo(dtest, 'group', c(50, 50))) expect_true(setinfo(dtest, 'group', c(50,50)))
expect_error(setinfo(dtest, 'group', test_label)) expect_error(setinfo(dtest, 'group', test_label))
# providing character values will give an error # providing character values will give a warning
expect_error(setinfo(dtest, 'weight', rep('a', nrow(test_data)))) expect_warning(setinfo(dtest, 'weight', rep('a', nrow(test_data))))
# any other label should error # any other label should error
expect_error(setinfo(dtest, 'asdf', test_label)) expect_error(setinfo(dtest, 'asdf', test_label))
}) })
test_that("xgb.DMatrix: slice, dim", { test_that("xgb.DMatrix: slice, dim", {
dtest <- xgb.DMatrix(test_data, label = test_label) dtest <- xgb.DMatrix(test_data, label=test_label)
expect_equal(dim(dtest), dim(test_data)) expect_equal(dim(dtest), dim(test_data))
dsub1 <- slice(dtest, 1:42) dsub1 <- slice(dtest, 1:42)
expect_equal(nrow(dsub1), 42) expect_equal(nrow(dsub1), 42)
expect_equal(ncol(dsub1), ncol(test_data)) expect_equal(ncol(dsub1), ncol(test_data))
dsub2 <- dtest[1:42, ] dsub2 <- dtest[1:42,]
expect_equal(dim(dtest), dim(test_data)) expect_equal(dim(dtest), dim(test_data))
expect_equal(getinfo(dsub1, 'label'), getinfo(dsub2, 'label')) expect_equal(getinfo(dsub1, 'label'), getinfo(dsub2, 'label'))
}) })
test_that("xgb.DMatrix: slice, trailing empty rows", { test_that("xgb.DMatrix: slice, trailing empty rows", {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
train_data <- agaricus.train$data train_data <- agaricus.train$data
train_label <- agaricus.train$label train_label <- agaricus.train$label
dtrain <- xgb.DMatrix(data = train_data, label = train_label) dtrain <- xgb.DMatrix(data=train_data, label=train_label)
slice(dtrain, 6513L) slice(dtrain, 6513L)
train_data[6513, ] <- 0 train_data[6513, ] <- 0
dtrain <- xgb.DMatrix(data = train_data, label = train_label) dtrain <- xgb.DMatrix(data=train_data, label=train_label)
slice(dtrain, 6513L) slice(dtrain, 6513L)
expect_equal(nrow(dtrain), 6513) expect_equal(nrow(dtrain), 6513)
}) })
test_that("xgb.DMatrix: colnames", { test_that("xgb.DMatrix: colnames", {
dtest <- xgb.DMatrix(test_data, label = test_label) dtest <- xgb.DMatrix(test_data, label=test_label)
expect_equal(colnames(dtest), colnames(test_data)) expect_equal(colnames(dtest), colnames(test_data))
expect_error(colnames(dtest) <- 'asdf') expect_error( colnames(dtest) <- 'asdf')
new_names <- make.names(seq_len(ncol(test_data))) new_names <- make.names(1:ncol(test_data))
expect_silent(colnames(dtest) <- new_names) expect_silent( colnames(dtest) <- new_names)
expect_equal(colnames(dtest), new_names) expect_equal(colnames(dtest), new_names)
expect_silent(colnames(dtest) <- NULL) expect_silent(colnames(dtest) <- NULL)
expect_null(colnames(dtest)) expect_null(colnames(dtest))
@@ -109,7 +109,7 @@ test_that("xgb.DMatrix: colnames", {
test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", { test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
set.seed(123) set.seed(123)
nr <- 1000 nr <- 1000
x <- rsparsematrix(nr, 100, density = 0.0005) x <- rsparsematrix(nr, 100, density=0.0005)
# we want it very sparse, so that last rows are empty # we want it very sparse, so that last rows are empty
expect_lt(max(x@i), nr) expect_lt(max(x@i), nr)
dtest <- xgb.DMatrix(x) dtest <- xgb.DMatrix(x)

View File

@@ -3,14 +3,13 @@ require(xgboost)
context("Garbage Collection Safety Check") context("Garbage Collection Safety Check")
test_that("train and prediction when gctorture is on", { test_that("train and prediction when gctorture is on", {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
gctorture(TRUE) gctorture(TRUE)
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
gctorture(FALSE) gctorture(FALSE)
expect_length(pred, length(test$label))
}) })

View File

@@ -3,12 +3,12 @@ context('Test generalized linear models')
require(xgboost) require(xgboost)
test_that("gblinear works", { test_that("gblinear works", {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear", param <- list(objective = "binary:logistic", booster = "gblinear",
nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001) nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
@@ -16,7 +16,7 @@ test_that("gblinear works", {
ERR_UL <- 0.005 # upper limit for the test set error ERR_UL <- 0.005 # upper limit for the test set error
VERB <- 0 # chatterbox switch VERB <- 0 # chatterbox switch
param$updater <- 'shotgun' param$updater = 'shotgun'
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle') bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
ypred <- predict(bst, dtest) ypred <- predict(bst, dtest)
expect_equal(length(getinfo(dtest, 'label')), 1611) expect_equal(length(getinfo(dtest, 'label')), 1611)
@@ -29,7 +29,7 @@ test_that("gblinear works", {
expect_equal(dim(h), c(n, ncol(dtrain) + 1)) expect_equal(dim(h), c(n, ncol(dtrain) + 1))
expect_is(h, "matrix") expect_is(h, "matrix")
param$updater <- 'coord_descent' param$updater = 'coord_descent'
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic') bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)

View File

@@ -5,18 +5,18 @@ require(data.table)
require(Matrix) require(Matrix)
require(vcd, quietly = TRUE) require(vcd, quietly = TRUE)
float_tolerance <- 5e-6 float_tolerance = 5e-6
# disable some tests for 32-bit environment # disable some tests for 32-bit environment
flag_32bit <- .Machine$sizeof.pointer != 8 flag_32bit = .Machine$sizeof.pointer != 8
set.seed(1982) set.seed(1982)
data(Arthritis) data(Arthritis)
df <- data.table(Arthritis, keep.rownames = FALSE) df <- data.table(Arthritis, keep.rownames = F)
df[, AgeDiscret := as.factor(round(Age / 10, 0))] df[,AgeDiscret := as.factor(round(Age / 10,0))]
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
df[, ID := NULL] df[,ID := NULL]
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) # nolint sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
label <- df[, ifelse(Improved == "Marked", 1, 0)] label <- df[, ifelse(Improved == "Marked", 1, 0)]
# binary # binary
@@ -46,8 +46,8 @@ mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
test_that("xgb.dump works", { test_that("xgb.dump works", {
if (!flag_32bit) if (!flag_32bit)
expect_length(xgb.dump(bst.Tree), 200) expect_length(xgb.dump(bst.Tree), 200)
dump_file <- file.path(tempdir(), 'xgb.model.dump') dump_file = file.path(tempdir(), 'xgb.model.dump')
expect_true(xgb.dump(bst.Tree, dump_file, with_stats = TRUE)) expect_true(xgb.dump(bst.Tree, dump_file, with_stats = T))
expect_true(file.exists(dump_file)) expect_true(file.exists(dump_file))
expect_gt(file.size(dump_file), 8000) expect_gt(file.size(dump_file), 8000)
@@ -63,7 +63,7 @@ test_that("xgb.dump works for gblinear", {
# also make sure that it works properly for a sparse model where some coefficients # also make sure that it works properly for a sparse model where some coefficients
# are 0 from setting large L1 regularization: # are 0 from setting large L1 regularization:
bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
alpha = 2, objective = "binary:logistic", booster = "gblinear") alpha=2, objective = "binary:logistic", booster = "gblinear")
d.sp <- xgb.dump(bst.GLM.sp) d.sp <- xgb.dump(bst.GLM.sp)
expect_length(d.sp, 14) expect_length(d.sp, 14)
expect_gt(sum(d.sp == "0"), 0) expect_gt(sum(d.sp == "0"), 0)
@@ -110,9 +110,9 @@ test_that("predict feature contributions works", {
pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE) pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
# manual calculation of linear terms # manual calculation of linear terms
coefs <- xgb.dump(bst.GLM)[-c(1, 2, 4)] %>% as.numeric coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric
coefs <- c(coefs[-1], coefs[1]) # intercept must be the last coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN = "*") pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*")
expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual),
tolerance = float_tolerance) tolerance = float_tolerance)
@@ -130,13 +130,13 @@ test_that("predict feature contributions works", {
pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE) pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE)
expect_length(pred_contr, 3) expect_length(pred_contr, 3)
coefs_all <- xgb.dump(mbst.GLM)[-c(1, 2, 6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE) coefs_all <- xgb.dump(mbst.GLM)[-c(1,2,6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE)
for (g in seq_along(pred_contr)) { for (g in seq_along(pred_contr)) {
expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance) expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance)
# manual calculation of linear terms # manual calculation of linear terms
coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
pred_contr_manual <- sweep(as.matrix(cbind(iris[, -5], 1)), 2, coefs, FUN = "*") pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*")
expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual),
tolerance = float_tolerance) tolerance = float_tolerance)
} }
@@ -147,8 +147,8 @@ test_that("SHAPs sum to predictions, with or without DART", {
x1 = rnorm(100), x1 = rnorm(100),
x2 = rnorm(100), x2 = rnorm(100),
x3 = rnorm(100)) x3 = rnorm(100))
y <- d[, "x1"] + d[, "x2"]^2 + y <- d[,"x1"] + d[,"x2"]^2 +
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
rnorm(100) rnorm(100)
nrounds <- 30 nrounds <- 30
@@ -160,7 +160,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
objective = "reg:squarederror", objective = "reg:squarederror",
eval_metric = "rmse"), eval_metric = "rmse"),
if (booster == "dart") if (booster == "dart")
list(rate_drop = .01, one_drop = TRUE)), list(rate_drop = .01, one_drop = T)),
data = d, data = d,
label = y, label = y,
nrounds = nrounds) nrounds = nrounds)
@@ -168,21 +168,21 @@ test_that("SHAPs sum to predictions, with or without DART", {
pr <- function(...) pr <- function(...)
predict(fit, newdata = d, ...) predict(fit, newdata = d, ...)
pred <- pr() pred <- pr()
shap <- pr(predcontrib = TRUE) shap <- pr(predcontrib = T)
shapi <- pr(predinteraction = TRUE) shapi <- pr(predinteraction = T)
tol <- 1e-5 tol = 1e-5
expect_equal(rowSums(shap), pred, tol = tol) expect_equal(rowSums(shap), pred, tol = tol)
expect_equal(apply(shapi, 1, sum), pred, tol = tol) expect_equal(apply(shapi, 1, sum), pred, tol = tol)
for (i in seq_len(nrow(d))) for (i in 1 : nrow(d))
for (f in list(rowSums, colSums)) for (f in list(rowSums, colSums))
expect_equal(f(shapi[i, , ]), shap[i, ], tol = tol) expect_equal(f(shapi[i,,]), shap[i,], tol = tol)
} }
}) })
test_that("xgb-attribute functionality", { test_that("xgb-attribute functionality", {
val <- "my attribute value" val <- "my attribute value"
list.val <- list(my_attr = val, a = 123, b = 'ok') list.val <- list(my_attr=val, a=123, b='ok')
list.ch <- list.val[order(names(list.val))] list.ch <- list.val[order(names(list.val))]
list.ch <- lapply(list.ch, as.character) list.ch <- lapply(list.ch, as.character)
# note: iter is 0-index in xgb attributes # note: iter is 0-index in xgb attributes
@@ -208,9 +208,9 @@ test_that("xgb-attribute functionality", {
xgb.attr(bst, "my_attr") <- NULL xgb.attr(bst, "my_attr") <- NULL
expect_null(xgb.attr(bst, "my_attr")) expect_null(xgb.attr(bst, "my_attr"))
expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")]) expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")])
xgb.attributes(bst) <- list(a = NULL, b = NULL) xgb.attributes(bst) <- list(a=NULL, b=NULL)
expect_equal(xgb.attributes(bst), list.default) expect_equal(xgb.attributes(bst), list.default)
xgb.attributes(bst) <- list(niter = NULL) xgb.attributes(bst) <- list(niter=NULL)
expect_null(xgb.attributes(bst)) expect_null(xgb.attributes(bst))
}) })
@@ -268,7 +268,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
bst.Tree.x$feature_names <- NULL bst.Tree.x$feature_names <- NULL
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x) dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"') expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE]) expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
# using integer node ID instead of character # using integer node ID instead of character
dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE) dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
@@ -295,7 +295,7 @@ test_that("xgb.importance works with and without feature names", {
bst.Tree.x <- bst.Tree bst.Tree.x <- bst.Tree
bst.Tree.x$feature_names <- NULL bst.Tree.x$feature_names <- NULL
importance.Tree.x <- xgb.importance(model = bst.Tree) importance.Tree.x <- xgb.importance(model = bst.Tree)
expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE], expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE],
tolerance = float_tolerance) tolerance = float_tolerance)
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
@@ -305,7 +305,7 @@ test_that("xgb.importance works with and without feature names", {
# for multiclass # for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree) imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4)) expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds)) xgb.importance(model = mbst.Tree, trees = seq(from=0, by=nclass, length.out=nrounds))
}) })
test_that("xgb.importance works with GLM model", { test_that("xgb.importance works with GLM model", {
@@ -320,7 +320,7 @@ test_that("xgb.importance works with GLM model", {
# for multiclass # for multiclass
imp.GLM <- xgb.importance(model = mbst.GLM) imp.GLM <- xgb.importance(model = mbst.GLM)
expect_equal(dim(imp.GLM), c(12, 3)) expect_equal(dim(imp.GLM), c(12, 3))
expect_equal(imp.GLM$Class, rep(0:2, each = 4)) expect_equal(imp.GLM$Class, rep(0:2, each=4))
}) })
test_that("xgb.model.dt.tree and xgb.importance work with a single split model", { test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
@@ -335,8 +335,8 @@ test_that("xgb.model.dt.tree and xgb.importance work with a single split model",
}) })
test_that("xgb.plot.tree works with and without feature names", { test_that("xgb.plot.tree works with and without feature names", {
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree)) xgb.plot.tree(feature_names = feature.names, model = bst.Tree)
expect_silent(xgb.plot.tree(model = bst.Tree)) xgb.plot.tree(model = bst.Tree)
}) })
test_that("xgb.plot.multi.trees works with and without feature names", { test_that("xgb.plot.multi.trees works with and without feature names", {
@@ -351,47 +351,11 @@ test_that("xgb.plot.deepness works", {
xgb.ggplot.deepness(model = bst.Tree) xgb.ggplot.deepness(model = bst.Tree)
}) })
test_that("xgb.shap.data works when top_n is provided", {
data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
expect_equal(names(data_list), c("data", "shap_contrib"))
expect_equal(NCOL(data_list$data), 2)
expect_equal(NCOL(data_list$shap_contrib), 2)
expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
expect_gt(length(colnames(data_list$data)), 0)
expect_gt(length(colnames(data_list$shap_contrib)), 0)
# for multiclass without target class provided
data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2)
expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
# for multiclass with target class provided
data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2, target_class = 0)
expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
})
test_that("xgb.shap.data works with subsampling", {
data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2, subsample = 0.8)
expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(sparse_matrix)))
expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
})
test_that("prepare.ggplot.shap.data works", {
data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
plot_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
expect_s3_class(plot_data, "data.frame")
expect_equal(names(plot_data), c("id", "feature", "feature_value", "shap_value"))
expect_s3_class(plot_data$feature, "factor")
# Each observation should have 1 row for each feature
expect_equal(nrow(plot_data), nrow(sparse_matrix) * 2)
})
test_that("xgb.plot.shap works", { test_that("xgb.plot.shap works", {
sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4) sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
expect_equal(names(sh), c("data", "shap_contrib")) expect_equal(names(sh), c("data", "shap_contrib"))
}) expect_equal(NCOL(sh$data), 2)
expect_equal(NCOL(sh$shap_contrib), 2)
test_that("xgb.plot.shap.summary works", {
expect_silent(xgb.plot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2))
expect_silent(xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2))
}) })
test_that("check.deprecation works", { test_that("check.deprecation works", {
@@ -410,26 +374,3 @@ test_that("check.deprecation works", {
, "\'dumm\' was partially matched to \'dummy\'") , "\'dumm\' was partially matched to \'dummy\'")
expect_equal(res, list(a = 1, DUMMY = 22)) expect_equal(res, list(a = 1, DUMMY = 22))
}) })
test_that('convert.labels works', {
y <- c(0, 1, 0, 0, 1)
for (objective in c('binary:logistic', 'binary:logitraw', 'binary:hinge')) {
res <- xgboost:::convert.labels(y, objective_name = objective)
expect_s3_class(res, 'factor')
expect_equal(res, factor(res))
}
y <- c(0, 1, 3, 2, 1, 4)
for (objective in c('multi:softmax', 'multi:softprob', 'rank:pairwise', 'rank:ndcg',
'rank:map')) {
res <- xgboost:::convert.labels(y, objective_name = objective)
expect_s3_class(res, 'factor')
expect_equal(res, factor(res))
}
y <- c(1.2, 3.0, -1.0, 10.0)
for (objective in c('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic',
'reg:pseudohubererror', 'count:poisson', 'survival:cox', 'survival:aft',
'reg:gamma', 'reg:tweedie')) {
res <- xgboost:::convert.labels(y, objective_name = objective)
expect_equal(class(res), 'numeric')
}
})

View File

@@ -5,20 +5,20 @@ context("interaction constraints")
set.seed(1024) set.seed(1024)
x1 <- rnorm(1000, 1) x1 <- rnorm(1000, 1)
x2 <- rnorm(1000, 1) x2 <- rnorm(1000, 1)
x3 <- sample(c(1, 2, 3), size = 1000, replace = TRUE) x3 <- sample(c(1,2,3), size=1000, replace=TRUE)
y <- x1 + x2 + x3 + x1 * x2 * x3 + rnorm(1000, 0.001) + 3 * sin(x1) y <- x1 + x2 + x3 + x1*x2*x3 + rnorm(1000, 0.001) + 3*sin(x1)
train <- matrix(c(x1, x2, x3), ncol = 3) train <- matrix(c(x1,x2,x3), ncol = 3)
test_that("interaction constraints for regression", { test_that("interaction constraints for regression", {
# Fit a model that only allows interaction between x1 and x2 # Fit a model that only allows interaction between x1 and x2
bst <- xgboost(data = train, label = y, max_depth = 3, bst <- xgboost(data = train, label = y, max_depth = 3,
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
interaction_constraints = list(c(0, 1))) interaction_constraints = list(c(0,1)))
# Set all observations to have the same x3 values then increment # Set all observations to have the same x3 values then increment
# by the same amount # by the same amount
preds <- lapply(c(1, 2, 3), function(x){ preds <- lapply(c(1,2,3), function(x){
tmat <- matrix(c(x1, x2, rep(x, 1000)), ncol = 3) tmat <- matrix(c(x1,x2,rep(x,1000)), ncol=3)
return(predict(bst, tmat)) return(predict(bst, tmat))
}) })
@@ -40,16 +40,16 @@ test_that("interaction constraints scientific representation", {
rows <- 10 rows <- 10
## When number exceeds 1e5, R paste function uses scientific representation. ## When number exceeds 1e5, R paste function uses scientific representation.
## See: https://github.com/dmlc/xgboost/issues/5179 ## See: https://github.com/dmlc/xgboost/issues/5179
cols <- 1e5 + 10 cols <- 1e5+10
d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols) d <- matrix(rexp(rows, rate=.1), nrow=rows, ncol=cols)
y <- rnorm(rows) y <- rnorm(rows)
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data=d, info = list(label=y))
inc <- list(c(seq.int(from = 0, to = cols, by = 1))) inc <- list(c(seq.int(from = 0, to = cols, by = 1)))
with_inc <- xgb.train(data = dtrain, tree_method = 'hist', with_inc <- xgb.train(data=dtrain, tree_method='hist',
interaction_constraints = inc, nrounds = 10) interaction_constraints=inc, nrounds=10)
without_inc <- xgb.train(data = dtrain, tree_method = 'hist', nrounds = 10) without_inc <- xgb.train(data=dtrain, tree_method='hist', nrounds=10)
expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc)) expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc))
}) })

View File

@@ -9,9 +9,9 @@ test_that("predict feature interactions works", {
# simulate some binary data and a linear outcome with an interaction term # simulate some binary data and a linear outcome with an interaction term
N <- 1000 N <- 1000
P <- 5 P <- 5
X <- matrix(rbinom(N * P, 1, 0.5), ncol = P, dimnames = list(NULL, letters[1:P])) X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P]))
# center the data (as contributions are computed WRT feature means) # center the data (as contributions are computed WRT feature means)
X <- scale(X, scale = FALSE) X <- scale(X, scale=FALSE)
# outcome without any interactions, without any noise: # outcome without any interactions, without any noise:
f <- function(x) 2 * x[, 1] - 3 * x[, 2] f <- function(x) 2 * x[, 1] - 3 * x[, 2]
@@ -23,14 +23,14 @@ test_that("predict feature interactions works", {
y <- f_int(X) y <- f_int(X)
dm <- xgb.DMatrix(X, label = y) dm <- xgb.DMatrix(X, label = y)
param <- list(eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = 2) param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2)
b <- xgb.train(param, dm, 100) b <- xgb.train(param, dm, 100)
pred <- predict(b, dm, outputmargin = TRUE) pred = predict(b, dm, outputmargin=TRUE)
# SHAP contributions: # SHAP contributions:
cont <- predict(b, dm, predcontrib = TRUE) cont <- predict(b, dm, predcontrib=TRUE)
expect_equal(dim(cont), c(N, P + 1)) expect_equal(dim(cont), c(N, P+1))
# make sure for each row they add up to marginal predictions # make sure for each row they add up to marginal predictions
max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001) max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001)
# Hand-construct the 'ground truth' feature contributions: # Hand-construct the 'ground truth' feature contributions:
@@ -39,43 +39,43 @@ test_that("predict feature interactions works", {
-3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2 -3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2
1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3 1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3
) )
gt_cont <- cbind(gt_cont, matrix(0, nrow = N, ncol = P + 1 - 3)) gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3))
# These should be relatively close: # These should be relatively close:
expect_lt(max(abs(cont - gt_cont)), 0.05) expect_lt(max(abs(cont - gt_cont)), 0.05)
# SHAP interaction contributions: # SHAP interaction contributions:
intr <- predict(b, dm, predinteraction = TRUE) intr <- predict(b, dm, predinteraction=TRUE)
expect_equal(dim(intr), c(N, P + 1, P + 1)) expect_equal(dim(intr), c(N, P+1, P+1))
# check assigned colnames # check assigned colnames
cn <- c(letters[1:P], "BIAS") cn <- c(letters[1:P], "BIAS")
expect_equal(dimnames(intr), list(NULL, cn, cn)) expect_equal(dimnames(intr), list(NULL, cn, cn))
# check the symmetry # check the symmetry
max(abs(aperm(intr, c(1, 3, 2)) - intr)) %>% expect_lt(0.00001) max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001)
# sums WRT columns must be close to feature contributions # sums WRT columns must be close to feature contributions
max(abs(apply(intr, c(1, 2), sum) - cont)) %>% expect_lt(0.00001) max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001)
# diagonal terms for features 3,4,5 must be close to zero # diagonal terms for features 3,4,5 must be close to zero
Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05) Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05)
# BIAS must have no interactions # BIAS must have no interactions
max(abs(intr[, 1:P, P + 1])) %>% expect_lt(0.00001) max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001)
# interactions other than 2 x 3 must be close to zero # interactions other than 2 x 3 must be close to zero
intr23 <- intr intr23 <- intr
intr23[, 2, 3] <- 0 intr23[,2,3] <- 0
Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i + 1):(P + 1)])))) %>% expect_lt(0.05) Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05)
# Construct the 'ground truth' contributions of interactions directly from the linear terms: # Construct the 'ground truth' contributions of interactions directly from the linear terms:
gt_intr <- array(0, c(N, P + 1, P + 1)) gt_intr <- array(0, c(N, P+1, P+1))
gt_intr[, 2, 3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
gt_intr[, 3, 2] <- gt_intr[, 2, 3] gt_intr[,3,2] <- gt_intr[, 2, 3]
# merge-in the diagonal based on 'ground truth' feature contributions # merge-in the diagonal based on 'ground truth' feature contributions
intr_diag <- gt_cont - apply(gt_intr, c(1, 2), sum) intr_diag = gt_cont - apply(gt_intr, c(1,2), sum)
for (j in seq_len(P)) { for(j in seq_len(P)) {
gt_intr[, j, j] <- intr_diag[, j] gt_intr[,j,j] = intr_diag[,j]
} }
# These should be relatively close: # These should be relatively close:
expect_lt(max(abs(intr - gt_intr)), 0.1) expect_lt(max(abs(intr - gt_intr)), 0.1)
@@ -107,7 +107,7 @@ test_that("SHAP contribution values are not NAN", {
shaps <- as.data.frame(predict(fit, shaps <- as.data.frame(predict(fit,
newdata = as.matrix(subset(d, fold == 1)[, ivs]), newdata = as.matrix(subset(d, fold == 1)[, ivs]),
predcontrib = TRUE)) predcontrib = T))
result <- cbind(shaps, sum = rowSums(shaps), pred = predict(fit, result <- cbind(shaps, sum = rowSums(shaps), pred = predict(fit,
newdata = as.matrix(subset(d, fold == 1)[, ivs]))) newdata = as.matrix(subset(d, fold == 1)[, ivs])))
@@ -116,26 +116,26 @@ test_that("SHAP contribution values are not NAN", {
test_that("multiclass feature interactions work", { test_that("multiclass feature interactions work", {
dm <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1) dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1)
param <- list(eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3) param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3)
b <- xgb.train(param, dm, 40) b <- xgb.train(param, dm, 40)
pred <- predict(b, dm, outputmargin = TRUE) %>% array(c(3, 150)) %>% t pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t
# SHAP contributions: # SHAP contributions:
cont <- predict(b, dm, predcontrib = TRUE) cont <- predict(b, dm, predcontrib=TRUE)
expect_length(cont, 3) expect_length(cont, 3)
# rewrap them as a 3d array # rewrap them as a 3d array
cont <- unlist(cont) %>% array(c(150, 5, 3)) cont <- unlist(cont) %>% array(c(150, 5, 3))
# make sure for each row they add up to marginal predictions # make sure for each row they add up to marginal predictions
max(abs(apply(cont, c(1, 3), sum) - pred)) %>% expect_lt(0.001) max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001)
# SHAP interaction contributions: # SHAP interaction contributions:
intr <- predict(b, dm, predinteraction = TRUE) intr <- predict(b, dm, predinteraction=TRUE)
expect_length(intr, 3) expect_length(intr, 3)
# rewrap them as a 4d array # rewrap them as a 4d array
intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col] intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col]
# check the symmetry # check the symmetry
max(abs(aperm(intr, c(1, 2, 4, 3)) - intr)) %>% expect_lt(0.00001) max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001)
# sums WRT columns must be close to feature contributions # sums WRT columns must be close to feature contributions
max(abs(apply(intr, c(1, 2, 3), sum) - aperm(cont, c(3, 1, 2)))) %>% expect_lt(0.00001) max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001)
}) })

View File

@@ -0,0 +1,27 @@
context("Code is of high quality and lint free")
test_that("Code Lint", {
skip_on_cran()
skip_on_travis()
skip_if_not_installed("lintr")
my_linters <- list(
absolute_paths_linter=lintr::absolute_paths_linter,
assignment_linter=lintr::assignment_linter,
closed_curly_linter=lintr::closed_curly_linter,
commas_linter=lintr::commas_linter,
# commented_code_linter=lintr::commented_code_linter,
infix_spaces_linter=lintr::infix_spaces_linter,
line_length_linter=lintr::line_length_linter,
no_tab_linter=lintr::no_tab_linter,
object_usage_linter=lintr::object_usage_linter,
# snake_case_linter=lintr::snake_case_linter,
# multiple_dots_linter=lintr::multiple_dots_linter,
object_length_linter=lintr::object_length_linter,
open_curly_linter=lintr::open_curly_linter,
# single_quotes_linter=lintr::single_quotes_linter,
spaces_inside_linter=lintr::spaces_inside_linter,
spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
trailing_whitespace_linter=lintr::trailing_whitespace_linter
)
# lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
})

View File

@@ -1,109 +0,0 @@
require(xgboost)
require(jsonlite)
context("Models from previous versions of XGBoost can be loaded")
metadata <- list(
kRounds = 2,
kRows = 1000,
kCols = 4,
kForests = 2,
kMaxDepth = 2,
kClasses = 3
)
run_model_param_check <- function (config) {
testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
}
get_num_tree <- function (booster) {
dump <- xgb.dump(booster)
m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
m <- regmatches(dump, m)
num_tree <- Reduce('+', lapply(m, length))
return (num_tree)
}
run_booster_check <- function (booster, name) {
# If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
booster <- xgb.Booster.complete(booster)
}
config <- jsonlite::fromJSON(xgb.config(booster))
run_model_param_check(config)
if (name == 'cls') {
testthat::expect_equal(get_num_tree(booster),
metadata$kForests * metadata$kRounds * metadata$kClasses)
testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
metadata$kClasses)
} else if (name == 'logitraw') {
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logitraw')
} else if (name == 'logit') {
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logistic')
} else if (name == 'ltr') {
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
testthat::expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg')
} else {
testthat::expect_equal(name, 'reg')
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
testthat::expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror')
}
}
test_that("Models from previous versions of XGBoost can be loaded", {
bucket <- 'xgboost-ci-jenkins-artifacts'
region <- 'us-west-2'
file_name <- 'xgboost_r_model_compatibility_test.zip'
zipfile <- file.path(getwd(), file_name)
model_dir <- file.path(getwd(), 'models')
download.file(paste('https://', bucket, '.s3-', region, '.amazonaws.com/', file_name, sep = ''),
destfile = zipfile, mode = 'wb', quiet = TRUE)
unzip(zipfile, overwrite = TRUE)
pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4))
lapply(list.files(model_dir), function (x) {
model_file <- file.path(model_dir, x)
m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
m <- regmatches(model_file, m)[[1]]
model_xgb_ver <- m[2]
name <- m[3]
is_rds <- endsWith(model_file, '.rds')
cpp_warning <- capture.output({
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
booster <- readRDS(model_file)
expect_warning(predict(booster, newdata = pred_data))
expect_warning(run_booster_check(booster, name))
} else {
if (is_rds) {
booster <- readRDS(model_file)
} else {
booster <- xgb.load(model_file)
}
predict(booster, newdata = pred_data)
run_booster_check(booster, name)
}
})
if (compareVersion(model_xgb_ver, '1.0.0.0') < 0) {
# Expect a C++ warning when a model was generated in version < 1.0.x
m <- grepl(paste0('.*Loading model from XGBoost < 1\\.0\\.0, consider saving it again for ',
'improved compatibility.*'), cpp_warning, perl = TRUE)
expect_true(length(m) > 0 && all(m))
} else if (is_rds && model_xgb_ver == '1.1.1.1') {
# Expect a C++ warning when a model is loaded from RDS and it was generated by version 1.1.x
m <- grepl(paste0('.*Attempted to load internal configuration for a model file that was ',
'generated by a previous version of XGBoost.*'), cpp_warning, perl = TRUE)
expect_true(length(m) > 0 && all(m))
}
})
})

View File

@@ -3,21 +3,22 @@ require(xgboost)
context("monotone constraints") context("monotone constraints")
set.seed(1024) set.seed(1024)
x <- rnorm(1000, 10) x = rnorm(1000, 10)
y <- -1 * x + rnorm(1000, 0.001) + 3 * sin(x) y = -1*x + rnorm(1000, 0.001) + 3*sin(x)
train <- matrix(x, ncol = 1) train = matrix(x, ncol = 1)
test_that("monotone constraints for regression", { test_that("monotone constraints for regression", {
bst <- xgboost(data = train, label = y, max_depth = 2, bst = xgboost(data = train, label = y, max_depth = 2,
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
monotone_constraints = -1) monotone_constraints = -1)
pred <- predict(bst, train) pred = predict(bst, train)
ind <- order(train[, 1]) ind = order(train[,1])
pred.ord <- pred[ind] pred.ord = pred[ind]
expect_true({ expect_true({
!any(diff(pred.ord) > 0) !any(diff(pred.ord) > 0)
}, "Monotone Contraint Satisfied") }, "Monotone Contraint Satisfied")
}) })

View File

@@ -2,8 +2,8 @@ context('Test model params and call are exposed to R')
require(xgboost) require(xgboost)
data(agaricus.train, package = 'xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)

View File

@@ -5,10 +5,10 @@ set.seed(1994)
test_that("poisson regression works", { test_that("poisson regression works", {
data(mtcars) data(mtcars)
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11], bst <- xgboost(data = as.matrix(mtcars[,-11]), label = mtcars[,11],
objective = 'count:poisson', nrounds = 10, verbose = 0) objective = 'count:poisson', nrounds=10, verbose=0)
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
pred <- predict(bst, as.matrix(mtcars[, -11])) pred <- predict(bst, as.matrix(mtcars[, -11]))
expect_equal(length(pred), 32) expect_equal(length(pred), 32)
expect_lt(sqrt(mean((pred - mtcars[, 11])^2)), 1.2) expect_lt(sqrt(mean( (pred - mtcars[,11])^2 )), 1.2)
}) })

View File

@@ -1,51 +0,0 @@
require(xgboost)
require(Matrix)
context('Learning to rank')
test_that('Test ranking with unweighted data', {
X <- sparseMatrix(i = c(2, 3, 7, 9, 12, 15, 17, 18),
j = c(1, 1, 2, 2, 3, 3, 4, 4),
x = rep(1.0, 8), dims = c(20, 4))
y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
group <- c(5, 5, 5, 5)
dtrain <- xgb.DMatrix(X, label = y, group = group)
params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
eval_metric = 'auc', eval_metric = 'aucpr')
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
# Check if the metric is monotone increasing
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
})
test_that('Test ranking with weighted data', {
X <- sparseMatrix(i = c(2, 3, 7, 9, 12, 15, 17, 18),
j = c(1, 1, 2, 2, 3, 3, 4, 4),
x = rep(1.0, 8), dims = c(20, 4))
y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
group <- c(5, 5, 5, 5)
weight <- c(1.0, 2.0, 3.0, 4.0)
dtrain <- xgb.DMatrix(X, label = y, group = group, weight = weight)
params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
eval_metric = 'auc', eval_metric = 'aucpr')
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
# Check if the metric is monotone increasing
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
for (i in 1:10) {
pred <- predict(bst, newdata = dtrain, ntreelimit = i)
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
is_sorted <- lapply(seq(1, 20, by = 5),
function (k) {
ind <- order(-pred[k:(k + 4)])
z <- y[ind + (k - 1)]
all(diff(z) <= 0) # Check if z is monotone decreasing
})
# Since we give weights 1, 2, 3, 4 to the four query groups,
# the ranking predictor will first try to correctly sort the last query group
# before correctly sorting other groups.
expect_true(all(diff(as.numeric(is_sorted)) >= 0))
}
})

View File

@@ -9,23 +9,23 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# Disable flaky tests for 32-bit Windows. # Disable flaky tests for 32-bit Windows.
# See https://github.com/dmlc/xgboost/issues/3720 # See https://github.com/dmlc/xgboost/issues/3720
win32_flag <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8 win32_flag = .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
test_that("updating the model works", { test_that("updating the model works", {
watchlist <- list(train = dtrain, test = dtest) watchlist = list(train = dtrain, test = dtest)
# no-subsampling # no-subsampling
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2) p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
set.seed(11) set.seed(11)
bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0) bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
tr1 <- xgb.model.dt.tree(model = bst1) tr1 <- xgb.model.dt.tree(model = bst1)
# with subsampling # with subsampling
p2 <- modifyList(p1, list(subsample = 0.1)) p2 <- modifyList(p1, list(subsample = 0.1))
set.seed(11) set.seed(11)
bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0) bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0)
tr2 <- xgb.model.dt.tree(model = bst2) tr2 <- xgb.model.dt.tree(model = bst2)
# the same no-subsampling boosting with an extra 'refresh' updater: # the same no-subsampling boosting with an extra 'refresh' updater:
p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE)) p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
set.seed(11) set.seed(11)
@@ -57,7 +57,7 @@ test_that("updating the model works", {
# all should be the same when no subsampling # all should be the same when no subsampling
expect_equal(bst1$evaluation_log, bst1u$evaluation_log) expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE) expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
# process type 'update' for model with subsampling, refreshing only the tree stats from training data: # process type 'update' for model with subsampling, refreshing only the tree stats from training data:
p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2) bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
@@ -72,7 +72,7 @@ test_that("updating the model works", {
if (!win32_flag) { if (!win32_flag) {
expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE) expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
} }
# process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data: # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1) bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
@@ -93,12 +93,12 @@ test_that("updating works for multiclass & multitree", {
set.seed(121) set.seed(121)
bst0 <- xgb.train(p0, dtr, 5, watchlist, verbose = 0) bst0 <- xgb.train(p0, dtr, 5, watchlist, verbose = 0)
tr0 <- xgb.model.dt.tree(model = bst0) tr0 <- xgb.model.dt.tree(model = bst0)
# run update process for an original model with subsampling # run update process for an original model with subsampling
p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) p0u <- modifyList(p0, list(process_type='update', updater='refresh', refresh_leaf=FALSE))
bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0) bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0)
tr0u <- xgb.model.dt.tree(model = bst0u) tr0u <- xgb.model.dt.tree(model = bst0u)
# should be the same evaluation but different gains and larger cover # should be the same evaluation but different gains and larger cover
expect_equal(bst0$evaluation_log, bst0u$evaluation_log) expect_equal(bst0$evaluation_log, bst0u$evaluation_log)
expect_equal(tr0[Feature == 'Leaf']$Quality, tr0u[Feature == 'Leaf']$Quality) expect_equal(tr0[Feature == 'Leaf']$Quality, tr0u[Feature == 'Leaf']$Quality)

Some files were not shown because too many files have changed in this diff Show More