Compare commits
166 Commits
v1.1.1
...
release_1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
00774eeac3 | ||
|
|
bcb15a980f | ||
|
|
0cd0dad0b5 | ||
|
|
884098ec22 | ||
|
|
738786680b | ||
|
|
04232c01b2 | ||
|
|
0353a78ab7 | ||
|
|
0089a0e6bf | ||
|
|
03a68a1714 | ||
|
|
a0da8a7e0a | ||
|
|
eee4eff49b | ||
|
|
936a854baa | ||
|
|
7856da5827 | ||
|
|
50a0def6c3 | ||
|
|
9116a0ec10 | ||
|
|
71197d1dfa | ||
|
|
5a2dcd1c33 | ||
|
|
bf2990e773 | ||
|
|
5f3c811e84 | ||
|
|
3fcfaad577 | ||
|
|
3b88bc948f | ||
|
|
70903c872f | ||
|
|
d268a2a463 | ||
|
|
fa3715f584 | ||
|
|
e4a273e1da | ||
|
|
071e10c1d1 | ||
|
|
f5fdcbe194 | ||
|
|
18349a7ccf | ||
|
|
75b8c22b0b | ||
|
|
5879acde9a | ||
|
|
8943eb4314 | ||
|
|
6347fa1c2e | ||
|
|
ace7fd328b | ||
|
|
40361043ae | ||
|
|
12110c900e | ||
|
|
487ab0ce73 | ||
|
|
a4de2f68e4 | ||
|
|
fbfbd525d8 | ||
|
|
4af857f95d | ||
|
|
bc1d3ee230 | ||
|
|
30363d9c35 | ||
|
|
66cc1e02aa | ||
|
|
627cf41a60 | ||
|
|
9b688aca3b | ||
|
|
8d7702766a | ||
|
|
03fb98fbde | ||
|
|
8b1afce316 | ||
|
|
b3d2e7644a | ||
|
|
ac9136ee49 | ||
|
|
6c0c87216f | ||
|
|
71b0528a2f | ||
|
|
7c2686146e | ||
|
|
e471056ec4 | ||
|
|
730866a7bc | ||
|
|
029a8b533f | ||
|
|
7aee0e51ed | ||
|
|
3c40f4a7f5 | ||
|
|
3cae287dea | ||
|
|
970b4b3fa2 | ||
|
|
e0c179c7cc | ||
|
|
dd445af56e | ||
|
|
9f85e92602 | ||
|
|
23e2c6ec91 | ||
|
|
1813804e36 | ||
|
|
0d411b0397 | ||
|
|
22a31b1faa | ||
|
|
06320729d4 | ||
|
|
d0a29c3135 | ||
|
|
a3ec964346 | ||
|
|
048d969be4 | ||
|
|
ac3f0e78dc | ||
|
|
93c44a9a64 | ||
|
|
4b0852ee41 | ||
|
|
0f17e35bce | ||
|
|
efe3e48ae2 | ||
|
|
1a0801238e | ||
|
|
4d277d750d | ||
|
|
eb067c1c34 | ||
|
|
90a9c68874 | ||
|
|
47c89775d6 | ||
|
|
95f11ed27e | ||
|
|
8234091368 | ||
|
|
dcff96ed27 | ||
|
|
8104f10328 | ||
|
|
26143ad0b1 | ||
|
|
c4d721200a | ||
|
|
a6d9a06b7b | ||
|
|
a67bc64819 | ||
|
|
abdf894fcf | ||
|
|
38ee514787 | ||
|
|
7c3a168ffd | ||
|
|
e8ecafb8dc | ||
|
|
b47b5ac771 | ||
|
|
02884b08aa | ||
|
|
ae18a094b0 | ||
|
|
d39da42e69 | ||
|
|
529b5c2cfd | ||
|
|
1bcbe1fc14 | ||
|
|
1fa84b61c1 | ||
|
|
306e38ff31 | ||
|
|
3028fa6b42 | ||
|
|
c35be9dc40 | ||
|
|
cb7f7e542c | ||
|
|
c96e1ef283 | ||
|
|
1d22a9be1c | ||
|
|
d087a12b04 | ||
|
|
b5ab009c19 | ||
|
|
cacff9232a | ||
|
|
bd9d57f579 | ||
|
|
359023c0fa | ||
|
|
cfc23c6a6b | ||
|
|
d3a0efbf16 | ||
|
|
cd3d14ad0e | ||
|
|
e49607af19 | ||
|
|
e533908922 | ||
|
|
0be0e6fd88 | ||
|
|
b77e3e3fcc | ||
|
|
325156c7a9 | ||
|
|
d19cec70f1 | ||
|
|
267c1ed784 | ||
|
|
073b625bde | ||
|
|
9e1b29944e | ||
|
|
057c762ecd | ||
|
|
251dc8a663 | ||
|
|
f779980f7e | ||
|
|
ca0d605b34 | ||
|
|
35e2205256 | ||
|
|
91c646392d | ||
|
|
fdbb6ae856 | ||
|
|
75a0025a3d | ||
|
|
78b4e95f25 | ||
|
|
e3aa7f1441 | ||
|
|
f145241593 | ||
|
|
8438c7d0e4 | ||
|
|
e35ad8a074 | ||
|
|
1ba24a7597 | ||
|
|
f656ef2fed | ||
|
|
5af8161a1a | ||
|
|
646def51e0 | ||
|
|
60511a3222 | ||
|
|
dd01e4ba8d | ||
|
|
e21a608552 | ||
|
|
7903286961 | ||
|
|
dd9aeb60ae | ||
|
|
83981a9ce3 | ||
|
|
535479e69f | ||
|
|
2c1a439869 | ||
|
|
4e64e2ef8e | ||
|
|
9ad40901a8 | ||
|
|
fcf57823b6 | ||
|
|
9910265064 | ||
|
|
21ed1f0c6d | ||
|
|
eaf2a00b5c | ||
|
|
67d267f9da | ||
|
|
33e052b1e5 | ||
|
|
8de7f1928e | ||
|
|
b9649e7b8e | ||
|
|
dfcdfabf1f | ||
|
|
c90457f489 | ||
|
|
7d93932423 | ||
|
|
8dfe7b3686 | ||
|
|
4fd95272c8 | ||
|
|
474cfddf91 | ||
|
|
a23de1c108 | ||
|
|
f68155de6c | ||
|
|
e726dd9902 |
138
.github/workflows/main.yml
vendored
Normal file
138
.github/workflows/main.yml
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
# This is a basic workflow to help you get started with Actions
|
||||
|
||||
name: XGBoost-CI
|
||||
|
||||
# Controls when the action will run. Triggers the workflow on push or pull request
|
||||
# events but only for the master branch
|
||||
on: [push, pull_request]
|
||||
|
||||
env:
|
||||
R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'stringi', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools')
|
||||
|
||||
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
|
||||
jobs:
|
||||
test-with-jvm:
|
||||
name: Test JVM on OS ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [windows-latest, windows-2016, ubuntu-latest]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: actions/setup-java@v1
|
||||
with:
|
||||
java-version: 1.8
|
||||
|
||||
- name: Cache Maven packages
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.m2
|
||||
key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
|
||||
restore-keys: ${{ runner.os }}-m2
|
||||
|
||||
- name: Test JVM packages
|
||||
run: |
|
||||
cd jvm-packages
|
||||
mvn test -pl :xgboost4j_2.12
|
||||
|
||||
|
||||
lintr:
|
||||
runs-on: ${{ matrix.config.os }}
|
||||
|
||||
name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
|
||||
env:
|
||||
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
|
||||
RSPM: ${{ matrix.config.rspm }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: r-lib/actions/setup-r@master
|
||||
with:
|
||||
r-version: ${{ matrix.config.r }}
|
||||
|
||||
- name: Cache R packages
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.R_LIBS_USER }}
|
||||
key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2-
|
||||
|
||||
- name: Install dependencies
|
||||
shell: Rscript {0}
|
||||
run: |
|
||||
install.packages(${{ env.R_PACKAGES }},
|
||||
repos = 'http://cloud.r-project.org',
|
||||
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||
|
||||
- name: Run lintr
|
||||
run: |
|
||||
cd R-package
|
||||
R.exe CMD INSTALL .
|
||||
Rscript.exe tests/helper_scripts/run_lint.R
|
||||
|
||||
|
||||
test-with-R:
|
||||
runs-on: ${{ matrix.config.os }}
|
||||
|
||||
name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- {os: windows-latest, r: 'release', compiler: 'msvc', build: 'autotools'}
|
||||
- {os: windows-2016, r: 'release', compiler: 'msvc', build: 'autotools'}
|
||||
- {os: windows-latest, r: 'release', compiler: 'msvc', build: 'cmake'}
|
||||
- {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'}
|
||||
- {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
|
||||
- {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'}
|
||||
- {os: windows-latest, r: 'release', compiler: 'mingw', build: 'cmake'}
|
||||
- {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'}
|
||||
env:
|
||||
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
|
||||
RSPM: ${{ matrix.config.rspm }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: r-lib/actions/setup-r@master
|
||||
with:
|
||||
r-version: ${{ matrix.config.r }}
|
||||
|
||||
- name: Cache R packages
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.R_LIBS_USER }}
|
||||
key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2-
|
||||
|
||||
- name: Install dependencies
|
||||
shell: Rscript {0}
|
||||
run: |
|
||||
install.packages(${{ env.R_PACKAGES }},
|
||||
repos = 'http://cloud.r-project.org',
|
||||
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.6' # Version range or exact version of a Python version to use, using SemVer's version range syntax
|
||||
architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified
|
||||
|
||||
- name: Test R
|
||||
run: |
|
||||
python tests/ci_build/test_r_package.py --compiler="${{ matrix.config.compiler }}" --build-tool="${{ matrix.config.build }}"
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -51,6 +51,7 @@ Debug
|
||||
#.Rbuildignore
|
||||
R-package.Rproj
|
||||
*.cache*
|
||||
.mypy_cache/
|
||||
# java
|
||||
java/xgboost4j/target
|
||||
java/xgboost4j/tmp
|
||||
@@ -92,6 +93,7 @@ metastore_db
|
||||
# files from R-package source install
|
||||
**/config.status
|
||||
R-package/src/Makevars
|
||||
*.lib
|
||||
|
||||
# Visual Studio Code
|
||||
/.vscode/
|
||||
|
||||
@@ -43,6 +43,7 @@ addons:
|
||||
- graphviz
|
||||
- openssl
|
||||
- libgit2
|
||||
- lz4
|
||||
- wget
|
||||
- r
|
||||
update: true
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
cmake_minimum_required(VERSION 3.13)
|
||||
project(xgboost LANGUAGES CXX C VERSION 1.1.1)
|
||||
project(xgboost LANGUAGES CXX C VERSION 1.2.1)
|
||||
include(cmake/Utils.cmake)
|
||||
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
||||
cmake_policy(SET CMP0022 NEW)
|
||||
cmake_policy(SET CMP0079 NEW)
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
|
||||
cmake_policy(SET CMP0063 NEW)
|
||||
|
||||
if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
|
||||
@@ -32,6 +33,9 @@ option(R_LIB "Build shared library for R package" OFF)
|
||||
## Dev
|
||||
option(USE_DEBUG_OUTPUT "Dump internal training results like gradients and predictions to stdout.
|
||||
Should only be used for debugging." OFF)
|
||||
option(FORCE_COLORED_OUTPUT "Force colored output from compilers, useful when ninja is used instead of make." OFF)
|
||||
option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC/Clang" OFF)
|
||||
option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
|
||||
option(GOOGLE_TEST "Build google tests" OFF)
|
||||
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
|
||||
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
|
||||
@@ -79,6 +83,11 @@ endif (R_LIB AND GOOGLE_TEST)
|
||||
if (USE_AVX)
|
||||
message(SEND_ERROR "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.")
|
||||
endif (USE_AVX)
|
||||
if (ENABLE_ALL_WARNINGS)
|
||||
if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
|
||||
message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
|
||||
endif ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
|
||||
endif (ENABLE_ALL_WARNINGS)
|
||||
|
||||
#-- Sanitizer
|
||||
if (USE_SANITIZER)
|
||||
@@ -93,11 +102,20 @@ if (USE_CUDA)
|
||||
message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
|
||||
|
||||
enable_language(CUDA)
|
||||
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.0)
|
||||
message(FATAL_ERROR "CUDA version must be at least 10.0!")
|
||||
endif()
|
||||
set(GEN_CODE "")
|
||||
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
|
||||
message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
||||
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
|
||||
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
|
||||
endif()
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
if (USE_OPENMP)
|
||||
@@ -109,14 +127,28 @@ if (USE_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
endif (USE_OPENMP)
|
||||
|
||||
# core xgboost
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/src)
|
||||
|
||||
# dmlc-core
|
||||
msvc_use_static_runtime()
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
|
||||
set_target_properties(dmlc PROPERTIES
|
||||
CXX_STANDARD 11
|
||||
CXX_STANDARD 14
|
||||
CXX_STANDARD_REQUIRED ON
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
|
||||
if (MSVC)
|
||||
target_compile_options(dmlc PRIVATE
|
||||
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
|
||||
if (TARGET dmlc_unit_tests)
|
||||
target_compile_options(dmlc_unit_tests PRIVATE
|
||||
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
|
||||
endif (TARGET dmlc_unit_tests)
|
||||
endif (MSVC)
|
||||
if (ENABLE_ALL_WARNINGS)
|
||||
target_compile_options(dmlc PRIVATE -Wall -Wextra)
|
||||
endif (ENABLE_ALL_WARNINGS)
|
||||
target_link_libraries(objxgboost PUBLIC dmlc)
|
||||
|
||||
# rabit
|
||||
set(RABIT_BUILD_DMLC OFF)
|
||||
@@ -125,18 +157,26 @@ set(RABIT_WITH_R_LIB ${R_LIB})
|
||||
add_subdirectory(rabit)
|
||||
|
||||
if (RABIT_MOCK)
|
||||
list(APPEND LINKED_LIBRARIES_PRIVATE rabit_mock_static)
|
||||
target_link_libraries(objxgboost PUBLIC rabit_mock_static)
|
||||
if (MSVC)
|
||||
target_compile_options(rabit_mock_static PRIVATE
|
||||
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
|
||||
endif (MSVC)
|
||||
else()
|
||||
list(APPEND LINKED_LIBRARIES_PRIVATE rabit)
|
||||
target_link_libraries(objxgboost PUBLIC rabit)
|
||||
if (MSVC)
|
||||
target_compile_options(rabit PRIVATE
|
||||
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
|
||||
endif (MSVC)
|
||||
endif(RABIT_MOCK)
|
||||
foreach(lib rabit rabit_base rabit_empty rabit_mock rabit_mock_static)
|
||||
# Explicitly link dmlc to rabit, so that configured header (build_config.h)
|
||||
# from dmlc is correctly applied to rabit.
|
||||
if (TARGET ${lib})
|
||||
target_link_libraries(${lib} dmlc ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (HIDE_CXX_SYMBOLS) # Hide all C++ symbols from Rabit
|
||||
set_target_properties(${lib} PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
endif (HIDE_CXX_SYMBOLS)
|
||||
if (ENABLE_ALL_WARNINGS)
|
||||
target_compile_options(${lib} PRIVATE -Wall -Wextra)
|
||||
endif (ENABLE_ALL_WARNINGS)
|
||||
endif (TARGET ${lib})
|
||||
endforeach()
|
||||
|
||||
@@ -145,31 +185,32 @@ if (R_LIB)
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
|
||||
endif (R_LIB)
|
||||
|
||||
# core xgboost
|
||||
list(APPEND LINKED_LIBRARIES_PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
|
||||
# Plugin
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/src)
|
||||
target_link_libraries(objxgboost PUBLIC dmlc)
|
||||
set(XGBOOST_OBJ_SOURCES "${XGBOOST_OBJ_SOURCES};$<TARGET_OBJECTS:objxgboost>")
|
||||
|
||||
#-- library
|
||||
if (BUILD_STATIC_LIB)
|
||||
add_library(xgboost STATIC ${XGBOOST_OBJ_SOURCES})
|
||||
add_library(xgboost STATIC)
|
||||
else (BUILD_STATIC_LIB)
|
||||
add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES})
|
||||
add_library(xgboost SHARED)
|
||||
endif (BUILD_STATIC_LIB)
|
||||
target_link_libraries(xgboost PRIVATE objxgboost)
|
||||
|
||||
if (USE_NVTX)
|
||||
enable_nvtx(xgboost)
|
||||
endif (USE_NVTX)
|
||||
|
||||
#-- Hide all C++ symbols
|
||||
if (HIDE_CXX_SYMBOLS)
|
||||
set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
set_target_properties(xgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
foreach(target objxgboost xgboost dmlc rabit rabit_mock_static)
|
||||
set_target_properties(${target} PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
endforeach()
|
||||
endif (HIDE_CXX_SYMBOLS)
|
||||
|
||||
target_include_directories(xgboost
|
||||
INTERFACE
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>)
|
||||
target_link_libraries(xgboost PRIVATE ${LINKED_LIBRARIES_PRIVATE})
|
||||
|
||||
# This creates its own shared library `xgboost4j'.
|
||||
if (JVM_BINDINGS)
|
||||
@@ -178,18 +219,21 @@ endif (JVM_BINDINGS)
|
||||
#-- End shared library
|
||||
|
||||
#-- CLI for xgboost
|
||||
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES})
|
||||
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
|
||||
target_link_libraries(runxgboost PRIVATE objxgboost)
|
||||
if (USE_NVTX)
|
||||
enable_nvtx(runxgboost)
|
||||
endif (USE_NVTX)
|
||||
|
||||
target_include_directories(runxgboost
|
||||
PRIVATE
|
||||
${xgboost_SOURCE_DIR}/include
|
||||
${xgboost_SOURCE_DIR}/dmlc-core/include
|
||||
${xgboost_SOURCE_DIR}/rabit/include)
|
||||
target_link_libraries(runxgboost PRIVATE ${LINKED_LIBRARIES_PRIVATE})
|
||||
set_target_properties(
|
||||
runxgboost PROPERTIES
|
||||
OUTPUT_NAME xgboost
|
||||
CXX_STANDARD 11
|
||||
CXX_STANDARD 14
|
||||
CXX_STANDARD_REQUIRED ON)
|
||||
#-- End CLI for xgboost
|
||||
|
||||
@@ -200,11 +244,12 @@ add_dependencies(xgboost runxgboost)
|
||||
|
||||
#-- Installing XGBoost
|
||||
if (R_LIB)
|
||||
include(cmake/RPackageInstallTargetSetup.cmake)
|
||||
set_target_properties(xgboost PROPERTIES PREFIX "")
|
||||
if (APPLE)
|
||||
set_target_properties(xgboost PROPERTIES SUFFIX ".so")
|
||||
endif (APPLE)
|
||||
setup_rpackage_install_target(xgboost ${CMAKE_CURRENT_BINARY_DIR})
|
||||
setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install")
|
||||
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
|
||||
endif (R_LIB)
|
||||
if (MINGW)
|
||||
|
||||
@@ -10,8 +10,8 @@ The Project Management Committee(PMC) consists group of active committers that m
|
||||
- Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
|
||||
* [Michael Benesty](https://github.com/pommedeterresautee)
|
||||
- Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
|
||||
* [Yuan Tang](https://github.com/terrytangyuan), Ant Financial
|
||||
- Yuan is a software engineer in Ant Financial. He contributed mostly in R and Python packages.
|
||||
* [Yuan Tang](https://github.com/terrytangyuan), Ant Group
|
||||
- Yuan is a software engineer in Ant Group. He contributed mostly in R and Python packages.
|
||||
* [Nan Zhu](https://github.com/CodingCat), Uber
|
||||
- Nan is a software engineer in Uber. He contributed mostly in JVM packages.
|
||||
* [Jiaming Yuan](https://github.com/trivialfis)
|
||||
@@ -37,6 +37,8 @@ Committers are people who have made substantial contribution to the project and
|
||||
- Sergei is a software engineer in Criteo. He contributed mostly in JVM packages.
|
||||
* [Scott Lundberg](http://scottlundberg.com/), University of Washington
|
||||
- Scott is a Ph.D. student at University of Washington. He is the creator of SHAP, a unified approach to explain the output of machine learning models such as decision tree ensembles. He also helps maintain the XGBoost Julia package.
|
||||
* [Egor Smirnov](https://github.com/SmirnovEgorRu), Intel
|
||||
- Egor has led a major effort to improve the performance of XGBoost on multi-core CPUs.
|
||||
|
||||
|
||||
Become a Committer
|
||||
|
||||
176
Jenkinsfile
vendored
176
Jenkinsfile
vendored
@@ -6,6 +6,9 @@
|
||||
// Command to run command inside a docker container
|
||||
dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
|
||||
// Which CUDA version to use when building reference distribution wheel
|
||||
ref_cuda_ver = '10.0'
|
||||
|
||||
import groovy.transform.Field
|
||||
|
||||
@Field
|
||||
@@ -31,13 +34,14 @@ pipeline {
|
||||
|
||||
// Build stages
|
||||
stages {
|
||||
stage('Jenkins Linux: Get sources') {
|
||||
agent { label 'linux && cpu' }
|
||||
stage('Jenkins Linux: Initialize') {
|
||||
agent { label 'job_initializer' }
|
||||
steps {
|
||||
script {
|
||||
checkoutSrcs()
|
||||
commit_id = "${GIT_COMMIT}"
|
||||
}
|
||||
sh 'python3 tests/jenkins_get_approval.py'
|
||||
stash name: 'srcs'
|
||||
milestone ordinal: 1
|
||||
}
|
||||
@@ -48,6 +52,7 @@ pipeline {
|
||||
script {
|
||||
parallel ([
|
||||
'clang-tidy': { ClangTidy() },
|
||||
'lint': { Lint() },
|
||||
'sphinx-doc': { SphinxDoc() },
|
||||
'doxygen': { Doxygen() }
|
||||
])
|
||||
@@ -63,9 +68,15 @@ pipeline {
|
||||
'build-cpu': { BuildCPU() },
|
||||
'build-cpu-rabit-mock': { BuildCPUMock() },
|
||||
'build-cpu-non-omp': { BuildCPUNonOmp() },
|
||||
// Build reference, distribution-ready Python wheel with CUDA 10.0
|
||||
// using CentOS 6 image
|
||||
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
|
||||
// The build-gpu-* builds below use Ubuntu image
|
||||
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
|
||||
'build-jvm-packages': { BuildJVMPackages(spark_version: '2.4.3') },
|
||||
'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2') },
|
||||
'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
|
||||
'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
|
||||
'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
|
||||
'build-jvm-doc': { BuildJVMDoc() }
|
||||
])
|
||||
}
|
||||
@@ -78,13 +89,14 @@ pipeline {
|
||||
script {
|
||||
parallel ([
|
||||
'test-python-cpu': { TestPythonCPU() },
|
||||
'test-python-gpu-cuda9.0': { TestPythonGPU(cuda_version: '9.0') },
|
||||
'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
|
||||
'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
|
||||
'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
|
||||
'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
|
||||
'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
|
||||
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
|
||||
'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2') },
|
||||
'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') },
|
||||
'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
|
||||
'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', multi_gpu: true) },
|
||||
'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2') },
|
||||
'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
|
||||
'test-jvm-jdk8-cuda10.0': { CrossTestJVMwithJDKGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.0') },
|
||||
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') },
|
||||
'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') },
|
||||
'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') },
|
||||
'test-r-3.5.3': { TestR(use_r35: true) }
|
||||
@@ -98,7 +110,7 @@ pipeline {
|
||||
steps {
|
||||
script {
|
||||
parallel ([
|
||||
'deploy-jvm-packages': { DeployJVMPackages(spark_version: '2.4.3') }
|
||||
'deploy-jvm-packages': { DeployJVMPackages(spark_version: '3.0.0') }
|
||||
])
|
||||
}
|
||||
milestone ordinal: 5
|
||||
@@ -122,13 +134,17 @@ def checkoutSrcs() {
|
||||
}
|
||||
}
|
||||
|
||||
def GetCUDABuildContainerType(cuda_version) {
|
||||
return (cuda_version == ref_cuda_ver) ? 'gpu_build_centos6' : 'gpu_build'
|
||||
}
|
||||
|
||||
def ClangTidy() {
|
||||
node('linux && cpu') {
|
||||
node('linux && cpu_build') {
|
||||
unstash name: 'srcs'
|
||||
echo "Running clang-tidy job..."
|
||||
def container_type = "clang_tidy"
|
||||
def docker_binary = "docker"
|
||||
def dockerArgs = "--build-arg CUDA_VERSION=10.1"
|
||||
def dockerArgs = "--build-arg CUDA_VERSION_ARG=10.1"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py
|
||||
"""
|
||||
@@ -143,7 +159,7 @@ def Lint() {
|
||||
def container_type = "cpu"
|
||||
def docker_binary = "docker"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} make lint
|
||||
${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make lint"
|
||||
"""
|
||||
deleteDir()
|
||||
}
|
||||
@@ -157,7 +173,7 @@ def SphinxDoc() {
|
||||
def docker_binary = "docker"
|
||||
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e SPHINX_GIT_BRANCH=${BRANCH_NAME}'"
|
||||
sh """#!/bin/bash
|
||||
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} make -C doc html
|
||||
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make -C doc html"
|
||||
"""
|
||||
deleteDir()
|
||||
}
|
||||
@@ -172,8 +188,10 @@ def Doxygen() {
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh ${BRANCH_NAME}
|
||||
"""
|
||||
echo 'Uploading doc...'
|
||||
s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2"
|
||||
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
|
||||
echo 'Uploading doc...'
|
||||
s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2"
|
||||
}
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
@@ -189,7 +207,7 @@ def BuildCPU() {
|
||||
# This step is not necessary, but here we include it, to ensure that DMLC_CORE_USE_CMAKE flag is correctly propagated
|
||||
# We want to make sure that we use the configured header build/dmlc/build_config.h instead of include/dmlc/build_config_default.h.
|
||||
# See discussion at https://github.com/dmlc/xgboost/issues/5510
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON
|
||||
${dockerRun} ${container_type} ${docker_binary} build/testxgboost
|
||||
"""
|
||||
// Sanitizer test
|
||||
@@ -238,26 +256,52 @@ def BuildCPUNonOmp() {
|
||||
}
|
||||
|
||||
def BuildCUDA(args) {
|
||||
node('linux && cpu') {
|
||||
node('linux && cpu_build') {
|
||||
unstash name: 'srcs'
|
||||
echo "Build with CUDA ${args.cuda_version}"
|
||||
def container_type = "gpu_build"
|
||||
def container_type = GetCUDABuildContainerType(args.cuda_version)
|
||||
def docker_binary = "docker"
|
||||
def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
|
||||
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.cuda_version}"
|
||||
def arch_flag = ""
|
||||
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
|
||||
arch_flag = "-DGPU_COMPUTE_VER=75"
|
||||
}
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOLS=ON
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON -DOPEN_MP:BOOL=ON -DHIDE_CXX_SYMBOLS=ON ${arch_flag}
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} python3 tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
|
||||
"""
|
||||
// Stash wheel for CUDA 10.0 target
|
||||
if (args.cuda_version == '10.0') {
|
||||
echo 'Stashing Python wheel...'
|
||||
stash name: 'xgboost_whl_cuda10', includes: 'python-package/dist/*.whl'
|
||||
echo 'Stashing Python wheel...'
|
||||
stash name: "xgboost_whl_cuda${args.cuda_version}", includes: 'python-package/dist/*.whl'
|
||||
if (args.cuda_version == ref_cuda_ver && (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release'))) {
|
||||
echo 'Uploading Python wheel...'
|
||||
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
|
||||
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
|
||||
echo 'Stashing C++ test executable (testxgboost)...'
|
||||
stash name: 'xgboost_cpp_tests', includes: 'build/testxgboost'
|
||||
}
|
||||
echo 'Stashing C++ test executable (testxgboost)...'
|
||||
stash name: "xgboost_cpp_tests_cuda${args.cuda_version}", includes: 'build/testxgboost'
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
def BuildJVMPackagesWithCUDA(args) {
|
||||
node('linux && mgpu') {
|
||||
unstash name: 'srcs'
|
||||
echo "Build XGBoost4J-Spark with Spark ${args.spark_version}, CUDA ${args.cuda_version}"
|
||||
def container_type = "jvm_gpu_build"
|
||||
def docker_binary = "nvidia-docker"
|
||||
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.cuda_version}"
|
||||
def arch_flag = ""
|
||||
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
|
||||
arch_flag = "-DGPU_COMPUTE_VER=75"
|
||||
}
|
||||
// Use only 4 CPU cores
|
||||
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='--cpuset-cpus 0-3'"
|
||||
sh """
|
||||
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_jvm_packages.sh ${args.spark_version} -Duse.cuda=ON $arch_flag
|
||||
"""
|
||||
echo "Stashing XGBoost4J JAR with CUDA ${args.cuda_version} ..."
|
||||
stash name: 'xgboost4j_jar_gpu', includes: "jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar"
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
@@ -288,15 +332,17 @@ def BuildJVMDoc() {
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME}
|
||||
"""
|
||||
echo 'Uploading doc...'
|
||||
s3Upload file: "jvm-packages/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${BRANCH_NAME}.tar.bz2"
|
||||
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
|
||||
echo 'Uploading doc...'
|
||||
s3Upload file: "jvm-packages/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${BRANCH_NAME}.tar.bz2"
|
||||
}
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
def TestPythonCPU() {
|
||||
node('linux && cpu') {
|
||||
unstash name: 'xgboost_whl_cuda10'
|
||||
unstash name: "xgboost_whl_cuda${ref_cuda_ver}"
|
||||
unstash name: 'srcs'
|
||||
unstash name: 'xgboost_cli'
|
||||
echo "Test Python CPU"
|
||||
@@ -304,45 +350,35 @@ def TestPythonCPU() {
|
||||
def docker_binary = "docker"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu-py35
|
||||
"""
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
def TestPythonGPU(args) {
|
||||
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
|
||||
def nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
|
||||
def artifact_cuda_version = (args.artifact_cuda_version) ?: ref_cuda_ver
|
||||
node(nodeReq) {
|
||||
unstash name: 'xgboost_whl_cuda10'
|
||||
unstash name: "xgboost_whl_cuda${artifact_cuda_version}"
|
||||
unstash name: "xgboost_cpp_tests_cuda${artifact_cuda_version}"
|
||||
unstash name: 'srcs'
|
||||
echo "Test Python GPU: CUDA ${args.cuda_version}"
|
||||
echo "Test Python GPU: CUDA ${args.host_cuda_version}"
|
||||
def container_type = "gpu"
|
||||
def docker_binary = "nvidia-docker"
|
||||
def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
|
||||
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
|
||||
if (args.multi_gpu) {
|
||||
echo "Using multiple GPUs"
|
||||
// Allocate extra space in /dev/shm to enable NCCL
|
||||
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu
|
||||
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu
|
||||
"""
|
||||
if (args.cuda_version != '9.0') {
|
||||
echo "Running tests with cuDF..."
|
||||
sh """
|
||||
${dockerRun} cudf ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu-cudf
|
||||
"""
|
||||
}
|
||||
} else {
|
||||
echo "Using a single GPU"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh gpu
|
||||
"""
|
||||
if (args.cuda_version != '9.0') {
|
||||
echo "Running tests with cuDF..."
|
||||
sh """
|
||||
${dockerRun} cudf ${docker_binary} ${docker_args} tests/ci_build/test_python.sh cudf
|
||||
"""
|
||||
}
|
||||
}
|
||||
// For CUDA 10.0 target, run cuDF tests too
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
@@ -362,21 +398,34 @@ def TestCppRabit() {
|
||||
}
|
||||
|
||||
def TestCppGPU(args) {
|
||||
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
|
||||
def nodeReq = 'linux && mgpu'
|
||||
def artifact_cuda_version = (args.artifact_cuda_version) ?: ref_cuda_ver
|
||||
node(nodeReq) {
|
||||
unstash name: 'xgboost_cpp_tests'
|
||||
unstash name: "xgboost_cpp_tests_cuda${artifact_cuda_version}"
|
||||
unstash name: 'srcs'
|
||||
echo "Test C++, CUDA ${args.cuda_version}"
|
||||
echo "Test C++, CUDA ${args.host_cuda_version}"
|
||||
def container_type = "gpu"
|
||||
def docker_binary = "nvidia-docker"
|
||||
def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
|
||||
if (args.multi_gpu) {
|
||||
echo "Using multiple GPUs"
|
||||
sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost --gtest_filter=*.MGPU_*"
|
||||
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
|
||||
sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
def CrossTestJVMwithJDKGPU(args) {
|
||||
def nodeReq = 'linux && mgpu'
|
||||
node(nodeReq) {
|
||||
unstash name: "xgboost4j_jar_gpu"
|
||||
unstash name: 'srcs'
|
||||
if (args.spark_version != null) {
|
||||
echo "Test XGBoost4J on a machine with JDK ${args.jdk_version}, Spark ${args.spark_version}, CUDA ${args.host_cuda_version}"
|
||||
} else {
|
||||
echo "Using a single GPU"
|
||||
sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost --gtest_filter=-*.MGPU_*"
|
||||
echo "Test XGBoost4J on a machine with JDK ${args.jdk_version}, CUDA ${args.host_cuda_version}"
|
||||
}
|
||||
def container_type = "gpu_jvm"
|
||||
def docker_binary = "nvidia-docker"
|
||||
def docker_args = "--build-arg CUDA_VERSION_ARG=${args.host_cuda_version}"
|
||||
sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_jvm_gpu_cross.sh"
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
@@ -423,10 +472,11 @@ def DeployJVMPackages(args) {
|
||||
unstash name: 'srcs'
|
||||
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
|
||||
echo 'Deploying to xgboost-maven-repo S3 repo...'
|
||||
def container_type = "jvm"
|
||||
def docker_binary = "docker"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
|
||||
${dockerRun} jvm docker tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 0
|
||||
"""
|
||||
sh """
|
||||
${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=10.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 1
|
||||
"""
|
||||
}
|
||||
deleteDir()
|
||||
|
||||
@@ -10,15 +10,25 @@ def commit_id // necessary to pass a variable from one stage to another
|
||||
|
||||
pipeline {
|
||||
agent none
|
||||
|
||||
// Setup common job properties
|
||||
options {
|
||||
timestamps()
|
||||
timeout(time: 240, unit: 'MINUTES')
|
||||
buildDiscarder(logRotator(numToKeepStr: '10'))
|
||||
preserveStashes()
|
||||
}
|
||||
|
||||
// Build stages
|
||||
stages {
|
||||
stage('Jenkins Win64: Get sources') {
|
||||
agent { label 'win64 && build' }
|
||||
stage('Jenkins Win64: Initialize') {
|
||||
agent { label 'job_initializer' }
|
||||
steps {
|
||||
script {
|
||||
checkoutSrcs()
|
||||
commit_id = "${GIT_COMMIT}"
|
||||
}
|
||||
sh 'python3 tests/jenkins_get_approval.py'
|
||||
stash name: 'srcs'
|
||||
milestone ordinal: 1
|
||||
}
|
||||
@@ -28,7 +38,7 @@ pipeline {
|
||||
steps {
|
||||
script {
|
||||
parallel ([
|
||||
'build-win64-cuda10.0': { BuildWin64() }
|
||||
'build-win64-cuda10.1': { BuildWin64() }
|
||||
])
|
||||
}
|
||||
milestone ordinal: 2
|
||||
@@ -39,9 +49,7 @@ pipeline {
|
||||
steps {
|
||||
script {
|
||||
parallel ([
|
||||
'test-win64-cpu': { TestWin64CPU() },
|
||||
'test-win64-gpu-cuda10.0': { TestWin64GPU(cuda_target: 'cuda10_0') },
|
||||
'test-win64-gpu-cuda10.1': { TestWin64GPU(cuda_target: 'cuda10_1') }
|
||||
'test-win64-cuda10.1': { TestWin64() },
|
||||
])
|
||||
}
|
||||
milestone ordinal: 3
|
||||
@@ -66,14 +74,18 @@ def checkoutSrcs() {
|
||||
}
|
||||
|
||||
def BuildWin64() {
|
||||
node('win64 && build && cuda10') {
|
||||
node('win64 && cuda10_unified') {
|
||||
unstash name: 'srcs'
|
||||
echo "Building XGBoost for Windows AMD64 target..."
|
||||
bat "nvcc --version"
|
||||
def arch_flag = ""
|
||||
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
|
||||
arch_flag = "-DGPU_COMPUTE_VER=75"
|
||||
}
|
||||
bat """
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON
|
||||
cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
|
||||
"""
|
||||
bat """
|
||||
cd build
|
||||
@@ -91,8 +103,11 @@ def BuildWin64() {
|
||||
"""
|
||||
echo 'Stashing Python wheel...'
|
||||
stash name: 'xgboost_whl', includes: 'python-package/dist/*.whl'
|
||||
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
|
||||
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
|
||||
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
|
||||
echo 'Uploading Python wheel...'
|
||||
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
|
||||
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', workingDir: 'python-package/dist', includePathPattern:'**/*.whl'
|
||||
}
|
||||
echo 'Stashing C++ test executable (testxgboost)...'
|
||||
stash name: 'xgboost_cpp_tests', includes: 'build/testxgboost.exe'
|
||||
stash name: 'xgboost_cli', includes: 'xgboost.exe'
|
||||
@@ -100,51 +115,29 @@ def BuildWin64() {
|
||||
}
|
||||
}
|
||||
|
||||
def TestWin64CPU() {
|
||||
node('win64 && cpu') {
|
||||
def TestWin64() {
|
||||
node('win64 && cuda10_unified') {
|
||||
unstash name: 'srcs'
|
||||
unstash name: 'xgboost_whl'
|
||||
unstash name: 'xgboost_cli'
|
||||
echo "Test Win64 CPU"
|
||||
echo "Installing Python wheel..."
|
||||
bat "conda activate && (python -m pip uninstall -y xgboost || cd .)"
|
||||
bat """
|
||||
conda activate && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i"
|
||||
"""
|
||||
echo "Installing Python dependencies..."
|
||||
bat """
|
||||
conda activate && conda upgrade scikit-learn pandas numpy
|
||||
"""
|
||||
echo "Running Python tests..."
|
||||
bat "conda activate && python -m pytest -v -s --fulltrace tests\\python"
|
||||
bat "conda activate && python -m pip uninstall -y xgboost"
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
def TestWin64GPU(args) {
|
||||
node("win64 && gpu && ${args.cuda_target}") {
|
||||
unstash name: 'srcs'
|
||||
unstash name: 'xgboost_whl'
|
||||
unstash name: 'xgboost_cpp_tests'
|
||||
echo "Test Win64 GPU (${args.cuda_target})"
|
||||
echo "Test Win64"
|
||||
bat "nvcc --version"
|
||||
echo "Running C++ tests..."
|
||||
bat "build\\testxgboost.exe"
|
||||
echo "Installing Python wheel..."
|
||||
bat "conda activate && (python -m pip uninstall -y xgboost || cd .)"
|
||||
bat """
|
||||
conda activate && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i"
|
||||
"""
|
||||
echo "Installing Python dependencies..."
|
||||
def env_name = 'win64_' + UUID.randomUUID().toString().replaceAll('-', '')
|
||||
bat "conda env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml"
|
||||
echo "Installing Python wheel..."
|
||||
bat """
|
||||
conda activate && conda upgrade scikit-learn pandas numpy
|
||||
conda activate ${env_name} && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i"
|
||||
"""
|
||||
echo "Running Python tests..."
|
||||
bat "conda activate ${env_name} && python -m pytest -v -s -rxXs --fulltrace tests\\python"
|
||||
bat """
|
||||
conda activate && python -m pytest -v -s --fulltrace -m "(not slow) and (not mgpu)" tests\\python-gpu
|
||||
conda activate ${env_name} && python -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)" tests\\python-gpu
|
||||
"""
|
||||
bat "conda activate && python -m pip uninstall -y xgboost"
|
||||
bat "conda env remove --name ${env_name}"
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
7
Makefile
7
Makefile
@@ -44,7 +44,7 @@ export CXX = g++
|
||||
endif
|
||||
endif
|
||||
|
||||
export CFLAGS= -DDMLC_LOG_CUSTOMIZE=1 -std=c++11 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS)
|
||||
export CFLAGS= -DDMLC_LOG_CUSTOMIZE=1 -std=c++14 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS)
|
||||
CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include -I$(GTEST_PATH)/include
|
||||
|
||||
ifeq ($(TEST_COVER), 1)
|
||||
@@ -133,15 +133,16 @@ Rpack: clean_all
|
||||
sed -i -e 's/@BACKTRACE_LIB@//g' xgboost/src/Makevars.win
|
||||
sed -i -e 's/@OPENMP_LIB@//g' xgboost/src/Makevars.win
|
||||
rm -f xgboost/src/Makevars.win-e # OSX sed create this extra file; remove it
|
||||
bash R-package/remove_warning_suppression_pragma.sh
|
||||
bash xgboost/remove_warning_suppression_pragma.sh
|
||||
rm xgboost/remove_warning_suppression_pragma.sh
|
||||
rm -rfv xgboost/tests/helper_scripts/
|
||||
|
||||
Rbuild: Rpack
|
||||
R CMD build --no-build-vignettes xgboost
|
||||
rm -rf xgboost
|
||||
|
||||
Rcheck: Rbuild
|
||||
R CMD check xgboost*.tar.gz
|
||||
R CMD check --as-cran xgboost*.tar.gz
|
||||
|
||||
-include build/*.d
|
||||
-include build/*/*.d
|
||||
|
||||
200
NEWS.md
200
NEWS.md
@@ -3,6 +3,206 @@ XGBoost Change Log
|
||||
|
||||
This file records the changes in xgboost library in reverse chronological order.
|
||||
|
||||
## v1.1.0 (2020.05.17)
|
||||
|
||||
### Better performance on multi-core CPUs (#5244, #5334, #5522)
|
||||
* Poor performance scaling of the `hist` algorithm for multi-core CPUs has been under investigation (#3810). #5244 concludes the ongoing effort to improve performance scaling on multi-CPUs, in particular Intel CPUs. Roadmap: #5104
|
||||
* #5334 makes steps toward reducing memory consumption for the `hist` tree method on CPU.
|
||||
* #5522 optimizes random number generation for data sampling.
|
||||
|
||||
### Deterministic GPU algorithm for regression and classification (#5361)
|
||||
* GPU algorithm for regression and classification tasks is now deterministic.
|
||||
* Roadmap: #5023. Currently only single-GPU training is deterministic. Distributed training with multiple GPUs is not yet deterministic.
|
||||
|
||||
### Improve external memory support on GPUs (#5093, #5365)
|
||||
* Starting from 1.0.0 release, we added support for external memory on GPUs to enable training with larger datasets. Gradient-based sampling (#5093) speeds up the external memory algorithm by intelligently sampling a subset of the training data to copy into the GPU memory. [Learn more about out-of-core GPU gradient boosting.](https://arxiv.org/abs/2005.09148)
|
||||
* GPU-side data sketching now works with data from external memory (#5365).
|
||||
|
||||
### Parameter validation: detection of unused or incorrect parameters (#5477, #5569, #5508)
|
||||
* Mis-spelled training parameter is a common user mistake. In previous versions of XGBoost, mis-spelled parameters were silently ignored. Starting with 1.0.0 release, XGBoost will produce a warning message if there is any unused training parameters. The 1.1.0 release makes parameter validation available to the scikit-learn interface (#5477) and the R binding (#5569).
|
||||
|
||||
### Thread-safe, in-place prediction method (#5389, #5512)
|
||||
* Previously, the prediction method was not thread-safe (#5339). This release adds a new API function `inplace_predict()` that is thread-safe. It is now possible to serve concurrent requests for prediction using a shared model object.
|
||||
* It is now possible to compute prediction in-place for selected data formats (`numpy.ndarray` / `scipy.sparse.csr_matrix` / `cupy.ndarray` / `cudf.DataFrame` / `pd.DataFrame`) without creating a `DMatrix` object.
|
||||
|
||||
### Addition of Accelerated Failure Time objective for survival analysis (#4763, #5473, #5486, #5552, #5553)
|
||||
* Survival analysis (regression) models the time it takes for an event of interest to occur. The target label is potentially censored, i.e. the label is a range rather than a single number. We added a new objective `survival:aft` to support survival analysis. Also added is the new API to specify the ranged labels. Check out [the tutorial](https://xgboost.readthedocs.io/en/release_1.1.0/tutorials/aft_survival_analysis.html) and the [demos](https://github.com/dmlc/xgboost/tree/release_1.1.0/demo/aft_survival).
|
||||
* GPU support is work in progress (#5714).
|
||||
|
||||
### Improved installation experience on Mac OSX (#5597, #5602, #5606, #5701)
|
||||
* It only takes two commands to install the XGBoost Python package: `brew install libomp` followed by `pip install xgboost`. The installed XGBoost will use all CPU cores. Even better, starting with this release, we distribute pre-compiled binary wheels targeting Mac OSX. Now the install command `pip install xgboost` finishes instantly, as it no longer compiles the C++ source of XGBoost. The last three Mac versions (High Sierra, Mojave, Catalina) are supported.
|
||||
* R package: the 1.1.0 release fixes the error `Initializing libomp.dylib, but found libomp.dylib already initialized` (#5701)
|
||||
|
||||
### Ranking metrics are now accelerated on GPUs (#5380, #5387, #5398)
|
||||
|
||||
### GPU-side data matrix to ingest data directly from other GPU libraries (#5420, #5465)
|
||||
* Previously, data on GPU memory had to be copied back to the main memory before it could be used by XGBoost. Starting with 1.1.0 release, XGBoost provides a dedicated interface (`DeviceQuantileDMatrix`) so that it can ingest data from GPU memory directly. The result is that XGBoost interoperates better with GPU-accelerated data science libraries, such as cuDF, cuPy, and PyTorch.
|
||||
* Set device in device dmatrix. (#5596)
|
||||
|
||||
### Robust model serialization with JSON (#5123, #5217)
|
||||
* We continue efforts from the 1.0.0 release to adopt JSON as the format to save and load models robustly. Refer to the release note for 1.0.0 to learn more.
|
||||
* It is now possible to store internal configuration of the trained model (`Booster`) object in R as a JSON string (#5123, #5217).
|
||||
|
||||
### Improved integration with Dask
|
||||
* Pass through `verbose` parameter for dask fit (#5413)
|
||||
* Use `DMLC_TASK_ID`. (#5415)
|
||||
* Order the prediction result. (#5416)
|
||||
* Honor `nthreads` from dask worker. (#5414)
|
||||
* Enable grid searching with scikit-learn. (#5417)
|
||||
* Check non-equal when setting threads. (#5421)
|
||||
* Accept other inputs for prediction. (#5428)
|
||||
* Fix missing value for scikit-learn interface. (#5435)
|
||||
|
||||
### XGBoost4J-Spark: Check number of columns in the data iterator (#5202, #5303)
|
||||
* Before, the native layer in XGBoost did not know the number of columns (features) ahead of time and had to guess the number of columns by counting the feature index when ingesting data. This method has a failure more in distributed setting: if the training data is highly sparse, some features may be completely missing in one or more worker partitions. Thus, one or more workers may deduce an incorrect data shape, leading to crashes or silently wrong models.
|
||||
* Enforce correct data shape by passing the number of columns explicitly from the JVM layer into the native layer.
|
||||
|
||||
### Major refactoring of the `DMatrix` class
|
||||
* Continued from 1.0.0 release.
|
||||
* Remove update prediction cache from predictors. (#5312)
|
||||
* Predict on Ellpack. (#5327)
|
||||
* Partial rewrite EllpackPage (#5352)
|
||||
* Use ellpack for prediction only when sparsepage doesn't exist. (#5504)
|
||||
* RFC: #4354, Roadmap: #5143
|
||||
|
||||
### Breaking: XGBoost Python package now requires Pip 19.0 and higher (#5589)
|
||||
* Your Linux machine may have an old version of Pip and may attempt to install a source package, leading to long installation time. This is because we are now using `manylinux2010` tag in the binary wheel release. Ensure you have Pip 19.0 or newer by running `python3 -m pip -V` to check the version. Upgrade Pip with command
|
||||
```
|
||||
python3 -m pip install --upgrade pip
|
||||
```
|
||||
Upgrading to latest pip allows us to depend on newer versions of system libraries. [TensorFlow](https://www.tensorflow.org/install/pip) also requires Pip 19.0+.
|
||||
|
||||
### Breaking: GPU algorithm now requires CUDA 10.0 and higher (#5649)
|
||||
* CUDA 10.0 is necessary to make the GPU algorithm deterministic (#5361).
|
||||
|
||||
### Breaking: `silent` parameter is now removed (#5476)
|
||||
* Please use `verbosity` instead.
|
||||
|
||||
### Breaking: Set `output_margin` to True for custom objectives (#5564)
|
||||
* Now both R and Python interface custom objectives get un-transformed (raw) prediction outputs.
|
||||
|
||||
### Breaking: `Makefile` is now removed. We use CMake exclusively to build XGBoost (#5513)
|
||||
* Exception: the R package uses Autotools, as the CRAN ecosystem did not yet adopt CMake widely.
|
||||
|
||||
### Breaking: `distcol` updater is now removed (#5507)
|
||||
* The `distcol` updater has been long broken, and currently we lack resources to implement a working implementation from scratch.
|
||||
|
||||
### Deprecation notices
|
||||
* **Python 3.5**. This release is the last release to support Python 3.5. The following release (1.2.0) will require Python 3.6.
|
||||
* **Scala 2.11**. Currently XGBoost4J supports Scala 2.11. However, if a future release of XGBoost adopts Spark 3, it will not support Scala 2.11, as Spark 3 requires Scala 2.12+. We do not yet know which XGBoost release will adopt Spark 3.
|
||||
|
||||
### Known limitations
|
||||
* (Python package) When early stopping is activated with `early_stopping_rounds` at training time, the prediction method (`xgb.predict()`) behaves in a surprising way. If XGBoost runs for M rounds and chooses iteration N (N < M) as the best iteration, then the prediction method will use M trees by default. To use the best iteration (N trees), users will need to manually take the best iteration field `bst.best_iteration` and pass it as the `ntree_limit` argument to `xgb.predict()`. See #5209 and #4052 for additional context.
|
||||
* GPU ranking objective is currently not deterministic (#5561).
|
||||
* When training parameter `reg_lambda` is set to zero, some leaf nodes may be assigned a NaN value. (See [discussion](https://discuss.xgboost.ai/t/still-getting-unexplained-nans-new-replication-code/1383/9).) For now, please set `reg_lambda` to a nonzero value.
|
||||
|
||||
### Community and Governance
|
||||
* The XGBoost Project Management Committee (PMC) is pleased to announce a new committer: Egor Smirnov (@SmirnovEgorRu). He has led a major initiative to improve the performance of XGBoost on multi-core CPUs.
|
||||
|
||||
### Bug-fixes
|
||||
* Improved compatibility with scikit-learn (#5255, #5505, #5538)
|
||||
* Remove f-string, since it's not supported by Python 3.5 (#5330). Note that Python 3.5 support is deprecated and schedule to be dropped in the upcoming release (1.2.0).
|
||||
* Fix the pruner so that it doesn't prune the same branch twice (#5335)
|
||||
* Enforce only major version in JSON model schema (#5336). Any major revision of the model schema would bump up the major version.
|
||||
* Fix a small typo in sklearn.py that broke multiple eval metrics (#5341)
|
||||
* Restore loading model from a memory buffer (#5360)
|
||||
* Define lazy isinstance for Python compat (#5364)
|
||||
* [R] fixed uses of `class()` (#5426)
|
||||
* Force compressed buffer to be 4 bytes aligned, to keep cuda-memcheck happy (#5441)
|
||||
* Remove warning for calling host function (`std::max`) on a GPU device (#5453)
|
||||
* Fix uninitialized value bug in xgboost callback (#5463)
|
||||
* Fix model dump in CLI (#5485)
|
||||
* Fix out-of-bound array access in `WQSummary::SetPrune()` (#5493)
|
||||
* Ensure that configured `dmlc/build_config.h` is picked up by Rabit and XGBoost, to fix build on Alpine (#5514)
|
||||
* Fix a misspelled method, made in a git merge (#5509)
|
||||
* Fix a bug in binary model serialization (#5532)
|
||||
* Fix CLI model IO (#5535)
|
||||
* Don't use `uint` for threads (#5542)
|
||||
* Fix R interaction constraints to handle more than 100000 features (#5543)
|
||||
* [jvm-packages] XGBoost Spark should deal with NaN when parsing evaluation output (#5546)
|
||||
* GPU-side data sketching is now aware of query groups in learning-to-rank data (#5551)
|
||||
* Fix DMatrix slicing for newly added fields (#5552)
|
||||
* Fix configuration status with loading binary model (#5562)
|
||||
* Fix build when OpenMP is disabled (#5566)
|
||||
* R compatibility patches (#5577, #5600)
|
||||
* gpu\_hist performance fixes (#5558)
|
||||
* Don't set seed on CLI interface (#5563)
|
||||
* [R] When serializing model, preserve model attributes related to early stopping (#5573)
|
||||
* Avoid rabit calls in learner configuration (#5581)
|
||||
* Hide C++ symbols in libxgboost.so when building Python wheel (#5590). This fixes apache/incubator-tvm#4953.
|
||||
* Fix compilation on Mac OSX High Sierra (10.13) (#5597)
|
||||
* Fix build on big endian CPUs (#5617)
|
||||
* Resolve crash due to use of `vector<bool>::iterator` (#5642)
|
||||
* Validation JSON model dump using JSON schema (#5660)
|
||||
|
||||
### Performance improvements
|
||||
* Wide dataset quantile performance improvement (#5306)
|
||||
* Reduce memory usage of GPU-side data sketching (#5407)
|
||||
* Reduce span check overhead (#5464)
|
||||
* Serialise booster after training to free up GPU memory (#5484)
|
||||
* Use the maximum amount of GPU shared memory available to speed up the histogram kernel (#5491)
|
||||
* Use non-synchronising scan in Thrust (#5560)
|
||||
* Use `cudaDeviceGetAttribute()` instead of `cudaGetDeviceProperties()` for speed (#5570)
|
||||
|
||||
### API changes
|
||||
* Support importing data from a Pandas SparseArray (#5431)
|
||||
* `HostDeviceVector` (vector shared between CPU and GPU memory) now exposes `HostSpan` interface, to enable access on the CPU side with bound check (#5459)
|
||||
* Accept other gradient types for `SplitEntry` (#5467)
|
||||
|
||||
### Usability Improvements, Documentation
|
||||
* Add `JVM_CHECK_CALL` to prevent C++ exceptions from leaking into the JVM layer (#5199)
|
||||
* Updated Windows build docs (#5283)
|
||||
* Update affiliation of @hcho3 (#5292)
|
||||
* Display Sponsor button, link to OpenCollective (#5325)
|
||||
* Update docs for GPU external memory (#5332)
|
||||
* Add link to GPU documentation (#5437)
|
||||
* Small updates to GPU documentation (#5483)
|
||||
* Edits on tutorial for XGBoost job on Kubernetes (#5487)
|
||||
* Add reference to GPU external memory (#5490)
|
||||
* Fix typos (#5346, #5371, #5384, #5399, #5482, #5515)
|
||||
* Update Python doc (#5517)
|
||||
* Add Neptune and Optuna to list of examples (#5528)
|
||||
* Raise error if the number of data weights doesn't match the number of data sets (#5540)
|
||||
* Add a note about GPU ranking (#5572)
|
||||
* Clarify meaning of `training` parameter in the C API function `XGBoosterPredict()` (#5604)
|
||||
* Better error handling for situations where existing trees cannot be modified (#5406, #5418). This feature is enabled when `process_type` is set to `update`.
|
||||
|
||||
### Maintenance: testing, continuous integration, build system
|
||||
* Add C++ test coverage for data sketching (#5251)
|
||||
* Ignore gdb\_history (#5257)
|
||||
* Rewrite setup.py. (#5271, #5280)
|
||||
* Use `scikit-learn` in extra dependencies (#5310)
|
||||
* Add CMake option to build static library (#5397)
|
||||
* [R] changed FindLibR to take advantage of CMake cache (#5427)
|
||||
* [R] fixed inconsistency in R -e calls in FindLibR.cmake (#5438)
|
||||
* Refactor tests with data generator (#5439)
|
||||
* Resolve failing Travis CI (#5445)
|
||||
* Update dmlc-core. (#5466)
|
||||
* [CI] Use clang-tidy 10 (#5469)
|
||||
* De-duplicate code for checking maximum number of nodes (#5497)
|
||||
* [CI] Use Ubuntu 18.04 LTS in JVM CI, because 19.04 is EOL (#5537)
|
||||
* [jvm-packages] [CI] Create a Maven repository to host SNAPSHOT JARs (#5533)
|
||||
* [jvm-packages] [CI] Publish XGBoost4J JARs with Scala 2.11 and 2.12 (#5539)
|
||||
* [CI] Use Vault repository to re-gain access to devtoolset-4 (#5589)
|
||||
|
||||
### Maintenance: Refactor code for legibility and maintainability
|
||||
* Move prediction cache to Learner (#5220, #5302)
|
||||
* Remove SimpleCSRSource (#5315)
|
||||
* Refactor SparsePageSource, delete cache files after use (#5321)
|
||||
* Remove unnecessary DMatrix methods (#5324)
|
||||
* Split up `LearnerImpl` (#5350)
|
||||
* Move segment sorter to common (#5378)
|
||||
* Move thread local entry into Learner (#5396)
|
||||
* Split up test helpers header (#5455)
|
||||
* Requires setting leaf stat when expanding tree (#5501)
|
||||
* Purge device\_helpers.cuh (#5534)
|
||||
* Use thrust functions instead of custom functions (#5544)
|
||||
|
||||
### Acknowledgement
|
||||
**Contributors**: Nan Zhu (@CodingCat), Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Andrew Kane (@ankane), Avinash Barnwal (@avinashbarnwal), Bart Broere (@bartbroere), Andy Adinets (@canonizer), Chen Qin (@chenqin), Daiki Katsuragawa (@daikikatsuragawa), David Díaz Vico (@daviddiazvico), Darius Kharazi (@dkharazi), Darby Payne (@dpayne), Jason E. Aten, Ph.D. (@glycerine), Philip Hyunsu Cho (@hcho3), James Lamb (@jameslamb), Jan Borchmann (@jborchma), Kamil A. Kaczmarek (@kamil-kaczmarek), Melissa Kohl (@mjkohl32), Nicolas Scozzaro (@nscozzaro), Paul Kaefer (@paulkaefer), Rong Ou (@rongou), Samrat Pandiri (@samratp), Sriram Chandramouli (@sriramch), Yuan Tang (@terrytangyuan), Jiaming Yuan (@trivialfis), Liang-Chi Hsieh (@viirya), Bobby Wang (@wbo4958), Zhang Zhang (@zhangzhang10),
|
||||
|
||||
**Reviewers**: Nan Zhu (@CodingCat), @LeZhengThu, Rory Mitchell (@RAMitchell), @ShvetsKS, Egor Smirnov (@SmirnovEgorRu), Steve Bronder (@SteveBronder), Nikita Titov (@StrikerRUS), Andrew Kane (@ankane), Avinash Barnwal (@avinashbarnwal), @brydag, Andy Adinets (@canonizer), Chandra Shekhar Reddy (@chandrureddy), Chen Qin (@chenqin), Codecov (@codecov-io), David Díaz Vico (@daviddiazvico), Darby Payne (@dpayne), Jason E. Aten, Ph.D. (@glycerine), Philip Hyunsu Cho (@hcho3), James Lamb (@jameslamb), @johnny-cat, Mu Li (@mli), Mate Soos (@msoos), @rnyak, Rong Ou (@rongou), Sriram Chandramouli (@sriramch), Toby Dylan Hocking (@tdhock), Yuan Tang (@terrytangyuan), Oleksandr Pryimak (@trams), Jiaming Yuan (@trivialfis), Liang-Chi Hsieh (@viirya), Bobby Wang (@wbo4958),
|
||||
|
||||
## v1.0.0 (2020.02.19)
|
||||
This release marks a major milestone for the XGBoost project.
|
||||
|
||||
|
||||
@@ -6,8 +6,11 @@ file(GLOB_RECURSE R_SOURCES
|
||||
${CMAKE_CURRENT_LIST_DIR}/src/*.c)
|
||||
# Use object library to expose symbols
|
||||
add_library(xgboost-r OBJECT ${R_SOURCES})
|
||||
|
||||
set(R_DEFINITIONS
|
||||
if (ENABLE_ALL_WARNINGS)
|
||||
target_compile_options(xgboost-r PRIVATE -Wall -Wextra)
|
||||
endif (ENABLE_ALL_WARNINGS)
|
||||
target_compile_definitions(xgboost-r
|
||||
PUBLIC
|
||||
-DXGBOOST_STRICT_R_MODE=1
|
||||
-DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1
|
||||
-DDMLC_LOG_BEFORE_THROW=0
|
||||
@@ -15,24 +18,27 @@ set(R_DEFINITIONS
|
||||
-DDMLC_LOG_CUSTOMIZE=1
|
||||
-DRABIT_CUSTOMIZE_MSG_
|
||||
-DRABIT_STRICT_CXX98_)
|
||||
target_compile_definitions(xgboost-r
|
||||
PRIVATE ${R_DEFINITIONS})
|
||||
target_include_directories(xgboost-r
|
||||
PRIVATE
|
||||
${LIBR_INCLUDE_DIRS}
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
${PROJECT_SOURCE_DIR}/dmlc-core/include
|
||||
${PROJECT_SOURCE_DIR}/rabit/include)
|
||||
target_link_libraries(xgboost-r PUBLIC ${LIBR_CORE_LIBRARY})
|
||||
if (USE_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
target_link_libraries(xgboost-r PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
|
||||
endif (USE_OPENMP)
|
||||
set_target_properties(
|
||||
xgboost-r PROPERTIES
|
||||
CXX_STANDARD 11
|
||||
CXX_STANDARD 14
|
||||
CXX_STANDARD_REQUIRED ON
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
set(XGBOOST_DEFINITIONS "${XGBOOST_DEFINITIONS};${R_DEFINITIONS}" PARENT_SCOPE)
|
||||
set(XGBOOST_OBJ_SOURCES $<TARGET_OBJECTS:xgboost-r> PARENT_SCOPE)
|
||||
set(LINKED_LIBRARIES_PRIVATE ${LINKED_LIBRARIES_PRIVATE} ${LIBR_CORE_LIBRARY} PARENT_SCOPE)
|
||||
# Get compilation and link flags of xgboost-r and propagate to objxgboost
|
||||
target_link_libraries(objxgboost PUBLIC xgboost-r)
|
||||
# Add all objects of xgboost-r to objxgboost
|
||||
target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:xgboost-r>)
|
||||
|
||||
if (USE_OPENMP)
|
||||
target_link_libraries(xgboost-r PRIVATE OpenMP::OpenMP_CXX)
|
||||
endif ()
|
||||
set(LIBR_HOME "${LIBR_HOME}" PARENT_SCOPE)
|
||||
set(LIBR_EXECUTABLE "${LIBR_EXECUTABLE}" PARENT_SCOPE)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: Extreme Gradient Boosting
|
||||
Version: 1.1.1.1
|
||||
Date: 2020-02-21
|
||||
Version: 1.2.0.1
|
||||
Date: 2020-08-28
|
||||
Authors@R: c(
|
||||
person("Tianqi", "Chen", role = c("aut"),
|
||||
email = "tianqi.tchen@gmail.com"),
|
||||
@@ -54,7 +54,8 @@ Suggests:
|
||||
lintr,
|
||||
igraph (>= 1.0.1),
|
||||
jsonlite,
|
||||
float
|
||||
float,
|
||||
crayon
|
||||
Depends:
|
||||
R (>= 3.3.0)
|
||||
Imports:
|
||||
@@ -63,5 +64,5 @@ Imports:
|
||||
data.table (>= 1.9.6),
|
||||
magrittr (>= 1.5),
|
||||
stringi (>= 0.5.2)
|
||||
RoxygenNote: 7.1.0
|
||||
SystemRequirements: GNU make, C++11
|
||||
RoxygenNote: 7.1.1
|
||||
SystemRequirements: GNU make, C++14
|
||||
|
||||
@@ -62,11 +62,11 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
|
||||
callback <- function(env = parent.frame()) {
|
||||
if (length(env$bst_evaluation) == 0 ||
|
||||
period == 0 ||
|
||||
NVL(env$rank, 0) != 0 )
|
||||
NVL(env$rank, 0) != 0)
|
||||
return()
|
||||
|
||||
i <- env$iteration
|
||||
if ((i-1) %% period == 0 ||
|
||||
if ((i - 1) %% period == 0 ||
|
||||
i == env$begin_iteration ||
|
||||
i == env$end_iteration) {
|
||||
stdev <- if (showsd) env$bst_evaluation_err else NULL
|
||||
@@ -115,7 +115,7 @@ cb.evaluation.log <- function() {
|
||||
stop("bst_evaluation must have non-empty names")
|
||||
|
||||
mnames <<- gsub('-', '_', names(env$bst_evaluation))
|
||||
if(!is.null(env$bst_evaluation_err))
|
||||
if (!is.null(env$bst_evaluation_err))
|
||||
mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std'))
|
||||
}
|
||||
|
||||
@@ -123,12 +123,12 @@ cb.evaluation.log <- function() {
|
||||
env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log)))
|
||||
setnames(env$evaluation_log, c('iter', mnames))
|
||||
|
||||
if(!is.null(env$bst_evaluation_err)) {
|
||||
if (!is.null(env$bst_evaluation_err)) {
|
||||
# rearrange col order from _mean,_mean,...,_std,_std,...
|
||||
# to be _mean,_std,_mean,_std,...
|
||||
len <- length(mnames)
|
||||
means <- mnames[seq_len(len/2)]
|
||||
stds <- mnames[(len/2 + 1):len]
|
||||
means <- mnames[seq_len(len / 2)]
|
||||
stds <- mnames[(len / 2 + 1):len]
|
||||
cnames <- numeric(len)
|
||||
cnames[c(TRUE, FALSE)] <- means
|
||||
cnames[c(FALSE, TRUE)] <- stds
|
||||
@@ -144,7 +144,7 @@ cb.evaluation.log <- function() {
|
||||
return(finalizer(env))
|
||||
|
||||
ev <- env$bst_evaluation
|
||||
if(!is.null(env$bst_evaluation_err))
|
||||
if (!is.null(env$bst_evaluation_err))
|
||||
ev <- c(ev, env$bst_evaluation_err)
|
||||
env$evaluation_log <- c(env$evaluation_log,
|
||||
list(c(iter = env$iteration, ev)))
|
||||
@@ -351,13 +351,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
|
||||
finalizer <- function(env) {
|
||||
if (!is.null(env$bst)) {
|
||||
attr_best_score = as.numeric(xgb.attr(env$bst$handle, 'best_score'))
|
||||
attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score'))
|
||||
if (best_score != attr_best_score)
|
||||
stop("Inconsistent 'best_score' values between the closure state: ", best_score,
|
||||
" and the xgb.attr: ", attr_best_score)
|
||||
env$bst$best_iteration = best_iteration
|
||||
env$bst$best_ntreelimit = best_ntreelimit
|
||||
env$bst$best_score = best_score
|
||||
env$bst$best_iteration <- best_iteration
|
||||
env$bst$best_ntreelimit <- best_ntreelimit
|
||||
env$bst$best_score <- best_score
|
||||
} else {
|
||||
env$basket$best_iteration <- best_iteration
|
||||
env$basket$best_ntreelimit <- best_ntreelimit
|
||||
@@ -372,9 +372,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
return(finalizer(env))
|
||||
|
||||
i <- env$iteration
|
||||
score = env$bst_evaluation[metric_idx]
|
||||
score <- env$bst_evaluation[metric_idx]
|
||||
|
||||
if (( maximize && score > best_score) ||
|
||||
if ((maximize && score > best_score) ||
|
||||
(!maximize && score < best_score)) {
|
||||
|
||||
best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
|
||||
@@ -500,7 +500,7 @@ cb.cv.predict <- function(save_models = FALSE) {
|
||||
for (fd in env$bst_folds) {
|
||||
pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
|
||||
if (is.matrix(pred)) {
|
||||
pred[fd$index,] <- pr
|
||||
pred[fd$index, ] <- pr
|
||||
} else {
|
||||
pred[fd$index] <- pr
|
||||
}
|
||||
@@ -613,9 +613,7 @@ cb.gblinear.history <- function(sparse=FALSE) {
|
||||
|
||||
init <- function(env) {
|
||||
if (!is.null(env$bst)) { # xgb.train:
|
||||
coef_path <- list()
|
||||
} else if (!is.null(env$bst_folds)) { # xgb.cv:
|
||||
coef_path <- rep(list(), length(env$bst_folds))
|
||||
} else stop("Parent frame has neither 'bst' nor 'bst_folds'")
|
||||
}
|
||||
|
||||
@@ -705,11 +703,11 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
if (!is_cv) {
|
||||
# extract num_class & num_feat from the internal model
|
||||
dmp <- xgb.dump(model)
|
||||
if(length(dmp) < 2 || dmp[2] != "bias:")
|
||||
if (length(dmp) < 2 || dmp[2] != "bias:")
|
||||
stop("It does not appear to be a gblinear model")
|
||||
dmp <- dmp[-c(1,2)]
|
||||
dmp <- dmp[-c(1, 2)]
|
||||
n <- which(dmp == 'weight:')
|
||||
if(length(n) != 1)
|
||||
if (length(n) != 1)
|
||||
stop("It does not appear to be a gblinear model")
|
||||
num_class <- n - 1
|
||||
num_feat <- (length(dmp) - 4) / num_class
|
||||
@@ -732,9 +730,9 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
if (!is.null(class_index) && num_class > 1) {
|
||||
coef_path <- if (is.list(coef_path)) {
|
||||
lapply(coef_path,
|
||||
function(x) x[, seq(1 + class_index, by=num_class, length.out=num_feat)])
|
||||
function(x) x[, seq(1 + class_index, by = num_class, length.out = num_feat)])
|
||||
} else {
|
||||
coef_path <- coef_path[, seq(1 + class_index, by=num_class, length.out=num_feat)]
|
||||
coef_path <- coef_path[, seq(1 + class_index, by = num_class, length.out = num_feat)]
|
||||
}
|
||||
}
|
||||
coef_path
|
||||
|
||||
@@ -69,23 +69,23 @@ check.booster.params <- function(params, ...) {
|
||||
|
||||
if (!is.null(params[['monotone_constraints']]) &&
|
||||
typeof(params[['monotone_constraints']]) != "character") {
|
||||
vec2str = paste(params[['monotone_constraints']], collapse = ',')
|
||||
vec2str = paste0('(', vec2str, ')')
|
||||
params[['monotone_constraints']] = vec2str
|
||||
vec2str <- paste(params[['monotone_constraints']], collapse = ',')
|
||||
vec2str <- paste0('(', vec2str, ')')
|
||||
params[['monotone_constraints']] <- vec2str
|
||||
}
|
||||
|
||||
# interaction constraints parser (convert from list of column indices to string)
|
||||
if (!is.null(params[['interaction_constraints']]) &&
|
||||
typeof(params[['interaction_constraints']]) != "character"){
|
||||
# check input class
|
||||
if (!identical(class(params[['interaction_constraints']]),'list')) stop('interaction_constraints should be class list')
|
||||
if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric','integer'))) {
|
||||
if (!identical(class(params[['interaction_constraints']]), 'list')) stop('interaction_constraints should be class list')
|
||||
if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric', 'integer'))) {
|
||||
stop('interaction_constraints should be a list of numeric/integer vectors')
|
||||
}
|
||||
|
||||
# recast parameter as string
|
||||
interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse=','), ']'))
|
||||
params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse=','), ']')
|
||||
interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse = ','), ']'))
|
||||
params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse = ','), ']')
|
||||
}
|
||||
return(params)
|
||||
}
|
||||
@@ -145,7 +145,8 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
|
||||
if (is.null(obj)) {
|
||||
.Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
|
||||
} else {
|
||||
pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE)
|
||||
pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE,
|
||||
ntreelimit = 0)
|
||||
gpair <- obj(pred, dtrain)
|
||||
.Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess)
|
||||
}
|
||||
@@ -167,12 +168,12 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
|
||||
if (is.null(feval)) {
|
||||
msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
|
||||
msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
|
||||
res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
|
||||
names(res) <- msg[c(TRUE,FALSE)] # odds are the names
|
||||
res <- as.numeric(msg[c(FALSE, TRUE)]) # even indices are the values
|
||||
names(res) <- msg[c(TRUE, FALSE)] # odds are the names
|
||||
} else {
|
||||
res <- sapply(seq_along(watchlist), function(j) {
|
||||
w <- watchlist[[j]]
|
||||
preds <- predict(booster_handle, w) # predict using all trees
|
||||
preds <- predict(booster_handle, w, outputmargin = TRUE, ntreelimit = 0) # predict using all trees
|
||||
eval_res <- feval(preds, w)
|
||||
out <- eval_res$value
|
||||
names(out) <- paste0(evnames[j], "-", eval_res$metric)
|
||||
@@ -307,6 +308,68 @@ xgb.createFolds <- function(y, k = 10)
|
||||
#' @name xgboost-deprecated
|
||||
NULL
|
||||
|
||||
#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
|
||||
#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
|
||||
#'
|
||||
#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
|
||||
#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
|
||||
#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
|
||||
#' the model is to be accessed in the future. If you train a model with the current version of
|
||||
#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
|
||||
#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
|
||||
#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
|
||||
#'
|
||||
#' @details
|
||||
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
#' the JSON format by specifying the JSON extension. To read the model back, use
|
||||
#' \code{\link{xgb.load}}.
|
||||
#'
|
||||
#' Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
|
||||
#' in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
|
||||
#' re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
|
||||
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
#' as part of another R object.
|
||||
#'
|
||||
#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
|
||||
#' model but also internal configurations and parameters, and its format is not stable across
|
||||
#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
|
||||
#'
|
||||
#' For more details and explanation about model persistence and archival, consult the page
|
||||
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#'
|
||||
#' # Save as a stand-alone file; load it with xgb.load()
|
||||
#' xgb.save(bst, 'xgb.model')
|
||||
#' bst2 <- xgb.load('xgb.model')
|
||||
#'
|
||||
#' # Save as a stand-alone file (JSON); load it with xgb.load()
|
||||
#' xgb.save(bst, 'xgb.model.json')
|
||||
#' bst2 <- xgb.load('xgb.model.json')
|
||||
#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
|
||||
#'
|
||||
#' # Save as a raw byte vector; load it with xgb.load.raw()
|
||||
#' xgb_bytes <- xgb.save.raw(bst)
|
||||
#' bst2 <- xgb.load.raw(xgb_bytes)
|
||||
#'
|
||||
#' # Persist XGBoost model as part of another R object
|
||||
#' obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost model")
|
||||
#' # Persist the R object. Here, saveRDS() is okay, since it doesn't persist
|
||||
#' # xgb.Booster directly. What's being persisted is the future-proof byte representation
|
||||
#' # as given by xgb.save.raw().
|
||||
#' saveRDS(obj, 'my_object.rds')
|
||||
#' # Read back the R object
|
||||
#' obj2 <- readRDS('my_object.rds')
|
||||
#' # Re-construct xgb.Booster object from the bytes
|
||||
#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
|
||||
#' if (file.exists('my_object.rds')) file.remove('my_object.rds')
|
||||
#'
|
||||
#' @name a-compatibility-note-for-saveRDS-save
|
||||
NULL
|
||||
|
||||
# Lookup table for the deprecated parameters bookkeeping
|
||||
depr_par_lut <- matrix(c(
|
||||
'print.every.n', 'print_every_n',
|
||||
@@ -315,8 +378,8 @@ depr_par_lut <- matrix(c(
|
||||
'with.stats', 'with_stats',
|
||||
'numberOfClusters', 'n_clusters',
|
||||
'features.keep', 'features_keep',
|
||||
'plot.height','plot_height',
|
||||
'plot.width','plot_width',
|
||||
'plot.height', 'plot_height',
|
||||
'plot.width', 'plot_width',
|
||||
'n_first_tree', 'trees',
|
||||
'dummy', 'DUMMY'
|
||||
), ncol = 2, byrow = TRUE)
|
||||
@@ -329,20 +392,20 @@ colnames(depr_par_lut) <- c('old', 'new')
|
||||
check.deprecation <- function(..., env = parent.frame()) {
|
||||
pars <- list(...)
|
||||
# exact and partial matches
|
||||
all_match <- pmatch(names(pars), depr_par_lut[,1])
|
||||
all_match <- pmatch(names(pars), depr_par_lut[, 1])
|
||||
# indices of matched pars' names
|
||||
idx_pars <- which(!is.na(all_match))
|
||||
if (length(idx_pars) == 0) return()
|
||||
# indices of matched LUT rows
|
||||
idx_lut <- all_match[idx_pars]
|
||||
# which of idx_lut were the exact matches?
|
||||
ex_match <- depr_par_lut[idx_lut,1] %in% names(pars)
|
||||
ex_match <- depr_par_lut[idx_lut, 1] %in% names(pars)
|
||||
for (i in seq_along(idx_pars)) {
|
||||
pars_par <- names(pars)[idx_pars[i]]
|
||||
old_par <- depr_par_lut[idx_lut[i], 1]
|
||||
new_par <- depr_par_lut[idx_lut[i], 2]
|
||||
if (!ex_match[i]) {
|
||||
warning("'", pars_par, "' was partially matched to '", old_par,"'")
|
||||
warning("'", pars_par, "' was partially matched to '", old_par, "'")
|
||||
}
|
||||
.Deprecated(new_par, old = old_par, package = 'xgboost')
|
||||
if (new_par != 'NULL') {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Construct an internal xgboost Booster and return a handle to it.
|
||||
# internal utility function
|
||||
xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
xgb.Booster.handle <- function(params = list(), cachelist = list(),
|
||||
modelfile = NULL) {
|
||||
if (typeof(cachelist) != "list" ||
|
||||
!all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) {
|
||||
stop("cachelist must be a list of xgb.DMatrix objects")
|
||||
@@ -62,8 +63,8 @@ is.null.handle <- function(handle) {
|
||||
return(FALSE)
|
||||
}
|
||||
|
||||
# Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster
|
||||
# internal utility function
|
||||
# Return a verified to be valid handle out of either xgb.Booster.handle or
|
||||
# xgb.Booster internal utility function
|
||||
xgb.get.handle <- function(object) {
|
||||
if (inherits(object, "xgb.Booster")) {
|
||||
handle <- object$handle
|
||||
@@ -110,6 +111,8 @@ xgb.get.handle <- function(object) {
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#' saveRDS(bst, "xgb.model.rds")
|
||||
#'
|
||||
#' # Warning: The resulting RDS file is only compatible with the current XGBoost version.
|
||||
#' # Refer to the section titled "a-compatibility-note-for-saveRDS-save".
|
||||
#' bst1 <- readRDS("xgb.model.rds")
|
||||
#' if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds")
|
||||
#' # the handle is invalid:
|
||||
@@ -369,8 +372,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
|
||||
} else {
|
||||
arr <- array(ret, c(n_col1, n_group, n_row),
|
||||
dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col]
|
||||
lapply(seq_len(n_group), function(g) arr[g,,])
|
||||
dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2, 3, 1)) # [group, row, col]
|
||||
lapply(seq_len(n_group), function(g) arr[g, , ])
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
n_col1 <- ncol(newdata) + 1
|
||||
@@ -379,11 +382,11 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
ret <- if (n_ret == n_row) {
|
||||
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_group == 1) {
|
||||
array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2))
|
||||
array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3, 1, 2))
|
||||
} else {
|
||||
arr <- array(ret, c(n_col1, n_col1, n_group, n_row),
|
||||
dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2]
|
||||
lapply(seq_len(n_group), function(g) arr[g,,,])
|
||||
dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3, 4, 1, 2)) # [group, row, col1, col2]
|
||||
lapply(seq_len(n_group), function(g) arr[g, , , ])
|
||||
}
|
||||
} else if (reshape && npred_per_case > 1) {
|
||||
ret <- matrix(ret, nrow = n_row, byrow = TRUE)
|
||||
@@ -656,7 +659,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
|
||||
|
||||
if (!is.null(x$params)) {
|
||||
cat('params (as set within xgb.train):\n')
|
||||
cat( ' ',
|
||||
cat(' ',
|
||||
paste(names(x$params),
|
||||
paste0('"', unlist(x$params), '"'),
|
||||
sep = ' = ', collapse = ', '), '\n', sep = '')
|
||||
@@ -669,9 +672,9 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
|
||||
if (length(attrs) > 0) {
|
||||
cat('xgb.attributes:\n')
|
||||
if (verbose) {
|
||||
cat( paste(paste0(' ',names(attrs)),
|
||||
paste0('"', unlist(attrs), '"'),
|
||||
sep = ' = ', collapse = '\n'), '\n', sep = '')
|
||||
cat(paste(paste0(' ', names(attrs)),
|
||||
paste0('"', unlist(attrs), '"'),
|
||||
sep = ' = ', collapse = '\n'), '\n', sep = '')
|
||||
} else {
|
||||
cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '')
|
||||
}
|
||||
@@ -693,7 +696,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
|
||||
#cat('ntree: ', xgb.ntree(x), '\n', sep='')
|
||||
|
||||
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
|
||||
'evaluation_log','niter','feature_names'))) {
|
||||
'evaluation_log', 'niter', 'feature_names'))) {
|
||||
if (is.atomic(x[[n]])) {
|
||||
cat(n, ':', x[[n]], '\n', sep = ' ')
|
||||
} else {
|
||||
|
||||
@@ -257,8 +257,6 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "weight") {
|
||||
if (length(info) != nrow(object))
|
||||
stop("The length of weights must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
return(TRUE)
|
||||
}
|
||||
@@ -322,7 +320,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
|
||||
for (i in seq_along(ind)) {
|
||||
obj_attr <- attr(object, nms[i])
|
||||
if (NCOL(obj_attr) > 1) {
|
||||
attr(ret, nms[i]) <- obj_attr[idxset,]
|
||||
attr(ret, nms[i]) <- obj_attr[idxset, ]
|
||||
} else {
|
||||
attr(ret, nms[i]) <- obj_attr[idxset]
|
||||
}
|
||||
@@ -360,9 +358,9 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
|
||||
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
||||
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
|
||||
infos <- c()
|
||||
if(length(getinfo(x, 'label')) > 0) infos <- 'label'
|
||||
if(length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
|
||||
if(length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin')
|
||||
if (length(getinfo(x, 'label')) > 0) infos <- 'label'
|
||||
if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
|
||||
if (length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin')
|
||||
if (length(infos) == 0) infos <- 'NA'
|
||||
cat(infos)
|
||||
cnames <- colnames(x)
|
||||
|
||||
@@ -83,5 +83,5 @@ xgb.create.features <- function(model, data, ...){
|
||||
check.deprecation(...)
|
||||
pred_with_leaf <- predict(model, data, predleaf = TRUE)
|
||||
cols <- lapply(as.data.frame(pred_with_leaf), factor)
|
||||
cbind(data, sparse.model.matrix( ~ . -1, cols))
|
||||
cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint
|
||||
}
|
||||
|
||||
@@ -2,12 +2,15 @@
|
||||
#'
|
||||
#' The cross validation function of xgboost
|
||||
#'
|
||||
#' @param params the list of parameters. Commonly used ones are:
|
||||
#' @param params the list of parameters. The complete list of parameters is
|
||||
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
#' is a shorter summary:
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
#' \itemize{
|
||||
#' \item \code{reg:squarederror} Regression with squared loss
|
||||
#' \item \code{binary:logistic} logistic regression for classification
|
||||
#' \item \code{reg:squarederror} Regression with squared loss.
|
||||
#' \item \code{binary:logistic} logistic regression for classification.
|
||||
#' \item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
|
||||
#' }
|
||||
#' \item \code{eta} step size of each boosting step
|
||||
#' \item \code{max_depth} maximum depth of the tree
|
||||
@@ -76,7 +79,7 @@
|
||||
#'
|
||||
#' All observations are used for both training and validation.
|
||||
#'
|
||||
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
|
||||
#' Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.cv.synchronous} with the following elements:
|
||||
@@ -134,20 +137,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
# stop("Either 'eval_metric' or 'feval' must be provided for CV")
|
||||
|
||||
# Check the labels
|
||||
if ( (inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
|
||||
(!inherits(data, 'xgb.DMatrix') && is.null(label))) {
|
||||
if ((inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
|
||||
(!inherits(data, 'xgb.DMatrix') && is.null(label))) {
|
||||
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
|
||||
} else if (inherits(data, 'xgb.DMatrix')) {
|
||||
if (!is.null(label))
|
||||
warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix")
|
||||
cv_label = getinfo(data, 'label')
|
||||
cv_label <- getinfo(data, 'label')
|
||||
} else {
|
||||
cv_label = label
|
||||
cv_label <- label
|
||||
}
|
||||
|
||||
# CV folds
|
||||
if(!is.null(folds)) {
|
||||
if(!is.list(folds) || length(folds) < 2)
|
||||
if (!is.null(folds)) {
|
||||
if (!is.list(folds) || length(folds) < 2)
|
||||
stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
|
||||
nfold <- length(folds)
|
||||
} else {
|
||||
@@ -162,7 +165,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
|
||||
# verbosity & evaluation printing callback:
|
||||
params <- c(params, list(silent = 1))
|
||||
print_every_n <- max( as.integer(print_every_n), 1L)
|
||||
print_every_n <- max(as.integer(print_every_n), 1L)
|
||||
if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
|
||||
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
|
||||
}
|
||||
@@ -193,20 +196,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
bst_folds <- lapply(seq_along(folds), function(k) {
|
||||
dtest <- slice(dall, folds[[k]])
|
||||
# code originally contributed by @RolandASc on stackoverflow
|
||||
if(is.null(train_folds))
|
||||
if (is.null(train_folds))
|
||||
dtrain <- slice(dall, unlist(folds[-k]))
|
||||
else
|
||||
dtrain <- slice(dall, train_folds[[k]])
|
||||
handle <- xgb.Booster.handle(params, list(dtrain, dtest))
|
||||
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]])
|
||||
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
|
||||
})
|
||||
rm(dall)
|
||||
# a "basket" to collect some results from callbacks
|
||||
basket <- list()
|
||||
|
||||
# extract parameters that can affect the relationship b/w #trees and #iterations
|
||||
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
|
||||
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
|
||||
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
|
||||
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
|
||||
|
||||
# those are fixed for CV (no training continuation)
|
||||
begin_iteration <- 1
|
||||
@@ -223,7 +226,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
})
|
||||
msg <- simplify2array(msg)
|
||||
bst_evaluation <- rowMeans(msg)
|
||||
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2)
|
||||
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) # nolint
|
||||
|
||||
for (f in cb$post_iter) f()
|
||||
|
||||
@@ -282,10 +285,10 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
|
||||
}
|
||||
if (!is.null(x$params)) {
|
||||
cat('params (as set within xgb.cv):\n')
|
||||
cat( ' ',
|
||||
paste(names(x$params),
|
||||
paste0('"', unlist(x$params), '"'),
|
||||
sep = ' = ', collapse = ', '), '\n', sep = '')
|
||||
cat(' ',
|
||||
paste(names(x$params),
|
||||
paste0('"', unlist(x$params), '"'),
|
||||
sep = ' = ', collapse = ', '), '\n', sep = '')
|
||||
}
|
||||
if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
|
||||
cat('callbacks:\n')
|
||||
|
||||
@@ -74,7 +74,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
|
||||
p <-
|
||||
ggplot2::ggplot(dt_depths[, max(Depth), Tree]) +
|
||||
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
|
||||
height = 0.15, alpha=0.4, size=3, stroke=0) +
|
||||
height = 0.15, alpha = 0.4, size = 3, stroke = 0) +
|
||||
ggplot2::xlab("tree #") +
|
||||
ggplot2::ylab("Max tree leaf depth")
|
||||
return(p)
|
||||
@@ -83,7 +83,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
|
||||
p <-
|
||||
ggplot2::ggplot(dt_depths[, median(as.numeric(Depth)), Tree]) +
|
||||
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
|
||||
height = 0.15, alpha=0.4, size=3, stroke=0) +
|
||||
height = 0.15, alpha = 0.4, size = 3, stroke = 0) +
|
||||
ggplot2::xlab("tree #") +
|
||||
ggplot2::ylab("Median tree leaf depth")
|
||||
return(p)
|
||||
@@ -92,7 +92,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
|
||||
p <-
|
||||
ggplot2::ggplot(dt_depths[, median(abs(Weight)), Tree]) +
|
||||
ggplot2::geom_point(ggplot2::aes(x = Tree, y = V1),
|
||||
alpha=0.4, size=3, stroke=0) +
|
||||
alpha = 0.4, size = 3, stroke = 0) +
|
||||
ggplot2::xlab("tree #") +
|
||||
ggplot2::ylab("Median absolute leaf weight")
|
||||
return(p)
|
||||
@@ -105,7 +105,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
|
||||
# internal utility function
|
||||
multiplot <- function(..., cols = 1) {
|
||||
plots <- list(...)
|
||||
num_plots = length(plots)
|
||||
num_plots <- length(plots)
|
||||
|
||||
layout <- matrix(seq(1, cols * ceiling(num_plots / cols)),
|
||||
ncol = cols, nrow = ceiling(num_plots / cols))
|
||||
|
||||
@@ -99,13 +99,13 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
model_text_dump <- xgb.dump(model = model, with_stats = TRUE)
|
||||
|
||||
# linear model
|
||||
if(model_text_dump[2] == "bias:"){
|
||||
if (model_text_dump[2] == "bias:"){
|
||||
weights <- which(model_text_dump == "weight:") %>%
|
||||
{model_text_dump[(. + 1):length(model_text_dump)]} %>%
|
||||
as.numeric
|
||||
|
||||
num_class <- NVL(model$params$num_class, 1)
|
||||
if(is.null(feature_names))
|
||||
if (is.null(feature_names))
|
||||
feature_names <- seq(to = length(weights) / num_class) - 1
|
||||
if (length(feature_names) * num_class != length(weights))
|
||||
stop("feature_names length does not match the number of features used in the model")
|
||||
@@ -117,18 +117,17 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
Weight = weights,
|
||||
Class = seq_len(num_class) - 1)[order(Class, -abs(Weight))]
|
||||
}
|
||||
} else {
|
||||
# tree model
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names,
|
||||
text = model_text_dump,
|
||||
trees = trees)[
|
||||
Feature != "Leaf", .(Gain = sum(Quality),
|
||||
Cover = sum(Cover),
|
||||
Frequency = .N), by = Feature][
|
||||
,`:=`(Gain = Gain / sum(Gain),
|
||||
Cover = Cover / sum(Cover),
|
||||
Frequency = Frequency / sum(Frequency))][
|
||||
order(Gain, decreasing = TRUE)]
|
||||
} else { # tree model
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names,
|
||||
text = model_text_dump,
|
||||
trees = trees)[
|
||||
Feature != "Leaf", .(Gain = sum(Quality),
|
||||
Cover = sum(Cover),
|
||||
Frequency = .N), by = Feature][
|
||||
, `:=`(Gain = Gain / sum(Gain),
|
||||
Cover = Cover / sum(Cover),
|
||||
Frequency = Frequency / sum(Frequency))][
|
||||
order(Gain, decreasing = TRUE)]
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
@@ -108,7 +108,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
}
|
||||
td <- td[Tree %in% trees & !grepl('^booster', t)]
|
||||
|
||||
td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ]
|
||||
td[, Node := stri_match_first_regex(t, "(\\d+):")[, 2] %>% as.integer]
|
||||
if (!use_int_id) td[, ID := add.tree.id(Node, Tree)]
|
||||
td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]
|
||||
|
||||
@@ -119,15 +119,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
td[isLeaf == FALSE,
|
||||
(branch_cols) := {
|
||||
# skip some indices with spurious capture groups from anynumber_regex
|
||||
xtr <- stri_match_first_regex(t, branch_rx)[, c(2,3,5,6,7,8,10), drop = FALSE]
|
||||
xtr <- stri_match_first_regex(t, branch_rx)[, c(2, 3, 5, 6, 7, 8, 10), drop = FALSE]
|
||||
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
|
||||
lapply(seq_len(ncol(xtr)), function(i) xtr[,i])
|
||||
lapply(seq_len(ncol(xtr)), function(i) xtr[, i])
|
||||
}]
|
||||
# assign feature_names when available
|
||||
if (!is.null(feature_names)) {
|
||||
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
|
||||
stop("feature_names has less elements than there are features used in the model")
|
||||
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1] ]
|
||||
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
|
||||
}
|
||||
|
||||
# parse leaf lines
|
||||
@@ -135,8 +135,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
leaf_cols <- c("Feature", "Quality", "Cover")
|
||||
td[isLeaf == TRUE,
|
||||
(leaf_cols) := {
|
||||
xtr <- stri_match_first_regex(t, leaf_rx)[, c(2,4)]
|
||||
c("Leaf", lapply(seq_len(ncol(xtr)), function(i) xtr[,i]))
|
||||
xtr <- stri_match_first_regex(t, leaf_rx)[, c(2, 4)]
|
||||
c("Leaf", lapply(seq_len(ncol(xtr)), function(i) xtr[, i]))
|
||||
}]
|
||||
|
||||
# convert some columns to numeric
|
||||
@@ -156,4 +156,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf",".SD", ".SDcols"))
|
||||
globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf", ".SD", ".SDcols"))
|
||||
|
||||
@@ -89,9 +89,9 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
|
||||
if (plot) {
|
||||
if (which == "2x1") {
|
||||
op <- par(no.readonly = TRUE)
|
||||
par(mfrow = c(2,1),
|
||||
oma = c(3,1,3,1) + 0.1,
|
||||
mar = c(1,4,1,0) + 0.1)
|
||||
par(mfrow = c(2, 1),
|
||||
oma = c(3, 1, 3, 1) + 0.1,
|
||||
mar = c(1, 4, 1, 0) + 0.1)
|
||||
|
||||
dt_summaries[, barplot(N, border = NA, ylab = 'Number of leafs', ...)]
|
||||
|
||||
@@ -130,7 +130,7 @@ get.leaf.depth <- function(dt_tree) {
|
||||
dt_edges[is.na(Leaf), Leaf := FALSE]
|
||||
|
||||
dt_edges[, {
|
||||
graph <- igraph::graph_from_data_frame(.SD[,.(ID, To)])
|
||||
graph <- igraph::graph_from_data_frame(.SD[, .(ID, To)])
|
||||
# min(ID) in a tree is a root node
|
||||
paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE])
|
||||
# list of paths to each leaf in a tree
|
||||
|
||||
@@ -92,10 +92,10 @@ xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure
|
||||
importance_matrix <- head(importance_matrix, top_n)
|
||||
}
|
||||
if (rel_to_first) {
|
||||
importance_matrix[, Importance := Importance/max(abs(Importance))]
|
||||
importance_matrix[, Importance := Importance / max(abs(Importance))]
|
||||
}
|
||||
if (is.null(cex)) {
|
||||
cex <- 2.5/log2(1 + nrow(importance_matrix))
|
||||
cex <- 2.5 / log2(1 + nrow(importance_matrix))
|
||||
}
|
||||
|
||||
if (plot) {
|
||||
|
||||
@@ -72,7 +72,7 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
|
||||
|
||||
precedent.nodes <- root.nodes
|
||||
|
||||
while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
|
||||
while (tree.matrix[, sum(is.na(abs.node.position))] > 0) {
|
||||
yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
|
||||
no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
|
||||
yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0")
|
||||
@@ -88,35 +88,35 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
|
||||
|
||||
remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
|
||||
|
||||
tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position),
|
||||
Yes = remove.tree(Yes),
|
||||
No = remove.tree(No))]
|
||||
tree.matrix[, `:=`(abs.node.position = remove.tree(abs.node.position),
|
||||
Yes = remove.tree(Yes),
|
||||
No = remove.tree(No))]
|
||||
|
||||
nodes.dt <- tree.matrix[
|
||||
, .(Quality = sum(Quality))
|
||||
, by = .(abs.node.position, Feature)
|
||||
][, .(Text = paste0(Feature[1:min(length(Feature), features_keep)],
|
||||
" (",
|
||||
format(Quality[1:min(length(Quality), features_keep)], digits=5),
|
||||
format(Quality[1:min(length(Quality), features_keep)], digits = 5),
|
||||
")") %>%
|
||||
paste0(collapse = "\n"))
|
||||
, by = abs.node.position]
|
||||
|
||||
edges.dt <- tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)] %>%
|
||||
list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>%
|
||||
list(tree.matrix[Feature != "Leaf", .(abs.node.position, No)]) %>%
|
||||
rbindlist() %>%
|
||||
setnames(c("From", "To")) %>%
|
||||
.[, .N, .(From, To)] %>%
|
||||
.[, N:=NULL]
|
||||
.[, N := NULL]
|
||||
|
||||
nodes <- DiagrammeR::create_node_df(
|
||||
n = nrow(nodes.dt),
|
||||
label = nodes.dt[,Text]
|
||||
label = nodes.dt[, Text]
|
||||
)
|
||||
|
||||
edges <- DiagrammeR::create_edge_df(
|
||||
from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
|
||||
to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
|
||||
from = match(edges.dt[, From], nodes.dt[, abs.node.position]),
|
||||
to = match(edges.dt[, To], nodes.dt[, abs.node.position]),
|
||||
rel = "leading_to")
|
||||
|
||||
graph <- DiagrammeR::create_graph(
|
||||
|
||||
@@ -125,12 +125,12 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
|
||||
nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
|
||||
idx <- sample(1:nrow(data), nsample)
|
||||
data <- data[idx,]
|
||||
data <- data[idx, ]
|
||||
|
||||
if (is.null(shap_contrib)) {
|
||||
shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
|
||||
} else {
|
||||
shap_contrib <- shap_contrib[idx,]
|
||||
shap_contrib <- shap_contrib[idx, ]
|
||||
}
|
||||
|
||||
which <- match.arg(which)
|
||||
@@ -168,8 +168,8 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
|
||||
if (plot && which == "1d") {
|
||||
op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
|
||||
oma = c(0,0,0,0) + 0.2,
|
||||
mar = c(3.5,3.5,0,0) + 0.1,
|
||||
oma = c(0, 0, 0, 0) + 0.2,
|
||||
mar = c(3.5, 3.5, 0, 0) + 0.1,
|
||||
mgp = c(1.7, 0.6, 0))
|
||||
for (f in cols) {
|
||||
ord <- order(data[, f])
|
||||
@@ -192,7 +192,7 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
grid()
|
||||
if (plot_loess) {
|
||||
# compress x to 3 digits, and mean-aggredate y
|
||||
zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x]
|
||||
zz <- data.table(x = signif(x, 3), y)[, .(.N, y = mean(y)), x]
|
||||
if (nrow(zz) <= 5) {
|
||||
lines(zz$x, zz$y, col = col_loess)
|
||||
} else {
|
||||
|
||||
@@ -80,12 +80,12 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
|
||||
|
||||
dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)
|
||||
|
||||
dt[, label:= paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)]
|
||||
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)]
|
||||
if (show_node_id)
|
||||
dt[, label := paste0(ID, ": ", label)]
|
||||
dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)]
|
||||
dt[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
|
||||
dt[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
|
||||
dt[, shape := "rectangle"][Feature == "Leaf", shape := "oval"]
|
||||
dt[, filledcolor := "Beige"][Feature == "Leaf", filledcolor := "Khaki"]
|
||||
# in order to draw the first tree on top:
|
||||
dt <- dt[order(-Tree)]
|
||||
|
||||
|
||||
@@ -13,7 +13,11 @@
|
||||
#'
|
||||
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
#' corresponding R-methods would need to be used to load it.
|
||||
#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
|
||||
#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
#' how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
#' releases of XGBoost.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
#' \code{xgb.train} is an advanced interface for training an xgboost model.
|
||||
#' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
|
||||
#'
|
||||
#' @param params the list of parameters.
|
||||
#' The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
|
||||
#' Below is a shorter summary:
|
||||
#' @param params the list of parameters. The complete list of parameters is
|
||||
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
#' is a shorter summary:
|
||||
#'
|
||||
#' 1. General Parameters
|
||||
#'
|
||||
@@ -43,13 +43,23 @@
|
||||
#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
|
||||
#' \itemize{
|
||||
#' \item \code{reg:squarederror} Regression with squared loss (Default).
|
||||
#' \item \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}. All inputs are required to be greater than -1. Also, see metric rmsle for possible issue with this objective.
|
||||
#' \item \code{reg:logistic} logistic regression.
|
||||
#' \item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
|
||||
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
#' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
|
||||
#' \item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
#' \item \code{count:poisson}: poisson regression for count data, output mean of poisson distribution. \code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).
|
||||
#' \item \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored). Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function \code{h(t) = h0(t) * HR)}.
|
||||
#' \item \code{survival:aft}: Accelerated failure time model for censored survival time data. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details.
|
||||
#' \item \code{aft_loss_distribution}: Probabilty Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
|
||||
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
|
||||
#' \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
|
||||
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
#' \item \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.
|
||||
#' \item \code{rank:map}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)} is maximized.
|
||||
#' \item \code{reg:gamma}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.
|
||||
#' \item \code{reg:tweedie}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.
|
||||
#' }
|
||||
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||
@@ -120,16 +130,16 @@
|
||||
#' Note that when using a customized metric, only this single metric can be used.
|
||||
#' The following is the list of built-in metrics for which Xgboost provides optimized implementation:
|
||||
#' \itemize{
|
||||
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
|
||||
#' \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
#' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
#' Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
#' \item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
#' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
|
||||
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
#' }
|
||||
#'
|
||||
#' The following callbacks are automatically created when certain parameters are set:
|
||||
@@ -268,7 +278,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
|
||||
# evaluation printing callback
|
||||
params <- c(params)
|
||||
print_every_n <- max( as.integer(print_every_n), 1L)
|
||||
print_every_n <- max(as.integer(print_every_n), 1L)
|
||||
if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
|
||||
verbose) {
|
||||
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
|
||||
@@ -318,12 +328,9 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
|
||||
}
|
||||
}
|
||||
if(is_update && nrounds > niter_init)
|
||||
if (is_update && nrounds > niter_init)
|
||||
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
|
||||
|
||||
# TODO: distributed code
|
||||
rank <- 0
|
||||
|
||||
niter_skip <- ifelse(is_update, 0, niter_init)
|
||||
begin_iteration <- niter_skip + 1
|
||||
end_iteration <- niter_skip + nrounds
|
||||
@@ -335,7 +342,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
|
||||
xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
|
||||
|
||||
bst_evaluation <- numeric(0)
|
||||
if (length(watchlist) > 0)
|
||||
bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)
|
||||
|
||||
@@ -350,7 +356,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||
|
||||
# store the total number of boosting iterations
|
||||
bst$niter = end_iteration
|
||||
bst$niter <- end_iteration
|
||||
|
||||
# store the evaluation results
|
||||
if (length(evaluation_log) > 0 &&
|
||||
|
||||
@@ -6,7 +6,26 @@
|
||||
xgb.unserialize <- function(buffer) {
|
||||
cachelist <- list()
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
.Call(XGBoosterUnserializeFromBuffer_R, handle, buffer)
|
||||
tryCatch(
|
||||
.Call(XGBoosterUnserializeFromBuffer_R, handle, buffer),
|
||||
error = function(e) {
|
||||
error_msg <- conditionMessage(e)
|
||||
m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)",
|
||||
error_msg, perl = TRUE)
|
||||
groups <- regmatches(error_msg, m)[[1]]
|
||||
if (length(groups) == 3) {
|
||||
warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ",
|
||||
"loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ",
|
||||
"function, to ensure that your model can be read in current and upcoming ",
|
||||
"XGBoost releases. Please use xgb.save() instead to preserve models for the ",
|
||||
"long term. For more details and explanation, see ",
|
||||
"https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html",
|
||||
sep = ""))
|
||||
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
|
||||
} else {
|
||||
stop(e)
|
||||
}
|
||||
})
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
return (handle)
|
||||
}
|
||||
|
||||
16
R-package/configure
vendored
16
R-package/configure
vendored
@@ -613,6 +613,7 @@ infodir
|
||||
docdir
|
||||
oldincludedir
|
||||
includedir
|
||||
runstatedir
|
||||
localstatedir
|
||||
sharedstatedir
|
||||
sysconfdir
|
||||
@@ -682,6 +683,7 @@ datadir='${datarootdir}'
|
||||
sysconfdir='${prefix}/etc'
|
||||
sharedstatedir='${prefix}/com'
|
||||
localstatedir='${prefix}/var'
|
||||
runstatedir='${localstatedir}/run'
|
||||
includedir='${prefix}/include'
|
||||
oldincludedir='/usr/include'
|
||||
docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
|
||||
@@ -934,6 +936,15 @@ do
|
||||
| -silent | --silent | --silen | --sile | --sil)
|
||||
silent=yes ;;
|
||||
|
||||
-runstatedir | --runstatedir | --runstatedi | --runstated \
|
||||
| --runstate | --runstat | --runsta | --runst | --runs \
|
||||
| --run | --ru | --r)
|
||||
ac_prev=runstatedir ;;
|
||||
-runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
|
||||
| --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
|
||||
| --run=* | --ru=* | --r=*)
|
||||
runstatedir=$ac_optarg ;;
|
||||
|
||||
-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
|
||||
ac_prev=sbindir ;;
|
||||
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
|
||||
@@ -1071,7 +1082,7 @@ fi
|
||||
for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
|
||||
datadir sysconfdir sharedstatedir localstatedir includedir \
|
||||
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
|
||||
libdir localedir mandir
|
||||
libdir localedir mandir runstatedir
|
||||
do
|
||||
eval ac_val=\$$ac_var
|
||||
# Remove trailing slashes.
|
||||
@@ -1224,6 +1235,7 @@ Fine tuning of the installation directories:
|
||||
--sysconfdir=DIR read-only single-machine data [PREFIX/etc]
|
||||
--sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
|
||||
--localstatedir=DIR modifiable single-machine data [PREFIX/var]
|
||||
--runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run]
|
||||
--libdir=DIR object code libraries [EPREFIX/lib]
|
||||
--includedir=DIR C header files [PREFIX/include]
|
||||
--oldincludedir=DIR C header files for non-gcc [/usr/include]
|
||||
@@ -2713,7 +2725,7 @@ main ()
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
${CC} -o conftest conftest.c /usr/local/lib/libomp.dylib -Xclang -fopenmp 2>/dev/null && ./conftest && ac_pkg_openmp=yes
|
||||
${CC} -o conftest conftest.c ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_pkg_openmp}" >&5
|
||||
$as_echo "${ac_pkg_openmp}" >&6; }
|
||||
if test "${ac_pkg_openmp}" = no; then
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
### configure.ac -*- Autoconf -*-
|
||||
|
||||
AC_PREREQ(2.62)
|
||||
AC_PREREQ(2.69)
|
||||
|
||||
AC_INIT([xgboost],[0.6-3],[],[xgboost],[])
|
||||
|
||||
@@ -33,7 +33,7 @@ then
|
||||
ac_pkg_openmp=no
|
||||
AC_MSG_CHECKING([whether OpenMP will work in a package])
|
||||
AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
|
||||
${CC} -o conftest conftest.c /usr/local/lib/libomp.dylib -Xclang -fopenmp 2>/dev/null && ./conftest && ac_pkg_openmp=yes
|
||||
${CC} -o conftest conftest.c ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
|
||||
AC_MSG_RESULT([${ac_pkg_openmp}])
|
||||
if test "${ac_pkg_openmp}" = no; then
|
||||
OPENMP_CXXFLAGS=''
|
||||
|
||||
@@ -17,4 +17,4 @@ Benchmarks
|
||||
Notes
|
||||
====
|
||||
* Contribution of examples, benchmarks is more than welcomed!
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request :)
|
||||
|
||||
@@ -3,8 +3,8 @@ require(methods)
|
||||
|
||||
# we load in the agaricus dataset
|
||||
# In this example, we are aiming to predict whether a mushroom is edible
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
|
||||
@@ -58,31 +58,31 @@ xgb.save(bst, "xgboost.model")
|
||||
bst2 <- xgb.load("xgboost.model")
|
||||
pred2 <- predict(bst2, test$data)
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
|
||||
|
||||
# save model to R's raw vector
|
||||
raw = xgb.save.raw(bst)
|
||||
raw <- xgb.save.raw(bst)
|
||||
# load binary model to R
|
||||
bst3 <- xgb.load(raw)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
# pred3 should be identical to pred
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3-pred))))
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))
|
||||
|
||||
#----------------Advanced features --------------
|
||||
# to use advanced features, we need to put data in xgb.DMatrix
|
||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label = test$label)
|
||||
#---------------Using watchlist----------------
|
||||
# watchlist is a list of xgb.DMatrix, each of them is tagged with name
|
||||
watchlist <- list(train=dtrain, test=dtest)
|
||||
watchlist <- list(train = dtrain, test = dtest)
|
||||
# to train with watchlist, use xgb.train, which contains more advanced features
|
||||
# watchlist allows us to monitor the evaluation result on all data in the list
|
||||
print("Train xgboost using xgb.train with watchlist")
|
||||
bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
|
||||
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# we can change evaluation metrics, or use multiple evaluation metrics
|
||||
print("train xgboost using xgb.train with watchlist, watch logloss and error")
|
||||
bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
|
||||
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
|
||||
eval_metric = "error", eval_metric = "logloss",
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
|
||||
@@ -90,17 +90,17 @@ bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist
|
||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||
# to load it in, simply call xgb.DMatrix
|
||||
dtrain2 <- xgb.DMatrix("dtrain.buffer")
|
||||
bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
|
||||
bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# information can be extracted from xgb.DMatrix using getinfo
|
||||
label = getinfo(dtest, "label")
|
||||
label <- getinfo(dtest, "label")
|
||||
pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
# You can dump the tree you learned using xgb.dump into a text file
|
||||
dump_path = file.path(tempdir(), 'dump.raw.txt')
|
||||
xgb.dump(bst, dump_path, with_stats = T)
|
||||
dump_path <- file.path(tempdir(), 'dump.raw.txt')
|
||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
|
||||
# Finally, you can check which features are the most important.
|
||||
print("Most important features (look at column Gain):")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
@@ -11,12 +11,12 @@ watchlist <- list(eval = dtest, train = dtrain)
|
||||
#
|
||||
print('start running example to start from a initial prediction')
|
||||
# train xgboost for 1 round
|
||||
param <- list(max_depth=2, eta=1, nthread = 2, silent=1, objective='binary:logistic')
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
|
||||
bst <- xgb.train(param, dtrain, 1, watchlist)
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
|
||||
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
|
||||
ptest <- predict(bst, dtest, outputmargin=TRUE)
|
||||
ptrain <- predict(bst, dtrain, outputmargin = TRUE)
|
||||
ptest <- predict(bst, dtest, outputmargin = TRUE)
|
||||
# set the base_margin property of dtrain and dtest
|
||||
# base margin is the base prediction we will boost from
|
||||
setinfo(dtrain, "base_margin", ptrain)
|
||||
|
||||
@@ -9,17 +9,17 @@ require(e1071)
|
||||
# Load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
|
||||
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
||||
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||
df[,AgeDiscret:= as.factor(round(Age/10,0))]
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
|
||||
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[,ID:=NULL]
|
||||
df[, ID := NULL]
|
||||
|
||||
#-------------Basic Training using XGBoost in caret Library-----------------
|
||||
# Set up control parameters for caret::train
|
||||
|
||||
@@ -19,7 +19,7 @@ if (!require(vcd)) {
|
||||
data(Arthritis)
|
||||
|
||||
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
|
||||
# Let's have a look to the data.table
|
||||
cat("Print the dataset\n")
|
||||
@@ -32,17 +32,17 @@ str(df)
|
||||
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
||||
|
||||
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||
df[,AgeDiscret:= as.factor(round(Age/10,0))]
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
|
||||
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[,ID:=NULL]
|
||||
df[, ID := NULL]
|
||||
|
||||
# List the different values for the column Treatment: Placebo, Treated.
|
||||
cat("Values of the categorical feature Treatment\n")
|
||||
print(levels(df[,Treatment]))
|
||||
print(levels(df[, Treatment]))
|
||||
|
||||
# Next step, we will transform the categorical data to dummy variables.
|
||||
# This method is also called one hot encoding.
|
||||
@@ -52,7 +52,7 @@ print(levels(df[,Treatment]))
|
||||
#
|
||||
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
|
||||
# Column Improved is excluded because it will be our output column, the one we want to predict.
|
||||
sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
|
||||
sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df)
|
||||
|
||||
cat("Encoding of the sparse Matrix\n")
|
||||
print(sparse_matrix)
|
||||
@@ -61,7 +61,7 @@ print(sparse_matrix)
|
||||
# 1. Set, for all rows, field in Y column to 0;
|
||||
# 2. set Y to 1 when Improved == Marked;
|
||||
# 3. Return Y column
|
||||
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
|
||||
output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y]
|
||||
|
||||
# Following is the same process as other demo
|
||||
cat("Learning...\n")
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
nrounds <- 2
|
||||
param <- list(max_depth=2, eta=1, silent=1, nthread=2, objective='binary:logistic')
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nrounds, nfold=5, metrics={'error'})
|
||||
xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = {'error'})
|
||||
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nrounds, nfold=5,
|
||||
metrics='error', showsd = FALSE)
|
||||
xgb.cv(param, dtrain, nrounds, nfold = 5,
|
||||
metrics = 'error', showsd = FALSE)
|
||||
|
||||
###
|
||||
# you can also do cross validation with cutomized loss function
|
||||
@@ -29,18 +29,18 @@ print ('running cross validation, with cutomsized loss function')
|
||||
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1,
|
||||
param <- list(max_depth = 2, eta = 1,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
# train with customized objective
|
||||
xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
@@ -15,7 +15,7 @@ num_round <- 2
|
||||
# this is loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
@@ -29,12 +29,12 @@ logregobj <- function(preds, dtrain) {
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth=2, eta=1, nthread = 2, verbosity=0,
|
||||
objective=logregobj, eval_metric=evalerror)
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
print ('start training with user customized objective')
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
@@ -52,13 +52,13 @@ attr(dtrain, 'label') <- getinfo(dtrain, 'label')
|
||||
logregobjattr <- function(preds, dtrain) {
|
||||
# now you can access the attribute in customized function
|
||||
labels <- attr(dtrain, 'label')
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
param <- list(max_depth=2, eta=1, nthread = 2, verbosity=0,
|
||||
objective=logregobjattr, eval_metric=evalerror)
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
|
||||
objective = logregobjattr, eval_metric = evalerror)
|
||||
print ('start training with user customized objective, with additional attributes in DMatrix')
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list(max_depth=2, eta=1, nthread=2, verbosity=0)
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0)
|
||||
watchlist <- list(eval = dtest)
|
||||
num_round <- 20
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
@@ -27,7 +27,7 @@ logregobj <- function(preds, dtrain) {
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
print ('start training with early Stopping setting')
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
##
|
||||
@@ -30,5 +30,4 @@ num_round <- 2
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
ypred <- predict(bst, dtest)
|
||||
labels <- getinfo(dtest, 'label')
|
||||
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')
|
||||
|
||||
cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n')
|
||||
|
||||
@@ -21,8 +21,8 @@ m <- X[, sel] %*% betas - 1 + rnorm(N)
|
||||
y <- rbinom(N, 1, plogis(m))
|
||||
|
||||
tr <- sample.int(N, N * 0.75)
|
||||
dtrain <- xgb.DMatrix(X[tr,], label = y[tr])
|
||||
dtest <- xgb.DMatrix(X[-tr,], label = y[-tr])
|
||||
dtrain <- xgb.DMatrix(X[tr, ], label = y[tr])
|
||||
dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr])
|
||||
wl <- list(train = dtrain, test = dtest)
|
||||
|
||||
# An example of running 'gpu_hist' algorithm
|
||||
|
||||
@@ -4,33 +4,38 @@ library(data.table)
|
||||
set.seed(1024)
|
||||
|
||||
# Function to obtain a list of interactions fitted in trees, requires input of maximum depth
|
||||
treeInteractions <- function(input_tree, input_max_depth){
|
||||
trees <- copy(input_tree) # copy tree input to prevent overwriting
|
||||
treeInteractions <- function(input_tree, input_max_depth) {
|
||||
ID_merge <- i.id <- i.feature <- NULL # Suppress warning "no visible binding for global variable"
|
||||
|
||||
trees <- data.table::copy(input_tree) # copy tree input to prevent overwriting
|
||||
if (input_max_depth < 2) return(list()) # no interactions if max depth < 2
|
||||
if (nrow(input_tree) == 1) return(list())
|
||||
|
||||
# Attach parent nodes
|
||||
for (i in 2:input_max_depth){
|
||||
if (i == 2) trees[, ID_merge:=ID] else trees[, ID_merge:=get(paste0('parent_',i-2))]
|
||||
parents_left <- trees[!is.na(Split), list(i.id=ID, i.feature=Feature, ID_merge=Yes)]
|
||||
parents_right <- trees[!is.na(Split), list(i.id=ID, i.feature=Feature, ID_merge=No)]
|
||||
for (i in 2:input_max_depth) {
|
||||
if (i == 2) trees[, ID_merge := ID] else trees[, ID_merge := get(paste0('parent_', i - 2))]
|
||||
parents_left <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = Yes)]
|
||||
parents_right <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = No)]
|
||||
|
||||
setorderv(trees, 'ID_merge')
|
||||
setorderv(parents_left, 'ID_merge')
|
||||
setorderv(parents_right, 'ID_merge')
|
||||
data.table::setorderv(trees, 'ID_merge')
|
||||
data.table::setorderv(parents_left, 'ID_merge')
|
||||
data.table::setorderv(parents_right, 'ID_merge')
|
||||
|
||||
trees <- merge(trees, parents_left, by='ID_merge', all.x=T)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
|
||||
trees[, c('i.id','i.feature'):=NULL]
|
||||
trees <- merge(trees, parents_left, by = 'ID_merge', all.x = TRUE)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1))
|
||||
:= list(i.id, i.feature)]
|
||||
trees[, c('i.id', 'i.feature') := NULL]
|
||||
|
||||
trees <- merge(trees, parents_right, by='ID_merge', all.x=T)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
|
||||
trees[, c('i.id','i.feature'):=NULL]
|
||||
trees <- merge(trees, parents_right, by = 'ID_merge', all.x = TRUE)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1))
|
||||
:= list(i.id, i.feature)]
|
||||
trees[, c('i.id', 'i.feature') := NULL]
|
||||
}
|
||||
|
||||
# Extract nodes with interactions
|
||||
interaction_trees <- trees[!is.na(Split) & !is.na(parent_1),
|
||||
c('Feature',paste0('parent_feat_',1:(input_max_depth-1))), with=F]
|
||||
c('Feature', paste0('parent_feat_', 1:(input_max_depth - 1))),
|
||||
with = FALSE]
|
||||
interaction_trees_split <- split(interaction_trees, 1:nrow(interaction_trees))
|
||||
interaction_list <- lapply(interaction_trees_split, as.character)
|
||||
|
||||
@@ -47,56 +52,59 @@ treeInteractions <- function(input_tree, input_max_depth){
|
||||
|
||||
# Generate sample data
|
||||
x <- list()
|
||||
for (i in 1:10){
|
||||
x[[i]] = i*rnorm(1000, 10)
|
||||
for (i in 1:10) {
|
||||
x[[i]] <- i * rnorm(1000, 10)
|
||||
}
|
||||
x <- as.data.table(x)
|
||||
|
||||
y = -1*x[, rowSums(.SD)] + x[['V1']]*x[['V2']] + x[['V3']]*x[['V4']]*x[['V5']] + rnorm(1000, 0.001) + 3*sin(x[['V7']])
|
||||
y <- -1 * x[, rowSums(.SD)] + x[['V1']] * x[['V2']] + x[['V3']] * x[['V4']] * x[['V5']]
|
||||
+ rnorm(1000, 0.001) + 3 * sin(x[['V7']])
|
||||
|
||||
train = as.matrix(x)
|
||||
train <- as.matrix(x)
|
||||
|
||||
# Interaction constraint list (column names form)
|
||||
interaction_list <- list(c('V1','V2'),c('V3','V4','V5'))
|
||||
interaction_list <- list(c('V1', 'V2'), c('V3', 'V4', 'V5'))
|
||||
|
||||
# Convert interaction constraint list into feature index form
|
||||
cols2ids <- function(object, col_names) {
|
||||
LUT <- seq_along(col_names) - 1
|
||||
names(LUT) <- col_names
|
||||
rapply(object, function(x) LUT[x], classes="character", how="replace")
|
||||
rapply(object, function(x) LUT[x], classes = "character", how = "replace")
|
||||
}
|
||||
interaction_list_fid = cols2ids(interaction_list, colnames(train))
|
||||
interaction_list_fid <- cols2ids(interaction_list, colnames(train))
|
||||
|
||||
# Fit model with interaction constraints
|
||||
bst = xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid)
|
||||
bst <- xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid)
|
||||
|
||||
bst_tree <- xgb.model.dt.tree(colnames(train), bst)
|
||||
bst_interactions <- treeInteractions(bst_tree, 4) # interactions constrained to combinations of V1*V2 and V3*V4*V5
|
||||
bst_interactions <- treeInteractions(bst_tree, 4)
|
||||
# interactions constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Fit model without interaction constraints
|
||||
bst2 = xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000)
|
||||
bst2 <- xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000)
|
||||
|
||||
bst2_tree <- xgb.model.dt.tree(colnames(train), bst2)
|
||||
bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions
|
||||
|
||||
# Fit model with both interaction and monotonicity constraints
|
||||
bst3 = xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid,
|
||||
monotone_constraints = c(-1,0,0,0,0,0,0,0,0,0))
|
||||
bst3 <- xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid,
|
||||
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
|
||||
|
||||
bst3_tree <- xgb.model.dt.tree(colnames(train), bst3)
|
||||
bst3_interactions <- treeInteractions(bst3_tree, 4) # interactions still constrained to combinations of V1*V2 and V3*V4*V5
|
||||
bst3_interactions <- treeInteractions(bst3_tree, 4)
|
||||
# interactions still constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Show monotonic constraints still apply by checking scores after incrementing V1
|
||||
x1 <- sort(unique(x[['V1']]))
|
||||
for (i in 1:length(x1)){
|
||||
testdata <- copy(x[, -c('V1')])
|
||||
testdata[['V1']] <- x1[i]
|
||||
testdata <- testdata[, paste0('V',1:10), with=F]
|
||||
testdata <- testdata[, paste0('V', 1:10), with = FALSE]
|
||||
pred <- predict(bst3, as.matrix(testdata))
|
||||
|
||||
# Should not print out anything due to monotonic constraints
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
data(mtcars)
|
||||
head(mtcars)
|
||||
bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
|
||||
objective='count:poisson',nrounds=5)
|
||||
pred = predict(bst,as.matrix(mtcars[,-11]))
|
||||
sqrt(mean((pred-mtcars[,11])^2))
|
||||
|
||||
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
|
||||
objective = 'count:poisson', nrounds = 5)
|
||||
pred <- predict(bst, as.matrix(mtcars[, -11]))
|
||||
sqrt(mean((pred - mtcars[, 11]) ^ 2))
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
nrounds = 2
|
||||
nrounds <- 2
|
||||
|
||||
# training the model for two rounds
|
||||
bst = xgb.train(param, dtrain, nrounds, nthread = 2, watchlist)
|
||||
bst <- xgb.train(param, dtrain, nrounds, nthread = 2, watchlist)
|
||||
cat('start testing prediction from first n trees\n')
|
||||
labels <- getinfo(dtest,'label')
|
||||
labels <- getinfo(dtest, 'label')
|
||||
|
||||
### predict using first 1 tree
|
||||
ypred1 = predict(bst, dtest, ntreelimit=1)
|
||||
ypred1 <- predict(bst, dtest, ntreelimit = 1)
|
||||
# by default, we predict using all the trees
|
||||
ypred2 = predict(bst, dtest)
|
||||
ypred2 <- predict(bst, dtest)
|
||||
|
||||
cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
|
||||
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')
|
||||
cat('error of ypred1=', mean(as.numeric(ypred1 > 0.5) != labels), '\n')
|
||||
cat('error of ypred2=', mean(as.numeric(ypred2 > 0.5) != labels), '\n')
|
||||
|
||||
@@ -5,34 +5,34 @@ require(Matrix)
|
||||
set.seed(1982)
|
||||
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
nrounds = 4
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
nrounds <- 4
|
||||
|
||||
# training the model for two rounds
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy without new features
|
||||
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||
accuracy.before <- (sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label)
|
||||
/ length(agaricus.test$label))
|
||||
|
||||
# by default, we predict using all the trees
|
||||
|
||||
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
|
||||
pred_with_leaf <- predict(bst, dtest, predleaf = TRUE)
|
||||
head(pred_with_leaf)
|
||||
|
||||
create.new.tree.features <- function(model, original.features){
|
||||
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
|
||||
cols <- list()
|
||||
for(i in 1:model$niter){
|
||||
for (i in 1:model$niter) {
|
||||
# max is not the real max but it s not important for the purpose of adding features
|
||||
leaf.id <- sort(unique(pred_with_leaf[,i]))
|
||||
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
|
||||
leaf.id <- sort(unique(pred_with_leaf[, i]))
|
||||
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id)
|
||||
}
|
||||
cbind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
|
||||
cbind(original.features, sparse.model.matrix(~ . - 1, as.data.frame(cols)))
|
||||
}
|
||||
|
||||
# Convert previous features to one hot encoding
|
||||
@@ -47,7 +47,9 @@ watchlist <- list(train = new.dtrain)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy with new features
|
||||
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||
accuracy.after <- (sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label)
|
||||
/ length(agaricus.test$label))
|
||||
|
||||
# Here the accuracy was already good and is now perfect.
|
||||
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
|
||||
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
|
||||
accuracy.after, "!\n"))
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
# running all scripts in demo folder
|
||||
demo(basic_walkthrough)
|
||||
demo(custom_objective)
|
||||
demo(boost_from_prediction)
|
||||
demo(predict_first_ntree)
|
||||
demo(generalized_linear_model)
|
||||
demo(cross_validation)
|
||||
demo(create_sparse_matrix)
|
||||
demo(predict_leaf_indices)
|
||||
demo(early_stopping)
|
||||
demo(poisson_regression)
|
||||
demo(caret_wrapper)
|
||||
demo(tweedie_regression)
|
||||
#demo(gpu_accelerated) # can only run when built with GPU support
|
||||
demo(basic_walkthrough, package = 'xgboost')
|
||||
demo(custom_objective, package = 'xgboost')
|
||||
demo(boost_from_prediction, package = 'xgboost')
|
||||
demo(predict_first_ntree, package = 'xgboost')
|
||||
demo(generalized_linear_model, package = 'xgboost')
|
||||
demo(cross_validation, package = 'xgboost')
|
||||
demo(create_sparse_matrix, package = 'xgboost')
|
||||
demo(predict_leaf_indices, package = 'xgboost')
|
||||
demo(early_stopping, package = 'xgboost')
|
||||
demo(poisson_regression, package = 'xgboost')
|
||||
demo(caret_wrapper, package = 'xgboost')
|
||||
demo(tweedie_regression, package = 'xgboost')
|
||||
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support
|
||||
|
||||
6
R-package/demo/tweedie_regression.R
Executable file → Normal file
6
R-package/demo/tweedie_regression.R
Executable file → Normal file
@@ -8,12 +8,12 @@ data(AutoClaim)
|
||||
dt <- data.table(AutoClaim)
|
||||
|
||||
# exclude these columns from the model matrix
|
||||
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
|
||||
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
|
||||
|
||||
# retains the missing values
|
||||
# NOTE: this dataset is comes ready out of the box
|
||||
options(na.action = 'na.pass')
|
||||
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
|
||||
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE])
|
||||
options(na.action = 'na.omit')
|
||||
|
||||
# response
|
||||
@@ -46,4 +46,4 @@ var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
|
||||
|
||||
preds <- predict(bst, d_train)
|
||||
|
||||
rmse <- sqrt(sum(mean((y - preds)^2)))
|
||||
rmse <- sqrt(sum(mean((y - preds) ^ 2)))
|
||||
|
||||
96
R-package/inst/make-r-def.R
Normal file
96
R-package/inst/make-r-def.R
Normal file
@@ -0,0 +1,96 @@
|
||||
# [description]
|
||||
# Create a definition file (.def) from a .dll file, using objdump. This
|
||||
# is used by FindLibR.cmake when building the R package with MSVC.
|
||||
#
|
||||
# [usage]
|
||||
#
|
||||
# Rscript make-r-def.R something.dll something.def
|
||||
#
|
||||
# [references]
|
||||
# * https://www.cs.colorado.edu/~main/cs1300/doc/mingwfaq.html
|
||||
|
||||
args <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
IN_DLL_FILE <- args[[1L]]
|
||||
OUT_DEF_FILE <- args[[2L]]
|
||||
DLL_BASE_NAME <- basename(IN_DLL_FILE)
|
||||
|
||||
message(sprintf("Creating '%s' from '%s'", OUT_DEF_FILE, IN_DLL_FILE))
|
||||
|
||||
# system() will not raise an R exception if the process called
|
||||
# fails. Wrapping it here to get that behavior.
|
||||
#
|
||||
# system() introduces a lot of overhead, at least on Windows,
|
||||
# so trying processx if it is available
|
||||
.pipe_shell_command_to_stdout <- function(command, args, out_file) {
|
||||
has_processx <- suppressMessages({
|
||||
suppressWarnings({
|
||||
require("processx") # nolint
|
||||
})
|
||||
})
|
||||
if (has_processx) {
|
||||
p <- processx::process$new(
|
||||
command = command
|
||||
, args = args
|
||||
, stdout = out_file
|
||||
, windows_verbatim_args = FALSE
|
||||
)
|
||||
invisible(p$wait())
|
||||
} else {
|
||||
message(paste0(
|
||||
"Using system2() to run shell commands. Installing "
|
||||
, "'processx' with install.packages('processx') might "
|
||||
, "make this faster."
|
||||
))
|
||||
exit_code <- system2(
|
||||
command = command
|
||||
, args = shQuote(args)
|
||||
, stdout = out_file
|
||||
)
|
||||
if (exit_code != 0L) {
|
||||
stop(paste0("Command failed with exit code: ", exit_code))
|
||||
}
|
||||
}
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
# use objdump to dump all the symbols
|
||||
OBJDUMP_FILE <- "objdump-out.txt"
|
||||
.pipe_shell_command_to_stdout(
|
||||
command = "objdump"
|
||||
, args = c("-p", IN_DLL_FILE)
|
||||
, out_file = OBJDUMP_FILE
|
||||
)
|
||||
|
||||
objdump_results <- readLines(OBJDUMP_FILE)
|
||||
result <- file.remove(OBJDUMP_FILE)
|
||||
|
||||
# Only one table in the objdump results matters for our purposes,
|
||||
# see https://www.cs.colorado.edu/~main/cs1300/doc/mingwfaq.html
|
||||
start_index <- which(
|
||||
grepl(
|
||||
pattern = "[Ordinal/Name Pointer] Table"
|
||||
, x = objdump_results
|
||||
, fixed = TRUE
|
||||
)
|
||||
)
|
||||
empty_lines <- which(objdump_results == "")
|
||||
end_of_table <- empty_lines[empty_lines > start_index][1L]
|
||||
|
||||
# Read the contents of the table
|
||||
exported_symbols <- objdump_results[(start_index + 1L):end_of_table]
|
||||
exported_symbols <- gsub("\t", "", exported_symbols)
|
||||
exported_symbols <- gsub(".*\\] ", "", exported_symbols)
|
||||
exported_symbols <- gsub(" ", "", exported_symbols)
|
||||
|
||||
# Write R.def file
|
||||
writeLines(
|
||||
text = c(
|
||||
paste0("LIBRARY \"", DLL_BASE_NAME, "\"")
|
||||
, "EXPORTS"
|
||||
, exported_symbols
|
||||
)
|
||||
, con = OUT_DEF_FILE
|
||||
, sep = "\n"
|
||||
)
|
||||
message(sprintf("Successfully created '%s'", OUT_DEF_FILE))
|
||||
64
R-package/man/a-compatibility-note-for-saveRDS-save.Rd
Normal file
64
R-package/man/a-compatibility-note-for-saveRDS-save.Rd
Normal file
@@ -0,0 +1,64 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/utils.R
|
||||
\name{a-compatibility-note-for-saveRDS-save}
|
||||
\alias{a-compatibility-note-for-saveRDS-save}
|
||||
\title{Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
|
||||
models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.}
|
||||
\description{
|
||||
It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
|
||||
\code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
|
||||
\code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
|
||||
the model is to be accessed in the future. If you train a model with the current version of
|
||||
XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
|
||||
accessible in later releases of XGBoost. To ensure that your model can be accessed in future
|
||||
releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
|
||||
}
|
||||
\details{
|
||||
Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
the JSON format by specifying the JSON extension. To read the model back, use
|
||||
\code{\link{xgb.load}}.
|
||||
|
||||
Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
|
||||
in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
|
||||
re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
|
||||
The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
as part of another R object.
|
||||
|
||||
Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
|
||||
model but also internal configurations and parameters, and its format is not stable across
|
||||
multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
|
||||
|
||||
For more details and explanation about model persistence and archival, consult the page
|
||||
\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
|
||||
# Save as a stand-alone file; load it with xgb.load()
|
||||
xgb.save(bst, 'xgb.model')
|
||||
bst2 <- xgb.load('xgb.model')
|
||||
|
||||
# Save as a stand-alone file (JSON); load it with xgb.load()
|
||||
xgb.save(bst, 'xgb.model.json')
|
||||
bst2 <- xgb.load('xgb.model.json')
|
||||
if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
|
||||
|
||||
# Save as a raw byte vector; load it with xgb.load.raw()
|
||||
xgb_bytes <- xgb.save.raw(bst)
|
||||
bst2 <- xgb.load.raw(xgb_bytes)
|
||||
|
||||
# Persist XGBoost model as part of another R object
|
||||
obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost model")
|
||||
# Persist the R object. Here, saveRDS() is okay, since it doesn't persist
|
||||
# xgb.Booster directly. What's being persisted is the future-proof byte representation
|
||||
# as given by xgb.save.raw().
|
||||
saveRDS(obj, 'my_object.rds')
|
||||
# Read back the R object
|
||||
obj2 <- readRDS('my_object.rds')
|
||||
# Re-construct xgb.Booster object from the bytes
|
||||
bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
|
||||
if (file.exists('my_object.rds')) file.remove('my_object.rds')
|
||||
|
||||
}
|
||||
@@ -38,6 +38,8 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
saveRDS(bst, "xgb.model.rds")
|
||||
|
||||
# Warning: The resulting RDS file is only compatible with the current XGBoost version.
|
||||
# Refer to the section titled "a-compatibility-note-for-saveRDS-save".
|
||||
bst1 <- readRDS("xgb.model.rds")
|
||||
if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds")
|
||||
# the handle is invalid:
|
||||
|
||||
@@ -28,12 +28,15 @@ xgb.cv(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\item{params}{the list of parameters. The complete list of parameters is
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss
|
||||
\item \code{binary:logistic} logistic regression for classification
|
||||
\item \code{reg:squarederror} Regression with squared loss.
|
||||
\item \code{binary:logistic} logistic regression for classification.
|
||||
\item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max_depth} maximum depth of the tree
|
||||
@@ -151,7 +154,7 @@ The cross-validation process is then repeated \code{nrounds} times, with each of
|
||||
|
||||
All observations are used for both training and validation.
|
||||
|
||||
Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
|
||||
Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
@@ -22,7 +22,11 @@ of \code{\link{xgb.train}}.
|
||||
|
||||
Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
corresponding R-methods would need to be used to load it.
|
||||
corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
|
||||
future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
releases of XGBoost.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
@@ -42,9 +42,9 @@ xgboost(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters.
|
||||
The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
|
||||
Below is a shorter summary:
|
||||
\item{params}{the list of parameters. The complete list of parameters is
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
|
||||
1. General Parameters
|
||||
|
||||
@@ -82,13 +82,23 @@ xgboost(
|
||||
\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss (Default).
|
||||
\item \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}. All inputs are required to be greater than -1. Also, see metric rmsle for possible issue with this objective.
|
||||
\item \code{reg:logistic} logistic regression.
|
||||
\item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
|
||||
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
\item \code{num_class} set the number of classes. To use only with multiclass objectives.
|
||||
\item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
\item \code{count:poisson}: poisson regression for count data, output mean of poisson distribution. \code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).
|
||||
\item \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored). Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function \code{h(t) = h0(t) * HR)}.
|
||||
\item \code{survival:aft}: Accelerated failure time model for censored survival time data. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details.
|
||||
\item \code{aft_loss_distribution}: Probabilty Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
|
||||
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
|
||||
\item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
|
||||
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
\item \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.
|
||||
\item \code{rank:map}: Use LambdaMART to perform list-wise ranking where \href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)} is maximized.
|
||||
\item \code{reg:gamma}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.
|
||||
\item \code{reg:tweedie}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.
|
||||
}
|
||||
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||
@@ -205,16 +215,16 @@ User may set one or several \code{eval_metric} parameters.
|
||||
Note that when using a customized metric, only this single metric can be used.
|
||||
The following is the list of built-in metrics for which Xgboost provides optimized implementation:
|
||||
\itemize{
|
||||
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
|
||||
\item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
\item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
\item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
\item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
}
|
||||
|
||||
The following callbacks are automatically created when certain parameters are set:
|
||||
|
||||
@@ -3,7 +3,7 @@ PKGROOT=../../
|
||||
ENABLE_STD_THREAD=1
|
||||
# _*_ mode: Makefile; _*_
|
||||
|
||||
CXX_STD = CXX11
|
||||
CXX_STD = CXX14
|
||||
|
||||
XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
|
||||
-DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
|
||||
|
||||
@@ -15,7 +15,7 @@ xgblib:
|
||||
cp -r ../../include .
|
||||
cp -r ../../amalgamation .
|
||||
|
||||
CXX_STD = CXX11
|
||||
CXX_STD = CXX14
|
||||
|
||||
XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
|
||||
-DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
|
||||
|
||||
@@ -375,7 +375,7 @@ SEXP XGBoosterSaveJsonConfig_R(SEXP handle) {
|
||||
|
||||
SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value) {
|
||||
R_API_BEGIN();
|
||||
XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value)));
|
||||
CHECK_CALL(XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value))));
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
@@ -397,9 +397,9 @@ SEXP XGBoosterSerializeToBuffer_R(SEXP handle) {
|
||||
|
||||
SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) {
|
||||
R_API_BEGIN();
|
||||
XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle),
|
||||
CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle),
|
||||
RAW(raw),
|
||||
length(raw));
|
||||
length(raw)));
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
|
||||
101
R-package/tests/helper_scripts/generate_models.R
Normal file
101
R-package/tests/helper_scripts/generate_models.R
Normal file
@@ -0,0 +1,101 @@
|
||||
# Script to generate reference models. The reference models are used to test backward compatibility
|
||||
# of saved model files from XGBoost version 0.90 and 1.0.x.
|
||||
library(xgboost)
|
||||
library(Matrix)
|
||||
source('./generate_models_params.R')
|
||||
|
||||
set.seed(0)
|
||||
metadata <- list(
|
||||
kRounds = 2,
|
||||
kRows = 1000,
|
||||
kCols = 4,
|
||||
kForests = 2,
|
||||
kMaxDepth = 2,
|
||||
kClasses = 3
|
||||
)
|
||||
X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows,
|
||||
ncol = metadata$kCols, sparse = TRUE)
|
||||
w <- runif(metadata$kRows)
|
||||
|
||||
version <- packageVersion('xgboost')
|
||||
target_dir <- 'models'
|
||||
|
||||
save_booster <- function (booster, model_name) {
|
||||
booster_bin <- function (model_name) {
|
||||
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
|
||||
}
|
||||
booster_json <- function (model_name) {
|
||||
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
|
||||
}
|
||||
booster_rds <- function (model_name) {
|
||||
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
|
||||
}
|
||||
xgb.save(booster, booster_bin(model_name))
|
||||
saveRDS(booster, booster_rds(model_name))
|
||||
if (version >= '1.0.0') {
|
||||
xgb.save(booster, booster_json(model_name))
|
||||
}
|
||||
}
|
||||
|
||||
generate_regression_model <- function () {
|
||||
print('Regression')
|
||||
y <- rnorm(metadata$kRows)
|
||||
|
||||
data <- xgb.DMatrix(X, label = y)
|
||||
params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
|
||||
max_depth = metadata$kMaxDepth)
|
||||
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
|
||||
save_booster(booster, 'reg')
|
||||
}
|
||||
|
||||
generate_logistic_model <- function () {
|
||||
print('Binary classification with logistic loss')
|
||||
y <- sample(0:1, size = metadata$kRows, replace = TRUE)
|
||||
stopifnot(max(y) == 1, min(y) == 0)
|
||||
|
||||
data <- xgb.DMatrix(X, label = y, weight = w)
|
||||
params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
|
||||
max_depth = metadata$kMaxDepth, objective = 'binary:logistic')
|
||||
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
|
||||
save_booster(booster, 'logit')
|
||||
}
|
||||
|
||||
generate_classification_model <- function () {
|
||||
print('Multi-class classification')
|
||||
y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE)
|
||||
stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0)
|
||||
|
||||
data <- xgb.DMatrix(X, label = y, weight = w)
|
||||
params <- list(num_class = metadata$kClasses, tree_method = 'hist',
|
||||
num_parallel_tree = metadata$kForests, max_depth = metadata$kMaxDepth,
|
||||
objective = 'multi:softmax')
|
||||
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
|
||||
save_booster(booster, 'cls')
|
||||
}
|
||||
|
||||
generate_ranking_model <- function () {
|
||||
print('Learning to rank')
|
||||
y <- sample(0:4, size = metadata$kRows, replace = TRUE)
|
||||
stopifnot(max(y) == 4, min(y) == 0)
|
||||
kGroups <- 20
|
||||
w <- runif(kGroups)
|
||||
g <- rep(50, times = kGroups)
|
||||
|
||||
data <- xgb.DMatrix(X, label = y, group = g)
|
||||
# setinfo(data, 'weight', w)
|
||||
# ^^^ does not work in version <= 1.1.0; see https://github.com/dmlc/xgboost/issues/5942
|
||||
# So call low-level function XGDMatrixSetInfo_R directly. Since this function is not an exported
|
||||
# symbol, use the triple-colon operator.
|
||||
.Call(xgboost:::XGDMatrixSetInfo_R, data, 'weight', as.numeric(w))
|
||||
params <- list(objective = 'rank:ndcg', num_parallel_tree = metadata$kForests,
|
||||
tree_method = 'hist', max_depth = metadata$kMaxDepth)
|
||||
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
|
||||
save_booster(booster, 'ltr')
|
||||
}
|
||||
|
||||
dir.create(target_dir)
|
||||
|
||||
invisible(generate_regression_model())
|
||||
invisible(generate_logistic_model())
|
||||
invisible(generate_classification_model())
|
||||
invisible(generate_ranking_model())
|
||||
71
R-package/tests/helper_scripts/run_lint.R
Normal file
71
R-package/tests/helper_scripts/run_lint.R
Normal file
@@ -0,0 +1,71 @@
|
||||
library(lintr)
|
||||
library(crayon)
|
||||
|
||||
my_linters <- list(
|
||||
absolute_path_linter = lintr::absolute_path_linter,
|
||||
assignment_linter = lintr::assignment_linter,
|
||||
closed_curly_linter = lintr::closed_curly_linter,
|
||||
commas_linter = lintr::commas_linter,
|
||||
# commented_code_linter = lintr::commented_code_linter,
|
||||
infix_spaces_linter = lintr::infix_spaces_linter,
|
||||
line_length_linter = lintr::line_length_linter,
|
||||
no_tab_linter = lintr::no_tab_linter,
|
||||
object_usage_linter = lintr::object_usage_linter,
|
||||
# snake_case_linter = lintr::snake_case_linter,
|
||||
# multiple_dots_linter = lintr::multiple_dots_linter,
|
||||
object_length_linter = lintr::object_length_linter,
|
||||
open_curly_linter = lintr::open_curly_linter,
|
||||
# single_quotes_linter = lintr::single_quotes_linter,
|
||||
spaces_inside_linter = lintr::spaces_inside_linter,
|
||||
spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter,
|
||||
trailing_blank_lines_linter = lintr::trailing_blank_lines_linter,
|
||||
trailing_whitespace_linter = lintr::trailing_whitespace_linter,
|
||||
true_false = lintr::T_and_F_symbol_linter
|
||||
)
|
||||
|
||||
results <- lapply(
|
||||
list.files(path = '.', pattern = '\\.[Rr]$', recursive = TRUE),
|
||||
function (r_file) {
|
||||
cat(sprintf("Processing %s ...\n", r_file))
|
||||
list(r_file = r_file,
|
||||
output = lintr::lint(filename = r_file, linters = my_linters))
|
||||
})
|
||||
num_issue <- Reduce(sum, lapply(results, function (e) length(e$output)))
|
||||
|
||||
lint2str <- function(lint_entry) {
|
||||
color <- function(type) {
|
||||
switch(type,
|
||||
"warning" = crayon::magenta,
|
||||
"error" = crayon::red,
|
||||
"style" = crayon::blue,
|
||||
crayon::bold
|
||||
)
|
||||
}
|
||||
|
||||
paste0(
|
||||
lapply(lint_entry$output,
|
||||
function (lint_line) {
|
||||
paste0(
|
||||
crayon::bold(lint_entry$r_file, ":",
|
||||
as.character(lint_line$line_number), ":",
|
||||
as.character(lint_line$column_number), ": ", sep = ""),
|
||||
color(lint_line$type)(lint_line$type, ": ", sep = ""),
|
||||
crayon::bold(lint_line$message), "\n",
|
||||
lint_line$line, "\n",
|
||||
lintr:::highlight_string(lint_line$message, lint_line$column_number, lint_line$ranges),
|
||||
"\n",
|
||||
collapse = "")
|
||||
}),
|
||||
collapse = "")
|
||||
}
|
||||
|
||||
if (num_issue > 0) {
|
||||
cat(sprintf('R linters found %d issues:\n', num_issue))
|
||||
for (entry in results) {
|
||||
if (length(entry$output)) {
|
||||
cat(paste0('**** ', crayon::bold(entry$r_file), '\n'))
|
||||
cat(paste0(lint2str(entry), collapse = ''))
|
||||
}
|
||||
}
|
||||
quit(save = 'no', status = 1) # Signal error to parent shell
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
library(testthat)
|
||||
library(xgboost)
|
||||
|
||||
test_check("xgboost")
|
||||
test_check("xgboost", reporter = ProgressReporter)
|
||||
|
||||
@@ -2,19 +2,19 @@ require(xgboost)
|
||||
|
||||
context("basic functions")
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
set.seed(1994)
|
||||
|
||||
# disable some tests for Win32
|
||||
windows_flag = .Platform$OS.type == "windows" &&
|
||||
windows_flag <- .Platform$OS.type == "windows" &&
|
||||
.Machine$sizeof.pointer != 8
|
||||
solaris_flag = (Sys.info()['sysname'] == "SunOS")
|
||||
solaris_flag <- (Sys.info()['sysname'] == "SunOS")
|
||||
|
||||
test_that("train and predict binary classification", {
|
||||
nrounds = 2
|
||||
nrounds <- 2
|
||||
expect_output(
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic")
|
||||
@@ -30,24 +30,24 @@ test_that("train and predict binary classification", {
|
||||
|
||||
pred1 <- predict(bst, train$data, ntreelimit = 1)
|
||||
expect_length(pred1, 6513)
|
||||
err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
|
||||
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
|
||||
err_log <- bst$evaluation_log[1, train_error]
|
||||
expect_lt(abs(err_pred1 - err_log), 10e-6)
|
||||
})
|
||||
|
||||
test_that("parameter validation works", {
|
||||
p <- list(foo = "bar")
|
||||
nrounds = 1
|
||||
nrounds <- 1
|
||||
set.seed(1994)
|
||||
|
||||
d <- cbind(
|
||||
x1 = rnorm(10),
|
||||
x2 = rnorm(10),
|
||||
x3 = rnorm(10))
|
||||
y <- d[,"x1"] + d[,"x2"]^2 +
|
||||
ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
|
||||
y <- d[, "x1"] + d[, "x2"]^2 +
|
||||
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
|
||||
rnorm(10)
|
||||
dtrain <- xgb.DMatrix(data=d, info = list(label=y))
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y))
|
||||
|
||||
correct <- function() {
|
||||
params <- list(max_depth = 2, booster = "dart",
|
||||
@@ -70,15 +70,15 @@ test_that("parameter validation works", {
|
||||
|
||||
|
||||
test_that("dart prediction works", {
|
||||
nrounds = 32
|
||||
nrounds <- 32
|
||||
set.seed(1994)
|
||||
|
||||
d <- cbind(
|
||||
x1 = rnorm(100),
|
||||
x2 = rnorm(100),
|
||||
x3 = rnorm(100))
|
||||
y <- d[,"x1"] + d[,"x2"]^2 +
|
||||
ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
|
||||
y <- d[, "x1"] + d[, "x2"]^2 +
|
||||
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
|
||||
rnorm(100)
|
||||
|
||||
set.seed(1994)
|
||||
@@ -87,23 +87,23 @@ test_that("dart prediction works", {
|
||||
eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
|
||||
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
|
||||
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
|
||||
expect_true(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
|
||||
expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
|
||||
|
||||
pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
|
||||
expect_false(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
|
||||
expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
|
||||
|
||||
set.seed(1994)
|
||||
dtrain <- xgb.DMatrix(data=d, info = list(label=y))
|
||||
booster_by_train <- xgb.train( params = list(
|
||||
booster = "dart",
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
rate_drop = 0.5,
|
||||
one_drop = TRUE,
|
||||
nthread = 1,
|
||||
tree_method= "exact",
|
||||
objective = "reg:squarederror"
|
||||
),
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y))
|
||||
booster_by_train <- xgb.train(params = list(
|
||||
booster = "dart",
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
rate_drop = 0.5,
|
||||
one_drop = TRUE,
|
||||
nthread = 1,
|
||||
tree_method = "exact",
|
||||
objective = "reg:squarederror"
|
||||
),
|
||||
data = dtrain,
|
||||
nrounds = nrounds
|
||||
)
|
||||
@@ -111,9 +111,9 @@ test_that("dart prediction works", {
|
||||
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
|
||||
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
|
||||
|
||||
expect_true(all(matrix(pred_by_train_0, byrow=TRUE) == matrix(pred_by_xgboost_0, byrow=TRUE)))
|
||||
expect_true(all(matrix(pred_by_train_1, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
|
||||
expect_true(all(matrix(pred_by_train_2, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
|
||||
expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
|
||||
expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
|
||||
expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
|
||||
})
|
||||
|
||||
test_that("train and predict softprob", {
|
||||
@@ -122,7 +122,7 @@ test_that("train and predict softprob", {
|
||||
expect_output(
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
||||
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
|
||||
objective = "multi:softprob", num_class=3)
|
||||
objective = "multi:softprob", num_class = 3)
|
||||
, "train-merror")
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
|
||||
@@ -130,17 +130,17 @@ test_that("train and predict softprob", {
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
expect_length(pred, nrow(iris) * 3)
|
||||
# row sums add up to total probability of 1:
|
||||
expect_equal(rowSums(matrix(pred, ncol=3, byrow=TRUE)), rep(1, nrow(iris)), tolerance = 1e-7)
|
||||
expect_equal(rowSums(matrix(pred, ncol = 3, byrow = TRUE)), rep(1, nrow(iris)), tolerance = 1e-7)
|
||||
# manually calculate error at the last iteration:
|
||||
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
|
||||
expect_equal(as.numeric(t(mpred)), pred)
|
||||
pred_labels <- max.col(mpred) - 1
|
||||
err <- sum(pred_labels != lb)/length(lb)
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
# manually calculate error at the 1st iteration:
|
||||
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
|
||||
pred_labels <- max.col(mpred) - 1
|
||||
err <- sum(pred_labels != lb)/length(lb)
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
@@ -150,7 +150,7 @@ test_that("train and predict softmax", {
|
||||
expect_output(
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
||||
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
|
||||
objective = "multi:softmax", num_class=3)
|
||||
objective = "multi:softmax", num_class = 3)
|
||||
, "train-merror")
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
|
||||
@@ -158,7 +158,7 @@ test_that("train and predict softmax", {
|
||||
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
expect_length(pred, nrow(iris))
|
||||
err <- sum(pred != lb)/length(lb)
|
||||
err <- sum(pred != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
@@ -173,12 +173,12 @@ test_that("train and predict RF", {
|
||||
expect_equal(xgb.ntree(bst), 20)
|
||||
|
||||
pred <- predict(bst, train$data)
|
||||
pred_err <- sum((pred > 0.5) != lb)/length(lb)
|
||||
pred_err <- sum((pred > 0.5) != lb) / length(lb)
|
||||
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
|
||||
#expect_lt(pred_err, 0.03)
|
||||
|
||||
pred <- predict(bst, train$data, ntreelimit = 20)
|
||||
pred_err_20 <- sum((pred > 0.5) != lb)/length(lb)
|
||||
pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
|
||||
expect_equal(pred_err_20, pred_err)
|
||||
|
||||
#pred <- predict(bst, train$data, ntreelimit = 1)
|
||||
@@ -193,19 +193,19 @@ test_that("train and predict RF with softprob", {
|
||||
set.seed(11)
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
||||
max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class=3, verbose = 0,
|
||||
objective = "multi:softprob", num_class = 3, verbose = 0,
|
||||
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
|
||||
expect_equal(bst$niter, 15)
|
||||
expect_equal(xgb.ntree(bst), 15*3*4)
|
||||
expect_equal(xgb.ntree(bst), 15 * 3 * 4)
|
||||
# predict for all iterations:
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE)
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
|
||||
expect_equal(dim(pred), c(nrow(iris), 3))
|
||||
pred_labels <- max.col(pred) - 1
|
||||
err <- sum(pred_labels != lb)/length(lb)
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
|
||||
# predict for 7 iterations and adjust for 4 parallel trees per iteration
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE, ntreelimit = 7 * 4)
|
||||
err <- sum((max.col(pred) - 1) != lb)/length(lb)
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
|
||||
err <- sum((max.col(pred) - 1) != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
@@ -223,7 +223,7 @@ test_that("use of multiple eval metrics works", {
|
||||
|
||||
test_that("training continuation works", {
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label)
|
||||
watchlist = list(train=dtrain)
|
||||
watchlist <- list(train = dtrain)
|
||||
param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
|
||||
|
||||
# for the reference, use 4 iterations at once:
|
||||
@@ -255,7 +255,7 @@ test_that("training continuation works", {
|
||||
test_that("model serialization works", {
|
||||
out_path <- "model_serialization"
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label)
|
||||
watchlist = list(train=dtrain)
|
||||
watchlist <- list(train = dtrain)
|
||||
param <- list(objective = "binary:logistic")
|
||||
booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
|
||||
raw <- xgb.serialize(booster)
|
||||
@@ -273,7 +273,7 @@ test_that("xgb.cv works", {
|
||||
expect_output(
|
||||
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
|
||||
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
|
||||
verbose=TRUE)
|
||||
verbose = TRUE)
|
||||
, "train-error:")
|
||||
expect_is(cv, 'xgb.cv.synchronous')
|
||||
expect_false(is.null(cv$evaluation_log))
|
||||
@@ -292,11 +292,11 @@ test_that("xgb.cv works with stratified folds", {
|
||||
set.seed(314159)
|
||||
cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
|
||||
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
|
||||
verbose=TRUE, stratified = FALSE)
|
||||
verbose = TRUE, stratified = FALSE)
|
||||
set.seed(314159)
|
||||
cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
|
||||
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
|
||||
verbose=TRUE, stratified = TRUE)
|
||||
verbose = TRUE, stratified = TRUE)
|
||||
# Stratified folds should result in a different evaluation logs
|
||||
expect_true(all(cv$evaluation_log[, test_error_mean] != cv2$evaluation_log[, test_error_mean]))
|
||||
})
|
||||
@@ -319,7 +319,7 @@ test_that("train and predict with non-strict classes", {
|
||||
expect_equal(pr0, pr)
|
||||
|
||||
# dense matrix-like input of non-matrix class with some inheritance
|
||||
class(train_dense) <- c('pphmatrix','shmatrix')
|
||||
class(train_dense) <- c('pphmatrix', 'shmatrix')
|
||||
expect_true(is.matrix(train_dense))
|
||||
expect_error(
|
||||
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
|
||||
@@ -337,15 +337,15 @@ test_that("train and predict with non-strict classes", {
|
||||
test_that("max_delta_step works", {
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
watchlist <- list(train = dtrain)
|
||||
param <- list(objective = "binary:logistic", eval_metric="logloss", max_depth = 2, nthread = 2, eta = 0.5)
|
||||
nrounds = 5
|
||||
param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5)
|
||||
nrounds <- 5
|
||||
# model with no restriction on max_delta_step
|
||||
bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
|
||||
# model with restricted max_delta_step
|
||||
bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
|
||||
# the no-restriction model is expected to have consistently lower loss during the initial interations
|
||||
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
|
||||
expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8)
|
||||
expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8)
|
||||
})
|
||||
|
||||
test_that("colsample_bytree works", {
|
||||
|
||||
@@ -5,8 +5,8 @@ require(data.table)
|
||||
|
||||
context("callbacks")
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
@@ -21,24 +21,24 @@ ltrain <- add.noise(train$label, 0.2)
|
||||
ltest <- add.noise(test$label, 0.2)
|
||||
dtrain <- xgb.DMatrix(train$data, label = ltrain)
|
||||
dtest <- xgb.DMatrix(test$data, label = ltest)
|
||||
watchlist = list(train=dtrain, test=dtest)
|
||||
watchlist <- list(train = dtrain, test = dtest)
|
||||
|
||||
|
||||
err <- function(label, pr) sum((pr > 0.5) != label)/length(label)
|
||||
err <- function(label, pr) sum((pr > 0.5) != label) / length(label)
|
||||
|
||||
param <- list(objective = "binary:logistic", max_depth = 2, nthread = 2)
|
||||
|
||||
|
||||
test_that("cb.print.evaluation works as expected", {
|
||||
|
||||
bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8)
|
||||
bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
|
||||
bst_evaluation_err <- NULL
|
||||
begin_iteration <- 1
|
||||
end_iteration <- 7
|
||||
|
||||
f0 <- cb.print.evaluation(period=0)
|
||||
f1 <- cb.print.evaluation(period=1)
|
||||
f5 <- cb.print.evaluation(period=5)
|
||||
f0 <- cb.print.evaluation(period = 0)
|
||||
f1 <- cb.print.evaluation(period = 1)
|
||||
f5 <- cb.print.evaluation(period = 5)
|
||||
|
||||
expect_false(is.null(attr(f1, 'call')))
|
||||
expect_equal(attr(f1, 'name'), 'cb.print.evaluation')
|
||||
@@ -57,13 +57,13 @@ test_that("cb.print.evaluation works as expected", {
|
||||
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
|
||||
expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
|
||||
|
||||
bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2)
|
||||
bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2)
|
||||
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\\+0.100000\ttest-auc:0.800000\\+0.200000")
|
||||
})
|
||||
|
||||
test_that("cb.evaluation.log works as expected", {
|
||||
|
||||
bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8)
|
||||
bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
|
||||
bst_evaluation_err <- NULL
|
||||
|
||||
evaluation_log <- list()
|
||||
@@ -75,33 +75,33 @@ test_that("cb.evaluation.log works as expected", {
|
||||
iteration <- 1
|
||||
expect_silent(f())
|
||||
expect_equal(evaluation_log,
|
||||
list(c(iter=1, bst_evaluation)))
|
||||
list(c(iter = 1, bst_evaluation)))
|
||||
iteration <- 2
|
||||
expect_silent(f())
|
||||
expect_equal(evaluation_log,
|
||||
list(c(iter=1, bst_evaluation), c(iter=2, bst_evaluation)))
|
||||
list(c(iter = 1, bst_evaluation), c(iter = 2, bst_evaluation)))
|
||||
expect_silent(f(finalize = TRUE))
|
||||
expect_equal(evaluation_log,
|
||||
data.table(iter=1:2, train_auc=c(0.9,0.9), test_auc=c(0.8,0.8)))
|
||||
data.table(iter = 1:2, train_auc = c(0.9, 0.9), test_auc = c(0.8, 0.8)))
|
||||
|
||||
bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2)
|
||||
bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2)
|
||||
evaluation_log <- list()
|
||||
f <- cb.evaluation.log()
|
||||
|
||||
iteration <- 1
|
||||
expect_silent(f())
|
||||
expect_equal(evaluation_log,
|
||||
list(c(iter=1, c(bst_evaluation, bst_evaluation_err))))
|
||||
list(c(iter = 1, c(bst_evaluation, bst_evaluation_err))))
|
||||
iteration <- 2
|
||||
expect_silent(f())
|
||||
expect_equal(evaluation_log,
|
||||
list(c(iter=1, c(bst_evaluation, bst_evaluation_err)),
|
||||
c(iter=2, c(bst_evaluation, bst_evaluation_err))))
|
||||
list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)),
|
||||
c(iter = 2, c(bst_evaluation, bst_evaluation_err))))
|
||||
expect_silent(f(finalize = TRUE))
|
||||
expect_equal(evaluation_log,
|
||||
data.table(iter=1:2,
|
||||
train_auc_mean=c(0.9,0.9), train_auc_std=c(0.1,0.1),
|
||||
test_auc_mean=c(0.8,0.8), test_auc_std=c(0.2,0.2)))
|
||||
data.table(iter = 1:2,
|
||||
train_auc_mean = c(0.9, 0.9), train_auc_std = c(0.1, 0.1),
|
||||
test_auc_mean = c(0.8, 0.8), test_auc_std = c(0.2, 0.2)))
|
||||
})
|
||||
|
||||
|
||||
@@ -237,7 +237,7 @@ test_that("early stopping using a specific metric works", {
|
||||
set.seed(11)
|
||||
expect_output(
|
||||
bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.6,
|
||||
eval_metric="logloss", eval_metric="auc",
|
||||
eval_metric = "logloss", eval_metric = "auc",
|
||||
callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
|
||||
metric_name = 'test_logloss')))
|
||||
, "Stopping. Best iteration")
|
||||
@@ -267,12 +267,12 @@ test_that("early stopping xgb.cv works", {
|
||||
|
||||
test_that("prediction in xgb.cv works", {
|
||||
set.seed(11)
|
||||
nrounds = 4
|
||||
nrounds <- 4
|
||||
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
|
||||
expect_false(is.null(cv$evaluation_log))
|
||||
expect_false(is.null(cv$pred))
|
||||
expect_length(cv$pred, nrow(train$data))
|
||||
err_pred <- mean( sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))) )
|
||||
err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
|
||||
err_log <- cv$evaluation_log[nrounds, test_error_mean]
|
||||
expect_equal(err_pred, err_log, tolerance = 1e-6)
|
||||
|
||||
@@ -308,7 +308,7 @@ test_that("prediction in early-stopping xgb.cv works", {
|
||||
expect_false(is.null(cv$pred))
|
||||
expect_length(cv$pred, nrow(train$data))
|
||||
|
||||
err_pred <- mean( sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))) )
|
||||
err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
|
||||
err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean]
|
||||
expect_equal(err_pred, err_log, tolerance = 1e-6)
|
||||
err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
|
||||
|
||||
@@ -4,8 +4,8 @@ require(xgboost)
|
||||
|
||||
set.seed(1994)
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
@@ -20,12 +20,12 @@ logregobj <- function(preds, dtrain) {
|
||||
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
err <- as.numeric(sum(labels != (preds > 0.5))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth=2, eta=1, nthread = 2,
|
||||
objective=logregobj, eval_metric=evalerror)
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
num_round <- 2
|
||||
|
||||
test_that("custom objective works", {
|
||||
@@ -37,12 +37,19 @@ test_that("custom objective works", {
|
||||
})
|
||||
|
||||
test_that("custom objective in CV works", {
|
||||
cv <- xgb.cv(param, dtrain, num_round, nfold=10, verbose=FALSE)
|
||||
cv <- xgb.cv(param, dtrain, num_round, nfold = 10, verbose = FALSE)
|
||||
expect_false(is.null(cv$evaluation_log))
|
||||
expect_equal(dim(cv$evaluation_log), c(2, 5))
|
||||
expect_lt(cv$evaluation_log[num_round, test_error_mean], 0.03)
|
||||
})
|
||||
|
||||
test_that("custom objective with early stop works", {
|
||||
bst <- xgb.train(param, dtrain, 10, watchlist)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
train_log <- bst$evaluation_log$train_error
|
||||
expect_true(all(diff(train_log)) <= 0)
|
||||
})
|
||||
|
||||
test_that("custom objective using DMatrix attr works", {
|
||||
|
||||
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
|
||||
@@ -54,14 +61,14 @@ test_that("custom objective using DMatrix attr works", {
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
param$objective = logregobjattr
|
||||
param$objective <- logregobjattr
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
})
|
||||
|
||||
test_that("custom objective with multi-class works", {
|
||||
data = as.matrix(iris[, -5])
|
||||
label = as.numeric(iris$Species) - 1
|
||||
data <- as.matrix(iris[, -5])
|
||||
label <- as.numeric(iris$Species) - 1
|
||||
dtrain <- xgb.DMatrix(data = data, label = label)
|
||||
nclasses <- 3
|
||||
|
||||
@@ -72,6 +79,10 @@ test_that("custom objective with multi-class works", {
|
||||
hess <- rnorm(dim(as.matrix(preds))[1])
|
||||
return (list(grad = grad, hess = hess))
|
||||
}
|
||||
param$objective = fake_softprob
|
||||
bst <- xgb.train(param, dtrain, 1, num_class=nclasses)
|
||||
fake_merror <- function(preds, dtrain) {
|
||||
expect_equal(dim(data)[1] * nclasses, dim(as.matrix(preds))[1])
|
||||
}
|
||||
param$objective <- fake_softprob
|
||||
param$eval_metric <- fake_merror
|
||||
bst <- xgb.train(param, dtrain, 1, num_class = nclasses)
|
||||
})
|
||||
|
||||
@@ -3,29 +3,29 @@ require(Matrix)
|
||||
|
||||
context("testing xgb.DMatrix functionality")
|
||||
|
||||
data(agaricus.test, package='xgboost')
|
||||
test_data <- agaricus.test$data[1:100,]
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
test_data <- agaricus.test$data[1:100, ]
|
||||
test_label <- agaricus.test$label[1:100]
|
||||
|
||||
test_that("xgb.DMatrix: basic construction", {
|
||||
# from sparse matrix
|
||||
dtest1 <- xgb.DMatrix(test_data, label=test_label)
|
||||
dtest1 <- xgb.DMatrix(test_data, label = test_label)
|
||||
|
||||
# from dense matrix
|
||||
dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label)
|
||||
dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label)
|
||||
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
|
||||
expect_equal(dim(dtest1), dim(dtest2))
|
||||
|
||||
#from dense integer matrix
|
||||
int_data <- as.matrix(test_data)
|
||||
storage.mode(int_data) <- "integer"
|
||||
dtest3 <- xgb.DMatrix(int_data, label=test_label)
|
||||
dtest3 <- xgb.DMatrix(int_data, label = test_label)
|
||||
expect_equal(dim(dtest1), dim(dtest3))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: saving, loading", {
|
||||
# save to a local file
|
||||
dtest1 <- xgb.DMatrix(test_data, label=test_label)
|
||||
dtest1 <- xgb.DMatrix(test_data, label = test_label)
|
||||
tmp_file <- tempfile('xgb.DMatrix_')
|
||||
expect_true(xgb.DMatrix.save(dtest1, tmp_file))
|
||||
# read from a local file
|
||||
@@ -35,12 +35,12 @@ test_that("xgb.DMatrix: saving, loading", {
|
||||
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
|
||||
|
||||
# from a libsvm text file
|
||||
tmp <- c("0 1:1 2:1","1 3:1","0 1:1")
|
||||
tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1")
|
||||
tmp_file <- 'tmp.libsvm'
|
||||
writeLines(tmp, tmp_file)
|
||||
dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
|
||||
expect_equal(dim(dtest4), c(3, 4))
|
||||
expect_equal(getinfo(dtest4, 'label'), c(0,1,0))
|
||||
expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
|
||||
unlink(tmp_file)
|
||||
})
|
||||
|
||||
@@ -61,7 +61,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
|
||||
|
||||
expect_true(setinfo(dtest, 'weight', test_label))
|
||||
expect_true(setinfo(dtest, 'base_margin', test_label))
|
||||
expect_true(setinfo(dtest, 'group', c(50,50)))
|
||||
expect_true(setinfo(dtest, 'group', c(50, 50)))
|
||||
expect_error(setinfo(dtest, 'group', test_label))
|
||||
|
||||
# providing character values will give a warning
|
||||
@@ -72,35 +72,35 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: slice, dim", {
|
||||
dtest <- xgb.DMatrix(test_data, label=test_label)
|
||||
dtest <- xgb.DMatrix(test_data, label = test_label)
|
||||
expect_equal(dim(dtest), dim(test_data))
|
||||
dsub1 <- slice(dtest, 1:42)
|
||||
expect_equal(nrow(dsub1), 42)
|
||||
expect_equal(ncol(dsub1), ncol(test_data))
|
||||
|
||||
dsub2 <- dtest[1:42,]
|
||||
dsub2 <- dtest[1:42, ]
|
||||
expect_equal(dim(dtest), dim(test_data))
|
||||
expect_equal(getinfo(dsub1, 'label'), getinfo(dsub2, 'label'))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: slice, trailing empty rows", {
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
train_data <- agaricus.train$data
|
||||
train_label <- agaricus.train$label
|
||||
dtrain <- xgb.DMatrix(data=train_data, label=train_label)
|
||||
dtrain <- xgb.DMatrix(data = train_data, label = train_label)
|
||||
slice(dtrain, 6513L)
|
||||
train_data[6513, ] <- 0
|
||||
dtrain <- xgb.DMatrix(data=train_data, label=train_label)
|
||||
dtrain <- xgb.DMatrix(data = train_data, label = train_label)
|
||||
slice(dtrain, 6513L)
|
||||
expect_equal(nrow(dtrain), 6513)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: colnames", {
|
||||
dtest <- xgb.DMatrix(test_data, label=test_label)
|
||||
dtest <- xgb.DMatrix(test_data, label = test_label)
|
||||
expect_equal(colnames(dtest), colnames(test_data))
|
||||
expect_error( colnames(dtest) <- 'asdf')
|
||||
expect_error(colnames(dtest) <- 'asdf')
|
||||
new_names <- make.names(1:ncol(test_data))
|
||||
expect_silent( colnames(dtest) <- new_names)
|
||||
expect_silent(colnames(dtest) <- new_names)
|
||||
expect_equal(colnames(dtest), new_names)
|
||||
expect_silent(colnames(dtest) <- NULL)
|
||||
expect_null(colnames(dtest))
|
||||
@@ -109,7 +109,7 @@ test_that("xgb.DMatrix: colnames", {
|
||||
test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
|
||||
set.seed(123)
|
||||
nr <- 1000
|
||||
x <- rsparsematrix(nr, 100, density=0.0005)
|
||||
x <- rsparsematrix(nr, 100, density = 0.0005)
|
||||
# we want it very sparse, so that last rows are empty
|
||||
expect_lt(max(x@i), nr)
|
||||
dtest <- xgb.DMatrix(x)
|
||||
|
||||
@@ -3,8 +3,8 @@ require(xgboost)
|
||||
context("Garbage Collection Safety Check")
|
||||
|
||||
test_that("train and prediction when gctorture is on", {
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
gctorture(TRUE)
|
||||
|
||||
@@ -3,8 +3,8 @@ context('Test generalized linear models')
|
||||
require(xgboost)
|
||||
|
||||
test_that("gblinear works", {
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
@@ -16,7 +16,7 @@ test_that("gblinear works", {
|
||||
ERR_UL <- 0.005 # upper limit for the test set error
|
||||
VERB <- 0 # chatterbox switch
|
||||
|
||||
param$updater = 'shotgun'
|
||||
param$updater <- 'shotgun'
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
|
||||
ypred <- predict(bst, dtest)
|
||||
expect_equal(length(getinfo(dtest, 'label')), 1611)
|
||||
@@ -29,7 +29,7 @@ test_that("gblinear works", {
|
||||
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
||||
expect_is(h, "matrix")
|
||||
|
||||
param$updater = 'coord_descent'
|
||||
param$updater <- 'coord_descent'
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
|
||||
@@ -5,18 +5,18 @@ require(data.table)
|
||||
require(Matrix)
|
||||
require(vcd, quietly = TRUE)
|
||||
|
||||
float_tolerance = 5e-6
|
||||
float_tolerance <- 5e-6
|
||||
|
||||
# disable some tests for 32-bit environment
|
||||
flag_32bit = .Machine$sizeof.pointer != 8
|
||||
flag_32bit <- .Machine$sizeof.pointer != 8
|
||||
|
||||
set.seed(1982)
|
||||
data(Arthritis)
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
df[,AgeDiscret := as.factor(round(Age / 10,0))]
|
||||
df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
df[,ID := NULL]
|
||||
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
df[, ID := NULL]
|
||||
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) # nolint
|
||||
label <- df[, ifelse(Improved == "Marked", 1, 0)]
|
||||
|
||||
# binary
|
||||
@@ -46,8 +46,8 @@ mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
|
||||
test_that("xgb.dump works", {
|
||||
if (!flag_32bit)
|
||||
expect_length(xgb.dump(bst.Tree), 200)
|
||||
dump_file = file.path(tempdir(), 'xgb.model.dump')
|
||||
expect_true(xgb.dump(bst.Tree, dump_file, with_stats = T))
|
||||
dump_file <- file.path(tempdir(), 'xgb.model.dump')
|
||||
expect_true(xgb.dump(bst.Tree, dump_file, with_stats = TRUE))
|
||||
expect_true(file.exists(dump_file))
|
||||
expect_gt(file.size(dump_file), 8000)
|
||||
|
||||
@@ -63,7 +63,7 @@ test_that("xgb.dump works for gblinear", {
|
||||
# also make sure that it works properly for a sparse model where some coefficients
|
||||
# are 0 from setting large L1 regularization:
|
||||
bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
|
||||
alpha=2, objective = "binary:logistic", booster = "gblinear")
|
||||
alpha = 2, objective = "binary:logistic", booster = "gblinear")
|
||||
d.sp <- xgb.dump(bst.GLM.sp)
|
||||
expect_length(d.sp, 14)
|
||||
expect_gt(sum(d.sp == "0"), 0)
|
||||
@@ -110,9 +110,9 @@ test_that("predict feature contributions works", {
|
||||
pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
|
||||
expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
|
||||
# manual calculation of linear terms
|
||||
coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric
|
||||
coefs <- xgb.dump(bst.GLM)[-c(1, 2, 4)] %>% as.numeric
|
||||
coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
|
||||
pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*")
|
||||
pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN = "*")
|
||||
expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual),
|
||||
tolerance = float_tolerance)
|
||||
|
||||
@@ -130,13 +130,13 @@ test_that("predict feature contributions works", {
|
||||
pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
|
||||
pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE)
|
||||
expect_length(pred_contr, 3)
|
||||
coefs_all <- xgb.dump(mbst.GLM)[-c(1,2,6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE)
|
||||
coefs_all <- xgb.dump(mbst.GLM)[-c(1, 2, 6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE)
|
||||
for (g in seq_along(pred_contr)) {
|
||||
expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
|
||||
expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance)
|
||||
# manual calculation of linear terms
|
||||
coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
|
||||
pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*")
|
||||
pred_contr_manual <- sweep(as.matrix(cbind(iris[, -5], 1)), 2, coefs, FUN = "*")
|
||||
expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual),
|
||||
tolerance = float_tolerance)
|
||||
}
|
||||
@@ -147,8 +147,8 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
||||
x1 = rnorm(100),
|
||||
x2 = rnorm(100),
|
||||
x3 = rnorm(100))
|
||||
y <- d[,"x1"] + d[,"x2"]^2 +
|
||||
ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
|
||||
y <- d[, "x1"] + d[, "x2"]^2 +
|
||||
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
|
||||
rnorm(100)
|
||||
nrounds <- 30
|
||||
|
||||
@@ -160,7 +160,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
||||
objective = "reg:squarederror",
|
||||
eval_metric = "rmse"),
|
||||
if (booster == "dart")
|
||||
list(rate_drop = .01, one_drop = T)),
|
||||
list(rate_drop = .01, one_drop = TRUE)),
|
||||
data = d,
|
||||
label = y,
|
||||
nrounds = nrounds)
|
||||
@@ -168,21 +168,21 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
||||
pr <- function(...)
|
||||
predict(fit, newdata = d, ...)
|
||||
pred <- pr()
|
||||
shap <- pr(predcontrib = T)
|
||||
shapi <- pr(predinteraction = T)
|
||||
tol = 1e-5
|
||||
shap <- pr(predcontrib = TRUE)
|
||||
shapi <- pr(predinteraction = TRUE)
|
||||
tol <- 1e-5
|
||||
|
||||
expect_equal(rowSums(shap), pred, tol = tol)
|
||||
expect_equal(apply(shapi, 1, sum), pred, tol = tol)
|
||||
for (i in 1 : nrow(d))
|
||||
for (f in list(rowSums, colSums))
|
||||
expect_equal(f(shapi[i,,]), shap[i,], tol = tol)
|
||||
expect_equal(f(shapi[i, , ]), shap[i, ], tol = tol)
|
||||
}
|
||||
})
|
||||
|
||||
test_that("xgb-attribute functionality", {
|
||||
val <- "my attribute value"
|
||||
list.val <- list(my_attr=val, a=123, b='ok')
|
||||
list.val <- list(my_attr = val, a = 123, b = 'ok')
|
||||
list.ch <- list.val[order(names(list.val))]
|
||||
list.ch <- lapply(list.ch, as.character)
|
||||
# note: iter is 0-index in xgb attributes
|
||||
@@ -208,9 +208,9 @@ test_that("xgb-attribute functionality", {
|
||||
xgb.attr(bst, "my_attr") <- NULL
|
||||
expect_null(xgb.attr(bst, "my_attr"))
|
||||
expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")])
|
||||
xgb.attributes(bst) <- list(a=NULL, b=NULL)
|
||||
xgb.attributes(bst) <- list(a = NULL, b = NULL)
|
||||
expect_equal(xgb.attributes(bst), list.default)
|
||||
xgb.attributes(bst) <- list(niter=NULL)
|
||||
xgb.attributes(bst) <- list(niter = NULL)
|
||||
expect_null(xgb.attributes(bst))
|
||||
})
|
||||
|
||||
@@ -268,7 +268,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
bst.Tree.x$feature_names <- NULL
|
||||
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
|
||||
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
|
||||
expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
|
||||
expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE])
|
||||
|
||||
# using integer node ID instead of character
|
||||
dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
|
||||
@@ -295,7 +295,7 @@ test_that("xgb.importance works with and without feature names", {
|
||||
bst.Tree.x <- bst.Tree
|
||||
bst.Tree.x$feature_names <- NULL
|
||||
importance.Tree.x <- xgb.importance(model = bst.Tree)
|
||||
expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE],
|
||||
expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE],
|
||||
tolerance = float_tolerance)
|
||||
|
||||
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
|
||||
@@ -305,7 +305,7 @@ test_that("xgb.importance works with and without feature names", {
|
||||
# for multiclass
|
||||
imp.Tree <- xgb.importance(model = mbst.Tree)
|
||||
expect_equal(dim(imp.Tree), c(4, 4))
|
||||
xgb.importance(model = mbst.Tree, trees = seq(from=0, by=nclass, length.out=nrounds))
|
||||
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
|
||||
})
|
||||
|
||||
test_that("xgb.importance works with GLM model", {
|
||||
@@ -320,7 +320,7 @@ test_that("xgb.importance works with GLM model", {
|
||||
# for multiclass
|
||||
imp.GLM <- xgb.importance(model = mbst.GLM)
|
||||
expect_equal(dim(imp.GLM), c(12, 3))
|
||||
expect_equal(imp.GLM$Class, rep(0:2, each=4))
|
||||
expect_equal(imp.GLM$Class, rep(0:2, each = 4))
|
||||
})
|
||||
|
||||
test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
|
||||
|
||||
@@ -5,20 +5,20 @@ context("interaction constraints")
|
||||
set.seed(1024)
|
||||
x1 <- rnorm(1000, 1)
|
||||
x2 <- rnorm(1000, 1)
|
||||
x3 <- sample(c(1,2,3), size=1000, replace=TRUE)
|
||||
y <- x1 + x2 + x3 + x1*x2*x3 + rnorm(1000, 0.001) + 3*sin(x1)
|
||||
train <- matrix(c(x1,x2,x3), ncol = 3)
|
||||
x3 <- sample(c(1, 2, 3), size = 1000, replace = TRUE)
|
||||
y <- x1 + x2 + x3 + x1 * x2 * x3 + rnorm(1000, 0.001) + 3 * sin(x1)
|
||||
train <- matrix(c(x1, x2, x3), ncol = 3)
|
||||
|
||||
test_that("interaction constraints for regression", {
|
||||
# Fit a model that only allows interaction between x1 and x2
|
||||
bst <- xgboost(data = train, label = y, max_depth = 3,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
interaction_constraints = list(c(0,1)))
|
||||
interaction_constraints = list(c(0, 1)))
|
||||
|
||||
# Set all observations to have the same x3 values then increment
|
||||
# by the same amount
|
||||
preds <- lapply(c(1,2,3), function(x){
|
||||
tmat <- matrix(c(x1,x2,rep(x,1000)), ncol=3)
|
||||
preds <- lapply(c(1, 2, 3), function(x){
|
||||
tmat <- matrix(c(x1, x2, rep(x, 1000)), ncol = 3)
|
||||
return(predict(bst, tmat))
|
||||
})
|
||||
|
||||
@@ -40,16 +40,16 @@ test_that("interaction constraints scientific representation", {
|
||||
rows <- 10
|
||||
## When number exceeds 1e5, R paste function uses scientific representation.
|
||||
## See: https://github.com/dmlc/xgboost/issues/5179
|
||||
cols <- 1e5+10
|
||||
cols <- 1e5 + 10
|
||||
|
||||
d <- matrix(rexp(rows, rate=.1), nrow=rows, ncol=cols)
|
||||
d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols)
|
||||
y <- rnorm(rows)
|
||||
|
||||
dtrain <- xgb.DMatrix(data=d, info = list(label=y))
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y))
|
||||
inc <- list(c(seq.int(from = 0, to = cols, by = 1)))
|
||||
|
||||
with_inc <- xgb.train(data=dtrain, tree_method='hist',
|
||||
interaction_constraints=inc, nrounds=10)
|
||||
without_inc <- xgb.train(data=dtrain, tree_method='hist', nrounds=10)
|
||||
with_inc <- xgb.train(data = dtrain, tree_method = 'hist',
|
||||
interaction_constraints = inc, nrounds = 10)
|
||||
without_inc <- xgb.train(data = dtrain, tree_method = 'hist', nrounds = 10)
|
||||
expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc))
|
||||
})
|
||||
|
||||
@@ -9,9 +9,9 @@ test_that("predict feature interactions works", {
|
||||
# simulate some binary data and a linear outcome with an interaction term
|
||||
N <- 1000
|
||||
P <- 5
|
||||
X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P]))
|
||||
X <- matrix(rbinom(N * P, 1, 0.5), ncol = P, dimnames = list(NULL, letters[1:P]))
|
||||
# center the data (as contributions are computed WRT feature means)
|
||||
X <- scale(X, scale=FALSE)
|
||||
X <- scale(X, scale = FALSE)
|
||||
|
||||
# outcome without any interactions, without any noise:
|
||||
f <- function(x) 2 * x[, 1] - 3 * x[, 2]
|
||||
@@ -23,14 +23,14 @@ test_that("predict feature interactions works", {
|
||||
y <- f_int(X)
|
||||
|
||||
dm <- xgb.DMatrix(X, label = y)
|
||||
param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2)
|
||||
param <- list(eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = 2)
|
||||
b <- xgb.train(param, dm, 100)
|
||||
|
||||
pred = predict(b, dm, outputmargin=TRUE)
|
||||
pred <- predict(b, dm, outputmargin = TRUE)
|
||||
|
||||
# SHAP contributions:
|
||||
cont <- predict(b, dm, predcontrib=TRUE)
|
||||
expect_equal(dim(cont), c(N, P+1))
|
||||
cont <- predict(b, dm, predcontrib = TRUE)
|
||||
expect_equal(dim(cont), c(N, P + 1))
|
||||
# make sure for each row they add up to marginal predictions
|
||||
max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001)
|
||||
# Hand-construct the 'ground truth' feature contributions:
|
||||
@@ -39,43 +39,43 @@ test_that("predict feature interactions works", {
|
||||
-3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2
|
||||
1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3
|
||||
)
|
||||
gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3))
|
||||
gt_cont <- cbind(gt_cont, matrix(0, nrow = N, ncol = P + 1 - 3))
|
||||
# These should be relatively close:
|
||||
expect_lt(max(abs(cont - gt_cont)), 0.05)
|
||||
|
||||
|
||||
# SHAP interaction contributions:
|
||||
intr <- predict(b, dm, predinteraction=TRUE)
|
||||
expect_equal(dim(intr), c(N, P+1, P+1))
|
||||
intr <- predict(b, dm, predinteraction = TRUE)
|
||||
expect_equal(dim(intr), c(N, P + 1, P + 1))
|
||||
# check assigned colnames
|
||||
cn <- c(letters[1:P], "BIAS")
|
||||
expect_equal(dimnames(intr), list(NULL, cn, cn))
|
||||
|
||||
# check the symmetry
|
||||
max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001)
|
||||
max(abs(aperm(intr, c(1, 3, 2)) - intr)) %>% expect_lt(0.00001)
|
||||
|
||||
# sums WRT columns must be close to feature contributions
|
||||
max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001)
|
||||
max(abs(apply(intr, c(1, 2), sum) - cont)) %>% expect_lt(0.00001)
|
||||
|
||||
# diagonal terms for features 3,4,5 must be close to zero
|
||||
Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05)
|
||||
|
||||
# BIAS must have no interactions
|
||||
max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001)
|
||||
max(abs(intr[, 1:P, P + 1])) %>% expect_lt(0.00001)
|
||||
|
||||
# interactions other than 2 x 3 must be close to zero
|
||||
intr23 <- intr
|
||||
intr23[,2,3] <- 0
|
||||
Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05)
|
||||
intr23[, 2, 3] <- 0
|
||||
Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i + 1):(P + 1)])))) %>% expect_lt(0.05)
|
||||
|
||||
# Construct the 'ground truth' contributions of interactions directly from the linear terms:
|
||||
gt_intr <- array(0, c(N, P+1, P+1))
|
||||
gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
|
||||
gt_intr[,3,2] <- gt_intr[, 2, 3]
|
||||
gt_intr <- array(0, c(N, P + 1, P + 1))
|
||||
gt_intr[, 2, 3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
|
||||
gt_intr[, 3, 2] <- gt_intr[, 2, 3]
|
||||
# merge-in the diagonal based on 'ground truth' feature contributions
|
||||
intr_diag = gt_cont - apply(gt_intr, c(1,2), sum)
|
||||
for(j in seq_len(P)) {
|
||||
gt_intr[,j,j] = intr_diag[,j]
|
||||
intr_diag <- gt_cont - apply(gt_intr, c(1, 2), sum)
|
||||
for (j in seq_len(P)) {
|
||||
gt_intr[, j, j] <- intr_diag[, j]
|
||||
}
|
||||
# These should be relatively close:
|
||||
expect_lt(max(abs(intr - gt_intr)), 0.1)
|
||||
@@ -107,7 +107,7 @@ test_that("SHAP contribution values are not NAN", {
|
||||
|
||||
shaps <- as.data.frame(predict(fit,
|
||||
newdata = as.matrix(subset(d, fold == 1)[, ivs]),
|
||||
predcontrib = T))
|
||||
predcontrib = TRUE))
|
||||
result <- cbind(shaps, sum = rowSums(shaps), pred = predict(fit,
|
||||
newdata = as.matrix(subset(d, fold == 1)[, ivs])))
|
||||
|
||||
@@ -116,26 +116,26 @@ test_that("SHAP contribution values are not NAN", {
|
||||
|
||||
|
||||
test_that("multiclass feature interactions work", {
|
||||
dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1)
|
||||
param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3)
|
||||
dm <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1)
|
||||
param <- list(eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3)
|
||||
b <- xgb.train(param, dm, 40)
|
||||
pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t
|
||||
pred <- predict(b, dm, outputmargin = TRUE) %>% array(c(3, 150)) %>% t
|
||||
|
||||
# SHAP contributions:
|
||||
cont <- predict(b, dm, predcontrib=TRUE)
|
||||
cont <- predict(b, dm, predcontrib = TRUE)
|
||||
expect_length(cont, 3)
|
||||
# rewrap them as a 3d array
|
||||
cont <- unlist(cont) %>% array(c(150, 5, 3))
|
||||
# make sure for each row they add up to marginal predictions
|
||||
max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001)
|
||||
max(abs(apply(cont, c(1, 3), sum) - pred)) %>% expect_lt(0.001)
|
||||
|
||||
# SHAP interaction contributions:
|
||||
intr <- predict(b, dm, predinteraction=TRUE)
|
||||
intr <- predict(b, dm, predinteraction = TRUE)
|
||||
expect_length(intr, 3)
|
||||
# rewrap them as a 4d array
|
||||
intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col]
|
||||
# check the symmetry
|
||||
max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001)
|
||||
max(abs(aperm(intr, c(1, 2, 4, 3)) - intr)) %>% expect_lt(0.00001)
|
||||
# sums WRT columns must be close to feature contributions
|
||||
max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001)
|
||||
max(abs(apply(intr, c(1, 2, 3), sum) - aperm(cont, c(3, 1, 2)))) %>% expect_lt(0.00001)
|
||||
})
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
context("Code is of high quality and lint free")
|
||||
test_that("Code Lint", {
|
||||
skip_on_cran()
|
||||
skip_on_travis()
|
||||
skip_if_not_installed("lintr")
|
||||
my_linters <- list(
|
||||
absolute_paths_linter=lintr::absolute_paths_linter,
|
||||
assignment_linter=lintr::assignment_linter,
|
||||
closed_curly_linter=lintr::closed_curly_linter,
|
||||
commas_linter=lintr::commas_linter,
|
||||
# commented_code_linter=lintr::commented_code_linter,
|
||||
infix_spaces_linter=lintr::infix_spaces_linter,
|
||||
line_length_linter=lintr::line_length_linter,
|
||||
no_tab_linter=lintr::no_tab_linter,
|
||||
object_usage_linter=lintr::object_usage_linter,
|
||||
# snake_case_linter=lintr::snake_case_linter,
|
||||
# multiple_dots_linter=lintr::multiple_dots_linter,
|
||||
object_length_linter=lintr::object_length_linter,
|
||||
open_curly_linter=lintr::open_curly_linter,
|
||||
# single_quotes_linter=lintr::single_quotes_linter,
|
||||
spaces_inside_linter=lintr::spaces_inside_linter,
|
||||
spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
|
||||
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
|
||||
trailing_whitespace_linter=lintr::trailing_whitespace_linter
|
||||
)
|
||||
# lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
|
||||
})
|
||||
84
R-package/tests/testthat/test_model_compatibility.R
Normal file
84
R-package/tests/testthat/test_model_compatibility.R
Normal file
@@ -0,0 +1,84 @@
|
||||
require(xgboost)
|
||||
require(jsonlite)
|
||||
|
||||
context("Models from previous versions of XGBoost can be loaded")
|
||||
|
||||
metadata <- list(
|
||||
kRounds = 2,
|
||||
kRows = 1000,
|
||||
kCols = 4,
|
||||
kForests = 2,
|
||||
kMaxDepth = 2,
|
||||
kClasses = 3
|
||||
)
|
||||
|
||||
run_model_param_check <- function (config) {
|
||||
testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
|
||||
testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
|
||||
}
|
||||
|
||||
get_num_tree <- function (booster) {
|
||||
dump <- xgb.dump(booster)
|
||||
m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
|
||||
m <- regmatches(dump, m)
|
||||
num_tree <- Reduce('+', lapply(m, length))
|
||||
return (num_tree)
|
||||
}
|
||||
|
||||
run_booster_check <- function (booster, name) {
|
||||
# If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
|
||||
if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
|
||||
booster <- xgb.Booster.complete(booster)
|
||||
}
|
||||
config <- jsonlite::fromJSON(xgb.config(booster))
|
||||
run_model_param_check(config)
|
||||
if (name == 'cls') {
|
||||
testthat::expect_equal(get_num_tree(booster),
|
||||
metadata$kForests * metadata$kRounds * metadata$kClasses)
|
||||
testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
|
||||
testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
|
||||
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
|
||||
metadata$kClasses)
|
||||
} else if (name == 'logit') {
|
||||
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
|
||||
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
|
||||
testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logistic')
|
||||
} else if (name == 'ltr') {
|
||||
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
|
||||
testthat::expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg')
|
||||
} else {
|
||||
testthat::expect_equal(name, 'reg')
|
||||
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
|
||||
testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
|
||||
testthat::expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror')
|
||||
}
|
||||
}
|
||||
|
||||
test_that("Models from previous versions of XGBoost can be loaded", {
|
||||
bucket <- 'xgboost-ci-jenkins-artifacts'
|
||||
region <- 'us-west-2'
|
||||
file_name <- 'xgboost_r_model_compatibility_test.zip'
|
||||
zipfile <- file.path(getwd(), file_name)
|
||||
model_dir <- file.path(getwd(), 'models')
|
||||
download.file(paste('https://', bucket, '.s3-', region, '.amazonaws.com/', file_name, sep = ''),
|
||||
destfile = zipfile, mode = 'wb')
|
||||
unzip(zipfile, overwrite = TRUE)
|
||||
|
||||
pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4))
|
||||
|
||||
lapply(list.files(model_dir), function (x) {
|
||||
model_file <- file.path(model_dir, x)
|
||||
m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
|
||||
m <- regmatches(model_file, m)[[1]]
|
||||
model_xgb_ver <- m[2]
|
||||
name <- m[3]
|
||||
|
||||
if (endsWith(model_file, '.rds')) {
|
||||
booster <- readRDS(model_file)
|
||||
} else {
|
||||
booster <- xgb.load(model_file)
|
||||
}
|
||||
predict(booster, newdata = pred_data)
|
||||
run_booster_check(booster, name)
|
||||
})
|
||||
})
|
||||
@@ -3,22 +3,21 @@ require(xgboost)
|
||||
context("monotone constraints")
|
||||
|
||||
set.seed(1024)
|
||||
x = rnorm(1000, 10)
|
||||
y = -1*x + rnorm(1000, 0.001) + 3*sin(x)
|
||||
train = matrix(x, ncol = 1)
|
||||
x <- rnorm(1000, 10)
|
||||
y <- -1 * x + rnorm(1000, 0.001) + 3 * sin(x)
|
||||
train <- matrix(x, ncol = 1)
|
||||
|
||||
|
||||
test_that("monotone constraints for regression", {
|
||||
bst = xgboost(data = train, label = y, max_depth = 2,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
monotone_constraints = -1)
|
||||
bst <- xgboost(data = train, label = y, max_depth = 2,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
monotone_constraints = -1)
|
||||
|
||||
pred = predict(bst, train)
|
||||
|
||||
ind = order(train[,1])
|
||||
pred.ord = pred[ind]
|
||||
expect_true({
|
||||
!any(diff(pred.ord) > 0)
|
||||
}, "Monotone Contraint Satisfied")
|
||||
pred <- predict(bst, train)
|
||||
|
||||
ind <- order(train[, 1])
|
||||
pred.ord <- pred[ind]
|
||||
expect_true({
|
||||
!any(diff(pred.ord) > 0)
|
||||
}, "Monotone Contraint Satisfied")
|
||||
})
|
||||
|
||||
@@ -2,8 +2,8 @@ context('Test model params and call are exposed to R')
|
||||
|
||||
require(xgboost)
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
@@ -5,10 +5,10 @@ set.seed(1994)
|
||||
|
||||
test_that("poisson regression works", {
|
||||
data(mtcars)
|
||||
bst <- xgboost(data = as.matrix(mtcars[,-11]), label = mtcars[,11],
|
||||
objective = 'count:poisson', nrounds=10, verbose=0)
|
||||
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
|
||||
objective = 'count:poisson', nrounds = 10, verbose = 0)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
pred <- predict(bst, as.matrix(mtcars[, -11]))
|
||||
expect_equal(length(pred), 32)
|
||||
expect_lt(sqrt(mean( (pred - mtcars[,11])^2 )), 1.2)
|
||||
expect_lt(sqrt(mean((pred - mtcars[, 11])^2)), 1.2)
|
||||
})
|
||||
|
||||
51
R-package/tests/testthat/test_ranking.R
Normal file
51
R-package/tests/testthat/test_ranking.R
Normal file
@@ -0,0 +1,51 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
|
||||
context('Learning to rank')
|
||||
|
||||
test_that('Test ranking with unweighted data', {
|
||||
X <- sparseMatrix(i = c(2, 3, 7, 9, 12, 15, 17, 18),
|
||||
j = c(1, 1, 2, 2, 3, 3, 4, 4),
|
||||
x = rep(1.0, 8), dims = c(20, 4))
|
||||
y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
|
||||
group <- c(5, 5, 5, 5)
|
||||
dtrain <- xgb.DMatrix(X, label = y, group = group)
|
||||
|
||||
params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
|
||||
eval_metric = 'auc', eval_metric = 'aucpr')
|
||||
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
|
||||
# Check if the metric is monotone increasing
|
||||
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
|
||||
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
|
||||
})
|
||||
|
||||
test_that('Test ranking with weighted data', {
|
||||
X <- sparseMatrix(i = c(2, 3, 7, 9, 12, 15, 17, 18),
|
||||
j = c(1, 1, 2, 2, 3, 3, 4, 4),
|
||||
x = rep(1.0, 8), dims = c(20, 4))
|
||||
y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
|
||||
group <- c(5, 5, 5, 5)
|
||||
weight <- c(1.0, 2.0, 3.0, 4.0)
|
||||
dtrain <- xgb.DMatrix(X, label = y, group = group, weight = weight)
|
||||
|
||||
params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
|
||||
eval_metric = 'auc', eval_metric = 'aucpr')
|
||||
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
|
||||
# Check if the metric is monotone increasing
|
||||
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
|
||||
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
|
||||
for (i in 1:10) {
|
||||
pred <- predict(bst, newdata = dtrain, ntreelimit = i)
|
||||
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
|
||||
is_sorted <- lapply(seq(1, 20, by = 5),
|
||||
function (k) {
|
||||
ind <- order(-pred[k:(k + 4)])
|
||||
z <- y[ind + (k - 1)]
|
||||
all(diff(z) <= 0) # Check if z is monotone decreasing
|
||||
})
|
||||
# Since we give weights 1, 2, 3, 4 to the four query groups,
|
||||
# the ranking predictor will first try to correctly sort the last query group
|
||||
# before correctly sorting other groups.
|
||||
expect_true(all(diff(as.numeric(is_sorted)) >= 0))
|
||||
}
|
||||
})
|
||||
@@ -9,10 +9,10 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
# Disable flaky tests for 32-bit Windows.
|
||||
# See https://github.com/dmlc/xgboost/issues/3720
|
||||
win32_flag = .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
|
||||
win32_flag <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
|
||||
|
||||
test_that("updating the model works", {
|
||||
watchlist = list(train = dtrain, test = dtest)
|
||||
watchlist <- list(train = dtrain, test = dtest)
|
||||
|
||||
# no-subsampling
|
||||
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
|
||||
@@ -95,7 +95,7 @@ test_that("updating works for multiclass & multitree", {
|
||||
tr0 <- xgb.model.dt.tree(model = bst0)
|
||||
|
||||
# run update process for an original model with subsampling
|
||||
p0u <- modifyList(p0, list(process_type='update', updater='refresh', refresh_leaf=FALSE))
|
||||
p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
||||
bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0)
|
||||
tr0u <- xgb.model.dt.tree(model = bst0u)
|
||||
|
||||
|
||||
@@ -57,16 +57,16 @@ To answer the question above we will convert *categorical* variables to `numeric
|
||||
|
||||
In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
|
||||
|
||||
The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
|
||||
The method we are going to see is usually called [one-hot encoding](https://en.wikipedia.org/wiki/One-hot).
|
||||
|
||||
The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
|
||||
|
||||
```{r, results='hide'}
|
||||
data(Arthritis)
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
```
|
||||
|
||||
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
||||
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
||||
|
||||
The first thing we want to do is to have a look to the first few lines of the `data.table`:
|
||||
|
||||
@@ -137,8 +137,8 @@ levels(df[,Treatment])
|
||||
#### Encoding categorical features
|
||||
|
||||
Next step, we will transform the categorical data to dummy variables.
|
||||
Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach.
|
||||
We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
|
||||
Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach.
|
||||
We will use the [dummy contrast coding](https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
|
||||
|
||||
The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.
|
||||
|
||||
@@ -176,7 +176,7 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
|
||||
|
||||
You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
|
||||
|
||||
A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
|
||||
A model which fits too well may [overfit](https://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
|
||||
|
||||
> Here you can see the numbers decrease until line 7 and then increase.
|
||||
>
|
||||
@@ -304,7 +304,7 @@ Linear model may not be that smart in this scenario.
|
||||
Special Note: What about Random Forests™?
|
||||
-----------------------------------------
|
||||
|
||||
As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
|
||||
As you may know, [Random Forests™](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family.
|
||||
|
||||
Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
|
||||
|
||||
|
||||
@@ -163,7 +163,7 @@ evalerror <- function(preds, dtrain) {
|
||||
|
||||
dtest <- xgb.DMatrix(test$data, label = test$label)
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||
param <- list(max_depth = 2, eta = 1)
|
||||
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE)
|
||||
@
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
author = "K. Bache and M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "http://archive.ics.uci.edu/ml",
|
||||
url = "http://archive.ics.uci.edu/ml/",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences"
|
||||
}
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ The version 0.4-2 is on CRAN, and you can install it by:
|
||||
install.packages("xgboost")
|
||||
```
|
||||
|
||||
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost)
|
||||
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost/)
|
||||
|
||||
## Learning
|
||||
|
||||
@@ -363,7 +363,7 @@ xgb.plot.importance(importance_matrix = importance_matrix)
|
||||
You can dump the tree you learned using `xgb.dump` into a text file.
|
||||
|
||||
```{r dump, message=T, warning=F}
|
||||
xgb.dump(bst, with_stats = T)
|
||||
xgb.dump(bst, with_stats = TRUE)
|
||||
```
|
||||
|
||||
You can plot the trees from your model using ```xgb.plot.tree``
|
||||
|
||||
@@ -69,13 +69,13 @@
|
||||
#include "../src/learner.cc"
|
||||
#include "../src/logging.cc"
|
||||
#include "../src/common/common.cc"
|
||||
#include "../src/common/charconv.cc"
|
||||
#include "../src/common/timer.cc"
|
||||
#include "../src/common/host_device_vector.cc"
|
||||
#include "../src/common/hist_util.cc"
|
||||
#include "../src/common/json.cc"
|
||||
#include "../src/common/io.cc"
|
||||
#include "../src/common/survival_util.cc"
|
||||
#include "../src/common/probability_distribution.cc"
|
||||
#include "../src/common/version.cc"
|
||||
|
||||
// c_api
|
||||
|
||||
64
appveyor.yml
64
appveyor.yml
@@ -1,6 +1,4 @@
|
||||
environment:
|
||||
R_ARCH: x64
|
||||
USE_RTOOLS: true
|
||||
matrix:
|
||||
- target: msvc
|
||||
ver: 2015
|
||||
@@ -12,13 +10,6 @@ environment:
|
||||
configuration: Release
|
||||
- target: mingw
|
||||
generator: "Unix Makefiles"
|
||||
- target: jvm
|
||||
- target: rmsvc
|
||||
ver: 2015
|
||||
generator: "Visual Studio 14 2015 Win64"
|
||||
configuration: Release
|
||||
- target: rmingw
|
||||
generator: "Unix Makefiles"
|
||||
|
||||
#matrix:
|
||||
# fast_finish: true
|
||||
@@ -44,21 +35,9 @@ install:
|
||||
- if /i "%DO_PYTHON%" == "on" (
|
||||
conda config --set always_yes true &&
|
||||
conda update -q conda &&
|
||||
conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz
|
||||
conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz hypothesis
|
||||
)
|
||||
- set PATH=C:\Miniconda3-x64\Library\bin\graphviz;%PATH%
|
||||
# R: based on https://github.com/krlmlr/r-appveyor
|
||||
- ps: |
|
||||
if($env:target -eq 'rmingw' -or $env:target -eq 'rmsvc') {
|
||||
#$ErrorActionPreference = "Stop"
|
||||
Invoke-WebRequest https://raw.githubusercontent.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "$Env:TEMP\appveyor-tool.ps1"
|
||||
Import-Module "$Env:TEMP\appveyor-tool.ps1"
|
||||
Bootstrap
|
||||
$BINARY_DEPS = "c('XML','igraph')"
|
||||
cmd.exe /c "R.exe -q -e ""install.packages($BINARY_DEPS, repos='$CRAN', type='win.binary')"" 2>&1"
|
||||
$DEPS = "c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown')"
|
||||
cmd.exe /c "R.exe -q -e ""install.packages($DEPS, repos='$CRAN', type='both')"" 2>&1"
|
||||
}
|
||||
|
||||
build_script:
|
||||
- cd %APPVEYOR_BUILD_FOLDER%
|
||||
@@ -81,53 +60,12 @@ build_script:
|
||||
mkdir wheel &&
|
||||
python setup.py bdist_wheel --universal --plat-name win-amd64 -d wheel
|
||||
)
|
||||
# R package: make + mingw standard CRAN packaging (only x64 for now)
|
||||
- if /i "%target%" == "rmingw" (
|
||||
make Rbuild &&
|
||||
ls -l &&
|
||||
R.exe CMD INSTALL xgboost*.tar.gz
|
||||
)
|
||||
# R package: cmake + VC2015
|
||||
- if /i "%target%" == "rmsvc" (
|
||||
mkdir build_rmsvc%ver% &&
|
||||
cd build_rmsvc%ver% &&
|
||||
cmake .. -G"%generator%" -DCMAKE_CONFIGURATION_TYPES="Release" -DR_LIB=ON &&
|
||||
cmake --build . --target install --config Release
|
||||
)
|
||||
- if /i "%target%" == "jvm" cd jvm-packages && mvn test -pl :xgboost4j_2.12
|
||||
|
||||
test_script:
|
||||
- cd %APPVEYOR_BUILD_FOLDER%
|
||||
- if /i "%DO_PYTHON%" == "on" python -m pytest tests/python
|
||||
# mingw R package: run the R check (which includes unit tests), and also keep the built binary package
|
||||
- if /i "%target%" == "rmingw" (
|
||||
set _R_CHECK_CRAN_INCOMING_=FALSE&&
|
||||
set _R_CHECK_FORCE_SUGGESTS_=FALSE&&
|
||||
R.exe CMD check xgboost*.tar.gz --no-manual --no-build-vignettes --as-cran --install-args=--build
|
||||
)
|
||||
# MSVC R package: run only the unit tests
|
||||
- if /i "%target%" == "rmsvc" (
|
||||
cd build_rmsvc%ver%\R-package &&
|
||||
R.exe -q -e "library(testthat); setwd('tests'); source('testthat.R')"
|
||||
)
|
||||
|
||||
on_failure:
|
||||
# keep the whole output of R check
|
||||
- if /i "%target%" == "rmingw" (
|
||||
7z a failure.zip *.Rcheck\* &&
|
||||
appveyor PushArtifact failure.zip
|
||||
)
|
||||
|
||||
artifacts:
|
||||
# log from R check
|
||||
- path: '*.Rcheck\**\*.log'
|
||||
name: Logs
|
||||
# source R-package
|
||||
- path: '\xgboost_*.tar.gz'
|
||||
name: Bits
|
||||
# binary R-package
|
||||
- path: '**\xgboost_*.zip'
|
||||
name: Bits
|
||||
# binary Python wheel package
|
||||
- path: '**\*.whl'
|
||||
name: Bits
|
||||
|
||||
34
cmake/RPackageInstall.cmake.in
Normal file
34
cmake/RPackageInstall.cmake.in
Normal file
@@ -0,0 +1,34 @@
|
||||
# Commands to install the R package as a CMake install target
|
||||
|
||||
function(check_call)
|
||||
set(cmd COMMAND)
|
||||
cmake_parse_arguments(
|
||||
PARSE_ARGV 0
|
||||
CALL_ARG "" "" "${cmd}"
|
||||
)
|
||||
string(REPLACE ";" " " commands "${CALL_ARG_COMMAND}")
|
||||
message("Command: ${commands}")
|
||||
execute_process(COMMAND ${CALL_ARG_COMMAND}
|
||||
OUTPUT_VARIABLE _out
|
||||
ERROR_VARIABLE _err
|
||||
RESULT_VARIABLE _res)
|
||||
if(NOT "${_res}" EQUAL "0")
|
||||
message(FATAL_ERROR "out: ${_out}, err: ${_err}, res: ${_res}")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Important paths
|
||||
set(build_dir "@build_dir@")
|
||||
set(LIBR_EXECUTABLE "@LIBR_EXECUTABLE@")
|
||||
|
||||
# Back up cmake_install.cmake
|
||||
file(WRITE "${build_dir}/R-package/src/Makevars" "all:")
|
||||
file(WRITE "${build_dir}/R-package/src/Makevars.win" "all:")
|
||||
|
||||
# Install dependencies
|
||||
set(XGB_DEPS_SCRIPT
|
||||
"deps = setdiff(c('data.table', 'magrittr', 'stringi'), rownames(installed.packages())); if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')")
|
||||
check_call(COMMAND "${LIBR_EXECUTABLE}" -q -e "${XGB_DEPS_SCRIPT}")
|
||||
|
||||
# Install the XGBoost R package
|
||||
check_call(COMMAND "${LIBR_EXECUTABLE}" CMD INSTALL --no-multiarch --build "${build_dir}/R-package")
|
||||
16
cmake/RPackageInstallTargetSetup.cmake
Normal file
16
cmake/RPackageInstallTargetSetup.cmake
Normal file
@@ -0,0 +1,16 @@
|
||||
# Assembles the R-package files in build_dir;
|
||||
# if necessary, installs the main R package dependencies;
|
||||
# runs R CMD INSTALL.
|
||||
function(setup_rpackage_install_target rlib_target build_dir)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/RPackageInstall.cmake.in ${PROJECT_BINARY_DIR}/RPackageInstall.cmake @ONLY)
|
||||
install(
|
||||
DIRECTORY "${xgboost_SOURCE_DIR}/R-package"
|
||||
DESTINATION "${build_dir}"
|
||||
REGEX "src/*" EXCLUDE
|
||||
REGEX "R-package/configure" EXCLUDE
|
||||
)
|
||||
install(TARGETS ${rlib_target}
|
||||
LIBRARY DESTINATION "${build_dir}/R-package/src/"
|
||||
RUNTIME DESTINATION "${build_dir}/R-package/src/")
|
||||
install(SCRIPT ${PROJECT_BINARY_DIR}/RPackageInstall.cmake)
|
||||
endfunction()
|
||||
@@ -110,34 +110,9 @@ function(format_gencode_flags flags out)
|
||||
set(${out} "${${out}}" PARENT_SCOPE)
|
||||
endfunction(format_gencode_flags flags)
|
||||
|
||||
# Assembles the R-package files in build_dir;
|
||||
# if necessary, installs the main R package dependencies;
|
||||
# runs R CMD INSTALL.
|
||||
function(setup_rpackage_install_target rlib_target build_dir)
|
||||
# backup cmake_install.cmake
|
||||
install(CODE "file(COPY \"${build_dir}/R-package/cmake_install.cmake\"
|
||||
DESTINATION \"${build_dir}/bak\")")
|
||||
|
||||
install(CODE "file(REMOVE_RECURSE \"${build_dir}/R-package\")")
|
||||
install(
|
||||
DIRECTORY "${xgboost_SOURCE_DIR}/R-package"
|
||||
DESTINATION "${build_dir}"
|
||||
REGEX "src/*" EXCLUDE
|
||||
REGEX "R-package/configure" EXCLUDE
|
||||
)
|
||||
install(TARGETS ${rlib_target}
|
||||
LIBRARY DESTINATION "${build_dir}/R-package/src/"
|
||||
RUNTIME DESTINATION "${build_dir}/R-package/src/")
|
||||
install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars\" \"all:\")")
|
||||
install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars.win\" \"all:\")")
|
||||
set(XGB_DEPS_SCRIPT
|
||||
"deps = setdiff(c('data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\
|
||||
if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')")
|
||||
install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" \"-q\" \"-e\" \"${XGB_DEPS_SCRIPT}\")")
|
||||
install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" CMD INSTALL\
|
||||
\"--no-multiarch\" \"--build\" \"${build_dir}/R-package\")")
|
||||
|
||||
# restore cmake_install.cmake
|
||||
install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\"
|
||||
\"${build_dir}/R-package/cmake_install.cmake\")")
|
||||
endfunction(setup_rpackage_install_target)
|
||||
macro(enable_nvtx target)
|
||||
find_package(NVTX REQUIRED)
|
||||
target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
|
||||
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||
endmacro()
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
|
||||
# Windows users might want to change this to their R version:
|
||||
if(NOT R_VERSION)
|
||||
set(R_VERSION "3.4.1")
|
||||
set(R_VERSION "4.0.0")
|
||||
endif()
|
||||
if(NOT R_ARCH)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "4")
|
||||
@@ -37,22 +37,32 @@ endif()
|
||||
# Creates R.lib and R.def in the build directory for linking with MSVC
|
||||
function(create_rlib_for_msvc)
|
||||
# various checks and warnings
|
||||
if(NOT WIN32 OR NOT MSVC)
|
||||
message(FATAL_ERROR "create_rlib_for_msvc() can only be used with MSVC")
|
||||
if(NOT WIN32 OR (NOT MSVC AND NOT MINGW))
|
||||
message(FATAL_ERROR "create_rlib_for_msvc() can only be used with MSVC or MINGW")
|
||||
endif()
|
||||
if(NOT EXISTS "${LIBR_LIB_DIR}")
|
||||
message(FATAL_ERROR "LIBR_LIB_DIR was not set!")
|
||||
endif()
|
||||
find_program(GENDEF_EXE gendef)
|
||||
find_program(DLLTOOL_EXE dlltool)
|
||||
if(NOT GENDEF_EXE OR NOT DLLTOOL_EXE)
|
||||
message(FATAL_ERROR "\nEither gendef.exe or dlltool.exe not found!\
|
||||
if(NOT DLLTOOL_EXE)
|
||||
message(FATAL_ERROR "\ndlltool.exe not found!\
|
||||
\nDo you have Rtools installed with its MinGW's bin/ in PATH?")
|
||||
endif()
|
||||
|
||||
# extract symbols from R.dll into R.def and R.lib import library
|
||||
execute_process(COMMAND ${GENDEF_EXE}
|
||||
"-" "${LIBR_LIB_DIR}/R.dll"
|
||||
OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/R.def")
|
||||
get_filename_component(
|
||||
LIBR_RSCRIPT_EXECUTABLE_DIR
|
||||
${LIBR_EXECUTABLE}
|
||||
DIRECTORY
|
||||
)
|
||||
set(LIBR_RSCRIPT_EXECUTABLE "${LIBR_RSCRIPT_EXECUTABLE_DIR}/Rscript")
|
||||
|
||||
execute_process(
|
||||
COMMAND ${LIBR_RSCRIPT_EXECUTABLE}
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/../../R-package/inst/make-r-def.R"
|
||||
"${LIBR_LIB_DIR}/R.dll" "${CMAKE_CURRENT_BINARY_DIR}/R.def"
|
||||
)
|
||||
|
||||
execute_process(COMMAND ${DLLTOOL_EXE}
|
||||
"--input-def" "${CMAKE_CURRENT_BINARY_DIR}/R.def"
|
||||
"--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib")
|
||||
@@ -148,7 +158,7 @@ message(STATUS "LIBR_CORE_LIBRARY [${LIBR_CORE_LIBRARY}]")
|
||||
|
||||
endif()
|
||||
|
||||
if(WIN32 AND MSVC)
|
||||
if((WIN32 AND MSVC) OR (WIN32 AND MINGW))
|
||||
# create a local R.lib import library for R.dll if it doesn't exist
|
||||
if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/R.lib")
|
||||
create_rlib_for_msvc()
|
||||
|
||||
26
cmake/modules/FindNVTX.cmake
Normal file
26
cmake/modules/FindNVTX.cmake
Normal file
@@ -0,0 +1,26 @@
|
||||
if (NVTX_LIBRARY)
|
||||
unset(NVTX_LIBRARY CACHE)
|
||||
endif (NVTX_LIBRARY)
|
||||
|
||||
set(NVTX_LIB_NAME nvToolsExt)
|
||||
|
||||
|
||||
find_path(NVTX_INCLUDE_DIR
|
||||
NAMES nvToolsExt.h
|
||||
PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
|
||||
|
||||
|
||||
find_library(NVTX_LIBRARY
|
||||
NAMES nvToolsExt
|
||||
PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
|
||||
|
||||
message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(NVTX DEFAULT_MSG
|
||||
NVTX_INCLUDE_DIR NVTX_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
NVTX_INCLUDE_DIR
|
||||
NVTX_LIBRARY
|
||||
)
|
||||
2
cub
2
cub
Submodule cub updated: b20808b1b0...c3cceac115
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Demo for survival analysis (regression) using Accelerated Failure Time (AFT) model
|
||||
"""
|
||||
import os
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
@@ -8,7 +9,8 @@ import xgboost as xgb
|
||||
|
||||
# The Veterans' Administration Lung Cancer Trial
|
||||
# The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
|
||||
df = pd.read_csv('../data/veterans_lung_cancer.csv')
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
df = pd.read_csv(os.path.join(CURRENT_DIR, '../data/veterans_lung_cancer.csv'))
|
||||
print('Training data:')
|
||||
print(df)
|
||||
|
||||
|
||||
1
demo/c-api/.gitignore
vendored
Normal file
1
demo/c-api/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
c-api-demo
|
||||
@@ -60,6 +60,10 @@ int main(int argc, char** argv) {
|
||||
printf("%s\n", eval_result);
|
||||
}
|
||||
|
||||
bst_ulong num_feature = 0;
|
||||
safe_xgboost(XGBoosterGetNumFeature(booster, &num_feature));
|
||||
printf("num_feature: %llu\n", num_feature);
|
||||
|
||||
// predict
|
||||
bst_ulong out_len = 0;
|
||||
const float* out_result = NULL;
|
||||
|
||||
@@ -14,5 +14,5 @@ data$STATE = as.factor(data$STATE)
|
||||
data$CLASS = as.factor(data$CLASS)
|
||||
data$GENDER = as.factor(data$GENDER)
|
||||
|
||||
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=T);
|
||||
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE);
|
||||
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
# GPU Acceleration Demo
|
||||
|
||||
`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
|
||||
|
||||
`memory.py` shows how to repeatedly train xgboost models while freeing memory between iterations.
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
from sklearn.datasets import fetch_covtype
|
||||
from sklearn.model_selection import train_test_split
|
||||
import time
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
import time
|
||||
import pickle
|
||||
import GPUtil
|
||||
|
||||
n = 10000
|
||||
m = 1000
|
||||
X = np.random.random((n, m))
|
||||
y = np.random.random(n)
|
||||
|
||||
param = {'objective': 'binary:logistic',
|
||||
'tree_method': 'gpu_hist'
|
||||
}
|
||||
iterations = 5
|
||||
dtrain = xgb.DMatrix(X, label=y)
|
||||
|
||||
# High memory usage
|
||||
# active bst objects with device memory persist across iterations
|
||||
boosters = []
|
||||
for i in range(iterations):
|
||||
bst = xgb.train(param, dtrain)
|
||||
boosters.append(bst)
|
||||
|
||||
print("Example 1")
|
||||
GPUtil.showUtilization()
|
||||
del boosters
|
||||
|
||||
# Better memory usage
|
||||
# The bst object can be destroyed by the python gc, freeing device memory
|
||||
# The gc may not immediately free the object, so more than one booster can be allocated at a time
|
||||
boosters = []
|
||||
for i in range(iterations):
|
||||
bst = xgb.train(param, dtrain)
|
||||
boosters.append(pickle.dumps(bst))
|
||||
|
||||
print("Example 2")
|
||||
GPUtil.showUtilization()
|
||||
del boosters
|
||||
|
||||
# Best memory usage
|
||||
# The gc explicitly frees the booster before starting the next iteration
|
||||
boosters = []
|
||||
for i in range(iterations):
|
||||
bst = xgb.train(param, dtrain)
|
||||
boosters.append(pickle.dumps(bst))
|
||||
del bst
|
||||
|
||||
print("Example 3")
|
||||
GPUtil.showUtilization()
|
||||
del boosters
|
||||
1
demo/guide-python/basic_walkthrough.py
Executable file → Normal file
1
demo/guide-python/basic_walkthrough.py
Executable file → Normal file
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
import pickle
|
||||
|
||||
10
demo/guide-python/boost_from_prediction.py
Executable file → Normal file
10
demo/guide-python/boost_from_prediction.py
Executable file → Normal file
@@ -1,15 +1,17 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import xgboost as xgb
|
||||
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print('start running example to start from a initial prediction')
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
# train xgboost for 1 round
|
||||
bst = xgb.train(param, dtrain, 1, watchlist)
|
||||
# Note: we need the margin value instead of transformed prediction in
|
||||
|
||||
11
demo/guide-python/cross_validation.py
Executable file → Normal file
11
demo/guide-python/cross_validation.py
Executable file → Normal file
@@ -1,10 +1,11 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
|
||||
### load data in do training
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
||||
# load data in do training
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
|
||||
num_round = 2
|
||||
|
||||
print('running cross validation')
|
||||
@@ -56,7 +57,7 @@ def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1}
|
||||
param = {'max_depth':2, 'eta':1}
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
|
||||
obj=logregobj, feval=evalerror)
|
||||
|
||||
9
demo/guide-python/custom_objective.py
Executable file → Normal file
9
demo/guide-python/custom_objective.py
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
###
|
||||
@@ -6,13 +6,14 @@ import xgboost as xgb
|
||||
#
|
||||
print('start running example to used customized objective function')
|
||||
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param = {'max_depth': 2, 'eta': 1, 'silent': 1}
|
||||
param = {'max_depth': 2, 'eta': 1}
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 2
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
|
||||
return grad, hess
|
||||
|
||||
|
||||
def predict(booster, X):
|
||||
def predict(booster: xgb.Booster, X):
|
||||
'''A customized prediction function that converts raw prediction to
|
||||
target class.
|
||||
|
||||
@@ -93,15 +93,34 @@ def predict(booster, X):
|
||||
return out
|
||||
|
||||
|
||||
def merror(predt: np.ndarray, dtrain: xgb.DMatrix):
|
||||
y = dtrain.get_label()
|
||||
# Like custom objective, the predt is untransformed leaf weight
|
||||
assert predt.shape == (kRows, kClasses)
|
||||
out = np.zeros(kRows)
|
||||
for r in range(predt.shape[0]):
|
||||
i = np.argmax(predt[r])
|
||||
out[r] = i
|
||||
|
||||
assert y.shape == out.shape
|
||||
|
||||
errors = np.zeros(kRows)
|
||||
errors[y != out] = 1.0
|
||||
return 'PyMError', np.sum(errors) / kRows
|
||||
|
||||
|
||||
def plot_history(custom_results, native_results):
|
||||
fig, axs = plt.subplots(2, 1)
|
||||
ax0 = axs[0]
|
||||
ax1 = axs[1]
|
||||
|
||||
pymerror = custom_results['train']['PyMError']
|
||||
merror = native_results['train']['merror']
|
||||
|
||||
x = np.arange(0, kRounds, 1)
|
||||
ax0.plot(x, custom_results['train']['merror'], label='Custom objective')
|
||||
ax0.plot(x, pymerror, label='Custom objective')
|
||||
ax0.legend()
|
||||
ax1.plot(x, native_results['train']['merror'], label='multi:softmax')
|
||||
ax1.plot(x, merror, label='multi:softmax')
|
||||
ax1.legend()
|
||||
|
||||
plt.show()
|
||||
@@ -110,10 +129,12 @@ def plot_history(custom_results, native_results):
|
||||
def main(args):
|
||||
custom_results = {}
|
||||
# Use our custom objective function
|
||||
booster_custom = xgb.train({'num_class': kClasses},
|
||||
booster_custom = xgb.train({'num_class': kClasses,
|
||||
'disable_default_eval_metric': True},
|
||||
m,
|
||||
num_boost_round=kRounds,
|
||||
obj=softprob_obj,
|
||||
feval=merror,
|
||||
evals_result=custom_results,
|
||||
evals=[(m, 'train')])
|
||||
|
||||
@@ -131,6 +152,8 @@ def main(args):
|
||||
# We are reimplementing the loss function in XGBoost, so it should
|
||||
# be the same for normal cases.
|
||||
assert np.all(predt_custom == predt_native)
|
||||
np.testing.assert_allclose(custom_results['train']['PyMError'],
|
||||
native_results['train']['merror'])
|
||||
|
||||
if args.plot != 0:
|
||||
plot_history(custom_results, native_results)
|
||||
|
||||
109
demo/guide-python/data_iterator.py
Normal file
109
demo/guide-python/data_iterator.py
Normal file
@@ -0,0 +1,109 @@
|
||||
'''A demo for defining data iterator.
|
||||
|
||||
The demo that defines a customized iterator for passing batches of data into
|
||||
`xgboost.DeviceQuantileDMatrix` and use this `DeviceQuantileDMatrix` for
|
||||
training. The feature is used primarily designed to reduce the required GPU
|
||||
memory for training on distributed environment.
|
||||
|
||||
Aftering going through the demo, one might ask why don't we use more native
|
||||
Python iterator? That's because XGBoost requires a `reset` function, while
|
||||
using `itertools.tee` might incur significant memory usage according to:
|
||||
|
||||
https://docs.python.org/3/library/itertools.html#itertools.tee.
|
||||
|
||||
'''
|
||||
|
||||
import xgboost
|
||||
import cupy
|
||||
import numpy
|
||||
|
||||
COLS = 64
|
||||
ROWS_PER_BATCH = 1000 # data is splited by rows
|
||||
BATCHES = 32
|
||||
|
||||
|
||||
class IterForDMatrixDemo(xgboost.core.DataIter):
|
||||
'''A data iterator for XGBoost DMatrix.
|
||||
|
||||
`reset` and `next` are required for any data iterator, other functions here
|
||||
are utilites for demonstration's purpose.
|
||||
|
||||
'''
|
||||
def __init__(self):
|
||||
'''Generate some random data for demostration.
|
||||
|
||||
Actual data can be anything that is currently supported by XGBoost.
|
||||
'''
|
||||
self.rows = ROWS_PER_BATCH
|
||||
self.cols = COLS
|
||||
rng = cupy.random.RandomState(1994)
|
||||
self._data = [rng.randn(self.rows, self.cols)] * BATCHES
|
||||
self._labels = [rng.randn(self.rows)] * BATCHES
|
||||
self._weights = [rng.randn(self.rows)] * BATCHES
|
||||
|
||||
self.it = 0 # set iterator to 0
|
||||
super().__init__()
|
||||
|
||||
def as_array(self):
|
||||
return cupy.concatenate(self._data)
|
||||
|
||||
def as_array_labels(self):
|
||||
return cupy.concatenate(self._labels)
|
||||
|
||||
def as_array_weights(self):
|
||||
return cupy.concatenate(self._weights)
|
||||
|
||||
def data(self):
|
||||
'''Utility function for obtaining current batch of data.'''
|
||||
return self._data[self.it]
|
||||
|
||||
def labels(self):
|
||||
'''Utility function for obtaining current batch of label.'''
|
||||
return self._labels[self.it]
|
||||
|
||||
def weights(self):
|
||||
return self._weights[self.it]
|
||||
|
||||
def reset(self):
|
||||
'''Reset the iterator'''
|
||||
self.it = 0
|
||||
|
||||
def next(self, input_data):
|
||||
'''Yield next batch of data.'''
|
||||
if self.it == len(self._data):
|
||||
# Return 0 when there's no more batch.
|
||||
return 0
|
||||
input_data(data=self.data(), label=self.labels(),
|
||||
weight=self.weights())
|
||||
self.it += 1
|
||||
return 1
|
||||
|
||||
|
||||
def main():
|
||||
rounds = 100
|
||||
it = IterForDMatrixDemo()
|
||||
|
||||
# Use iterator, must be `DeviceQuantileDMatrix`
|
||||
m_with_it = xgboost.DeviceQuantileDMatrix(it)
|
||||
|
||||
# Use regular DMatrix.
|
||||
m = xgboost.DMatrix(it.as_array(), it.as_array_labels(),
|
||||
weight=it.as_array_weights())
|
||||
|
||||
assert m_with_it.num_col() == m.num_col()
|
||||
assert m_with_it.num_row() == m.num_row()
|
||||
|
||||
reg_with_it = xgboost.train({'tree_method': 'gpu_hist'}, m_with_it,
|
||||
num_boost_round=rounds)
|
||||
predict_with_it = reg_with_it.predict(m_with_it)
|
||||
|
||||
reg = xgboost.train({'tree_method': 'gpu_hist'}, m,
|
||||
num_boost_round=rounds)
|
||||
predict = reg.predict(m)
|
||||
|
||||
numpy.testing.assert_allclose(predict_with_it, predict,
|
||||
rtol=1e6)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,10 +1,12 @@
|
||||
##
|
||||
# This script demonstrate how to access the eval metrics in xgboost
|
||||
##
|
||||
|
||||
import os
|
||||
import xgboost as xgb
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train', silent=True)
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test', silent=True)
|
||||
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
|
||||
param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user