enable ROCm on latest XGBoost

This commit is contained in:
Hui Liu 2023-10-23 11:07:08 -07:00
commit 15421e40d9
328 changed files with 8028 additions and 3642 deletions

View File

@ -141,18 +141,14 @@ jobs:
architecture: 'x64' architecture: 'x64'
- name: Install Python packages - name: Install Python packages
run: | run: |
python -m pip install wheel setuptools cpplint pylint python -m pip install wheel setuptools cmakelint cpplint pylint
- name: Run lint - name: Run lint
run: | run: |
python3 dmlc-core/scripts/lint.py xgboost cpp R-package/src python3 tests/ci_build/lint_cpp.py xgboost cpp R-package/src
python3 dmlc-core/scripts/lint.py --exclude_path \ python3 tests/ci_build/lint_cpp.py xgboost cpp include src python-package \
python-package/xgboost/dmlc-core \ --exclude_path python-package/xgboost/dmlc-core python-package/xgboost/include \
python-package/xgboost/include \ python-package/xgboost/lib python-package/xgboost/rabit \
python-package/xgboost/lib \ python-package/xgboost/src
python-package/xgboost/rabit \
python-package/xgboost/src \ sh ./tests/ci_build/lint_cmake.sh || true
--pylint-rc python-package/.pylintrc \
xgboost \
cpp \
include src python-package

View File

@ -190,7 +190,7 @@ jobs:
run: | run: |
mkdir build_msvc mkdir build_msvc
cd build_msvc cd build_msvc
cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
cmake --build . --config Release --parallel $(nproc) cmake --build . --config Release --parallel $(nproc)
- name: Install Python package - name: Install Python package

View File

@ -8,9 +8,9 @@ cmake_policy(SET CMP0076 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0063 NEW) set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
cmake_policy(SET CMP0063 NEW) cmake_policy(SET CMP0063 NEW)
if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13)) if((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
cmake_policy(SET CMP0077 NEW) cmake_policy(SET CMP0077 NEW)
endif ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13)) endif()
message(STATUS "CMake version ${CMAKE_VERSION}") message(STATUS "CMake version ${CMAKE_VERSION}")
@ -90,108 +90,99 @@ option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
option(ADD_PKGCONFIG "Add xgboost.pc into system." ON) option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
#-- Checks for building XGBoost #-- Checks for building XGBoost
if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug))) if(USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.") message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug))) endif()
if(USE_NCCL AND NOT (USE_CUDA))
if (USE_NCCL AND NOT (USE_CUDA))
message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.") message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
endif (USE_NCCL AND NOT (USE_CUDA)) endif()
if (USE_DEVICE_DEBUG AND NOT (USE_CUDA)) if(USE_DEVICE_DEBUG AND NOT (USE_CUDA))
message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.") message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA)) endif()
if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.") message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) endif()
if(USE_RCCL AND NOT (USE_HIP))
if (USE_RCCL AND NOT (USE_HIP))
message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.") message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
endif (USE_RCCL AND NOT (USE_HIP)) endif()
if (USE_DEVICE_DEBUG AND NOT (USE_HIP)) if(BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_HIP` flag.")
endif (USE_DEVICE_DEBUG AND NOT (USE_HIP))
if (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.") message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
endif (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL)) endif()
if(JVM_BINDINGS AND R_LIB)
if (JVM_BINDINGS AND R_LIB)
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.") message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
endif (JVM_BINDINGS AND R_LIB) endif()
if (R_LIB AND GOOGLE_TEST) if(R_LIB AND GOOGLE_TEST)
message(WARNING "Some C++ unittests will fail with `R_LIB` enabled, message(
as R package redirects some functions to R runtime implementation.") WARNING
endif (R_LIB AND GOOGLE_TEST) "Some C++ tests will fail with `R_LIB` enabled, as R package redirects some functions to R runtime implementation."
if (PLUGIN_RMM AND NOT (USE_CUDA)) )
endif()
if(PLUGIN_RMM AND NOT (USE_CUDA))
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.") message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
endif (PLUGIN_RMM AND NOT (USE_CUDA)) endif()
if(PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
if (PLUGIN_RMM AND NOT (USE_HIP))
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_HIP` flag.")
endif (PLUGIN_RMM AND NOT (USE_HIP))
if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.") message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))) endif()
if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")) if(PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.") message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.")
endif (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")) endif()
if (ENABLE_ALL_WARNINGS) if(ENABLE_ALL_WARNINGS)
if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) if((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.") message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
endif ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) endif()
endif (ENABLE_ALL_WARNINGS) endif()
if (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS)) if(BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
message(SEND_ERROR "Cannot build a static library libxgboost.a when R or JVM packages are enabled.") message(SEND_ERROR "Cannot build a static library libxgboost.a when R or JVM packages are enabled.")
endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS)) endif()
if (PLUGIN_FEDERATED) if(PLUGIN_FEDERATED)
if (CMAKE_CROSSCOMPILING) if(CMAKE_CROSSCOMPILING)
message(SEND_ERROR "Cannot cross compile with federated learning support") message(SEND_ERROR "Cannot cross compile with federated learning support")
endif () endif()
if (BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
message(SEND_ERROR "Cannot build static lib with federated learning support") message(SEND_ERROR "Cannot build static lib with federated learning support")
endif () endif()
if (R_LIB OR JVM_BINDINGS) if(R_LIB OR JVM_BINDINGS)
message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.") message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.")
endif () endif()
if (WIN32) if(WIN32)
message(SEND_ERROR "Federated learning not supported for Windows platform") message(SEND_ERROR "Federated learning not supported for Windows platform")
endif () endif()
endif () endif()
#-- Removed options #-- Removed options
if (USE_AVX) if(USE_AVX)
message(SEND_ERROR "The option `USE_AVX` is deprecated as experimental AVX features have been removed from XGBoost.") message(SEND_ERROR "The option `USE_AVX` is deprecated as experimental AVX features have been removed from XGBoost.")
endif (USE_AVX) endif()
if (PLUGIN_LZ4) if(PLUGIN_LZ4)
message(SEND_ERROR "The option `PLUGIN_LZ4` is removed from XGBoost.") message(SEND_ERROR "The option `PLUGIN_LZ4` is removed from XGBoost.")
endif (PLUGIN_LZ4) endif()
if (RABIT_BUILD_MPI) if(RABIT_BUILD_MPI)
message(SEND_ERROR "The option `RABIT_BUILD_MPI` has been removed from XGBoost.") message(SEND_ERROR "The option `RABIT_BUILD_MPI` has been removed from XGBoost.")
endif (RABIT_BUILD_MPI) endif()
if (USE_S3) if(USE_S3)
message(SEND_ERROR "The option `USE_S3` has been removed from XGBoost") message(SEND_ERROR "The option `USE_S3` has been removed from XGBoost")
endif (USE_S3) endif()
if (USE_AZURE) if(USE_AZURE)
message(SEND_ERROR "The option `USE_AZURE` has been removed from XGBoost") message(SEND_ERROR "The option `USE_AZURE` has been removed from XGBoost")
endif (USE_AZURE) endif()
if (USE_HDFS) if(USE_HDFS)
message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost") message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
endif (USE_HDFS) endif()
#-- Sanitizer #-- Sanitizer
if (USE_SANITIZER) if(USE_SANITIZER)
include(cmake/Sanitizer.cmake) include(cmake/Sanitizer.cmake)
enable_sanitizers("${ENABLED_SANITIZERS}") enable_sanitizers("${ENABLED_SANITIZERS}")
endif (USE_SANITIZER) endif()
if (USE_CUDA) if(USE_CUDA)
set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE) set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
# `export CXX=' is ignored by CMake CUDA. # `export CXX=' is ignored by CMake CUDA.
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}") message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
enable_language(CUDA) enable_language(CUDA)
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.0) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.0)
message(FATAL_ERROR "CUDA version must be at least 11.0!") message(FATAL_ERROR "CUDA version must be at least 11.0!")
endif() endif()
set(GEN_CODE "") set(GEN_CODE "")
@ -199,7 +190,7 @@ if (USE_CUDA)
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap) add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
endif (USE_CUDA) endif()
if (USE_HIP) if (USE_HIP)
set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE) set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
@ -218,7 +209,7 @@ if (USE_HIP)
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap) add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
endif (USE_HIP) endif (USE_HIP)
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))) (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
@ -226,10 +217,10 @@ endif()
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if (USE_OPENMP) if(USE_OPENMP)
if (APPLE) if(APPLE)
find_package(OpenMP) find_package(OpenMP)
if (NOT OpenMP_FOUND) if(NOT OpenMP_FOUND)
# Try again with extra path info; required for libomp 15+ from Homebrew # Try again with extra path info; required for libomp 15+ from Homebrew
execute_process(COMMAND brew --prefix libomp execute_process(COMMAND brew --prefix libomp
OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX
@ -242,20 +233,20 @@ if (USE_OPENMP)
set(OpenMP_CXX_LIB_NAMES omp) set(OpenMP_CXX_LIB_NAMES omp)
set(OpenMP_omp_LIBRARY ${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib) set(OpenMP_omp_LIBRARY ${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib)
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
endif () endif()
else () else()
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
endif () endif()
endif (USE_OPENMP) endif()
#Add for IBM i #Add for IBM i
if (${CMAKE_SYSTEM_NAME} MATCHES "OS400") if(${CMAKE_SYSTEM_NAME} MATCHES "OS400")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -X64 qc <TARGET> <OBJECTS>") set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -X64 qc <TARGET> <OBJECTS>")
endif() endif()
if (USE_NCCL) if(USE_NCCL)
find_package(Nccl REQUIRED) find_package(Nccl REQUIRED)
endif (USE_NCCL) endif()
if (USE_RCCL) if (USE_RCCL)
find_package(rccl REQUIRED) find_package(rccl REQUIRED)
@ -263,17 +254,19 @@ endif (USE_RCCL)
# dmlc-core # dmlc-core
msvc_use_static_runtime() msvc_use_static_runtime()
if (FORCE_SHARED_CRT) if(FORCE_SHARED_CRT)
set(DMLC_FORCE_SHARED_CRT ON) set(DMLC_FORCE_SHARED_CRT ON)
endif () endif()
add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core) add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
if (MSVC) if(MSVC)
if (TARGET dmlc_unit_tests) if(TARGET dmlc_unit_tests)
target_compile_options(dmlc_unit_tests PRIVATE target_compile_options(
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE) dmlc_unit_tests PRIVATE
endif (TARGET dmlc_unit_tests) -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE
endif (MSVC) )
endif()
endif()
# rabit # rabit
add_subdirectory(rabit) add_subdirectory(rabit)
@ -282,20 +275,25 @@ add_subdirectory(rabit)
add_subdirectory(${xgboost_SOURCE_DIR}/src) add_subdirectory(${xgboost_SOURCE_DIR}/src)
target_link_libraries(objxgboost PUBLIC dmlc) target_link_libraries(objxgboost PUBLIC dmlc)
# Link -lstdc++fs for GCC 8.x
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
target_link_libraries(objxgboost PUBLIC stdc++fs)
endif()
# Exports some R specific definitions and objects # Exports some R specific definitions and objects
if (R_LIB) if(R_LIB)
add_subdirectory(${xgboost_SOURCE_DIR}/R-package) add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
endif (R_LIB) endif()
# This creates its own shared library `xgboost4j'. # This creates its own shared library `xgboost4j'.
if (JVM_BINDINGS) if(JVM_BINDINGS)
add_subdirectory(${xgboost_SOURCE_DIR}/jvm-packages) add_subdirectory(${xgboost_SOURCE_DIR}/jvm-packages)
endif (JVM_BINDINGS) endif()
# Plugin # Plugin
add_subdirectory(${xgboost_SOURCE_DIR}/plugin) add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
if (PLUGIN_RMM) if(PLUGIN_RMM)
find_package(rmm REQUIRED) find_package(rmm REQUIRED)
# Patch the rmm targets so they reference the static cudart # Patch the rmm targets so they reference the static cudart
@ -306,14 +304,14 @@ if (PLUGIN_RMM)
list(APPEND rmm_link_libs CUDA::cudart_static) list(APPEND rmm_link_libs CUDA::cudart_static)
set_target_properties(rmm::rmm PROPERTIES INTERFACE_LINK_LIBRARIES "${rmm_link_libs}") set_target_properties(rmm::rmm PROPERTIES INTERFACE_LINK_LIBRARIES "${rmm_link_libs}")
get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES) get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
endif (PLUGIN_RMM) endif()
#-- library #-- library
if (BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
add_library(xgboost STATIC) add_library(xgboost STATIC)
else (BUILD_STATIC_LIB) else()
add_library(xgboost SHARED) add_library(xgboost SHARED)
endif (BUILD_STATIC_LIB) endif()
target_link_libraries(xgboost PRIVATE objxgboost) target_link_libraries(xgboost PRIVATE objxgboost)
target_include_directories(xgboost target_include_directories(xgboost
INTERFACE INTERFACE
@ -322,7 +320,7 @@ target_include_directories(xgboost
#-- End shared library #-- End shared library
#-- CLI for xgboost #-- CLI for xgboost
if (BUILD_DEPRECATED_CLI) if(BUILD_DEPRECATED_CLI)
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc) add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
target_link_libraries(runxgboost PRIVATE objxgboost) target_link_libraries(runxgboost PRIVATE objxgboost)
target_include_directories(runxgboost target_include_directories(runxgboost
@ -336,12 +334,12 @@ if (BUILD_DEPRECATED_CLI)
xgboost_target_link_libraries(runxgboost) xgboost_target_link_libraries(runxgboost)
xgboost_target_defs(runxgboost) xgboost_target_defs(runxgboost)
if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR) if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
set_output_directory(runxgboost ${xgboost_BINARY_DIR}) set_output_directory(runxgboost ${xgboost_BINARY_DIR})
else () else()
set_output_directory(runxgboost ${xgboost_SOURCE_DIR}) set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
endif (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR) endif()
endif (BUILD_DEPRECATED_CLI) endif()
#-- End CLI for xgboost #-- End CLI for xgboost
# Common setup for all targets # Common setup for all targets
@ -351,41 +349,41 @@ foreach(target xgboost objxgboost dmlc)
xgboost_target_defs(${target}) xgboost_target_defs(${target})
endforeach() endforeach()
if (JVM_BINDINGS) if(JVM_BINDINGS)
xgboost_target_properties(xgboost4j) xgboost_target_properties(xgboost4j)
xgboost_target_link_libraries(xgboost4j) xgboost_target_link_libraries(xgboost4j)
xgboost_target_defs(xgboost4j) xgboost_target_defs(xgboost4j)
endif (JVM_BINDINGS) endif()
if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR) if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib) set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
else () else()
set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib) set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
endif () endif()
# Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
if (BUILD_DEPRECATED_CLI) if(BUILD_DEPRECATED_CLI)
add_dependencies(xgboost runxgboost) add_dependencies(xgboost runxgboost)
endif (BUILD_DEPRECATED_CLI) endif()
#-- Installing XGBoost #-- Installing XGBoost
if (R_LIB) if(R_LIB)
include(cmake/RPackageInstallTargetSetup.cmake) include(cmake/RPackageInstallTargetSetup.cmake)
set_target_properties(xgboost PROPERTIES PREFIX "") set_target_properties(xgboost PROPERTIES PREFIX "")
if (APPLE) if(APPLE)
set_target_properties(xgboost PROPERTIES SUFFIX ".so") set_target_properties(xgboost PROPERTIES SUFFIX ".so")
endif (APPLE) endif()
setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install") setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install")
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst") set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
endif (R_LIB) endif()
if (MINGW) if(MINGW)
set_target_properties(xgboost PROPERTIES PREFIX "") set_target_properties(xgboost PROPERTIES PREFIX "")
endif (MINGW) endif()
if (BUILD_C_DOC) if(BUILD_C_DOC)
include(cmake/Doc.cmake) include(cmake/Doc.cmake)
run_doxygen() run_doxygen()
endif (BUILD_C_DOC) endif()
include(CPack) include(CPack)
@ -401,19 +399,19 @@ install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost
# > in any export set. # > in any export set.
# #
# https://github.com/dmlc/xgboost/issues/6085 # https://github.com/dmlc/xgboost/issues/6085
if (BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
if (BUILD_DEPRECATED_CLI) if(BUILD_DEPRECATED_CLI)
set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc) set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
else() else()
set(INSTALL_TARGETS xgboost objxgboost dmlc) set(INSTALL_TARGETS xgboost objxgboost dmlc)
endif (BUILD_DEPRECATED_CLI) endif()
else (BUILD_STATIC_LIB) else()
if (BUILD_DEPRECATED_CLI) if(BUILD_DEPRECATED_CLI)
set(INSTALL_TARGETS xgboost runxgboost) set(INSTALL_TARGETS xgboost runxgboost)
else(BUILD_DEPRECATED_CLI) else()
set(INSTALL_TARGETS xgboost) set(INSTALL_TARGETS xgboost)
endif (BUILD_DEPRECATED_CLI) endif()
endif (BUILD_STATIC_LIB) endif()
install(TARGETS ${INSTALL_TARGETS} install(TARGETS ${INSTALL_TARGETS}
EXPORT XGBoostTargets EXPORT XGBoostTargets
@ -442,7 +440,7 @@ install(
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/xgboost) DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/xgboost)
#-- Test #-- Test
if (GOOGLE_TEST) if(GOOGLE_TEST)
enable_testing() enable_testing()
# Unittests. # Unittests.
add_executable(testxgboost) add_executable(testxgboost)
@ -462,7 +460,7 @@ if (GOOGLE_TEST)
${xgboost_SOURCE_DIR}/tests/cli/machine.conf.in ${xgboost_SOURCE_DIR}/tests/cli/machine.conf.in
${xgboost_BINARY_DIR}/tests/cli/machine.conf ${xgboost_BINARY_DIR}/tests/cli/machine.conf
@ONLY) @ONLY)
if (BUILD_DEPRECATED_CLI) if(BUILD_DEPRECATED_CLI)
add_test( add_test(
NAME TestXGBoostCLI NAME TestXGBoostCLI
COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf
@ -470,8 +468,8 @@ if (GOOGLE_TEST)
set_tests_properties(TestXGBoostCLI set_tests_properties(TestXGBoostCLI
PROPERTIES PROPERTIES
PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*") PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*")
endif (BUILD_DEPRECATED_CLI) endif()
endif (GOOGLE_TEST) endif()
# For MSVC: Call msvc_use_static_runtime() once again to completely # For MSVC: Call msvc_use_static_runtime() once again to completely
# replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
@ -479,10 +477,10 @@ endif (GOOGLE_TEST)
msvc_use_static_runtime() msvc_use_static_runtime()
# Add xgboost.pc # Add xgboost.pc
if (ADD_PKGCONFIG) if(ADD_PKGCONFIG)
configure_file(${xgboost_SOURCE_DIR}/cmake/xgboost.pc.in ${xgboost_BINARY_DIR}/xgboost.pc @ONLY) configure_file(${xgboost_SOURCE_DIR}/cmake/xgboost.pc.in ${xgboost_BINARY_DIR}/xgboost.pc @ONLY)
install( install(
FILES ${xgboost_BINARY_DIR}/xgboost.pc FILES ${xgboost_BINARY_DIR}/xgboost.pc
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
endif (ADD_PKGCONFIG) endif()

201
NEWS.md
View File

@ -3,6 +3,207 @@ XGBoost Change Log
This file records the changes in xgboost library in reverse chronological order. This file records the changes in xgboost library in reverse chronological order.
## 2.0.0 (2023 Aug 16)
We are excited to announce the release of XGBoost 2.0. This note will begin by covering some overall changes and then highlight specific updates to the package.
### Initial work on multi-target trees with vector-leaf outputs
We have been working on vector-leaf tree models for multi-target regression, multi-label classification, and multi-class classification in version 2.0. Previously, XGBoost would build a separate model for each target. However, with this new feature that's still being developed, XGBoost can build one tree for all targets. The feature has multiple benefits and trade-offs compared to the existing approach. It can help prevent overfitting, produce smaller models, and build trees that consider the correlation between targets. In addition, users can combine vector leaf and scalar leaf trees during a training session using a callback. Please note that the feature is still a working in progress, and many parts are not yet available. See #9043 for the current status. Related PRs: (#8538, #8697, #8902, #8884, #8895, #8898, #8612, #8652, #8698, #8908, #8928, #8968, #8616, #8922, #8890, #8872, #8889, #9509) Please note that, only the `hist` (default) tree method on CPU can be used for building vector leaf trees at the moment.
### New `device` parameter.
A new `device` parameter is set to replace the existing `gpu_id`, `gpu_hist`, `gpu_predictor`, `cpu_predictor`, `gpu_coord_descent`, and the PySpark specific parameter `use_gpu`. Onward, users need only the `device` parameter to select which device to run along with the ordinal of the device. For more information, please see our document page (https://xgboost.readthedocs.io/en/stable/parameter.html#general-parameters) . For example, with `device="cuda", tree_method="hist"`, XGBoost will run the `hist` tree method on GPU. (#9363, #8528, #8604, #9354, #9274, #9243, #8896, #9129, #9362, #9402, #9385, #9398, #9390, #9386, #9412, #9507, #9536). The old behavior of ``gpu_hist`` is preserved but deprecated. In addition, the `predictor` parameter is removed.
### `hist` is now the default tree method
Starting from 2.0, the `hist` tree method will be the default. In previous versions, XGBoost chooses `approx` or `exact` depending on the input data and training environment. The new default can help XGBoost train models more efficiently and consistently. (#9320, #9353)
### GPU-based approx tree method
There's initial support for using the `approx` tree method on GPU. The performance of the `approx` is not yet well optimized but is feature complete except for the JVM packages. It can be accessed through the use of the parameter combination `device="cuda", tree_method="approx"`. (#9414, #9399, #9478). Please note that the Scala-based Spark interface is not yet supported.
### Optimize and bound the size of the histogram on CPU, to control memory footprint
XGBoost has a new parameter `max_cached_hist_node` for users to limit the CPU cache size for histograms. It can help prevent XGBoost from caching histograms too aggressively. Without the cache, performance is likely to decrease. However, the size of the cache grows exponentially with the depth of the tree. The limit can be crucial when growing deep trees. In most cases, users need not configure this parameter as it does not affect the model's accuracy. (#9455, #9441, #9440, #9427, #9400).
Along with the cache limit, XGBoost also reduces the memory usage of the `hist` and `approx` tree method on distributed systems by cutting the size of the cache by half. (#9433)
### Improved external memory support
There is some exciting development around external memory support in XGBoost. It's still an experimental feature, but the performance has been significantly improved with the default `hist` tree method. We replaced the old file IO logic with memory map. In addition to performance, we have reduced CPU memory usage and added extensive documentation. Beginning from 2.0.0, we encourage users to try it with the `hist` tree method when the memory saving by `QuantileDMatrix` is not sufficient. (#9361, #9317, #9282, #9315, #8457)
### Learning to rank
We created a brand-new implementation for the learning-to-rank task. With the latest version, XGBoost gained a set of new features for ranking task including:
- A new parameter `lambdarank_pair_method` for choosing the pair construction strategy.
- A new parameter `lambdarank_num_pair_per_sample` for controlling the number of samples for each group.
- An experimental implementation of unbiased learning-to-rank, which can be accessed using the `lambdarank_unbiased` parameter.
- Support for custom gain function with `NDCG` using the `ndcg_exp_gain` parameter.
- Deterministic GPU computation for all objectives and metrics.
- `NDCG` is now the default objective function.
- Improved performance of metrics using caches.
- Support scikit-learn utilities for `XGBRanker`.
- Extensive documentation on how learning-to-rank works with XGBoost.
For more information, please see the [tutorial](https://xgboost.readthedocs.io/en/latest/tutorials/learning_to_rank.html). Related PRs: (#8771, #8692, #8783, #8789, #8790, #8859, #8887, #8893, #8906, #8931, #9075, #9015, #9381, #9336, #8822, #9222, #8984, #8785, #8786, #8768)
### Automatically estimated intercept
In the previous version, `base_score` was a constant that could be set as a training parameter. In the new version, XGBoost can automatically estimate this parameter based on input labels for optimal accuracy. (#8539, #8498, #8272, #8793, #8607)
### Quantile regression
The XGBoost algorithm now supports quantile regression, which involves minimizing the quantile loss (also called "pinball loss"). Furthermore, XGBoost allows for training with multiple target quantiles simultaneously with one tree per quantile. (#8775, #8761, #8760, #8758, #8750)
### L1 and Quantile regression now supports learning rate
Both objectives use adaptive trees due to the lack of proper Hessian values. In the new version, XGBoost can scale the leaf value with the learning rate accordingly. (#8866)
### Export cut value
Using the Python or the C package, users can export the quantile values (not to be confused with quantile regression) used for the `hist` tree method. (#9356)
### column-based split and federated learning
We made progress on column-based split for federated learning. In 2.0, both `approx`, `hist`, and `hist` with vector leaf can work with column-based data split, along with support for vertical federated learning. Work on GPU support is still on-going, stay tuned. (#8576, #8468, #8442, #8847, #8811, #8985, #8623, #8568, #8828, #8932, #9081, #9102, #9103, #9124, #9120, #9367, #9370, #9343, #9171, #9346, #9270, #9244, #8494, #8434, #8742, #8804, #8710, #8676, #9020, #9002, #9058, #9037, #9018, #9295, #9006, #9300, #8765, #9365, #9060)
### PySpark
After the initial introduction of the PySpark interface, it has gained some new features and optimizations in 2.0.
- GPU-based prediction. (#9292, #9542)
- Optimization for data initialization by avoiding the stack operation. (#9088)
- Support predict feature contribution. (#8633)
- Python typing support. (#9156, #9172, #9079, #8375)
- `use_gpu` is deprecated. The `device` parameter is preferred.
- Update eval_metric validation to support list of strings (#8826)
- Improved logs for training (#9449)
- Maintenance, including refactoring and document updates (#8324, #8465, #8605, #9202, #9460, #9302, #8385, #8630, #8525, #8496)
- Fix for GPU setup. (#9495)
### Other General New Features
Here's a list of new features that don't have their own section and yet are general to all language bindings.
- Use array interface for CSC matrix. This helps XGBoost to use a consistent number of threads and align the interface of the CSC matrix with other interfaces. In addition, memory usage is likely to decrease with CSC input thanks to on-the-fly type conversion. (#8672)
- CUDA compute 90 is now part of the default build.. (#9397)
### Other General Optimization
These optimizations are general to all language bindings. For language-specific optimization, please visit the corresponding sections.
- Performance for input with `array_interface` on CPU (like `numpy`) is significantly improved. (#9090)
- Some optimization with CUDA for data initialization. (#9199, #9209, #9144)
- Use the latest thrust policy to prevent synchronizing GPU devices. (#9212)
- XGBoost now uses a per-thread CUDA stream, which prevents synchronization with other streams. (#9416, #9396, #9413)
### Notable breaking change
Other than the aforementioned change with the `device` parameter, here's a list of breaking changes affecting all packages.
- Users must specify the format for text input (#9077). However, we suggest using third-party data structures such as `numpy.ndarray` instead of relying on text inputs. See https://github.com/dmlc/xgboost/issues/9472 for more info.
### Notable bug fixes
Some noteworthy bug fixes that are not related to specific language bindings are listed in this section.
- Some language environments use a different thread to perform garbage collection, which breaks the thread-local cache used in XGBoost. XGBoost 2.0 implements a new thread-safe cache using a light weight lock to replace the thread-local cache. (#8851)
- Fix model IO by clearing the prediction cache. (#8904)
- `inf` is checked during data construction. (#8911)
- Preserve order of saved updaters configuration. Usually, this is not an issue unless the `updater` parameter is used instead of the `tree_method` parameter (#9355)
- Fix GPU memory allocation issue with categorical splits. (#9529)
- Handle escape sequence like `\t\n` in feature names for JSON model dump. (#9474)
- Normalize file path for model IO and text input. This handles short paths on Windows and paths that contain `~` on Unix (#9463). In addition, all path inputs are required to be encoded in UTF-8 (#9448, #9443)
- Fix integer overflow on H100. (#9380)
- Fix weighted sketching on GPU with categorical features. (#9341)
- Fix metric serialization. The bug might cause some of the metrics to be dropped during evaluation. (#9405)
- Fixes compilation errors on MSVC x86 targets (#8823)
- Pick up the dmlc-core fix for the CSV parser. (#8897)
### Documentation
Aside from documents for new features, we have many smaller updates to improve user experience, from troubleshooting guides to typo fixes.
- Explain CPU/GPU interop. (#8450)
- Guide to troubleshoot NCCL errors. (#8943, #9206)
- Add a note for rabit port selection. (#8879)
- How to build the docs using conda (#9276)
- Explain how to obtain reproducible results on distributed systems. (#8903)
* Fixes and small updates to document and demonstration scripts. (#8626, #8436, #8995, #8907, #8923, #8926, #9358, #9232, #9201, #9469, #9462, #9458, #8543, #8597, #8401, #8784, #9213, #9098, #9008, #9223, #9333, #9434, #9435, #9415, #8773, #8752, #9291, #9549)
### Python package
* New Features and Improvements
- Support primitive types of pyarrow-backed pandas dataframe. (#8653)
- Warning messages emitted by XGBoost are now emitted using Python warnings. (#9387)
- User can now format the value printed near the bars on the `plot_importance` plot (#8540)
- XGBoost has improved half-type support (float16) with pandas, cupy, and cuDF. With GPU input, the handling is through CUDA `__half` type, and no data copy is made. (#8487, #9207, #8481)
- Support `Series` and Python primitive types in `inplace_predict` and `QuantileDMatrix` (#8547, #8542)
- Support all pandas' nullable integer types. (#8480)
- Custom metric with the scikit-learn interface now supports `sample_weight`. (#8706)
- Enable Installation of Python Package with System lib in a Virtual Environment (#9349)
- Raise if expected workers are not alive in `xgboost.dask.train` (#9421)
* Optimization
- Cache transformed data in `QuantileDMatrix` for efficiency. (#8666, #9445)
- Take datatable as row-major input. (#8472)
- Remove unnecessary conversions between data structures (#8546)
* Adopt modern Python packaging conventions (PEP 517, PEP 518, PEP 621)
- XGBoost adopted the modern Python packaging conventions. The old setup script `setup.py` is now replaced with the new configuration file `pyproject.toml`. Along with this, XGBoost now supports Python 3.11. (#9021, #9112, #9114, #9115) Consult the latest documentation for the updated instructions to build and install XGBoost.
* Fixes
- `DataIter` now accepts only keyword arguments. (#9431)
- Fix empty DMatrix with categorical features. (#8739)
- Convert ``DaskXGBClassifier.classes_`` to an array (#8452)
- Define `best_iteration` only if early stopping is used to be consistent with documented behavior. (#9403)
- Make feature validation immutable. (#9388)
* Breaking changes
- Discussed in the new `device` parameter section, the `predictor` parameter is now removed. (#9129)
- Remove support for single-string feature info. Feature type and names should be a sequence of strings (#9401)
- Remove parameters in the `save_model` call for the scikit-learn interface. (#8963)
- Remove the `ntree_limit` in the python package. This has been deprecated in previous versions. (#8345)
* Maintenance including formatting and refactoring along with type hints.
- More consistent use of `black` and `isort` for code formatting (#8420, #8748, #8867)
- Improved type support. Most of the type changes happen in the PySpark module; here, we list the remaining changes. (#8444, #8617, #9197, #9005)
- Set `enable_categorical` to True in predict. (#8592)
- Some refactoring and updates for tests (#8395, #8372, #8557, #8379, #8702, #9459, #9316, #8446, #8695, #8409, #8993, #9480)
* Documentation
- Add introduction and notes for the sklearn interface. (#8948)
- Demo for using dask for hyper-parameter optimization. (#8891)
- Document all supported Python input types. (#8643)
- Other documentation updates (#8944, #9304)
### R package
- Use the new data consumption interface for CSR and CSC. This provides better control for the number of threads and improves performance. (#8455, #8673)
- Accept multiple evaluation metrics during training. (#8657)
- Fix integer inputs with `NA`. (#9522)
- Some refactoring for the R package (#8545, #8430, #8614, #8624, #8613, #9457, #8689, #8563, #9461, #8647, #8564, #8565, #8736, #8610, #8609, #8599, #8704, #9456, #9450, #9476, #9477, #9481). Special thanks to @jameslamb.
- Document updates (#8886, #9323, #9437, #8998)
### JVM packages
Following are changes specific to various JVM-based packages.
- Stop using Rabit in prediction (#9054)
- Set feature_names and feature_types in jvm-packages. This is to prepare support for categorical features (#9364)
- Scala 2.13 support. (#9099)
- Change training stage from `ResultStage` to `ShuffleMapStage` (#9423)
- Automatically set the max/min direction for the best score during early stopping. (#9404)
* Revised support for `flink` (#9046)
* Breaking changes
- Scala-based tracker is removed. (#9078, #9045)
- Change `DeviceQuantileDmatrix` into `QuantileDMatrix` (#8461)
* Maintenance (#9253, #9166, #9395, #9389, #9224, #9233, #9351, #9479)
* CI bot PRs
We employed GitHub dependent bot to help us keep the dependencies up-to-date for JVM packages. With the help from the bot, we have cleared up all the dependencies that are lagging behind (#8501, #8507).
Here's a list of dependency update PRs including those made by dependent bots (#8456, #8560, #8571, #8561, #8562, #8600, #8594, #8524, #8509, #8548, #8549, #8533, #8521, #8534, #8532, #8516, #8503, #8531, #8530, #8518, #8512, #8515, #8517, #8506, #8504, #8502, #8629, #8815, #8813, #8814, #8877, #8876, #8875, #8874, #8873, #9049, #9070, #9073, #9039, #9083, #8917, #8952, #8980, #8973, #8962, #9252, #9208, #9131, #9136, #9219, #9160, #9158, #9163, #9184, #9192, #9265, #9268, #8882, #8837, #8662, #8661, #8390, #9056, #8508, #8925, #8920, #9149, #9230, #9097, #8648, #9203, #8593).
### Maintenance
Maintenance work includes refactoring, fixing small issues that don't affect end users. (#9256, #8627, #8756, #8735, #8966, #8864, #8747, #8892, #9057, #8921, #8949, #8941, #8942, #9108, #9125, #9155, #9153, #9176, #9447, #9444, #9436, #9438, #9430, #9200, #9210, #9055, #9014, #9004, #8999, #9154, #9148, #9283, #9246, #8888, #8900, #8871, #8861, #8858, #8791, #8807, #8751, #8703, #8696, #8693, #8677, #8686, #8665, #8660, #8386, #8371, #8410, #8578, #8574, #8483, #8443, #8454, #8733)
### CI
- Build pip wheel with RMM support (#9383)
- Other CI updates including updating dependencies and work on the CI infrastructure. (#9464, #9428, #8767, #9394, #9278, #9214, #9234, #9205, #9034, #9104, #8878, #9294, #8625, #8806, #8741, #8707, #8381, #8382, #8388, #8402, #8397, #8445, #8602, #8628, #8583, #8460, #9544)
## 1.7.6 (2023 Jun 16) ## 1.7.6 (2023 Jun 16)
This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5. This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.

View File

@ -4,3 +4,5 @@
^.*\.Rproj$ ^.*\.Rproj$
^\.Rproj\.user$ ^\.Rproj\.user$
README.md README.md
^doc$
^Meta$

View File

@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
i == env$begin_iteration || i == env$begin_iteration ||
i == env$end_iteration) { i == env$end_iteration) {
stdev <- if (showsd) env$bst_evaluation_err else NULL stdev <- if (showsd) env$bst_evaluation_err else NULL
msg <- format.eval.string(i, env$bst_evaluation, stdev) msg <- .format_eval_string(i, env$bst_evaluation, stdev)
cat(msg, '\n') cat(msg, '\n')
} }
} }
@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
if ((maximize && score > best_score) || if ((maximize && score > best_score) ||
(!maximize && score < best_score)) { (!maximize && score < best_score)) {
best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) best_msg <<- .format_eval_string(
i, env$bst_evaluation, env$bst_evaluation_err
)
best_score <<- score best_score <<- score
best_iteration <<- i best_iteration <<- i
best_ntreelimit <<- best_iteration * env$num_parallel_tree best_ntreelimit <<- best_iteration * env$num_parallel_tree
@ -555,14 +557,18 @@ cb.cv.predict <- function(save_models = FALSE) {
#' #'
#' @examples #' @examples
#' #### Binary classification: #' #### Binary classification:
#' # #'
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#'
#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest #' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
#' # without considering the 2nd order interactions: #' # without considering the 2nd order interactions:
#' x <- model.matrix(Species ~ .^2, iris)[,-1] #' x <- model.matrix(Species ~ .^2, iris)[,-1]
#' colnames(x) #' colnames(x)
#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2) #' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc", #' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
#' lambda = 0.0003, alpha = 0.0003, nthread = 2) #' lambda = 0.0003, alpha = 0.0003, nthread = nthread)
#' # For 'shotgun', which is a default linear updater, using high eta values may result in #' # For 'shotgun', which is a default linear updater, using high eta values may result in
#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning #' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
#' # rate does not break the convergence, but allows us to illustrate the typical pattern of #' # rate does not break the convergence, but allows us to illustrate the typical pattern of
@ -592,9 +598,9 @@ cb.cv.predict <- function(save_models = FALSE) {
#' #'
#' #### Multiclass classification: #' #### Multiclass classification:
#' # #' #
#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1) #' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3, #' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
#' lambda = 0.0003, alpha = 0.0003, nthread = 1) #' lambda = 0.0003, alpha = 0.0003, nthread = nthread)
#' # For the default linear updater 'shotgun' it sometimes is helpful #' # For the default linear updater 'shotgun' it sometimes is helpful
#' # to use smaller eta to reduce instability #' # to use smaller eta to reduce instability
#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5, #' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
@ -754,7 +760,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
# #
# Format the evaluation metric string # Format the evaluation metric string
format.eval.string <- function(iter, eval_res, eval_err = NULL) { .format_eval_string <- function(iter, eval_res, eval_err = NULL) {
if (length(eval_res) == 0) if (length(eval_res) == 0)
stop('no evaluation results') stop('no evaluation results')
enames <- names(eval_res) enames <- names(eval_res)

View File

@ -21,13 +21,13 @@ xgb.Booster.handle <- function(params, cachelist, modelfile, handle) {
## A memory buffer ## A memory buffer
bst <- xgb.unserialize(modelfile, handle) bst <- xgb.unserialize(modelfile, handle)
xgb.parameters(bst) <- params xgb.parameters(bst) <- params
return (bst) return(bst)
} else if (inherits(modelfile, "xgb.Booster")) { } else if (inherits(modelfile, "xgb.Booster")) {
## A booster object ## A booster object
bst <- xgb.Booster.complete(modelfile, saveraw = TRUE) bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
bst <- xgb.unserialize(bst$raw) bst <- xgb.unserialize(bst$raw)
xgb.parameters(bst) <- params xgb.parameters(bst) <- params
return (bst) return(bst)
} else { } else {
stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
} }
@ -267,11 +267,16 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#'
#' ## Keep the number of threads to 2 for examples
#' nthread <- 2
#' data.table::setDTthreads(nthread)
#'
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' #'
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") #' eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
#' # use all trees by default #' # use all trees by default
#' pred <- predict(bst, test$data) #' pred <- predict(bst, test$data)
#' # use only the 1st tree #' # use only the 1st tree
@ -337,8 +342,14 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) { reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) if (!inherits(newdata, "xgb.DMatrix")) {
newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1)) config <- jsonlite::fromJSON(xgb.config(object))
nthread <- strtoi(config$learner$generic_param$nthread)
newdata <- xgb.DMatrix(
newdata,
missing = missing, nthread = NVL(nthread, -1)
)
}
if (!is.null(object[["feature_names"]]) && if (!is.null(object[["feature_names"]]) &&
!is.null(colnames(newdata)) && !is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata))) !identical(object[["feature_names"]], colnames(newdata)))
@ -371,7 +382,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
cval[0] <- val cval[0] <- val
return(cval) return(cval)
} }
return (val) return(val)
} }
## We set strict_shape to TRUE then drop the dimensions conditionally ## We set strict_shape to TRUE then drop the dimensions conditionally
@ -628,10 +639,15 @@ xgb.attributes <- function(object) {
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#' train <- agaricus.train #' train <- agaricus.train
#' #'
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
#' )
#' config <- xgb.config(bst) #' config <- xgb.config(bst)
#' #'
#' @rdname xgb.config #' @rdname xgb.config

View File

@ -18,7 +18,12 @@
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) #' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#' dtrain <- with(
#' agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
#' )
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data') #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@ -112,7 +117,7 @@ xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
stop("xgboost: invalid input data") stop("xgboost: invalid input data")
} }
} }
return (dtrain) return(dtrain)
} }

View File

@ -22,14 +22,23 @@
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#'
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#'
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' data = train$data, label = train$label, max_depth = 2, eta = 1,
#' nthread = nthread,
#' nrounds = 2,
#' objective = "binary:logistic"
#' )
#'
#' xgb.save(bst, 'xgb.model') #' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model') #' bst <- xgb.load('xgb.model')
#' if (file.exists('xgb.model')) file.remove('xgb.model') #' if (file.exists('xgb.model')) file.remove('xgb.model')
#' pred <- predict(bst, test$data)
#' @export #' @export
xgb.load <- function(modelfile) { xgb.load <- function(modelfile) {
if (is.null(modelfile)) if (is.null(modelfile))

View File

@ -18,6 +18,6 @@ xgb.load.raw <- function(buffer, as_booster = FALSE) {
booster <- xgb.Booster.complete(booster, saveraw = TRUE) booster <- xgb.Booster.complete(booster, saveraw = TRUE)
return(booster) return(booster)
} else { } else {
return (handle) return(handle)
} }
} }

View File

@ -46,9 +46,12 @@
#' # Basic use: #' # Basic use:
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#' #'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
#' #'
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
#' #'

View File

@ -45,10 +45,13 @@
#' @examples #' @examples
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' ## Keep the number of threads to 2 for examples
#' nthread <- 2
#' data.table::setDTthreads(nthread)
#' #'
#' # Change max_depth to a higher number to get a more significant result #' ## Change max_depth to a higher number to get a more significant result
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
#' eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic", #' eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
#' subsample = 0.5, min_child_weight = 2) #' subsample = 0.5, min_child_weight = 2)
#' #'
#' xgb.plot.deepness(bst) #' xgb.plot.deepness(bst)

View File

@ -45,9 +45,14 @@
#' #'
#' @examples #' @examples
#' data(agaricus.train) #' data(agaricus.train)
#' ## Keep the number of threads to 2 for examples
#' nthread <- 2
#' data.table::setDTthreads(nthread)
#' #'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, #' bst <- xgboost(
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
#' eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
#' )
#' #'
#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) #' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
#' #'

View File

@ -43,10 +43,15 @@
#' @examples #' @examples
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' ## Keep the number of threads to 2 for examples
#' nthread <- 2
#' data.table::setDTthreads(nthread)
#' #'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15, #' bst <- xgboost(
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", #' data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
#' min_child_weight = 50, verbose = 0) #' eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
#' min_child_weight = 50, verbose = 0
#' )
#' #'
#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3) #' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
#' print(p) #' print(p)

View File

@ -74,9 +74,14 @@
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' #'
#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, #' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#' nrounds <- 20
#'
#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
#' eta = 0.1, max_depth = 3, subsample = .5, #' eta = 0.1, max_depth = 3, subsample = .5,
#' method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0) #' method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
#' #'
#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
@ -85,12 +90,11 @@
#' #'
#' # multiclass example - plots for each class separately: #' # multiclass example - plots for each class separately:
#' nclass <- 3 #' nclass <- 3
#' nrounds <- 20
#' x <- as.matrix(iris[, -5]) #' x <- as.matrix(iris[, -5])
#' set.seed(123) #' set.seed(123)
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
#' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds, #' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
#' max_depth = 2, eta = 0.3, subsample = .5, nthread = 2, #' max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
#' objective = "multi:softprob", num_class = nclass, verbose = 0) #' objective = "multi:softprob", num_class = nclass, verbose = 0)
#' trees0 <- seq(from=0, by=nclass, length.out=nrounds) #' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
#' col <- rgb(0, 0, 1, 0.5) #' col <- rgb(0, 0, 1, 0.5)

View File

@ -25,14 +25,22 @@
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#'
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#'
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' data = train$data, label = train$label, max_depth = 2, eta = 1,
#' nthread = nthread,
#' nrounds = 2,
#' objective = "binary:logistic"
#' )
#' xgb.save(bst, 'xgb.model') #' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model') #' bst <- xgb.load('xgb.model')
#' if (file.exists('xgb.model')) file.remove('xgb.model') #' if (file.exists('xgb.model')) file.remove('xgb.model')
#' pred <- predict(bst, test$data)
#' @export #' @export
xgb.save <- function(model, fname) { xgb.save <- function(model, fname) {
if (typeof(fname) != "character") if (typeof(fname) != "character")

View File

@ -16,13 +16,18 @@
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#'
#' ## Keep the number of threads to 2 for examples
#' nthread <- 2
#' data.table::setDTthreads(nthread)
#'
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
#'
#' raw <- xgb.save.raw(bst) #' raw <- xgb.save.raw(bst)
#' bst <- xgb.load.raw(raw) #' bst <- xgb.load.raw(raw)
#' pred <- predict(bst, test$data)
#' #'
#' @export #' @export
xgb.save.raw <- function(model, raw_format = "deprecated") { xgb.save.raw <- function(model, raw_format = "deprecated") {

View File

@ -168,7 +168,8 @@
#' than the \code{xgboost} interface. #' than the \code{xgboost} interface.
#' #'
#' Parallelization is automatically enabled if \code{OpenMP} is present. #' Parallelization is automatically enabled if \code{OpenMP} is present.
#' Number of threads can also be manually specified via \code{nthread} parameter. #' Number of threads can also be manually specified via the \code{nthread}
#' parameter.
#' #'
#' The evaluation metric is chosen automatically by XGBoost (according to the objective) #' The evaluation metric is chosen automatically by XGBoost (according to the objective)
#' when the \code{eval_metric} parameter is not provided. #' when the \code{eval_metric} parameter is not provided.
@ -237,17 +238,25 @@
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' #'
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) #' ## Keep the number of threads to 1 for examples
#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2)) #' nthread <- 1
#' data.table::setDTthreads(nthread)
#'
#' dtrain <- with(
#' agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
#' )
#' dtest <- with(
#' agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
#' )
#' watchlist <- list(train = dtrain, eval = dtest) #' watchlist <- list(train = dtrain, eval = dtest)
#' #'
#' ## A simple xgb.train example: #' ## A simple xgb.train example:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2, #' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
#' objective = "binary:logistic", eval_metric = "auc") #' objective = "binary:logistic", eval_metric = "auc")
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist) #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#' #'
#' #' ## An xgb.train example where custom objective and evaluation metric are
#' ## An xgb.train example where custom objective and evaluation metric are used: #' ## used:
#' logregobj <- function(preds, dtrain) { #' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label") #' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds)) #' preds <- 1/(1 + exp(-preds))
@ -263,12 +272,12 @@
#' #'
#' # These functions could be used by passing them either: #' # These functions could be used by passing them either:
#' # as 'objective' and 'eval_metric' parameters in the params list: #' # as 'objective' and 'eval_metric' parameters in the params list:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2, #' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
#' objective = logregobj, eval_metric = evalerror) #' objective = logregobj, eval_metric = evalerror)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist) #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#' #'
#' # or through the ... arguments: #' # or through the ... arguments:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2) #' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
#' objective = logregobj, eval_metric = evalerror) #' objective = logregobj, eval_metric = evalerror)
#' #'
@ -278,7 +287,7 @@
#' #'
#' #'
#' ## An xgb.train example of using variable learning rates at each iteration: #' ## An xgb.train example of using variable learning rates at each iteration:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2, #' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
#' objective = "binary:logistic", eval_metric = "auc") #' objective = "binary:logistic", eval_metric = "auc")
#' my_etas <- list(eta = c(0.5, 0.1)) #' my_etas <- list(eta = c(0.5, 0.1))
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
@ -290,7 +299,7 @@
#' #'
#' ## An 'xgboost' interface example: #' ## An 'xgboost' interface example:
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
#' max_depth = 2, eta = 1, nthread = 2, nrounds = 2, #' max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
#' objective = "binary:logistic") #' objective = "binary:logistic")
#' pred <- predict(bst, agaricus.test$data) #' pred <- predict(bst, agaricus.test$data)
#' #'

View File

@ -37,5 +37,5 @@ xgb.unserialize <- function(buffer, handle = NULL) {
} }
}) })
class(handle) <- "xgb.Booster.handle" class(handle) <- "xgb.Booster.handle"
return (handle) return(handle)
} }

View File

@ -24,7 +24,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
early_stopping_rounds = early_stopping_rounds, maximize = maximize, early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, save_period = save_period, save_name = save_name,
xgb_model = xgb_model, callbacks = callbacks, ...) xgb_model = xgb_model, callbacks = callbacks, ...)
return (bst) return(bst)
} }
#' Training part from Mushroom Data Set #' Training part from Mushroom Data Set

View File

@ -25,7 +25,7 @@ xgb.cv(param, dtrain, nrounds, nfold = 5,
# you can also do cross validation with customized loss function # you can also do cross validation with customized loss function
# See custom_objective.R # See custom_objective.R
## ##
print ('running cross validation, with customized loss function') print('running cross validation, with customized loss function')
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")

View File

@ -35,7 +35,7 @@ evalerror <- function(preds, dtrain) {
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
objective = logregobj, eval_metric = evalerror) objective = logregobj, eval_metric = evalerror)
print ('start training with user customized objective') print('start training with user customized objective')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train # simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist) bst <- xgb.train(param, dtrain, num_round, watchlist)
@ -59,7 +59,7 @@ logregobjattr <- function(preds, dtrain) {
} }
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
objective = logregobjattr, eval_metric = evalerror) objective = logregobjattr, eval_metric = evalerror)
print ('start training with user customized objective, with additional attributes in DMatrix') print('start training with user customized objective, with additional attributes in DMatrix')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train # simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist) bst <- xgb.train(param, dtrain, num_round, watchlist)

View File

@ -30,7 +30,7 @@ evalerror <- function(preds, dtrain) {
err <- as.numeric(sum(labels != (preds > 0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
print ('start training with early Stopping setting') print('start training with early Stopping setting')
bst <- xgb.train(param, dtrain, num_round, watchlist, bst <- xgb.train(param, dtrain, num_round, watchlist,
objective = logregobj, eval_metric = evalerror, maximize = FALSE, objective = logregobj, eval_metric = evalerror, maximize = FALSE,

View File

@ -35,14 +35,18 @@ Callback function expects the following values to be set in its calling frame:
} }
\examples{ \examples{
#### Binary classification: #### Binary classification:
#
## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
# In the iris dataset, it is hard to linearly separate Versicolor class from the rest # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
# without considering the 2nd order interactions: # without considering the 2nd order interactions:
x <- model.matrix(Species ~ .^2, iris)[,-1] x <- model.matrix(Species ~ .^2, iris)[,-1]
colnames(x) colnames(x)
dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2) dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc", param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
lambda = 0.0003, alpha = 0.0003, nthread = 2) lambda = 0.0003, alpha = 0.0003, nthread = nthread)
# For 'shotgun', which is a default linear updater, using high eta values may result in # For 'shotgun', which is a default linear updater, using high eta values may result in
# unstable behaviour in some datasets. With this simple dataset, however, the high learning # unstable behaviour in some datasets. With this simple dataset, however, the high learning
# rate does not break the convergence, but allows us to illustrate the typical pattern of # rate does not break the convergence, but allows us to illustrate the typical pattern of
@ -72,9 +76,9 @@ matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
#### Multiclass classification: #### Multiclass classification:
# #
dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1) dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3, param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
lambda = 0.0003, alpha = 0.0003, nthread = 1) lambda = 0.0003, alpha = 0.0003, nthread = nthread)
# For the default linear updater 'shotgun' it sometimes is helpful # For the default linear updater 'shotgun' it sometimes is helpful
# to use smaller eta to reduce instability # to use smaller eta to reduce instability
bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5, bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,

View File

@ -132,11 +132,16 @@ Note also that converting a matrix to \code{\link{xgb.DMatrix}} uses multiple th
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
## Keep the number of threads to 2 for examples
nthread <- 2
data.table::setDTthreads(nthread)
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
# use all trees by default # use all trees by default
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
# use only the 1st tree # use only the 1st tree

View File

@ -38,7 +38,12 @@ Supported input file formats are either a LIBSVM text file or a binary file that
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) ## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
dtrain <- with(
agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data') dtrain <- xgb.DMatrix('xgb.DMatrix.data')
if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data') if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')

View File

@ -19,10 +19,15 @@ Accessors for model parameters as JSON string.
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
train <- agaricus.train train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
)
config <- xgb.config(bst) config <- xgb.config(bst)
} }

View File

@ -27,14 +27,23 @@ not \code{xgb.load}.
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") data = train$data, label = train$label, max_depth = 2, eta = 1,
nthread = nthread,
nrounds = 2,
objective = "binary:logistic"
)
xgb.save(bst, 'xgb.model') xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model') bst <- xgb.load('xgb.model')
if (file.exists('xgb.model')) file.remove('xgb.model') if (file.exists('xgb.model')) file.remove('xgb.model')
pred <- predict(bst, test$data)
} }
\seealso{ \seealso{
\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.

View File

@ -66,9 +66,12 @@ Parse a boosted tree model text dump into a \code{data.table} structure.
# Basic use: # Basic use:
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))

View File

@ -61,10 +61,13 @@ This function was inspired by the blog post
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
## Keep the number of threads to 2 for examples
nthread <- 2
data.table::setDTthreads(nthread)
# Change max_depth to a higher number to get a more significant result ## Change max_depth to a higher number to get a more significant result
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic", eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
subsample = 0.5, min_child_weight = 2) subsample = 0.5, min_child_weight = 2)
xgb.plot.deepness(bst) xgb.plot.deepness(bst)

View File

@ -77,9 +77,14 @@ with bar colors corresponding to different clusters that have somewhat similar i
} }
\examples{ \examples{
data(agaricus.train) data(agaricus.train)
## Keep the number of threads to 2 for examples
nthread <- 2
data.table::setDTthreads(nthread)
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
)
importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)

View File

@ -63,10 +63,15 @@ This function is inspired by this blog post:
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
## Keep the number of threads to 2 for examples
nthread <- 2
data.table::setDTthreads(nthread)
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
min_child_weight = 50, verbose = 0) eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
min_child_weight = 50, verbose = 0
)
p <- xgb.plot.multi.trees(model = bst, features_keep = 3) p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
print(p) print(p)

View File

@ -124,9 +124,14 @@ a meaningful thing to do.
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, ## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
nrounds <- 20
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
eta = 0.1, max_depth = 3, subsample = .5, eta = 0.1, max_depth = 3, subsample = .5,
method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0) method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
@ -135,12 +140,11 @@ xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # S
# multiclass example - plots for each class separately: # multiclass example - plots for each class separately:
nclass <- 3 nclass <- 3
nrounds <- 20
x <- as.matrix(iris[, -5]) x <- as.matrix(iris[, -5])
set.seed(123) set.seed(123)
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds, mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
max_depth = 2, eta = 0.3, subsample = .5, nthread = 2, max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
objective = "multi:softprob", num_class = nclass, verbose = 0) objective = "multi:softprob", num_class = nclass, verbose = 0)
trees0 <- seq(from=0, by=nclass, length.out=nrounds) trees0 <- seq(from=0, by=nclass, length.out=nrounds)
col <- rgb(0, 0, 1, 0.5) col <- rgb(0, 0, 1, 0.5)

View File

@ -31,14 +31,22 @@ releases of XGBoost.
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
## Keep the number of threads to 1 for examples
nthread <- 1
data.table::setDTthreads(nthread)
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") data = train$data, label = train$label, max_depth = 2, eta = 1,
nthread = nthread,
nrounds = 2,
objective = "binary:logistic"
)
xgb.save(bst, 'xgb.model') xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model') bst <- xgb.load('xgb.model')
if (file.exists('xgb.model')) file.remove('xgb.model') if (file.exists('xgb.model')) file.remove('xgb.model')
pred <- predict(bst, test$data)
} }
\seealso{ \seealso{
\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.

View File

@ -25,12 +25,17 @@ Save xgboost model from xgboost or xgb.train
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
## Keep the number of threads to 2 for examples
nthread <- 2
data.table::setDTthreads(nthread)
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
raw <- xgb.save.raw(bst) raw <- xgb.save.raw(bst)
bst <- xgb.load.raw(raw) bst <- xgb.load.raw(raw)
pred <- predict(bst, test$data)
} }

View File

@ -250,7 +250,8 @@ customized objective and evaluation metric functions, therefore it is more flexi
than the \code{xgboost} interface. than the \code{xgboost} interface.
Parallelization is automatically enabled if \code{OpenMP} is present. Parallelization is automatically enabled if \code{OpenMP} is present.
Number of threads can also be manually specified via \code{nthread} parameter. Number of threads can also be manually specified via the \code{nthread}
parameter.
The evaluation metric is chosen automatically by XGBoost (according to the objective) The evaluation metric is chosen automatically by XGBoost (according to the objective)
when the \code{eval_metric} parameter is not provided. when the \code{eval_metric} parameter is not provided.
@ -286,17 +287,25 @@ The following callbacks are automatically created when certain parameters are se
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) ## Keep the number of threads to 1 for examples
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2)) nthread <- 1
data.table::setDTthreads(nthread)
dtrain <- with(
agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
)
dtest <- with(
agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
)
watchlist <- list(train = dtrain, eval = dtest) watchlist <- list(train = dtrain, eval = dtest)
## A simple xgb.train example: ## A simple xgb.train example:
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2, param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
objective = "binary:logistic", eval_metric = "auc") objective = "binary:logistic", eval_metric = "auc")
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist) bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
## An xgb.train example where custom objective and evaluation metric are
## An xgb.train example where custom objective and evaluation metric are used: ## used:
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds)) preds <- 1/(1 + exp(-preds))
@ -312,12 +321,12 @@ evalerror <- function(preds, dtrain) {
# These functions could be used by passing them either: # These functions could be used by passing them either:
# as 'objective' and 'eval_metric' parameters in the params list: # as 'objective' and 'eval_metric' parameters in the params list:
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2, param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
objective = logregobj, eval_metric = evalerror) objective = logregobj, eval_metric = evalerror)
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist) bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
# or through the ... arguments: # or through the ... arguments:
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2) param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
objective = logregobj, eval_metric = evalerror) objective = logregobj, eval_metric = evalerror)
@ -327,7 +336,7 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
## An xgb.train example of using variable learning rates at each iteration: ## An xgb.train example of using variable learning rates at each iteration:
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2, param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
objective = "binary:logistic", eval_metric = "auc") objective = "binary:logistic", eval_metric = "auc")
my_etas <- list(eta = c(0.5, 0.1)) my_etas <- list(eta = c(0.5, 0.1))
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
@ -339,7 +348,7 @@ bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
## An 'xgboost' interface example: ## An 'xgboost' interface example:
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
max_depth = 2, eta = 1, nthread = 2, nrounds = 2, max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
objective = "binary:logistic") objective = "binary:logistic")
pred <- predict(bst, agaricus.test$data) pred <- predict(bst, agaricus.test$data)

View File

@ -62,6 +62,7 @@ OBJECTS= \
$(PKGROOT)/src/gbm/gbtree_model.o \ $(PKGROOT)/src/gbm/gbtree_model.o \
$(PKGROOT)/src/gbm/gblinear.o \ $(PKGROOT)/src/gbm/gblinear.o \
$(PKGROOT)/src/gbm/gblinear_model.o \ $(PKGROOT)/src/gbm/gblinear_model.o \
$(PKGROOT)/src/data/adapter.o \
$(PKGROOT)/src/data/simple_dmatrix.o \ $(PKGROOT)/src/data/simple_dmatrix.o \
$(PKGROOT)/src/data/data.o \ $(PKGROOT)/src/data/data.o \
$(PKGROOT)/src/data/sparse_page_raw_format.o \ $(PKGROOT)/src/data/sparse_page_raw_format.o \
@ -97,9 +98,15 @@ OBJECTS= \
$(PKGROOT)/src/context.o \ $(PKGROOT)/src/context.o \
$(PKGROOT)/src/logging.o \ $(PKGROOT)/src/logging.o \
$(PKGROOT)/src/global_config.o \ $(PKGROOT)/src/global_config.o \
$(PKGROOT)/src/collective/allgather.o \
$(PKGROOT)/src/collective/allreduce.o \
$(PKGROOT)/src/collective/broadcast.o \
$(PKGROOT)/src/collective/comm.o \
$(PKGROOT)/src/collective/tracker.o \
$(PKGROOT)/src/collective/communicator.o \ $(PKGROOT)/src/collective/communicator.o \
$(PKGROOT)/src/collective/in_memory_communicator.o \ $(PKGROOT)/src/collective/in_memory_communicator.o \
$(PKGROOT)/src/collective/in_memory_handler.o \ $(PKGROOT)/src/collective/in_memory_handler.o \
$(PKGROOT)/src/collective/loop.o \
$(PKGROOT)/src/collective/socket.o \ $(PKGROOT)/src/collective/socket.o \
$(PKGROOT)/src/common/charconv.o \ $(PKGROOT)/src/common/charconv.o \
$(PKGROOT)/src/common/column_matrix.o \ $(PKGROOT)/src/common/column_matrix.o \

View File

@ -62,6 +62,7 @@ OBJECTS= \
$(PKGROOT)/src/gbm/gbtree_model.o \ $(PKGROOT)/src/gbm/gbtree_model.o \
$(PKGROOT)/src/gbm/gblinear.o \ $(PKGROOT)/src/gbm/gblinear.o \
$(PKGROOT)/src/gbm/gblinear_model.o \ $(PKGROOT)/src/gbm/gblinear_model.o \
$(PKGROOT)/src/data/adapter.o \
$(PKGROOT)/src/data/simple_dmatrix.o \ $(PKGROOT)/src/data/simple_dmatrix.o \
$(PKGROOT)/src/data/data.o \ $(PKGROOT)/src/data/data.o \
$(PKGROOT)/src/data/sparse_page_raw_format.o \ $(PKGROOT)/src/data/sparse_page_raw_format.o \
@ -97,9 +98,15 @@ OBJECTS= \
$(PKGROOT)/src/context.o \ $(PKGROOT)/src/context.o \
$(PKGROOT)/src/logging.o \ $(PKGROOT)/src/logging.o \
$(PKGROOT)/src/global_config.o \ $(PKGROOT)/src/global_config.o \
$(PKGROOT)/src/collective/allgather.o \
$(PKGROOT)/src/collective/allreduce.o \
$(PKGROOT)/src/collective/broadcast.o \
$(PKGROOT)/src/collective/comm.o \
$(PKGROOT)/src/collective/tracker.o \
$(PKGROOT)/src/collective/communicator.o \ $(PKGROOT)/src/collective/communicator.o \
$(PKGROOT)/src/collective/in_memory_communicator.o \ $(PKGROOT)/src/collective/in_memory_communicator.o \
$(PKGROOT)/src/collective/in_memory_handler.o \ $(PKGROOT)/src/collective/in_memory_handler.o \
$(PKGROOT)/src/collective/loop.o \
$(PKGROOT)/src/collective/socket.o \ $(PKGROOT)/src/collective/socket.o \
$(PKGROOT)/src/common/charconv.o \ $(PKGROOT)/src/common/charconv.o \
$(PKGROOT)/src/common/column_matrix.o \ $(PKGROOT)/src/common/column_matrix.o \

View File

@ -5,7 +5,6 @@
* and edited to conform to xgboost C linter requirements. For details, see * and edited to conform to xgboost C linter requirements. For details, see
* https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines * https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines
*/ */
#include <R.h>
#include <Rinternals.h> #include <Rinternals.h>
#include <stdlib.h> #include <stdlib.h>
#include <R_ext/Rdynload.h> #include <R_ext/Rdynload.h>

View File

@ -20,7 +20,6 @@
#include "../../src/common/threading_utils.h" #include "../../src/common/threading_utils.h"
#include "./xgboost_R.h" // Must follow other includes. #include "./xgboost_R.h" // Must follow other includes.
#include "Rinternals.h"
/*! /*!
* \brief macro to annotate begin of api * \brief macro to annotate begin of api

View File

@ -19,15 +19,15 @@ w <- runif(metadata$kRows)
version <- packageVersion('xgboost') version <- packageVersion('xgboost')
target_dir <- 'models' target_dir <- 'models'
save_booster <- function (booster, model_name) { save_booster <- function(booster, model_name) {
booster_bin <- function (model_name) { booster_bin <- function(model_name) {
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = ''))) return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
} }
booster_json <- function (model_name) { booster_json <- function(model_name) {
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = ''))) return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
} }
booster_rds <- function (model_name) { booster_rds <- function(model_name) {
return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = ''))) return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
} }
xgb.save(booster, booster_bin(model_name)) xgb.save(booster, booster_bin(model_name))
saveRDS(booster, booster_rds(model_name)) saveRDS(booster, booster_rds(model_name))
@ -36,7 +36,7 @@ save_booster <- function (booster, model_name) {
} }
} }
generate_regression_model <- function () { generate_regression_model <- function() {
print('Regression') print('Regression')
y <- rnorm(metadata$kRows) y <- rnorm(metadata$kRows)
@ -47,7 +47,7 @@ generate_regression_model <- function () {
save_booster(booster, 'reg') save_booster(booster, 'reg')
} }
generate_logistic_model <- function () { generate_logistic_model <- function() {
print('Binary classification with logistic loss') print('Binary classification with logistic loss')
y <- sample(0:1, size = metadata$kRows, replace = TRUE) y <- sample(0:1, size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == 1, min(y) == 0) stopifnot(max(y) == 1, min(y) == 0)
@ -64,7 +64,7 @@ generate_logistic_model <- function () {
} }
} }
generate_classification_model <- function () { generate_classification_model <- function() {
print('Multi-class classification') print('Multi-class classification')
y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE) y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0) stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0)
@ -77,7 +77,7 @@ generate_classification_model <- function () {
save_booster(booster, 'cls') save_booster(booster, 'cls')
} }
generate_ranking_model <- function () { generate_ranking_model <- function() {
print('Learning to rank') print('Learning to rank')
y <- sample(0:4, size = metadata$kRows, replace = TRUE) y <- sample(0:4, size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == 4, min(y) == 0) stopifnot(max(y) == 4, min(y) == 0)

View File

@ -0,0 +1,25 @@
## Helper script for running individual examples.
library(pkgload)
library(xgboost)
files <- list.files("./man")
run_example_timeit <- function(f) {
path <- paste("./man/", f, sep = "")
print(paste("Test", f))
flush.console()
t0 <- proc.time()
run_example(path)
t1 <- proc.time()
list(file = f, time = t1 - t0)
}
timings <- lapply(files, run_example_timeit)
for (t in timings) {
ratio <- t$time[1] / t$time[3]
if (!is.na(ratio) && !is.infinite(ratio) && ratio >= 2.5) {
print(paste("Offending example:", t$file, ratio))
}
}

View File

@ -1,7 +1,7 @@
context("basic functions") context("basic functions")
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = "xgboost")
data(agaricus.test, package = 'xgboost') data(agaricus.test, package = "xgboost")
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
set.seed(1994) set.seed(1994)
@ -9,15 +9,20 @@ set.seed(1994)
# disable some tests for Win32 # disable some tests for Win32
windows_flag <- .Platform$OS.type == "windows" && windows_flag <- .Platform$OS.type == "windows" &&
.Machine$sizeof.pointer != 8 .Machine$sizeof.pointer != 8
solaris_flag <- (Sys.info()['sysname'] == "SunOS") solaris_flag <- (Sys.info()["sysname"] == "SunOS")
n_threads <- 1
test_that("train and predict binary classification", { test_that("train and predict binary classification", {
nrounds <- 2 nrounds <- 2
expect_output( expect_output(
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic", data = train$data, label = train$label, max_depth = 2,
eval_metric = "error") eta = 1, nthread = n_threads, nrounds = nrounds,
, "train-error") objective = "binary:logistic", eval_metric = "error"
),
"train-error"
)
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
expect_equal(bst$niter, nrounds) expect_equal(bst$niter, nrounds)
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
@ -46,26 +51,39 @@ test_that("parameter validation works", {
d <- cbind( d <- cbind(
x1 = rnorm(10), x1 = rnorm(10),
x2 = rnorm(10), x2 = rnorm(10),
x3 = rnorm(10)) x3 = rnorm(10)
)
y <- d[, "x1"] + d[, "x2"]^2 + y <- d[, "x1"] + d[, "x2"]^2 +
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
rnorm(10) rnorm(10)
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
correct <- function() { correct <- function() {
params <- list(max_depth = 2, booster = "dart", params <- list(
rate_drop = 0.5, one_drop = TRUE, max_depth = 2,
objective = "reg:squarederror") booster = "dart",
rate_drop = 0.5,
one_drop = TRUE,
nthread = n_threads,
objective = "reg:squarederror"
)
xgb.train(params = params, data = dtrain, nrounds = nrounds) xgb.train(params = params, data = dtrain, nrounds = nrounds)
} }
expect_silent(correct()) expect_silent(correct())
incorrect <- function() { incorrect <- function() {
params <- list(max_depth = 2, booster = "dart", params <- list(
rate_drop = 0.5, one_drop = TRUE, max_depth = 2,
booster = "dart",
rate_drop = 0.5,
one_drop = TRUE,
objective = "reg:squarederror", objective = "reg:squarederror",
foo = "bar", bar = "foo") nthread = n_threads,
foo = "bar",
bar = "foo"
)
output <- capture.output( output <- capture.output(
xgb.train(params = params, data = dtrain, nrounds = nrounds)) xgb.train(params = params, data = dtrain, nrounds = nrounds)
)
print(output) print(output)
} }
expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"') expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"')
@ -79,7 +97,8 @@ test_that("dart prediction works", {
d <- cbind( d <- cbind(
x1 = rnorm(100), x1 = rnorm(100),
x2 = rnorm(100), x2 = rnorm(100),
x3 = rnorm(100)) x3 = rnorm(100)
)
y <- d[, "x1"] + d[, "x2"]^2 + y <- d[, "x1"] + d[, "x2"]^2 +
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
rnorm(100) rnorm(100)
@ -93,7 +112,7 @@ test_that("dart prediction works", {
rate_drop = 0.5, rate_drop = 0.5,
one_drop = TRUE, one_drop = TRUE,
eta = 1, eta = 1,
nthread = 2, nthread = n_threads,
nrounds = nrounds, nrounds = nrounds,
objective = "reg:squarederror" objective = "reg:squarederror"
) )
@ -105,7 +124,7 @@ test_that("dart prediction works", {
expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
set.seed(1994) set.seed(1994)
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
booster_by_train <- xgb.train( booster_by_train <- xgb.train(
params = list( params = list(
booster = "dart", booster = "dart",
@ -113,7 +132,7 @@ test_that("dart prediction works", {
eta = 1, eta = 1,
rate_drop = 0.5, rate_drop = 0.5,
one_drop = TRUE, one_drop = TRUE,
nthread = 1, nthread = n_threads,
objective = "reg:squarederror" objective = "reg:squarederror"
), ),
data = dtrain, data = dtrain,
@ -132,10 +151,13 @@ test_that("train and predict softprob", {
lb <- as.numeric(iris$Species) - 1 lb <- as.numeric(iris$Species) - 1
set.seed(11) set.seed(11)
expect_output( expect_output(
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, bst <- xgboost(
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, data = as.matrix(iris[, -5]), label = lb,
objective = "multi:softprob", num_class = 3, eval_metric = "merror") max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
, "train-merror") objective = "multi:softprob", num_class = 3, eval_metric = "merror"
),
"train-merror"
)
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$niter * 3, xgb.ntree(bst)) expect_equal(bst$niter * 3, xgb.ntree(bst))
@ -164,9 +186,10 @@ test_that("train and predict softprob", {
x3 = rnorm(100) x3 = rnorm(100)
) )
y <- sample.int(10, 100, replace = TRUE) - 1 y <- sample.int(10, 100, replace = TRUE) - 1
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
booster <- xgb.train( booster <- xgb.train(
params = list(tree_method = "hist"), data = dtrain, nrounds = 4, num_class = 10, params = list(tree_method = "hist", nthread = n_threads),
data = dtrain, nrounds = 4, num_class = 10,
objective = "multi:softprob" objective = "multi:softprob"
) )
predt <- predict(booster, as.matrix(d), reshape = TRUE, strict_shape = FALSE) predt <- predict(booster, as.matrix(d), reshape = TRUE, strict_shape = FALSE)
@ -178,10 +201,13 @@ test_that("train and predict softmax", {
lb <- as.numeric(iris$Species) - 1 lb <- as.numeric(iris$Species) - 1
set.seed(11) set.seed(11)
expect_output( expect_output(
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, bst <- xgboost(
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, data = as.matrix(iris[, -5]), label = lb,
objective = "multi:softmax", num_class = 3, eval_metric = "merror") max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
, "train-merror") objective = "multi:softmax", num_class = 3, eval_metric = "merror"
),
"train-merror"
)
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$niter * 3, xgb.ntree(bst)) expect_equal(bst$niter * 3, xgb.ntree(bst))
@ -196,16 +222,19 @@ test_that("train and predict RF", {
set.seed(11) set.seed(11)
lb <- train$label lb <- train$label
# single iteration # single iteration
bst <- xgboost(data = train$data, label = lb, max_depth = 5, bst <- xgboost(
nthread = 2, nrounds = 1, objective = "binary:logistic", eval_metric = "error", data = train$data, label = lb, max_depth = 5,
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1) nthread = n_threads,
nrounds = 1, objective = "binary:logistic", eval_metric = "error",
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1
)
expect_equal(bst$niter, 1) expect_equal(bst$niter, 1)
expect_equal(xgb.ntree(bst), 20) expect_equal(xgb.ntree(bst), 20)
pred <- predict(bst, train$data) pred <- predict(bst, train$data)
pred_err <- sum((pred > 0.5) != lb) / length(lb) pred_err <- sum((pred > 0.5) != lb) / length(lb)
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6) expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
#expect_lt(pred_err, 0.03) # expect_lt(pred_err, 0.03)
pred <- predict(bst, train$data, ntreelimit = 20) pred <- predict(bst, train$data, ntreelimit = 20)
pred_err_20 <- sum((pred > 0.5) != lb) / length(lb) pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
@ -219,11 +248,13 @@ test_that("train and predict RF with softprob", {
lb <- as.numeric(iris$Species) - 1 lb <- as.numeric(iris$Species) - 1
nrounds <- 15 nrounds <- 15
set.seed(11) set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, bst <- xgboost(
max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds, data = as.matrix(iris[, -5]), label = lb,
max_depth = 3, eta = 0.9, nthread = n_threads, nrounds = nrounds,
objective = "multi:softprob", eval_metric = "merror", objective = "multi:softprob", eval_metric = "merror",
num_class = 3, verbose = 0, num_class = 3, verbose = 0,
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5) num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5
)
expect_equal(bst$niter, 15) expect_equal(bst$niter, 15)
expect_equal(xgb.ntree(bst), 15 * 3 * 4) expect_equal(xgb.ntree(bst), 15 * 3 * 4)
# predict for all iterations: # predict for all iterations:
@ -240,18 +271,24 @@ test_that("train and predict RF with softprob", {
test_that("use of multiple eval metrics works", { test_that("use of multiple eval metrics works", {
expect_output( expect_output(
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", data = train$data, label = train$label, max_depth = 2,
eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss") eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
, "train-error.*train-auc.*train-logloss") eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
),
"train-error.*train-auc.*train-logloss"
)
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
expect_equal(dim(bst$evaluation_log), c(2, 4)) expect_equal(dim(bst$evaluation_log), c(2, 4))
expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
expect_output( expect_output(
bst2 <- xgboost(data = train$data, label = train$label, max_depth = 2, bst2 <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", data = train$data, label = train$label, max_depth = 2,
eval_metric = list("error", "auc", "logloss")) eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
, "train-error.*train-auc.*train-logloss") eval_metric = list("error", "auc", "logloss")
),
"train-error.*train-auc.*train-logloss"
)
expect_false(is.null(bst2$evaluation_log)) expect_false(is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(2, 4)) expect_equal(dim(bst2$evaluation_log), c(2, 4))
expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
@ -259,9 +296,11 @@ test_that("use of multiple eval metrics works", {
test_that("training continuation works", { test_that("training continuation works", {
dtrain <- xgb.DMatrix(train$data, label = train$label) dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
watchlist <- list(train = dtrain) watchlist <- list(train = dtrain)
param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2) param <- list(
objective = "binary:logistic", max_depth = 2, eta = 1, nthread = n_threads
)
# for the reference, use 4 iterations at once: # for the reference, use 4 iterations at once:
set.seed(11) set.seed(11)
@ -271,30 +310,33 @@ test_that("training continuation works", {
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0) bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
# continue for two more: # continue for two more:
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1) bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
if (!windows_flag && !solaris_flag) if (!windows_flag && !solaris_flag) {
expect_equal(bst$raw, bst2$raw) expect_equal(bst$raw, bst2$raw)
}
expect_false(is.null(bst2$evaluation_log)) expect_false(is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(4, 2)) expect_equal(dim(bst2$evaluation_log), c(4, 2))
expect_equal(bst2$evaluation_log, bst$evaluation_log) expect_equal(bst2$evaluation_log, bst$evaluation_log)
# test continuing from raw model data # test continuing from raw model data
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw) bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
if (!windows_flag && !solaris_flag) if (!windows_flag && !solaris_flag) {
expect_equal(bst$raw, bst2$raw) expect_equal(bst$raw, bst2$raw)
}
expect_equal(dim(bst2$evaluation_log), c(2, 2)) expect_equal(dim(bst2$evaluation_log), c(2, 2))
# test continuing from a model in file # test continuing from a model in file
xgb.save(bst1, "xgboost.json") xgb.save(bst1, "xgboost.json")
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json") bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json")
if (!windows_flag && !solaris_flag) if (!windows_flag && !solaris_flag) {
expect_equal(bst$raw, bst2$raw) expect_equal(bst$raw, bst2$raw)
}
expect_equal(dim(bst2$evaluation_log), c(2, 2)) expect_equal(dim(bst2$evaluation_log), c(2, 2))
file.remove("xgboost.json") file.remove("xgboost.json")
}) })
test_that("model serialization works", { test_that("model serialization works", {
out_path <- "model_serialization" out_path <- "model_serialization"
dtrain <- xgb.DMatrix(train$data, label = train$label) dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
watchlist <- list(train = dtrain) watchlist <- list(train = dtrain)
param <- list(objective = "binary:logistic") param <- list(objective = "binary:logistic", nthread = n_threads)
booster <- xgb.train(param, dtrain, nrounds = 4, watchlist) booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
raw <- xgb.serialize(booster) raw <- xgb.serialize(booster)
saveRDS(raw, out_path) saveRDS(raw, out_path)
@ -309,11 +351,14 @@ test_that("model serialization works", {
test_that("xgb.cv works", { test_that("xgb.cv works", {
set.seed(11) set.seed(11)
expect_output( expect_output(
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5, cv <- xgb.cv(
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", data = train$data, label = train$label, max_depth = 2, nfold = 5,
eval_metric = "error", verbose = TRUE) eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
, "train-error:") eval_metric = "error", verbose = TRUE
expect_is(cv, 'xgb.cv.synchronous') ),
"train-error:"
)
expect_is(cv, "xgb.cv.synchronous")
expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$evaluation_log))
expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03) expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
expect_lt(cv$evaluation_log[, min(test_error_std)], 0.008) expect_lt(cv$evaluation_log[, min(test_error_std)], 0.008)
@ -326,15 +371,19 @@ test_that("xgb.cv works", {
}) })
test_that("xgb.cv works with stratified folds", { test_that("xgb.cv works with stratified folds", {
dtrain <- xgb.DMatrix(train$data, label = train$label) dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
set.seed(314159) set.seed(314159)
cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5, cv <- xgb.cv(
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", data = dtrain, max_depth = 2, nfold = 5,
verbose = TRUE, stratified = FALSE) eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
verbose = TRUE, stratified = FALSE
)
set.seed(314159) set.seed(314159)
cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5, cv2 <- xgb.cv(
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", data = dtrain, max_depth = 2, nfold = 5,
verbose = TRUE, stratified = TRUE) eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
verbose = TRUE, stratified = TRUE
)
# Stratified folds should result in a different evaluation logs # Stratified folds should result in a different evaluation logs
expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean])) expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean]))
}) })
@ -342,40 +391,57 @@ test_that("xgb.cv works with stratified folds", {
test_that("train and predict with non-strict classes", { test_that("train and predict with non-strict classes", {
# standard dense matrix input # standard dense matrix input
train_dense <- as.matrix(train$data) train_dense <- as.matrix(train$data)
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0) data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
verbose = 0
)
pr0 <- predict(bst, train_dense) pr0 <- predict(bst, train_dense)
# dense matrix-like input of non-matrix class # dense matrix-like input of non-matrix class
class(train_dense) <- 'shmatrix' class(train_dense) <- "shmatrix"
expect_true(is.matrix(train_dense)) expect_true(is.matrix(train_dense))
expect_error( expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0) data = train_dense, label = train$label, max_depth = 2,
, regexp = NA) eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
verbose = 0
),
regexp = NA
)
expect_error(pr <- predict(bst, train_dense), regexp = NA) expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr) expect_equal(pr0, pr)
# dense matrix-like input of non-matrix class with some inheritance # dense matrix-like input of non-matrix class with some inheritance
class(train_dense) <- c('pphmatrix', 'shmatrix') class(train_dense) <- c("pphmatrix", "shmatrix")
expect_true(is.matrix(train_dense)) expect_true(is.matrix(train_dense))
expect_error( expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0) data = train_dense, label = train$label, max_depth = 2,
, regexp = NA) eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
verbose = 0
),
regexp = NA
)
expect_error(pr <- predict(bst, train_dense), regexp = NA) expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr) expect_equal(pr0, pr)
# when someone inherits from xgb.Booster, it should still be possible to use it as xgb.Booster # when someone inherits from xgb.Booster, it should still be possible to use it as xgb.Booster
class(bst) <- c('super.Booster', 'xgb.Booster') class(bst) <- c("super.Booster", "xgb.Booster")
expect_error(pr <- predict(bst, train_dense), regexp = NA) expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr) expect_equal(pr0, pr)
}) })
test_that("max_delta_step works", { test_that("max_delta_step works", {
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(
agaricus.train$data, label = agaricus.train$label, nthread = n_threads
)
watchlist <- list(train = dtrain) watchlist <- list(train = dtrain)
param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5) param <- list(
objective = "binary:logistic", eval_metric = "logloss", max_depth = 2,
nthread = n_threads,
eta = 0.5
)
nrounds <- 5 nrounds <- 5
# model with no restriction on max_delta_step # model with no restriction on max_delta_step
bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1) bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
@ -395,14 +461,16 @@ test_that("colsample_bytree works", {
test_y <- as.numeric(rowSums(test_x) > 0) test_y <- as.numeric(rowSums(test_x) > 0)
colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100)) colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100))
colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100)) colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
dtrain <- xgb.DMatrix(train_x, label = train_y) dtrain <- xgb.DMatrix(train_x, label = train_y, nthread = n_threads)
dtest <- xgb.DMatrix(test_x, label = test_y) dtest <- xgb.DMatrix(test_x, label = test_y, nthread = n_threads)
watchlist <- list(train = dtrain, eval = dtest) watchlist <- list(train = dtrain, eval = dtest)
## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for
## each tree ## each tree
param <- list(max_depth = 2, eta = 0, nthread = 2, param <- list(
max_depth = 2, eta = 0, nthread = n_threads,
colsample_bytree = 0.01, objective = "binary:logistic", colsample_bytree = 0.01, objective = "binary:logistic",
eval_metric = "auc") eval_metric = "auc"
)
set.seed(2) set.seed(2)
bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0) bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
xgb.importance(model = bst) xgb.importance(model = bst)
@ -412,9 +480,11 @@ test_that("colsample_bytree works", {
}) })
test_that("Configuration works", { test_that("Configuration works", {
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", data = train$data, label = train$label, max_depth = 2,
eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss") eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
)
config <- xgb.config(bst) config <- xgb.config(bst)
xgb.config(bst) <- config xgb.config(bst) <- config
reloaded_config <- xgb.config(bst) reloaded_config <- xgb.config(bst)
@ -451,22 +521,26 @@ test_that("strict_shape works", {
y <- as.numeric(iris$Species) - 1 y <- as.numeric(iris$Species) - 1
X <- as.matrix(iris[, -5]) X <- as.matrix(iris[, -5])
bst <- xgboost(data = X, label = y, bst <- xgboost(
max_depth = 2, nrounds = n_rounds, data = X, label = y,
objective = "multi:softprob", num_class = 3, eval_metric = "merror") max_depth = 2, nrounds = n_rounds, nthread = n_threads,
objective = "multi:softprob", num_class = 3, eval_metric = "merror"
)
test_strict_shape(bst, X, 3) test_strict_shape(bst, X, 3)
} }
test_agaricus <- function() { test_agaricus <- function() {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = "xgboost")
X <- agaricus.train$data X <- agaricus.train$data
y <- agaricus.train$label y <- agaricus.train$label
bst <- xgboost(data = X, label = y, max_depth = 2, bst <- xgboost(
data = X, label = y, max_depth = 2, nthread = n_threads,
nrounds = n_rounds, objective = "binary:logistic", nrounds = n_rounds, objective = "binary:logistic",
eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss") eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
)
test_strict_shape(bst, X, 1) test_strict_shape(bst, X, 1)
} }
@ -481,8 +555,10 @@ test_that("'predict' accepts CSR data", {
x_csc <- as(X[1L, , drop = FALSE], "CsparseMatrix") x_csc <- as(X[1L, , drop = FALSE], "CsparseMatrix")
x_csr <- as(x_csc, "RsparseMatrix") x_csr <- as(x_csc, "RsparseMatrix")
x_spv <- as(x_csc, "sparseVector") x_spv <- as(x_csc, "sparseVector")
bst <- xgboost(data = X, label = y, objective = "binary:logistic", bst <- xgboost(
nrounds = 5L, verbose = FALSE) data = X, label = y, objective = "binary:logistic",
nrounds = 5L, verbose = FALSE, nthread = n_threads,
)
p_csc <- predict(bst, x_csc) p_csc <- predict(bst, x_csc)
p_csr <- predict(bst, x_csr) p_csr <- predict(bst, x_csr)
p_spv <- predict(bst, x_spv) p_spv <- predict(bst, x_spv)

View File

@ -6,6 +6,8 @@ data(agaricus.test, package = 'xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
n_threads <- 2
# add some label noise for early stopping tests # add some label noise for early stopping tests
add.noise <- function(label, frac) { add.noise <- function(label, frac) {
inoise <- sample(length(label), length(label) * frac) inoise <- sample(length(label), length(label) * frac)
@ -15,15 +17,15 @@ add.noise <- function(label, frac) {
set.seed(11) set.seed(11)
ltrain <- add.noise(train$label, 0.2) ltrain <- add.noise(train$label, 0.2)
ltest <- add.noise(test$label, 0.2) ltest <- add.noise(test$label, 0.2)
dtrain <- xgb.DMatrix(train$data, label = ltrain) dtrain <- xgb.DMatrix(train$data, label = ltrain, nthread = n_threads)
dtest <- xgb.DMatrix(test$data, label = ltest) dtest <- xgb.DMatrix(test$data, label = ltest, nthread = n_threads)
watchlist <- list(train = dtrain, test = dtest) watchlist <- list(train = dtrain, test = dtest)
err <- function(label, pr) sum((pr > 0.5) != label) / length(label) err <- function(label, pr) sum((pr > 0.5) != label) / length(label)
param <- list(objective = "binary:logistic", eval_metric = "error", param <- list(objective = "binary:logistic", eval_metric = "error",
max_depth = 2, nthread = 2) max_depth = 2, nthread = n_threads)
test_that("cb.print.evaluation works as expected", { test_that("cb.print.evaluation works as expected", {
@ -103,7 +105,7 @@ test_that("cb.evaluation.log works as expected", {
param <- list(objective = "binary:logistic", eval_metric = "error", param <- list(objective = "binary:logistic", eval_metric = "error",
max_depth = 4, nthread = 2) max_depth = 4, nthread = n_threads)
test_that("can store evaluation_log without printing", { test_that("can store evaluation_log without printing", {
expect_silent( expect_silent(
@ -179,8 +181,10 @@ test_that("cb.save.model works as expected", {
expect_true(file.exists('xgboost_01.json')) expect_true(file.exists('xgboost_01.json'))
expect_true(file.exists('xgboost_02.json')) expect_true(file.exists('xgboost_02.json'))
b1 <- xgb.load('xgboost_01.json') b1 <- xgb.load('xgboost_01.json')
xgb.parameters(b1) <- list(nthread = 2)
expect_equal(xgb.ntree(b1), 1) expect_equal(xgb.ntree(b1), 1)
b2 <- xgb.load('xgboost_02.json') b2 <- xgb.load('xgboost_02.json')
xgb.parameters(b2) <- list(nthread = 2)
expect_equal(xgb.ntree(b2), 2) expect_equal(xgb.ntree(b2), 2)
xgb.config(b2) <- xgb.config(bst) xgb.config(b2) <- xgb.config(bst)
@ -267,7 +271,8 @@ test_that("early stopping works with titanic", {
objective = "binary:logistic", objective = "binary:logistic",
eval_metric = "auc", eval_metric = "auc",
nrounds = 100, nrounds = 100,
early_stopping_rounds = 3 early_stopping_rounds = 3,
nthread = n_threads
) )
expect_true(TRUE) # should not crash expect_true(TRUE) # should not crash
@ -308,7 +313,7 @@ test_that("prediction in xgb.cv works", {
test_that("prediction in xgb.cv works for gblinear too", { test_that("prediction in xgb.cv works for gblinear too", {
set.seed(11) set.seed(11)
p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2) p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads)
cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0) cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$evaluation_log))
expect_false(is.null(cv$pred)) expect_false(is.null(cv$pred))
@ -341,7 +346,7 @@ test_that("prediction in xgb.cv for softprob works", {
set.seed(11) set.seed(11)
expect_warning( expect_warning(
cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4, cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2, eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads,
subsample = 0.8, gamma = 2, verbose = 0, subsample = 0.8, gamma = 2, verbose = 0,
prediction = TRUE, objective = "multi:softprob", num_class = 3) prediction = TRUE, objective = "multi:softprob", num_class = 3)
, NA) , NA)

View File

@ -2,10 +2,16 @@ context('Test models with custom objective')
set.seed(1994) set.seed(1994)
n_threads <- 2
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) agaricus.train$data, label = agaricus.train$label, nthread = n_threads
)
dtest <- xgb.DMatrix(
agaricus.test$data, label = agaricus.test$label, nthread = n_threads
)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
@ -22,7 +28,7 @@ evalerror <- function(preds, dtrain) {
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max_depth = 2, eta = 1, nthread = 2, param <- list(max_depth = 2, eta = 1, nthread = n_threads,
objective = logregobj, eval_metric = evalerror) objective = logregobj, eval_metric = evalerror)
num_round <- 2 num_round <- 2
@ -67,7 +73,7 @@ test_that("custom objective using DMatrix attr works", {
test_that("custom objective with multi-class shape", { test_that("custom objective with multi-class shape", {
data <- as.matrix(iris[, -5]) data <- as.matrix(iris[, -5])
label <- as.numeric(iris$Species) - 1 label <- as.numeric(iris$Species) - 1
dtrain <- xgb.DMatrix(data = data, label = label) dtrain <- xgb.DMatrix(data = data, label = label, nthread = n_threads)
n_classes <- 3 n_classes <- 3
fake_softprob <- function(preds, dtrain) { fake_softprob <- function(preds, dtrain) {

View File

@ -5,19 +5,21 @@ data(agaricus.test, package = "xgboost")
test_data <- agaricus.test$data[1:100, ] test_data <- agaricus.test$data[1:100, ]
test_label <- agaricus.test$label[1:100] test_label <- agaricus.test$label[1:100]
n_threads <- 2
test_that("xgb.DMatrix: basic construction", { test_that("xgb.DMatrix: basic construction", {
# from sparse matrix # from sparse matrix
dtest1 <- xgb.DMatrix(test_data, label = test_label) dtest1 <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
# from dense matrix # from dense matrix
dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label) dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label, nthread = n_threads)
expect_equal(getinfo(dtest1, "label"), getinfo(dtest2, "label")) expect_equal(getinfo(dtest1, "label"), getinfo(dtest2, "label"))
expect_equal(dim(dtest1), dim(dtest2)) expect_equal(dim(dtest1), dim(dtest2))
# from dense integer matrix # from dense integer matrix
int_data <- as.matrix(test_data) int_data <- as.matrix(test_data)
storage.mode(int_data) <- "integer" storage.mode(int_data) <- "integer"
dtest3 <- xgb.DMatrix(int_data, label = test_label) dtest3 <- xgb.DMatrix(int_data, label = test_label, nthread = n_threads)
expect_equal(dim(dtest1), dim(dtest3)) expect_equal(dim(dtest1), dim(dtest3))
n_samples <- 100 n_samples <- 100
@ -29,15 +31,15 @@ test_that("xgb.DMatrix: basic construction", {
X <- matrix(X, nrow = n_samples) X <- matrix(X, nrow = n_samples)
y <- rbinom(n = n_samples, size = 1, prob = 1 / 2) y <- rbinom(n = n_samples, size = 1, prob = 1 / 2)
fd <- xgb.DMatrix(X, label = y, missing = 1) fd <- xgb.DMatrix(X, label = y, missing = 1, nthread = n_threads)
dgc <- as(X, "dgCMatrix") dgc <- as(X, "dgCMatrix")
fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0) fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0, nthread = n_threads)
dgr <- as(X, "dgRMatrix") dgr <- as(X, "dgRMatrix")
fdgr <- xgb.DMatrix(dgr, label = y, missing = 1) fdgr <- xgb.DMatrix(dgr, label = y, missing = 1, nthread = n_threads)
params <- list(tree_method = "hist") params <- list(tree_method = "hist", nthread = n_threads)
bst_fd <- xgb.train( bst_fd <- xgb.train(
params, nrounds = 8, fd, watchlist = list(train = fd) params, nrounds = 8, fd, watchlist = list(train = fd)
) )
@ -64,12 +66,12 @@ test_that("xgb.DMatrix: NA", {
) )
x[1, "x1"] <- NA x[1, "x1"] <- NA
m <- xgb.DMatrix(x) m <- xgb.DMatrix(x, nthread = n_threads)
xgb.DMatrix.save(m, "int.dmatrix") xgb.DMatrix.save(m, "int.dmatrix")
x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2) x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
colnames(x) <- c("x1", "x2") colnames(x) <- c("x1", "x2")
m <- xgb.DMatrix(x) m <- xgb.DMatrix(x, nthread = n_threads)
xgb.DMatrix.save(m, "float.dmatrix") xgb.DMatrix.save(m, "float.dmatrix")
@ -94,7 +96,7 @@ test_that("xgb.DMatrix: NA", {
test_that("xgb.DMatrix: saving, loading", { test_that("xgb.DMatrix: saving, loading", {
# save to a local file # save to a local file
dtest1 <- xgb.DMatrix(test_data, label = test_label) dtest1 <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
tmp_file <- tempfile('xgb.DMatrix_') tmp_file <- tempfile('xgb.DMatrix_')
on.exit(unlink(tmp_file)) on.exit(unlink(tmp_file))
expect_true(xgb.DMatrix.save(dtest1, tmp_file)) expect_true(xgb.DMatrix.save(dtest1, tmp_file))
@ -109,13 +111,17 @@ test_that("xgb.DMatrix: saving, loading", {
tmp_file <- tempfile(fileext = ".libsvm") tmp_file <- tempfile(fileext = ".libsvm")
writeLines(tmp, tmp_file) writeLines(tmp, tmp_file)
expect_true(file.exists(tmp_file)) expect_true(file.exists(tmp_file))
dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE) dtest4 <- xgb.DMatrix(
paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE, nthread = n_threads
)
expect_equal(dim(dtest4), c(3, 4)) expect_equal(dim(dtest4), c(3, 4))
expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0)) expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
# check that feature info is saved # check that feature info is saved
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(
data = agaricus.train$data, label = agaricus.train$label, nthread = n_threads
)
cnames <- colnames(dtrain) cnames <- colnames(dtrain)
expect_equal(length(cnames), 126) expect_equal(length(cnames), 126)
tmp_file <- tempfile('xgb.DMatrix_') tmp_file <- tempfile('xgb.DMatrix_')
@ -129,7 +135,7 @@ test_that("xgb.DMatrix: saving, loading", {
}) })
test_that("xgb.DMatrix: getinfo & setinfo", { test_that("xgb.DMatrix: getinfo & setinfo", {
dtest <- xgb.DMatrix(test_data) dtest <- xgb.DMatrix(test_data, nthread = n_threads)
expect_true(setinfo(dtest, 'label', test_label)) expect_true(setinfo(dtest, 'label', test_label))
labels <- getinfo(dtest, 'label') labels <- getinfo(dtest, 'label')
expect_equal(test_label, getinfo(dtest, 'label')) expect_equal(test_label, getinfo(dtest, 'label'))
@ -156,7 +162,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
}) })
test_that("xgb.DMatrix: slice, dim", { test_that("xgb.DMatrix: slice, dim", {
dtest <- xgb.DMatrix(test_data, label = test_label) dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
expect_equal(dim(dtest), dim(test_data)) expect_equal(dim(dtest), dim(test_data))
dsub1 <- slice(dtest, 1:42) dsub1 <- slice(dtest, 1:42)
expect_equal(nrow(dsub1), 42) expect_equal(nrow(dsub1), 42)
@ -171,16 +177,20 @@ test_that("xgb.DMatrix: slice, trailing empty rows", {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
train_data <- agaricus.train$data train_data <- agaricus.train$data
train_label <- agaricus.train$label train_label <- agaricus.train$label
dtrain <- xgb.DMatrix(data = train_data, label = train_label) dtrain <- xgb.DMatrix(
data = train_data, label = train_label, nthread = n_threads
)
slice(dtrain, 6513L) slice(dtrain, 6513L)
train_data[6513, ] <- 0 train_data[6513, ] <- 0
dtrain <- xgb.DMatrix(data = train_data, label = train_label) dtrain <- xgb.DMatrix(
data = train_data, label = train_label, nthread = n_threads
)
slice(dtrain, 6513L) slice(dtrain, 6513L)
expect_equal(nrow(dtrain), 6513) expect_equal(nrow(dtrain), 6513)
}) })
test_that("xgb.DMatrix: colnames", { test_that("xgb.DMatrix: colnames", {
dtest <- xgb.DMatrix(test_data, label = test_label) dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
expect_equal(colnames(dtest), colnames(test_data)) expect_equal(colnames(dtest), colnames(test_data))
expect_error(colnames(dtest) <- 'asdf') expect_error(colnames(dtest) <- 'asdf')
new_names <- make.names(seq_len(ncol(test_data))) new_names <- make.names(seq_len(ncol(test_data)))
@ -196,7 +206,7 @@ test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
x <- Matrix::rsparsematrix(nr, 100, density = 0.0005) x <- Matrix::rsparsematrix(nr, 100, density = 0.0005)
# we want it very sparse, so that last rows are empty # we want it very sparse, so that last rows are empty
expect_lt(max(x@i), nr) expect_lt(max(x@i), nr)
dtest <- xgb.DMatrix(x) dtest <- xgb.DMatrix(x, nthread = n_threads)
expect_equal(dim(dtest), dim(x)) expect_equal(dim(dtest), dim(x))
}) })
@ -205,8 +215,8 @@ test_that("xgb.DMatrix: print", {
# core DMatrix with just data and labels # core DMatrix with just data and labels
dtrain <- xgb.DMatrix( dtrain <- xgb.DMatrix(
data = agaricus.train$data data = agaricus.train$data, label = agaricus.train$label,
, label = agaricus.train$label nthread = n_threads
) )
txt <- capture.output({ txt <- capture.output({
print(dtrain) print(dtrain)
@ -222,10 +232,11 @@ test_that("xgb.DMatrix: print", {
# DMatrix with weights and base_margin # DMatrix with weights and base_margin
dtrain <- xgb.DMatrix( dtrain <- xgb.DMatrix(
data = agaricus.train$data data = agaricus.train$data,
, label = agaricus.train$label label = agaricus.train$label,
, weight = seq_along(agaricus.train$label) weight = seq_along(agaricus.train$label),
, base_margin = agaricus.train$label base_margin = agaricus.train$label,
nthread = n_threads
) )
txt <- capture.output({ txt <- capture.output({
print(dtrain) print(dtrain)
@ -234,7 +245,8 @@ test_that("xgb.DMatrix: print", {
# DMatrix with just features # DMatrix with just features
dtrain <- xgb.DMatrix( dtrain <- xgb.DMatrix(
data = agaricus.train$data data = agaricus.train$data,
nthread = n_threads
) )
txt <- capture.output({ txt <- capture.output({
print(dtrain) print(dtrain)
@ -245,7 +257,8 @@ test_that("xgb.DMatrix: print", {
data_no_colnames <- agaricus.train$data data_no_colnames <- agaricus.train$data
colnames(data_no_colnames) <- NULL colnames(data_no_colnames) <- NULL
dtrain <- xgb.DMatrix( dtrain <- xgb.DMatrix(
data = data_no_colnames data = data_no_colnames,
nthread = n_threads
) )
txt <- capture.output({ txt <- capture.output({
print(dtrain) print(dtrain)

View File

@ -1,5 +1,7 @@
context("feature weights") context("feature weights")
n_threads <- 2
test_that("training with feature weights works", { test_that("training with feature weights works", {
nrows <- 1000 nrows <- 1000
ncols <- 9 ncols <- 9
@ -10,8 +12,12 @@ test_that("training with feature weights works", {
test <- function(tm) { test <- function(tm) {
names <- paste0("f", 1:ncols) names <- paste0("f", 1:ncols)
xy <- xgb.DMatrix(data = x, label = y, feature_weights = weights) xy <- xgb.DMatrix(
params <- list(colsample_bynode = 0.4, tree_method = tm, nthread = 1) data = x, label = y, feature_weights = weights, nthread = n_threads
)
params <- list(
colsample_bynode = 0.4, tree_method = tm, nthread = n_threads
)
model <- xgb.train(params = params, data = xy, nrounds = 32) model <- xgb.train(params = params, data = xy, nrounds = 32)
importance <- xgb.importance(model = model, feature_names = names) importance <- xgb.importance(model = model, feature_names = names)
expect_equal(dim(importance), c(ncols, 4)) expect_equal(dim(importance), c(ncols, 4))

View File

@ -1,13 +1,19 @@
context('Test generalized linear models') context('Test generalized linear models')
n_threads <- 2
test_that("gblinear works", { test_that("gblinear works", {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) agaricus.train$data, label = agaricus.train$label, nthread = n_threads
)
dtest <- xgb.DMatrix(
agaricus.test$data, label = agaricus.test$label, nthread = n_threads
)
param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear", param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001) nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
n <- 5 # iterations n <- 5 # iterations
@ -48,12 +54,16 @@ test_that("gblinear works", {
test_that("gblinear early stopping works", { test_that("gblinear early stopping works", {
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) agaricus.train$data, label = agaricus.train$label, nthread = n_threads
)
dtest <- xgb.DMatrix(
agaricus.test$data, label = agaricus.test$label, nthread = n_threads
)
param <- list( param <- list(
objective = "binary:logistic", eval_metric = "error", booster = "gblinear", objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001, nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001,
updater = "coord_descent" updater = "coord_descent"
) )

View File

@ -171,6 +171,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
fit <- xgboost( fit <- xgboost(
params = c( params = c(
list( list(
nthread = 2,
booster = booster, booster = booster,
objective = "reg:squarederror", objective = "reg:squarederror",
eval_metric = "rmse"), eval_metric = "rmse"),
@ -257,7 +258,7 @@ test_that("xgb.Booster serializing as R object works", {
.skip_if_vcd_not_available() .skip_if_vcd_not_available()
saveRDS(bst.Tree, 'xgb.model.rds') saveRDS(bst.Tree, 'xgb.model.rds')
bst <- readRDS('xgb.model.rds') bst <- readRDS('xgb.model.rds')
dtrain <- xgb.DMatrix(sparse_matrix, label = label) dtrain <- xgb.DMatrix(sparse_matrix, label = label, nthread = 2)
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance) expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
expect_equal(xgb.dump(bst.Tree), xgb.dump(bst)) expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
xgb.save(bst, 'xgb.model') xgb.save(bst, 'xgb.model')
@ -363,7 +364,8 @@ test_that("xgb.importance works with and without feature names", {
data = as.matrix(data.frame(x = c(0, 1))), data = as.matrix(data.frame(x = c(0, 1))),
label = c(1, 2), label = c(1, 2),
nrounds = 1, nrounds = 1,
base_score = 0.5 base_score = 0.5,
nthread = 2
) )
df <- xgb.model.dt.tree(model = m) df <- xgb.model.dt.tree(model = m)
expect_equal(df$Feature, "Leaf") expect_equal(df$Feature, "Leaf")

View File

@ -2,6 +2,8 @@ require(xgboost)
context("interaction constraints") context("interaction constraints")
n_threads <- 2
set.seed(1024) set.seed(1024)
x1 <- rnorm(1000, 1) x1 <- rnorm(1000, 1)
x2 <- rnorm(1000, 1) x2 <- rnorm(1000, 1)
@ -45,11 +47,18 @@ test_that("interaction constraints scientific representation", {
d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols) d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols)
y <- rnorm(rows) y <- rnorm(rows)
dtrain <- xgb.DMatrix(data = d, info = list(label = y)) dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
inc <- list(c(seq.int(from = 0, to = cols, by = 1))) inc <- list(c(seq.int(from = 0, to = cols, by = 1)))
with_inc <- xgb.train(data = dtrain, tree_method = 'hist', with_inc <- xgb.train(
interaction_constraints = inc, nrounds = 10) data = dtrain,
without_inc <- xgb.train(data = dtrain, tree_method = 'hist', nrounds = 10) tree_method = 'hist',
interaction_constraints = inc,
nrounds = 10,
nthread = n_threads
)
without_inc <- xgb.train(
data = dtrain, tree_method = 'hist', nrounds = 10, nthread = n_threads
)
expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc)) expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc))
}) })

View File

@ -1,6 +1,7 @@
context('Test prediction of feature interactions') context('Test prediction of feature interactions')
set.seed(123) set.seed(123)
n_threads <- 2
test_that("predict feature interactions works", { test_that("predict feature interactions works", {
# simulate some binary data and a linear outcome with an interaction term # simulate some binary data and a linear outcome with an interaction term
@ -19,8 +20,10 @@ test_that("predict feature interactions works", {
y <- f_int(X) y <- f_int(X)
dm <- xgb.DMatrix(X, label = y) dm <- xgb.DMatrix(X, label = y, nthread = n_threads)
param <- list(eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = 2) param <- list(
eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = n_threads
)
b <- xgb.train(param, dm, 100) b <- xgb.train(param, dm, 100)
pred <- predict(b, dm, outputmargin = TRUE) pred <- predict(b, dm, outputmargin = TRUE)
@ -99,11 +102,13 @@ test_that("SHAP contribution values are not NAN", {
verbose = 0, verbose = 0,
params = list( params = list(
objective = "reg:squarederror", objective = "reg:squarederror",
eval_metric = "rmse"), eval_metric = "rmse",
nthread = n_threads
),
data = as.matrix(subset(d, fold == 2)[, ivs]), data = as.matrix(subset(d, fold == 2)[, ivs]),
label = subset(d, fold == 2)$y, label = subset(d, fold == 2)$y,
nthread = 1, nrounds = 3
nrounds = 3) )
shaps <- as.data.frame(predict(fit, shaps <- as.data.frame(predict(fit,
newdata = as.matrix(subset(d, fold == 1)[, ivs]), newdata = as.matrix(subset(d, fold == 1)[, ivs]),
@ -116,8 +121,12 @@ test_that("SHAP contribution values are not NAN", {
test_that("multiclass feature interactions work", { test_that("multiclass feature interactions work", {
dm <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1) dm <- xgb.DMatrix(
param <- list(eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3) as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
)
param <- list(
eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3, nthread = n_threads
)
b <- xgb.train(param, dm, 40) b <- xgb.train(param, dm, 40)
pred <- t( pred <- t(
array( array(
@ -166,6 +175,7 @@ test_that("SHAP single sample works", {
max_depth = 2, max_depth = 2,
nrounds = 4, nrounds = 4,
objective = "binary:logistic", objective = "binary:logistic",
nthread = n_threads
) )
predt <- predict( predt <- predict(

View File

@ -9,7 +9,8 @@ test_that("load/save raw works", {
nrounds <- 8 nrounds <- 8
booster <- xgboost( booster <- xgboost(
data = train$data, label = train$label, data = train$data, label = train$label,
nrounds = nrounds, objective = "binary:logistic" nrounds = nrounds, objective = "binary:logistic",
nthread = 2
) )
json_bytes <- xgb.save.raw(booster, raw_format = "json") json_bytes <- xgb.save.raw(booster, raw_format = "json")

View File

@ -9,20 +9,20 @@ metadata <- list(
kClasses = 3 kClasses = 3
) )
run_model_param_check <- function (config) { run_model_param_check <- function(config) {
testthat::expect_equal(config$learner$learner_model_param$num_feature, '4') testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree') testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
} }
get_num_tree <- function (booster) { get_num_tree <- function(booster) {
dump <- xgb.dump(booster) dump <- xgb.dump(booster)
m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE) m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
m <- regmatches(dump, m) m <- regmatches(dump, m)
num_tree <- Reduce('+', lapply(m, length)) num_tree <- Reduce('+', lapply(m, length))
return (num_tree) return(num_tree)
} }
run_booster_check <- function (booster, name) { run_booster_check <- function(booster, name) {
# If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config(). # If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) { if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
booster <- xgb.Booster.complete(booster) booster <- xgb.Booster.complete(booster)
@ -66,9 +66,9 @@ test_that("Models from previous versions of XGBoost can be loaded", {
unzip(zipfile, exdir = extract_dir, overwrite = TRUE) unzip(zipfile, exdir = extract_dir, overwrite = TRUE)
model_dir <- file.path(extract_dir, 'models') model_dir <- file.path(extract_dir, 'models')
pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4)) pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
lapply(list.files(model_dir), function (x) { lapply(list.files(model_dir), function(x) {
model_file <- file.path(model_dir, x) model_file <- file.path(model_dir, x)
m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE) m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
m <- regmatches(model_file, m)[[1]] m <- regmatches(model_file, m)[[1]]
@ -87,6 +87,7 @@ test_that("Models from previous versions of XGBoost can be loaded", {
booster <- readRDS(model_file) booster <- readRDS(model_file)
} else { } else {
booster <- xgb.load(model_file) booster <- xgb.load(model_file)
xgb.parameters(booster) <- list(nthread = 2)
} }
predict(booster, newdata = pred_data) predict(booster, newdata = pred_data)
run_booster_check(booster, name) run_booster_check(booster, name)

View File

@ -3,8 +3,12 @@ context('Test model params and call are exposed to R')
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) agaricus.train$data, label = agaricus.train$label, nthread = 2
)
dtest <- xgb.DMatrix(
agaricus.test$data, label = agaricus.test$label, nthread = 2
)
bst <- xgboost(data = dtrain, bst <- xgboost(data = dtrain,
max_depth = 2, max_depth = 2,

View File

@ -4,8 +4,10 @@ set.seed(1994)
test_that("Poisson regression works", { test_that("Poisson regression works", {
data(mtcars) data(mtcars)
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11], bst <- xgboost(
objective = 'count:poisson', nrounds = 10, verbose = 0) data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
objective = 'count:poisson', nrounds = 10, verbose = 0, nthread = 2
)
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
pred <- predict(bst, as.matrix(mtcars[, -11])) pred <- predict(bst, as.matrix(mtcars[, -11]))
expect_equal(length(pred), 32) expect_equal(length(pred), 32)

View File

@ -1,5 +1,7 @@
context('Learning to rank') context('Learning to rank')
n_threads <- 2
test_that('Test ranking with unweighted data', { test_that('Test ranking with unweighted data', {
X <- Matrix::sparseMatrix( X <- Matrix::sparseMatrix(
i = c(2, 3, 7, 9, 12, 15, 17, 18) i = c(2, 3, 7, 9, 12, 15, 17, 18)
@ -9,10 +11,10 @@ test_that('Test ranking with unweighted data', {
) )
y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0) y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
group <- c(5, 5, 5, 5) group <- c(5, 5, 5, 5)
dtrain <- xgb.DMatrix(X, label = y, group = group) dtrain <- xgb.DMatrix(X, label = y, group = group, nthread = n_threads)
params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1, params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
eval_metric = 'auc', eval_metric = 'aucpr') eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain)) bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
# Check if the metric is monotone increasing # Check if the metric is monotone increasing
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0)) expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
@ -29,10 +31,14 @@ test_that('Test ranking with weighted data', {
y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0) y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
group <- c(5, 5, 5, 5) group <- c(5, 5, 5, 5)
weight <- c(1.0, 2.0, 3.0, 4.0) weight <- c(1.0, 2.0, 3.0, 4.0)
dtrain <- xgb.DMatrix(X, label = y, group = group, weight = weight) dtrain <- xgb.DMatrix(
X, label = y, group = group, weight = weight, nthread = n_threads
)
params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1, params <- list(
eval_metric = 'auc', eval_metric = 'aucpr') eta = 1, tree_method = "exact", objective = "rank:pairwise", max_depth = 1,
eval_metric = "auc", eval_metric = "aucpr", nthread = n_threads
)
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain)) bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
# Check if the metric is monotone increasing # Check if the metric is monotone increasing
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0)) expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
@ -41,7 +47,7 @@ test_that('Test ranking with weighted data', {
pred <- predict(bst, newdata = dtrain, ntreelimit = i) pred <- predict(bst, newdata = dtrain, ntreelimit = i)
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor? # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
is_sorted <- lapply(seq(1, 20, by = 5), is_sorted <- lapply(seq(1, 20, by = 5),
function (k) { function(k) {
ind <- order(-pred[k:(k + 4)]) ind <- order(-pred[k:(k + 4)])
z <- y[ind + (k - 1)] z <- y[ind + (k - 1)]
all(diff(z) <= 0) # Check if z is monotone decreasing all(diff(z) <= 0) # Check if z is monotone decreasing

View File

@ -16,6 +16,7 @@ test_that("Can save and load models with Unicode paths", {
path <- file.path(tmpdir, x) path <- file.path(tmpdir, x)
xgb.save(bst, path) xgb.save(bst, path)
bst2 <- xgb.load(path) bst2 <- xgb.load(path)
xgb.parameters(bst2) <- list(nthread = 2)
expect_equal(predict(bst, test$data), predict(bst2, test$data)) expect_equal(predict(bst, test$data), predict(bst2, test$data))
}) })
}) })

View File

@ -2,8 +2,15 @@ context("update trees in an existing model")
data(agaricus.train, package = 'xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost') data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) n_threads <- 1
dtrain <- xgb.DMatrix(
agaricus.train$data, label = agaricus.train$label, nthread = n_threads
)
dtest <- xgb.DMatrix(
agaricus.test$data, label = agaricus.test$label, nthread = n_threads
)
# Disable flaky tests for 32-bit Windows. # Disable flaky tests for 32-bit Windows.
# See https://github.com/dmlc/xgboost/issues/3720 # See https://github.com/dmlc/xgboost/issues/3720
@ -14,7 +21,7 @@ test_that("updating the model works", {
# no-subsampling # no-subsampling
p1 <- list( p1 <- list(
objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2, objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = n_threads,
updater = "grow_colmaker,prune" updater = "grow_colmaker,prune"
) )
set.seed(11) set.seed(11)
@ -86,9 +93,11 @@ test_that("updating the model works", {
}) })
test_that("updating works for multiclass & multitree", { test_that("updating works for multiclass & multitree", {
dtr <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1) dtr <- xgb.DMatrix(
as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
)
watchlist <- list(train = dtr) watchlist <- list(train = dtr)
p0 <- list(max_depth = 2, eta = 0.5, nthread = 2, subsample = 0.6, p0 <- list(max_depth = 2, eta = 0.5, nthread = n_threads, subsample = 0.6,
objective = "multi:softprob", num_class = 3, num_parallel_tree = 2, objective = "multi:softprob", num_class = 3, num_parallel_tree = 2,
base_score = 0) base_score = 0)
set.seed(121) set.seed(121)

View File

@ -31,6 +31,8 @@ require(data.table)
if (!require('vcd')) { if (!require('vcd')) {
install.packages('vcd') install.packages('vcd')
} }
data.table::setDTthreads(2)
``` ```
> **VCD** package is used for one of its embedded dataset only. > **VCD** package is used for one of its embedded dataset only.
@ -297,23 +299,25 @@ test <- agaricus.test
#Random Forest - 1000 trees #Random Forest - 1000 trees
bst <- xgboost( bst <- xgboost(
data = train$data data = train$data,
, label = train$label label = train$label,
, max_depth = 4 max_depth = 4,
, num_parallel_tree = 1000 num_parallel_tree = 1000,
, subsample = 0.5 subsample = 0.5,
, colsample_bytree = 0.5 colsample_bytree = 0.5,
, nrounds = 1 nrounds = 1,
, objective = "binary:logistic" objective = "binary:logistic",
nthread = 2
) )
#Boosting - 3 rounds #Boosting - 3 rounds
bst <- xgboost( bst <- xgboost(
data = train$data data = train$data,
, label = train$label label = train$label,
, max_depth = 4 max_depth = 4,
, nrounds = 3 nrounds = 3,
, objective = "binary:logistic" objective = "binary:logistic",
nthread = 2
) )
``` ```

View File

@ -86,9 +86,10 @@ data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1,
nrounds = 2, objective = "binary:logistic") nrounds = 2, objective = "binary:logistic", nthread = 2)
xgb.save(bst, 'model.save') xgb.save(bst, 'model.save')
bst = xgb.load('model.save') bst = xgb.load('model.save')
xgb.parameters(bst) <- list(nthread = 2)
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
@ @
@ -127,7 +128,7 @@ training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>= <<xgb.DMatrix>>=
dtrain <- xgb.DMatrix(train$data, label = train$label) dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
class(dtrain) class(dtrain)
head(getinfo(dtrain,'label')) head(getinfo(dtrain,'label'))
@ @
@ -161,9 +162,9 @@ evalerror <- function(preds, dtrain) {
return(list(metric = "MSE", value = err)) return(list(metric = "MSE", value = err))
} }
dtest <- xgb.DMatrix(test$data, label = test$label) dtest <- xgb.DMatrix(test$data, label = test$label, nthread = 2)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
param <- list(max_depth = 2, eta = 1) param <- list(max_depth = 2, eta = 1, nthread = 2)
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE) bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE)
@ @

View File

@ -173,13 +173,13 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
```{r trainingDense, message=F, warning=F} ```{r trainingDense, message=F, warning=F}
bstDense <- xgboost( bstDense <- xgboost(
data = as.matrix(train$data) data = as.matrix(train$data),
, label = train$label label = train$label,
, max_depth = 2 max_depth = 2,
, eta = 1 eta = 1,
, nthread = 2 nthread = 2,
, nrounds = 2 nrounds = 2,
, objective = "binary:logistic" objective = "binary:logistic"
) )
``` ```
@ -188,14 +188,14 @@ bstDense <- xgboost(
**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later. **XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
```{r trainingDmatrix, message=F, warning=F} ```{r trainingDmatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
bstDMatrix <- xgboost( bstDMatrix <- xgboost(
data = dtrain data = dtrain,
, max_depth = 2 max_depth = 2,
, eta = 1 eta = 1,
, nthread = 2 nthread = 2,
, nrounds = 2 nrounds = 2,
, objective = "binary:logistic" objective = "binary:logistic"
) )
``` ```
@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above. For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
```{r DMatrix, message=F, warning=F} ```{r DMatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
dtest <- xgb.DMatrix(data = test$data, label = test$label) dtest <- xgb.DMatrix(data = test$data, label = test$label, nthread = 2)
``` ```
### Measure learning progress with xgb.train ### Measure learning progress with xgb.train
@ -476,6 +476,7 @@ An interesting test to see how identical our saved model is to the original one
```{r loadModel, message=F, warning=F} ```{r loadModel, message=F, warning=F}
# load binary model to R # load binary model to R
bst2 <- xgb.load("xgboost.model") bst2 <- xgb.load("xgboost.model")
xgb.parameters(bst2) <- list(nthread = 2)
pred2 <- predict(bst2, test$data) pred2 <- predict(bst2, test$data)
# And now the test # And now the test
@ -500,6 +501,7 @@ print(class(rawVec))
# load binary model to R # load binary model to R
bst3 <- xgb.load(rawVec) bst3 <- xgb.load(rawVec)
xgb.parameters(bst3) <- list(nthread = 2)
pred3 <- predict(bst3, test$data) pred3 <- predict(bst3, test$data)
# pred2 should be identical to pred # pred2 should be identical to pred

View File

@ -1,16 +1,17 @@
function (run_doxygen) function(run_doxygen)
find_package(Doxygen REQUIRED) find_package(Doxygen REQUIRED)
if (NOT DOXYGEN_DOT_FOUND) if(NOT DOXYGEN_DOT_FOUND)
message(FATAL_ERROR "Command `dot` not found. Please install graphviz.") message(FATAL_ERROR "Command `dot` not found. Please install graphviz.")
endif (NOT DOXYGEN_DOT_FOUND) endif()
configure_file( configure_file(
${xgboost_SOURCE_DIR}/doc/Doxyfile.in ${xgboost_SOURCE_DIR}/doc/Doxyfile.in
${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
add_custom_target( doc_doxygen ALL add_custom_target(
doc_doxygen ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Generate C APIs documentation." COMMENT "Generate C APIs documentation."
VERBATIM) VERBATIM)
endfunction (run_doxygen) endfunction()

View File

@ -1,4 +1,4 @@
function (find_prefetch_intrinsics) function(find_prefetch_intrinsics)
include(CheckCXXSourceCompiles) include(CheckCXXSourceCompiles)
check_cxx_source_compiles(" check_cxx_source_compiles("
#include <xmmintrin.h> #include <xmmintrin.h>
@ -19,4 +19,4 @@ function (find_prefetch_intrinsics)
" XGBOOST_BUILTIN_PREFETCH_PRESENT) " XGBOOST_BUILTIN_PREFETCH_PRESENT)
set(XGBOOST_MM_PREFETCH_PRESENT ${XGBOOST_MM_PREFETCH_PRESENT} PARENT_SCOPE) set(XGBOOST_MM_PREFETCH_PRESENT ${XGBOOST_MM_PREFETCH_PRESENT} PARENT_SCOPE)
set(XGBOOST_BUILTIN_PREFETCH_PRESENT ${XGBOOST_BUILTIN_PREFETCH_PRESENT} PARENT_SCOPE) set(XGBOOST_BUILTIN_PREFETCH_PRESENT ${XGBOOST_BUILTIN_PREFETCH_PRESENT} PARENT_SCOPE)
endfunction (find_prefetch_intrinsics) endfunction()

View File

@ -12,9 +12,9 @@ macro(enable_sanitizer sanitizer)
elseif(${sanitizer} MATCHES "thread") elseif(${sanitizer} MATCHES "thread")
find_package(TSan) find_package(TSan)
set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=thread") set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=thread")
if (TSan_FOUND) if(TSan_FOUND)
link_libraries(${TSan_LIBRARY}) link_libraries(${TSan_LIBRARY})
endif (TSan_FOUND) endif()
elseif(${sanitizer} MATCHES "leak") elseif(${sanitizer} MATCHES "leak")
find_package(LSan) find_package(LSan)
@ -33,16 +33,16 @@ macro(enable_sanitizers SANITIZERS)
# Check sanitizers compatibility. # Check sanitizers compatibility.
# Idealy, we should use if(san IN_LIST SANITIZERS) ... endif() # Idealy, we should use if(san IN_LIST SANITIZERS) ... endif()
# But I haven't figure out how to make it work. # But I haven't figure out how to make it work.
foreach ( _san ${SANITIZERS} ) foreach( _san ${SANITIZERS} )
string(TOLOWER ${_san} _san) string(TOLOWER ${_san} _san)
if (_san MATCHES "thread") if(_san MATCHES "thread")
if (${_use_other_sanitizers}) if(${_use_other_sanitizers})
message(FATAL_ERROR message(FATAL_ERROR
"thread sanitizer is not compatible with ${_san} sanitizer.") "thread sanitizer is not compatible with ${_san} sanitizer.")
endif() endif()
set(_use_thread_sanitizer 1) set(_use_thread_sanitizer 1)
else () else()
if (${_use_thread_sanitizer}) if(${_use_thread_sanitizer})
message(FATAL_ERROR message(FATAL_ERROR
"${_san} sanitizer is not compatible with thread sanitizer.") "${_san} sanitizer is not compatible with thread sanitizer.")
endif() endif()

View File

@ -11,7 +11,7 @@ function(auto_source_group SOURCES)
source_group("${GROUP}" FILES "${FILE}") source_group("${GROUP}" FILES "${FILE}")
endforeach() endforeach()
endfunction(auto_source_group) endfunction()
# Force static runtime for MSVC # Force static runtime for MSVC
function(msvc_use_static_runtime) function(msvc_use_static_runtime)
@ -50,7 +50,7 @@ function(msvc_use_static_runtime)
endif() endif()
endforeach() endforeach()
endif() endif()
endfunction(msvc_use_static_runtime) endfunction()
# Set output directory of target, ignoring debug or release # Set output directory of target, ignoring debug or release
function(set_output_directory target dir) function(set_output_directory target dir)
@ -70,7 +70,7 @@ function(set_output_directory target dir)
ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${dir} ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${dir}
ARCHIVE_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dir} ARCHIVE_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dir}
ARCHIVE_OUTPUT_DIRECTORY_MINSIZEREL ${dir}) ARCHIVE_OUTPUT_DIRECTORY_MINSIZEREL ${dir})
endfunction(set_output_directory) endfunction()
# Set a default build type to release if none was specified # Set a default build type to release if none was specified
function(set_default_configuration_release) function(set_default_configuration_release)
@ -78,9 +78,9 @@ function(set_default_configuration_release)
set(CMAKE_CONFIGURATION_TYPES Release CACHE STRING "" FORCE) set(CMAKE_CONFIGURATION_TYPES Release CACHE STRING "" FORCE)
elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to 'Release' as none was specified.") message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE ) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
endif() endif()
endfunction(set_default_configuration_release) endfunction()
# Generate nvcc compiler flags given a list of architectures # Generate nvcc compiler flags given a list of architectures
# Also generates PTX for the most recent architecture for forwards compatibility # Also generates PTX for the most recent architecture for forwards compatibility
@ -90,9 +90,9 @@ function(format_gencode_flags flags out)
endif() endif()
# Set up architecture flags # Set up architecture flags
if(NOT flags) if(NOT flags)
if (CUDA_VERSION VERSION_GREATER_EQUAL "11.8") if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
set(flags "50;60;70;80;90") set(flags "50;60;70;80;90")
elseif (CUDA_VERSION VERSION_GREATER_EQUAL "11.0") elseif(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
set(flags "50;60;70;80") set(flags "50;60;70;80")
elseif(CUDA_VERSION VERSION_GREATER_EQUAL "10.0") elseif(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
set(flags "35;50;60;70") set(flags "35;50;60;70")
@ -103,7 +103,7 @@ function(format_gencode_flags flags out)
endif() endif()
endif() endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
cmake_policy(SET CMP0104 NEW) cmake_policy(SET CMP0104 NEW)
list(GET flags -1 latest_arch) list(GET flags -1 latest_arch)
list(TRANSFORM flags APPEND "-real") list(TRANSFORM flags APPEND "-real")
@ -121,8 +121,8 @@ function(format_gencode_flags flags out)
set(${out} "${${out}}--generate-code=arch=compute_${ver},code=compute_${ver};") set(${out} "${${out}}--generate-code=arch=compute_${ver},code=compute_${ver};")
set(${out} "${${out}}" PARENT_SCOPE) set(${out} "${${out}}" PARENT_SCOPE)
message(STATUS "CUDA GEN_CODE: ${GEN_CODE}") message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") endif()
endfunction(format_gencode_flags flags) endfunction()
# Set CUDA related flags to target. Must be used after code `format_gencode_flags`. # Set CUDA related flags to target. Must be used after code `format_gencode_flags`.
function(xgboost_set_cuda_flags target) function(xgboost_set_cuda_flags target)
@ -133,35 +133,35 @@ function(xgboost_set_cuda_flags target)
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}> $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
$<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>) $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
if (USE_PER_THREAD_DEFAULT_STREAM) if(USE_PER_THREAD_DEFAULT_STREAM)
target_compile_options(${target} PRIVATE target_compile_options(${target} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>) $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
endif (USE_PER_THREAD_DEFAULT_STREAM) endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") endif()
if (FORCE_COLORED_OUTPUT) if(FORCE_COLORED_OUTPUT)
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))) (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
target_compile_options(${target} PRIVATE target_compile_options(${target} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fdiagnostics-color=always>) $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fdiagnostics-color=always>)
endif() endif()
endif (FORCE_COLORED_OUTPUT) endif()
if (USE_DEVICE_DEBUG) if(USE_DEVICE_DEBUG)
target_compile_options(${target} PRIVATE target_compile_options(${target} PRIVATE
$<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>) $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>)
else (USE_DEVICE_DEBUG) else()
target_compile_options(${target} PRIVATE target_compile_options(${target} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>) $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
endif (USE_DEVICE_DEBUG) endif()
if (USE_NVTX) if(USE_NVTX)
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
endif (USE_NVTX) endif()
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
target_include_directories( target_include_directories(
@ -169,17 +169,17 @@ function(xgboost_set_cuda_flags target)
${xgboost_SOURCE_DIR}/gputreeshap ${xgboost_SOURCE_DIR}/gputreeshap
${CUDAToolkit_INCLUDE_DIRS}) ${CUDAToolkit_INCLUDE_DIRS})
if (MSVC) if(MSVC)
target_compile_options(${target} PRIVATE target_compile_options(${target} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>) $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
endif (MSVC) endif()
set_target_properties(${target} PROPERTIES set_target_properties(${target} PROPERTIES
CUDA_STANDARD 17 CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF CUDA_SEPARABLE_COMPILATION OFF
CUDA_RUNTIME_LIBRARY Static) CUDA_RUNTIME_LIBRARY Static)
endfunction(xgboost_set_cuda_flags) endfunction()
# Set HIP related flags to target. # Set HIP related flags to target.
function(xgboost_set_hip_flags target) function(xgboost_set_hip_flags target)
@ -199,16 +199,16 @@ function(xgboost_set_hip_flags target)
endfunction(xgboost_set_hip_flags) endfunction(xgboost_set_hip_flags)
macro(xgboost_link_nccl target) macro(xgboost_link_nccl target)
if (BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR}) target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1) target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1)
target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY}) target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
else () else()
target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR}) target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY}) target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
endif (BUILD_STATIC_LIB) endif()
endmacro(xgboost_link_nccl) endmacro()
# compile options # compile options
macro(xgboost_target_properties target) macro(xgboost_target_properties target)
@ -217,110 +217,106 @@ macro(xgboost_target_properties target)
CXX_STANDARD_REQUIRED ON CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON) POSITION_INDEPENDENT_CODE ON)
if (HIDE_CXX_SYMBOLS) if(HIDE_CXX_SYMBOLS)
#-- Hide all C++ symbols #-- Hide all C++ symbols
set_target_properties(${target} PROPERTIES set_target_properties(${target} PROPERTIES
C_VISIBILITY_PRESET hidden C_VISIBILITY_PRESET hidden
CXX_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden
CUDA_VISIBILITY_PRESET hidden CUDA_VISIBILITY_PRESET hidden
) )
endif (HIDE_CXX_SYMBOLS) endif()
if (ENABLE_ALL_WARNINGS) if(ENABLE_ALL_WARNINGS)
target_compile_options(${target} PUBLIC target_compile_options(${target} PUBLIC
$<IF:$<COMPILE_LANGUAGE:CUDA>, $<IF:$<COMPILE_LANGUAGE:CUDA>,
-Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined, -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
-Wall -Wextra -Wno-expansion-to-defined> -Wall -Wextra -Wno-expansion-to-defined>
) )
target_compile_options(${target} PUBLIC endif()
$<IF:$<COMPILE_LANGUAGE:HIP>,
-Wall -Wextra >
)
endif(ENABLE_ALL_WARNINGS)
target_compile_options(${target} target_compile_options(${target}
PRIVATE PRIVATE
$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<COMPILE_LANGUAGE:CXX>>:/MP> $<$<AND:$<CXX_COMPILER_ID:MSVC>,$<COMPILE_LANGUAGE:CXX>>:/MP>
$<$<AND:$<NOT:$<CXX_COMPILER_ID:MSVC>>,$<COMPILE_LANGUAGE:CXX>>:-funroll-loops>) $<$<AND:$<NOT:$<CXX_COMPILER_ID:MSVC>>,$<COMPILE_LANGUAGE:CXX>>:-funroll-loops>)
if (MSVC) if(MSVC)
target_compile_options(${target} PRIVATE target_compile_options(${target} PRIVATE
$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8> $<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_WARNINGS
-D_CRT_SECURE_NO_DEPRECATE -D_CRT_SECURE_NO_DEPRECATE
) )
endif (MSVC) endif()
if (WIN32 AND MINGW) if(WIN32 AND MINGW)
target_compile_options(${target} PUBLIC -static-libstdc++) target_compile_options(${target} PUBLIC -static-libstdc++)
endif (WIN32 AND MINGW) endif()
endmacro(xgboost_target_properties) endmacro()
# Custom definitions used in xgboost. # Custom definitions used in xgboost.
macro(xgboost_target_defs target) macro(xgboost_target_defs target)
if (NOT ${target} STREQUAL "dmlc") # skip dmlc core for custom logging. if(NOT ${target} STREQUAL "dmlc") # skip dmlc core for custom logging.
target_compile_definitions(${target} target_compile_definitions(${target}
PRIVATE PRIVATE
-DDMLC_LOG_CUSTOMIZE=1 -DDMLC_LOG_CUSTOMIZE=1
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:_MWAITXINTRIN_H_INCLUDED>) $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:_MWAITXINTRIN_H_INCLUDED>)
endif () endif()
if (USE_DEBUG_OUTPUT) if(USE_DEBUG_OUTPUT)
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_DEBUG_OUTPUT=1) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_DEBUG_OUTPUT=1)
endif (USE_DEBUG_OUTPUT) endif()
if (XGBOOST_MM_PREFETCH_PRESENT) if(XGBOOST_MM_PREFETCH_PRESENT)
target_compile_definitions(${target} target_compile_definitions(${target}
PRIVATE PRIVATE
-DXGBOOST_MM_PREFETCH_PRESENT=1) -DXGBOOST_MM_PREFETCH_PRESENT=1)
endif(XGBOOST_MM_PREFETCH_PRESENT) endif()
if (XGBOOST_BUILTIN_PREFETCH_PRESENT) if(XGBOOST_BUILTIN_PREFETCH_PRESENT)
target_compile_definitions(${target} target_compile_definitions(${target}
PRIVATE PRIVATE
-DXGBOOST_BUILTIN_PREFETCH_PRESENT=1) -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
endif (XGBOOST_BUILTIN_PREFETCH_PRESENT) endif()
if (PLUGIN_RMM) if(PLUGIN_RMM)
target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1) target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
endif (PLUGIN_RMM) endif()
endmacro(xgboost_target_defs) endmacro()
# handles dependencies # handles dependencies
macro(xgboost_target_link_libraries target) macro(xgboost_target_link_libraries target)
if (BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
else() else()
target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
endif (BUILD_STATIC_LIB) endif()
if (USE_OPENMP) if(USE_OPENMP)
if (BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
target_link_libraries(${target} PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(${target} PUBLIC OpenMP::OpenMP_CXX)
else() else()
target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
endif (BUILD_STATIC_LIB) endif()
endif (USE_OPENMP) endif()
if (USE_CUDA) if(USE_CUDA)
xgboost_set_cuda_flags(${target}) xgboost_set_cuda_flags(${target})
target_link_libraries(${target} PUBLIC CUDA::cudart_static) target_link_libraries(${target} PUBLIC CUDA::cudart_static)
endif (USE_CUDA) endif()
if (USE_HIP) if (USE_HIP)
xgboost_set_hip_flags(${target}) xgboost_set_hip_flags(${target})
endif (USE_HIP) endif (USE_HIP)
if (PLUGIN_RMM) if(PLUGIN_RMM)
target_link_libraries(${target} PRIVATE rmm::rmm) target_link_libraries(${target} PRIVATE rmm::rmm)
endif (PLUGIN_RMM) endif()
if (USE_NCCL) if(USE_NCCL)
xgboost_link_nccl(${target}) xgboost_link_nccl(${target})
endif (USE_NCCL) endif()
if (USE_NVTX) if(USE_NVTX)
target_link_libraries(${target} PRIVATE CUDA::nvToolsExt) target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
endif (USE_NVTX) endif()
if (MINGW) if(MINGW)
target_link_libraries(${target} PRIVATE wsock32 ws2_32) target_link_libraries(${target} PRIVATE wsock32 ws2_32)
endif (MINGW) endif()
endmacro(xgboost_target_link_libraries) endmacro()

View File

@ -1,6 +1,6 @@
function (write_version) function(write_version)
message(STATUS "xgboost VERSION: ${xgboost_VERSION}") message(STATUS "xgboost VERSION: ${xgboost_VERSION}")
configure_file( configure_file(
${xgboost_SOURCE_DIR}/cmake/version_config.h.in ${xgboost_SOURCE_DIR}/cmake/version_config.h.in
${xgboost_SOURCE_DIR}/include/xgboost/version_config.h @ONLY) ${xgboost_SOURCE_DIR}/include/xgboost/version_config.h @ONLY)
endfunction (write_version) endfunction()

View File

@ -66,7 +66,7 @@ function(create_rlib_for_msvc)
execute_process(COMMAND ${DLLTOOL_EXE} execute_process(COMMAND ${DLLTOOL_EXE}
"--input-def" "${CMAKE_CURRENT_BINARY_DIR}/R.def" "--input-def" "${CMAKE_CURRENT_BINARY_DIR}/R.def"
"--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib") "--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib")
endfunction(create_rlib_for_msvc) endfunction()
# detection for OSX # detection for OSX

View File

@ -1,6 +1,6 @@
if (NVML_LIBRARY) if(NVML_LIBRARY)
unset(NVML_LIBRARY CACHE) unset(NVML_LIBRARY CACHE)
endif(NVML_LIBRARY) endif()
set(NVML_LIB_NAME nvml) set(NVML_LIB_NAME nvml)

View File

@ -35,20 +35,20 @@
# #
# This module assumes that the user has already called find_package(CUDA) # This module assumes that the user has already called find_package(CUDA)
if (NCCL_LIBRARY) if(NCCL_LIBRARY)
if(NOT USE_NCCL_LIB_PATH) if(NOT USE_NCCL_LIB_PATH)
# Don't cache NCCL_LIBRARY to enable switching between static and shared. # Don't cache NCCL_LIBRARY to enable switching between static and shared.
unset(NCCL_LIBRARY CACHE) unset(NCCL_LIBRARY CACHE)
endif(NOT USE_NCCL_LIB_PATH) endif()
endif() endif()
if (BUILD_WITH_SHARED_NCCL) if(BUILD_WITH_SHARED_NCCL)
# libnccl.so # libnccl.so
set(NCCL_LIB_NAME nccl) set(NCCL_LIB_NAME nccl)
else () else()
# libnccl_static.a # libnccl_static.a
set(NCCL_LIB_NAME nccl_static) set(NCCL_LIB_NAME nccl_static)
endif (BUILD_WITH_SHARED_NCCL) endif()
find_path(NCCL_INCLUDE_DIR find_path(NCCL_INCLUDE_DIR
NAMES nccl.h NAMES nccl.h

View File

@ -3,11 +3,11 @@ find_package(xgboost REQUIRED)
# xgboost is built as static libraries, all cxx dependencies need to be linked into the # xgboost is built as static libraries, all cxx dependencies need to be linked into the
# executable. # executable.
if (XGBOOST_BUILD_STATIC_LIB) if(XGBOOST_BUILD_STATIC_LIB)
enable_language(CXX) enable_language(CXX)
# find again for those cxx libraries. # find again for those cxx libraries.
find_package(xgboost REQUIRED) find_package(xgboost REQUIRED)
endif(XGBOOST_BUILD_STATIC_LIB) endif()
add_executable(api-demo c-api-demo.c) add_executable(api-demo c-api-demo.c)
target_link_libraries(api-demo PRIVATE xgboost::xgboost) target_link_libraries(api-demo PRIVATE xgboost::xgboost)

View File

@ -4,11 +4,11 @@ find_package(xgboost REQUIRED)
# xgboost is built as static libraries, all cxx dependencies need to be linked into the # xgboost is built as static libraries, all cxx dependencies need to be linked into the
# executable. # executable.
if (XGBOOST_BUILD_STATIC_LIB) if(XGBOOST_BUILD_STATIC_LIB)
enable_language(CXX) enable_language(CXX)
# find again for those cxx libraries. # find again for those cxx libraries.
find_package(xgboost REQUIRED) find_package(xgboost REQUIRED)
endif(XGBOOST_BUILD_STATIC_LIB) endif()
add_executable(inference-demo inference.c) add_executable(inference-demo inference.c)
target_link_libraries(inference-demo PRIVATE xgboost::xgboost) target_link_libraries(inference-demo PRIVATE xgboost::xgboost)

View File

@ -104,7 +104,7 @@ def check_point_callback():
# Use callback class from xgboost.callback # Use callback class from xgboost.callback
# Feel free to subclass/customize it to suit your need. # Feel free to subclass/customize it to suit your need.
check_point = xgb.callback.TrainingCheckPoint( check_point = xgb.callback.TrainingCheckPoint(
directory=tmpdir, iterations=rounds, name="model" directory=tmpdir, interval=rounds, name="model"
) )
xgb.train( xgb.train(
{"objective": "binary:logistic"}, {"objective": "binary:logistic"},
@ -118,7 +118,7 @@ def check_point_callback():
# This version of checkpoint saves everything including parameters and # This version of checkpoint saves everything including parameters and
# model. See: doc/tutorials/saving_model.rst # model. See: doc/tutorials/saving_model.rst
check_point = xgb.callback.TrainingCheckPoint( check_point = xgb.callback.TrainingCheckPoint(
directory=tmpdir, iterations=rounds, as_pickle=True, name="model" directory=tmpdir, interval=rounds, as_pickle=True, name="model"
) )
xgb.train( xgb.train(
{"objective": "binary:logistic"}, {"objective": "binary:logistic"},

View File

@ -24,8 +24,8 @@ param <- list("objective" = "binary:logitraw",
"nthread" = 16) "nthread" = 16)
watchlist <- list("train" = xgmat) watchlist <- list("train" = xgmat)
nrounds <- 120 nrounds <- 120
print ("loading data end, start to boost trees") print("loading data end, start to boost trees")
bst <- xgb.train(param, xgmat, nrounds, watchlist) bst <- xgb.train(param, xgmat, nrounds, watchlist)
# save out model # save out model
xgb.save(bst, "higgs.model") xgb.save(bst, "higgs.model")
print ('finish training') print('finish training')

View File

@ -39,11 +39,11 @@ for (i in seq_along(threads)){
"nthread" = thread) "nthread" = thread)
watchlist <- list("train" = xgmat) watchlist <- list("train" = xgmat)
nrounds <- 120 nrounds <- 120
print ("loading data end, start to boost trees") print("loading data end, start to boost trees")
bst <- xgb.train(param, xgmat, nrounds, watchlist) bst <- xgb.train(param, xgmat, nrounds, watchlist)
# save out model # save out model
xgb.save(bst, "higgs.model") xgb.save(bst, "higgs.model")
print ('finish training') print('finish training')
}) })
} }

View File

@ -85,8 +85,8 @@ shutdown server
## Training with GPUs ## Training with GPUs
To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs. To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL Build XGBoost with the federated learning plugin enabled along with CUDA
turned off (see the [README](../../plugin/federated/README.md)). (see the [README](../../plugin/federated/README.md)).
Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
above. above.

View File

@ -67,7 +67,7 @@ class XGBoostTrainer(Executor):
dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm') dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')
# Specify parameters via map, definition are same as c++ version # Specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} param = {'tree_method': 'hist', 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
if self._use_gpus: if self._use_gpus:
self.log_info(fl_ctx, f'Training with GPU {rank}') self.log_info(fl_ctx, f'Training with GPU {rank}')
param['device'] = f"cuda:{rank}" param['device'] = f"cuda:{rank}"

View File

@ -56,4 +56,9 @@ shutdown server
## Training with GPUs ## Training with GPUs
Currently GPUs are not yet supported by vertical federated XGBoost. To demo with Vertical Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
Build XGBoost with the federated learning plugin enabled along with CUDA
(see the [README](../../plugin/federated/README.md)).
Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
above.

View File

@ -77,13 +77,14 @@ class XGBoostTrainer(Executor):
'gamma': 1.0, 'gamma': 1.0,
'max_depth': 8, 'max_depth': 8,
'min_child_weight': 100, 'min_child_weight': 100,
'tree_method': 'approx', 'tree_method': 'hist',
'grow_policy': 'depthwise', 'grow_policy': 'depthwise',
'objective': 'binary:logistic', 'objective': 'binary:logistic',
'eval_metric': 'auc', 'eval_metric': 'auc',
} }
if self._use_gpus: if self._use_gpus:
self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost') self.log_info(fl_ctx, f'Training with GPU {rank}')
param['device'] = f"cuda:{rank}"
# specify validations set to watch performance # specify validations set to watch performance
watchlist = [(dtest, "eval"), (dtrain, "train")] watchlist = [(dtest, "eval"), (dtrain, "train")]

View File

@ -250,8 +250,8 @@ echo "<hash> <artifact>" | shasum -a 256 --check
``` ```
**Experimental binary packages for R with CUDA enabled** **Experimental binary packages for R with CUDA enabled**
* xgboost_r_gpu_linux_1.7.5.tar.gz: [Download]({r_gpu_linux_url}) * xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url})
* xgboost_r_gpu_win64_1.7.5.tar.gz: [Download]({r_gpu_win64_url}) * xgboost_r_gpu_win64_{release}.tar.gz: [Download]({r_gpu_win64_url})
**Source tarball** **Source tarball**
* xgboost.tar.gz: [Download]({src_tarball})""" * xgboost.tar.gz: [Download]({src_tarball})"""
@ -296,12 +296,13 @@ def main(args: argparse.Namespace) -> None:
git.submodule("update") git.submodule("update")
commit_hash = latest_hash() commit_hash = latest_hash()
if not os.path.exists(args.outdir): outdir = os.path.abspath(args.outdir)
os.mkdir(args.outdir) if not os.path.exists(outdir):
os.mkdir(outdir)
# source tarball # source tarball
hashes: List[str] = [] hashes: List[str] = []
tarname, h = make_src_package(release, args.outdir) tarname, h = make_src_package(release, outdir)
hashes.append(h) hashes.append(h)
# CUDA R packages # CUDA R packages
@ -310,18 +311,18 @@ def main(args: argparse.Namespace) -> None:
branch, branch,
"" if rc is None else rc + str(rc_ver), "" if rc is None else rc + str(rc_ver),
commit_hash, commit_hash,
args.outdir, outdir,
) )
hashes.extend(hr) hashes.extend(hr)
# Python source wheel # Python source wheel
make_pysrc_wheel(release, rc, rc_ver, args.outdir) make_pysrc_wheel(release, rc, rc_ver, outdir)
# Python binary wheels # Python binary wheels
download_py_packages(branch, major, minor, commit_hash, args.outdir) download_py_packages(branch, major, minor, commit_hash, outdir)
# Write end note # Write end note
release_note(release, hashes, urls, tarname, args.outdir) release_note(release, hashes, urls, tarname, outdir)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -80,6 +80,24 @@ R package versioning
==================== ====================
See :ref:`release`. See :ref:`release`.
Testing R package with different compilers
==========================================
You can change the default compiler of R by changing the configuration file in home
directory. For instance, if you want to test XGBoost built with clang++ instead of g++ on
Linux, put the following in your ``~/.R/Makevars`` file:
.. code-block:: sh
CC=clang-15
CXX17=clang++-15
Be aware that the variable name should match with the name used by ``R CMD``:
.. code-block:: sh
R CMD config CXX17
Registering native routines in R Registering native routines in R
================================ ================================
According to `R extension manual <https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines>`_, According to `R extension manual <https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines>`_,

View File

@ -35,7 +35,7 @@ Building sdists
In the case of XGBoost, an sdist contains both the Python code as well as In the case of XGBoost, an sdist contains both the Python code as well as
the C++ code, so that the core part of XGBoost can be compiled into the the C++ code, so that the core part of XGBoost can be compiled into the
shared libary ``libxgboost.so`` [#shared_lib_name]_. shared library ``libxgboost.so`` [#shared_lib_name]_.
You can obtain an sdist as follows: You can obtain an sdist as follows:

View File

@ -16,7 +16,14 @@ Adding a new unit test
Python package: pytest Python package: pytest
====================== ======================
Add your test under the directory `tests/python/ <https://github.com/dmlc/xgboost/tree/master/tests/python>`_ or `tests/python-gpu/ <https://github.com/dmlc/xgboost/tree/master/tests/python-gpu>`_ (if you are testing GPU code). Refer to `the PyTest tutorial <https://docs.pytest.org/en/latest/getting-started.html>`_ to learn how to write tests for Python code. Add your test under the directories
- `tests/python/ <https://github.com/dmlc/xgboost/tree/master/tests/python>`_
- `tests/python-gpu/ <https://github.com/dmlc/xgboost/tree/master/tests/python-gpu>`_ (if you are testing GPU code)
- `tests/test_distributed <https://github.com/dmlc/xgboost/tree/master/tests/test_distributed>`_. (if a distributed framework is used)
Refer to `the PyTest tutorial <https://docs.pytest.org/en/latest/getting-started.html>`_
to learn how to write tests for Python code.
You may try running your test by following instructions in :ref:`this section <running_pytest>`. You may try running your test by following instructions in :ref:`this section <running_pytest>`.
@ -56,19 +63,26 @@ Run
.. code-block:: bash .. code-block:: bash
make Rcheck python ./tests/ci_build/test_r_package.py --task=check
at the root of the project directory. at the root of the project directory. The command builds and checks the XGBoost
r-package. Alternatively, if you want to just run the tests, you can use the following
commands after installing XGBoost:
.. code-block:: bash
cd R-package/tests/
Rscript testthat.R
.. _running_jvm_tests: .. _running_jvm_tests:
JVM packages JVM packages
============ ============
As part of the building process, tests are run: Maven is used
.. code-block:: bash .. code-block:: bash
mvn package mvn test
.. _running_pytest: .. _running_pytest:
@ -99,6 +113,14 @@ In addition, to test CUDA code, run:
(For this step, you should have compiled XGBoost with CUDA enabled.) (For this step, you should have compiled XGBoost with CUDA enabled.)
For testing with distributed frameworks like ``Dask`` and ``PySpark``:
.. code:: bash
# Tell Python where to find XGBoost module
export PYTHONPATH=./python-package
pytest -v -s --fulltrace tests/test_distributed
.. _running_gtest: .. _running_gtest:
C++: Google Test C++: Google Test
@ -110,21 +132,13 @@ To build and run C++ unit tests enable tests while running CMake:
mkdir build mkdir build
cd build cd build
cmake -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON .. cmake -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
make ninja
make test ./testxgboost
To enable tests for CUDA code, add ``-DUSE_CUDA=ON`` and ``-DUSE_NCCL=ON`` (CUDA toolkit required): Flags like ``USE_CUDA``, ``USE_DMLC_GTEST`` are optional. For more info about how to build
XGBoost from source, see :doc:`</build>`. One can also run all unit test using ctest tool
.. code-block:: bash which provides higher flexibility. For example:
mkdir build
cd build
cmake -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
make
make test
One can also run all unit test using ctest tool which provides higher flexibility. For example:
.. code-block:: bash .. code-block:: bash
@ -157,14 +171,14 @@ sanitizer is not compatible with the other two sanitizers.
.. code-block:: bash .. code-block:: bash
cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" /path/to/xgboost cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;undefined" /path/to/xgboost
By default, CMake will search regular system paths for sanitizers, you can also By default, CMake will search regular system paths for sanitizers, you can also
supply a specified SANITIZER_PATH. supply a specified SANITIZER_PATH.
.. code-block:: bash .. code-block:: bash
cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" \ cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;undefined" \
-DSANITIZER_PATH=/path/to/sanitizers /path/to/xgboost -DSANITIZER_PATH=/path/to/sanitizers /path/to/xgboost
How to use sanitizers with CUDA support How to use sanitizers with CUDA support

View File

@ -146,3 +146,48 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi
with ThreadPoolExecutor(max_workers=10) as e: with ThreadPoolExecutor(max_workers=10) as e:
e.submit(predict_fn, ...) e.submit(predict_fn, ...)
*****************************
Privacy-Preserving Prediction
*****************************
`Concrete ML`_ is a third-party open-source library developed by `Zama`_ that proposes gradient
boosting classes similar to ours, but predicting directly over encrypted data, thanks to
Fully Homomorphic Encryption. A simple example would be as follows:
.. code-block:: python
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from concrete.ml.sklearn import XGBClassifier
x, y = make_classification(n_samples=100, class_sep=2, n_features=30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
x, y, test_size=10, random_state=42
)
# Train in the clear and quantize the weights
model = XGBClassifier()
model.fit(X_train, y_train)
# Simulate the predictions in the clear
y_pred_clear = model.predict(X_test)
# Compile in FHE
model.compile(X_train)
# Generate keys
model.fhe_circuit.keygen()
# Run the inference on encrypted inputs!
y_pred_fhe = model.predict(X_test, fhe="execute")
print("In clear :", y_pred_clear)
print("In FHE :", y_pred_fhe)
print(f"Similarity: {int((y_pred_fhe == y_pred_clear).mean()*100)}%")
More information and examples are given in the `Concrete ML documentation`_.
.. _Zama: https://www.zama.ai/
.. _Concrete ML: https://github.com/zama-ai/concrete-ml
.. _Concrete ML documentation: https://docs.zama.ai/concrete-ml

View File

@ -172,9 +172,8 @@ Support Matrix
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| modin.Series | NPA | FF | NPA | NPA | FF | | | modin.Series | NPA | FF | NPA | NPA | FF | |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| pyarrow.Table | T | F | | NPA | FF | | | pyarrow.Table | NPA | NPA | NPA | NPA | NPA | NPA |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| pyarrow.dataset.Dataset | T | F | | | F | |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| _\_array\_\_ | NPA | F | NPA | NPA | H | | | _\_array\_\_ | NPA | F | NPA | NPA | H | |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+

View File

@ -30,3 +30,4 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
input_format input_format
param_tuning param_tuning
custom_metric_obj custom_metric_obj
privacy_preserving

View File

@ -58,6 +58,7 @@ Notice that the samples are sorted based on their query index in a non-decreasin
sorted_idx = np.argsort(qid) sorted_idx = np.argsort(qid)
X = X[sorted_idx, :] X = X[sorted_idx, :]
y = y[sorted_idx] y = y[sorted_idx]
qid = qid[sorted_idx]
The simplest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning: The simplest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning:

View File

@ -0,0 +1,97 @@
#############################################
Privacy Preserving Inference with Concrete ML
#############################################
`Concrete ML`_ is a specialized library developed by Zama that allows the execution of machine learning models on encrypted data through `Fully Homomorphic Encryption (FHE) <https://www.youtube.com/watch?v=FFox2S4uqEo>`_, thereby preserving data privacy.
To use models such as XGBClassifier, use the following import:
.. code:: python
from concrete.ml.sklearn import XGBClassifier
***************************************
Performing Privacy Preserving Inference
***************************************
Initialization of a XGBClassifier can be done as follows:
.. code:: python
classifier = XGBClassifier(n_bits=6, [other_hyperparameters])
where ``n_bits`` determines the precision of the input features. Note that a higher value of ``n_bits`` increases the precision of the input features and possibly the final model accuracy but also ends up with longer FHE execution time.
Other hyper-parameters that exist in xgboost library can be used.
******************************
Model Training and Compilation
******************************
As commonly used in scikit-learn like models, it can be trained with the .fit() method.
.. code:: python
classifier.fit(X_train, y_train)
After training, the model can be compiled with a calibration dataset, potentially a subset of the training data:
.. code:: python
classifier.compile(X_calibrate)
This calibration dataset, ``X_calibrate``, is used in Concrete ML compute the precision (bit-width) of each intermediate value in the model. This is a necessary step to optimize the equivalent FHE circuit.
****************************
FHE Simulation and Execution
****************************
To verify model accuracy in encrypted computations, you can run an FHE simulation:
.. code:: python
predictions = classifier.predict(X_test, fhe="simulate")
This simulation can be used to evaluate the model. The resulting accuracy of this simulation step is representative of the actual FHE execution without having to pay the cost of an actual FHE execution.
When the model is ready, actual Fully Homomorphic Encryption execution can be performed:
.. code:: python
predictions = classifier.predict(X_test, fhe="execute")
Note that using FHE="execute" is a convenient way to assess the model in FHE, but for real deployment, functions to encrypt (on the client), run in FHE (on the server), and finally decrypt (on the client) have to be used for end-to-end privacy-preserving inferences.
Concrete ML provides a deployment API to facilitate this process, ensuring end-to-end privacy.
To go further in the deployment API you can read:
- the `deployment documentation <https://docs.zama.ai/concrete-ml/advanced-topics/client_server>`_
- the `deployment notebook <https://github.com/zama-ai/concrete-ml/blob/17779ca571d20b001caff5792eb11e76fe2c19ba/docs/advanced_examples/ClientServer.ipynb>`_
*******************************
Parameter Tuning in Concrete ML
*******************************
Concrete ML is compatible with standard scikit-learn pipelines such as GridSearchCV or any other hyper-parameter tuning techniques.
******************
Examples and Demos
******************
- `Sentiment analysis (based on transformers + xgboost) <https://huggingface.co/spaces/zama-fhe/encrypted_sentiment_analysis>`_
- `XGBoost Classifier <https://github.com/zama-ai/concrete-ml/blob/6966c84b9698d5418209b346900f81d1270c64bd/docs/advanced_examples/XGBClassifier.ipynb>`_
- `XGBoost Regressor <https://github.com/zama-ai/concrete-ml/blob/6966c84b9698d5418209b346900f81d1270c64bd/docs/advanced_examples/XGBRegressor.ipynb>`_
**********
Conclusion
**********
Concrete ML provides a framework for executing privacy-preserving inferences by leveraging Fully Homomorphic Encryption, allowing secure and private computations on encrypted data.
More information and examples are given in the `Concrete ML documentation`_.
.. _Concrete ML: https://github.com/zama-ai/concrete-ml
.. _`Concrete ML documentation`: https://docs.zama.ai/concrete-ml

View File

@ -144,9 +144,7 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
* See :doc:`/tutorials/input_format` for more info. * See :doc:`/tutorials/input_format` for more info.
* \endverbatim * \endverbatim
* - silent (optional): Whether to print message during loading. Default to true. * - silent (optional): Whether to print message during loading. Default to true.
* - data_split_mode (optional): Whether to split by row or column. In distributed mode, the * - data_split_mode (optional): Whether the file was split by row or column beforehand for distributed computing. Default to row.
* file is split accordingly; otherwise this is only an indicator on how the file was split
* beforehand. Default to row.
* \param out a loaded data matrix * \param out a loaded data matrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
@ -174,6 +172,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
* \param config JSON encoded configuration. Required values are: * \param config JSON encoded configuration. Required values are:
* - missing: Which value to represent missing value. * - missing: Which value to represent missing value.
* - nthread (optional): Number of threads used for initializing DMatrix. * - nthread (optional): Number of threads used for initializing DMatrix.
* - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
* \param out created dmatrix * \param out created dmatrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
@ -186,6 +185,7 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
* \param config JSON encoded configuration. Required values are: * \param config JSON encoded configuration. Required values are:
* - missing: Which value to represent missing value. * - missing: Which value to represent missing value.
* - nthread (optional): Number of threads used for initializing DMatrix. * - nthread (optional): Number of threads used for initializing DMatrix.
* - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
* \param out created dmatrix * \param out created dmatrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
@ -200,6 +200,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
* \param config JSON encoded configuration. Supported values are: * \param config JSON encoded configuration. Supported values are:
* - missing: Which value to represent missing value. * - missing: Which value to represent missing value.
* - nthread (optional): Number of threads used for initializing DMatrix. * - nthread (optional): Number of threads used for initializing DMatrix.
* - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
* \param out created dmatrix * \param out created dmatrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
@ -266,6 +267,7 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
* \param config JSON encoded configuration. Required values are: * \param config JSON encoded configuration. Required values are:
* - missing: Which value to represent missing value. * - missing: Which value to represent missing value.
* - nthread (optional): Number of threads used for initializing DMatrix. * - nthread (optional): Number of threads used for initializing DMatrix.
* - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
* \param out created dmatrix * \param out created dmatrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
@ -278,6 +280,7 @@ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, char const *config
* \param config JSON encoded configuration. Required values are: * \param config JSON encoded configuration. Required values are:
* - missing: Which value to represent missing value. * - missing: Which value to represent missing value.
* - nthread (optional): Number of threads used for initializing DMatrix. * - nthread (optional): Number of threads used for initializing DMatrix.
* - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
* \param out created dmatrix * \param out created dmatrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
@ -552,24 +555,6 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
/** @} */ // End of Streaming /** @} */ // End of Streaming
XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array, void *ptr_schema);
/*!
* \brief Construct DMatrix from arrow using callbacks. Arrow related C API is not stable
* and subject to change in the future.
*
* \param next Callback function for fetching arrow records.
* \param config JSON encoded configuration. Required values are:
* - missing: Which value to represent missing value.
* - nbatch: Number of batches in arrow table.
* - nthread (optional): Number of threads used for initializing DMatrix.
* \param out The created DMatrix.
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
DMatrixHandle *out);
/*! /*!
* \brief create a new dmatrix from sliced content of existing matrix * \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced * \param handle instance of data matrix to be sliced
@ -808,6 +793,16 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out);
*/ */
XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out); XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
/*!
* \brief Get the data split mode from DMatrix.
*
* \param handle the handle to the DMatrix
* \param out The output of the data split mode
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out);
/** /**
* \brief Get the predictors from DMatrix as CSR matrix for testing. If this is a * \brief Get the predictors from DMatrix as CSR matrix for testing. If this is a
* quantized DMatrix, quantized values are returned instead. * quantized DMatrix, quantized values are returned instead.
@ -1276,15 +1271,6 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len, XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len,
char const **out_dptr); char const **out_dptr);
/*!
* \brief Save booster to a buffer with in binary format.
*
* \deprecated since 1.6.0
* \see XGBoosterSaveModelToBuffer()
*/
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len,
const char **out_dptr);
/*! /*!
* \brief Memory snapshot based serialization method. Saves everything states * \brief Memory snapshot based serialization method. Saves everything states
* into buffer. * into buffer.
@ -1308,24 +1294,6 @@ XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, bst_ulong *out_len,
XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle, XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
const void *buf, bst_ulong len); const void *buf, bst_ulong len);
/*!
* \brief Initialize the booster from rabit checkpoint.
* This is used in distributed training API.
* \param handle handle
* \param version The output version of the model.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
int* version);
/*!
* \brief Save the current checkpoint to rabit.
* \param handle handle
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle);
/*! /*!
* \brief Save XGBoost's internal configuration into a JSON document. Currently the * \brief Save XGBoost's internal configuration into a JSON document. Currently the
* support is experimental, function signature may change in the future without * support is experimental, function signature may change in the future without
@ -1554,29 +1522,19 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
* \param config JSON encoded configuration. Accepted JSON keys are: * \param config JSON encoded configuration. Accepted JSON keys are:
* - xgboost_communicator: The type of the communicator. Can be set as an environment variable. * - xgboost_communicator: The type of the communicator. Can be set as an environment variable.
* * rabit: Use Rabit. This is the default if the type is unspecified. * * rabit: Use Rabit. This is the default if the type is unspecified.
* * mpi: Use MPI.
* * federated: Use the gRPC interface for Federated Learning. * * federated: Use the gRPC interface for Federated Learning.
* Only applicable to the Rabit communicator (these are case-sensitive): * Only applicable to the Rabit communicator (these are case-sensitive):
* - rabit_tracker_uri: Hostname of the tracker. * - rabit_tracker_uri: Hostname of the tracker.
* - rabit_tracker_port: Port number of the tracker. * - rabit_tracker_port: Port number of the tracker.
* - rabit_task_id: ID of the current task, can be used to obtain deterministic rank assignment. * - rabit_task_id: ID of the current task, can be used to obtain deterministic rank assignment.
* - rabit_world_size: Total number of workers. * - rabit_world_size: Total number of workers.
* - rabit_hadoop_mode: Enable Hadoop support.
* - rabit_tree_reduce_minsize: Minimal size for tree reduce.
* - rabit_reduce_ring_mincount: Minimal count to perform ring reduce.
* - rabit_reduce_buffer: Size of the reduce buffer.
* - rabit_bootstrap_cache: Size of the bootstrap cache.
* - rabit_debug: Enable debugging.
* - rabit_timeout: Enable timeout. * - rabit_timeout: Enable timeout.
* - rabit_timeout_sec: Timeout in seconds. * - rabit_timeout_sec: Timeout in seconds.
* - rabit_enable_tcp_no_delay: Enable TCP no delay on Unix platforms.
* Only applicable to the Rabit communicator (these are case-sensitive, and can be set as * Only applicable to the Rabit communicator (these are case-sensitive, and can be set as
* environment variables): * environment variables):
* - DMLC_TRACKER_URI: Hostname of the tracker. * - DMLC_TRACKER_URI: Hostname of the tracker.
* - DMLC_TRACKER_PORT: Port number of the tracker. * - DMLC_TRACKER_PORT: Port number of the tracker.
* - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment. * - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
* - DMLC_ROLE: Role of the current task, "worker" or "server".
* - DMLC_NUM_ATTEMPT: Number of attempts after task failure.
* - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker. * - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
* Only applicable to the Federated communicator (use upper case for environment variables, use * Only applicable to the Federated communicator (use upper case for environment variables, use
* lower case for runtime configuration): * lower case for runtime configuration):

View File

@ -157,4 +157,13 @@ struct Result {
[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) { [[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) {
return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)}; return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)};
} }
// We don't have monad, a simple helper would do.
template <typename Fn>
Result operator<<(Result&& r, Fn&& fn) {
if (!r.OK()) {
return std::forward<Result>(r);
}
return fn();
}
} // namespace xgboost::collective } // namespace xgboost::collective

View File

@ -215,9 +215,9 @@ class SockAddrV4 {
static SockAddrV4 Loopback(); static SockAddrV4 Loopback();
static SockAddrV4 InaddrAny(); static SockAddrV4 InaddrAny();
in_port_t Port() const { return ntohs(addr_.sin_port); } [[nodiscard]] in_port_t Port() const { return ntohs(addr_.sin_port); }
std::string Addr() const { [[nodiscard]] std::string Addr() const {
char buf[INET_ADDRSTRLEN]; char buf[INET_ADDRSTRLEN];
auto const *s = system::inet_ntop(static_cast<std::int32_t>(SockDomain::kV4), &addr_.sin_addr, auto const *s = system::inet_ntop(static_cast<std::int32_t>(SockDomain::kV4), &addr_.sin_addr,
buf, INET_ADDRSTRLEN); buf, INET_ADDRSTRLEN);
@ -226,7 +226,7 @@ class SockAddrV4 {
} }
return {buf}; return {buf};
} }
sockaddr_in const &Handle() const { return addr_; } [[nodiscard]] sockaddr_in const &Handle() const { return addr_; }
}; };
/** /**
@ -243,13 +243,13 @@ class SockAddress {
explicit SockAddress(SockAddrV6 const &addr) : v6_{addr}, domain_{SockDomain::kV6} {} explicit SockAddress(SockAddrV6 const &addr) : v6_{addr}, domain_{SockDomain::kV6} {}
explicit SockAddress(SockAddrV4 const &addr) : v4_{addr} {} explicit SockAddress(SockAddrV4 const &addr) : v4_{addr} {}
auto Domain() const { return domain_; } [[nodiscard]] auto Domain() const { return domain_; }
bool IsV4() const { return Domain() == SockDomain::kV4; } [[nodiscard]] bool IsV4() const { return Domain() == SockDomain::kV4; }
bool IsV6() const { return !IsV4(); } [[nodiscard]] bool IsV6() const { return !IsV4(); }
auto const &V4() const { return v4_; } [[nodiscard]] auto const &V4() const { return v4_; }
auto const &V6() const { return v6_; } [[nodiscard]] auto const &V6() const { return v6_; }
}; };
/** /**
@ -261,6 +261,7 @@ class TCPSocket {
private: private:
HandleT handle_{InvalidSocket()}; HandleT handle_{InvalidSocket()};
bool non_blocking_{false};
// There's reliable no way to extract domain from a socket without first binding that // There's reliable no way to extract domain from a socket without first binding that
// socket on macos. // socket on macos.
#if defined(__APPLE__) #if defined(__APPLE__)
@ -276,7 +277,7 @@ class TCPSocket {
/** /**
* \brief Return the socket domain. * \brief Return the socket domain.
*/ */
auto Domain() const -> SockDomain { [[nodiscard]] auto Domain() const -> SockDomain {
auto ret_iafamily = [](std::int32_t domain) { auto ret_iafamily = [](std::int32_t domain) {
switch (domain) { switch (domain) {
case AF_INET: case AF_INET:
@ -321,10 +322,10 @@ class TCPSocket {
#endif // platforms #endif // platforms
} }
bool IsClosed() const { return handle_ == InvalidSocket(); } [[nodiscard]] bool IsClosed() const { return handle_ == InvalidSocket(); }
/** \brief get last error code if any */ /** @brief get last error code if any */
Result GetSockError() const { [[nodiscard]] Result GetSockError() const {
std::int32_t optval = 0; std::int32_t optval = 0;
socklen_t len = sizeof(optval); socklen_t len = sizeof(optval);
auto ret = getsockopt(handle_, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&optval), &len); auto ret = getsockopt(handle_, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&optval), &len);
@ -340,7 +341,7 @@ class TCPSocket {
} }
/** \brief check if anything bad happens */ /** \brief check if anything bad happens */
bool BadSocket() const { [[nodiscard]] bool BadSocket() const {
if (IsClosed()) { if (IsClosed()) {
return true; return true;
} }
@ -352,24 +353,63 @@ class TCPSocket {
return false; return false;
} }
void SetNonBlock(bool non_block) { [[nodiscard]] Result NonBlocking(bool non_block) {
#if defined(_WIN32) #if defined(_WIN32)
u_long mode = non_block ? 1 : 0; u_long mode = non_block ? 1 : 0;
xgboost_CHECK_SYS_CALL(ioctlsocket(handle_, FIONBIO, &mode), NO_ERROR); if (ioctlsocket(handle_, FIONBIO, &mode) != NO_ERROR) {
return system::FailWithCode("Failed to set socket to non-blocking.");
}
#else #else
std::int32_t flag = fcntl(handle_, F_GETFL, 0); std::int32_t flag = fcntl(handle_, F_GETFL, 0);
if (flag == -1) { auto rc = flag;
system::ThrowAtError("fcntl"); if (rc == -1) {
return system::FailWithCode("Failed to get socket flag.");
} }
if (non_block) { if (non_block) {
flag |= O_NONBLOCK; flag |= O_NONBLOCK;
} else { } else {
flag &= ~O_NONBLOCK; flag &= ~O_NONBLOCK;
} }
if (fcntl(handle_, F_SETFL, flag) == -1) { rc = fcntl(handle_, F_SETFL, flag);
system::ThrowAtError("fcntl"); if (rc == -1) {
return system::FailWithCode("Failed to set socket to non-blocking.");
} }
#endif // _WIN32 #endif // _WIN32
non_blocking_ = non_block;
return Success();
}
[[nodiscard]] bool NonBlocking() const { return non_blocking_; }
[[nodiscard]] Result RecvTimeout(std::chrono::seconds timeout) {
// https://stackoverflow.com/questions/2876024/linux-is-there-a-read-or-recv-from-socket-with-timeout
#if defined(_WIN32)
DWORD tv = timeout.count() * 1000;
auto rc =
setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char *>(&tv), sizeof(tv));
#else
struct timeval tv;
tv.tv_sec = timeout.count();
tv.tv_usec = 0;
auto rc = setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char const *>(&tv),
sizeof(tv));
#endif
if (rc != 0) {
return system::FailWithCode("Failed to set timeout on recv.");
}
return Success();
}
[[nodiscard]] Result SetBufSize(std::int32_t n_bytes) {
auto rc = setsockopt(this->Handle(), SOL_SOCKET, SO_SNDBUF, reinterpret_cast<char *>(&n_bytes),
sizeof(n_bytes));
if (rc != 0) {
return system::FailWithCode("Failed to set send buffer size.");
}
rc = setsockopt(this->Handle(), SOL_SOCKET, SO_RCVBUF, reinterpret_cast<char *>(&n_bytes),
sizeof(n_bytes));
if (rc != 0) {
return system::FailWithCode("Failed to set recv buffer size.");
}
return Success();
} }
void SetKeepAlive() { void SetKeepAlive() {
@ -391,14 +431,31 @@ class TCPSocket {
* \brief Accept new connection, returns a new TCP socket for the new connection. * \brief Accept new connection, returns a new TCP socket for the new connection.
*/ */
TCPSocket Accept() { TCPSocket Accept() {
HandleT newfd = accept(handle_, nullptr, nullptr); HandleT newfd = accept(Handle(), nullptr, nullptr);
if (newfd == InvalidSocket()) { #if defined(_WIN32)
auto interrupt = WSAEINTR;
#else
auto interrupt = EINTR;
#endif
if (newfd == InvalidSocket() && system::LastError() != interrupt) {
system::ThrowAtError("accept"); system::ThrowAtError("accept");
} }
TCPSocket newsock{newfd}; TCPSocket newsock{newfd};
return newsock; return newsock;
} }
[[nodiscard]] Result Accept(TCPSocket *out, SockAddrV4 *addr) {
struct sockaddr_in caddr;
socklen_t caddr_len = sizeof(caddr);
HandleT newfd = accept(Handle(), reinterpret_cast<sockaddr *>(&caddr), &caddr_len);
if (newfd == InvalidSocket()) {
return system::FailWithCode("Failed to accept.");
}
*addr = SockAddrV4{caddr};
*out = TCPSocket{newfd};
return Success();
}
~TCPSocket() { ~TCPSocket() {
if (!IsClosed()) { if (!IsClosed()) {
Close(); Close();
@ -413,9 +470,9 @@ class TCPSocket {
return *this; return *this;
} }
/** /**
* \brief Return the native socket file descriptor. * @brief Return the native socket file descriptor.
*/ */
HandleT const &Handle() const { return handle_; } [[nodiscard]] HandleT const &Handle() const { return handle_; }
/** /**
* \brief Listen to incoming requests. Should be called after bind. * \brief Listen to incoming requests. Should be called after bind.
*/ */
@ -423,7 +480,7 @@ class TCPSocket {
/** /**
* \brief Bind socket to INADDR_ANY, return the port selected by the OS. * \brief Bind socket to INADDR_ANY, return the port selected by the OS.
*/ */
in_port_t BindHost() { [[nodiscard]] in_port_t BindHost() {
if (Domain() == SockDomain::kV6) { if (Domain() == SockDomain::kV6) {
auto addr = SockAddrV6::InaddrAny(); auto addr = SockAddrV6::InaddrAny();
auto handle = reinterpret_cast<sockaddr const *>(&addr.Handle()); auto handle = reinterpret_cast<sockaddr const *>(&addr.Handle());
@ -448,10 +505,53 @@ class TCPSocket {
return ntohs(res_addr.sin_port); return ntohs(res_addr.sin_port);
} }
} }
[[nodiscard]] auto Port() const {
if (this->Domain() == SockDomain::kV4) {
sockaddr_in res_addr;
socklen_t addrlen = sizeof(res_addr);
auto code = getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen);
if (code != 0) {
return std::make_pair(system::FailWithCode("getsockname"), std::int32_t{0});
}
return std::make_pair(Success(), std::int32_t{ntohs(res_addr.sin_port)});
} else {
sockaddr_in6 res_addr;
socklen_t addrlen = sizeof(res_addr);
auto code = getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen);
if (code != 0) {
return std::make_pair(system::FailWithCode("getsockname"), std::int32_t{0});
}
return std::make_pair(Success(), std::int32_t{ntohs(res_addr.sin6_port)});
}
}
[[nodiscard]] Result Bind(StringView ip, std::int32_t *port) {
// bind socket handle_ to ip
auto addr = MakeSockAddress(ip, 0);
std::int32_t errc{0};
if (addr.IsV4()) {
auto handle = reinterpret_cast<sockaddr const *>(&addr.V4().Handle());
errc = bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.V4().Handle())>));
} else {
auto handle = reinterpret_cast<sockaddr const *>(&addr.V6().Handle());
errc = bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.V6().Handle())>));
}
if (errc != 0) {
return system::FailWithCode("Failed to bind socket.");
}
auto [rc, new_port] = this->Port();
if (!rc.OK()) {
return std::move(rc);
}
*port = new_port;
return Success();
}
/** /**
* \brief Send data, without error then all data should be sent. * \brief Send data, without error then all data should be sent.
*/ */
auto SendAll(void const *buf, std::size_t len) { [[nodiscard]] auto SendAll(void const *buf, std::size_t len) {
char const *_buf = reinterpret_cast<const char *>(buf); char const *_buf = reinterpret_cast<const char *>(buf);
std::size_t ndone = 0; std::size_t ndone = 0;
while (ndone < len) { while (ndone < len) {
@ -470,7 +570,7 @@ class TCPSocket {
/** /**
* \brief Receive data, without error then all data should be received. * \brief Receive data, without error then all data should be received.
*/ */
auto RecvAll(void *buf, std::size_t len) { [[nodiscard]] auto RecvAll(void *buf, std::size_t len) {
char *_buf = reinterpret_cast<char *>(buf); char *_buf = reinterpret_cast<char *>(buf);
std::size_t ndone = 0; std::size_t ndone = 0;
while (ndone < len) { while (ndone < len) {
@ -524,7 +624,15 @@ class TCPSocket {
*/ */
void Close() { void Close() {
if (InvalidSocket() != handle_) { if (InvalidSocket() != handle_) {
#if defined(_WIN32)
auto rc = system::CloseSocket(handle_);
// it's possible that we close TCP sockets after finalizing WSA due to detached thread.
if (rc != 0 && system::LastError() != WSANOTINITIALISED) {
system::ThrowAtError("close", rc);
}
#else
xgboost_CHECK_SYS_CALL(system::CloseSocket(handle_), 0); xgboost_CHECK_SYS_CALL(system::CloseSocket(handle_), 0);
#endif
handle_ = InvalidSocket(); handle_ = InvalidSocket();
} }
} }
@ -546,6 +654,24 @@ class TCPSocket {
socket.domain_ = domain; socket.domain_ = domain;
#endif // defined(__APPLE__) #endif // defined(__APPLE__)
return socket; return socket;
#endif // defined(xgboost_IS_MINGW)
}
static TCPSocket *CreatePtr(SockDomain domain) {
#if defined(xgboost_IS_MINGW)
MingWError();
return nullptr;
#else
auto fd = socket(static_cast<std::int32_t>(domain), SOCK_STREAM, 0);
if (fd == InvalidSocket()) {
system::ThrowAtError("socket");
}
auto socket = new TCPSocket{fd};
#if defined(__APPLE__)
socket->domain_ = domain;
#endif // defined(__APPLE__)
return socket;
#endif // defined(xgboost_IS_MINGW) #endif // defined(xgboost_IS_MINGW)
} }
}; };
@ -567,12 +693,36 @@ class TCPSocket {
xgboost::collective::TCPSocket *out_conn); xgboost::collective::TCPSocket *out_conn);
/** /**
* \brief Get the local host name. * @brief Get the local host name.
*/ */
inline std::string GetHostName() { [[nodiscard]] Result GetHostName(std::string *p_out);
char buf[HOST_NAME_MAX];
xgboost_CHECK_SYS_CALL(gethostname(&buf[0], HOST_NAME_MAX), 0); /**
return buf; * @brief inet_ntop
*/
template <typename H>
Result INetNToP(H const &host, std::string *p_out) {
std::string &ip = *p_out;
switch (host->h_addrtype) {
case AF_INET: {
auto addr = reinterpret_cast<struct in_addr *>(host->h_addr_list[0]);
char str[INET_ADDRSTRLEN];
inet_ntop(AF_INET, addr, str, INET_ADDRSTRLEN);
ip = str;
break;
}
case AF_INET6: {
auto addr = reinterpret_cast<struct in6_addr *>(host->h_addr_list[0]);
char str[INET6_ADDRSTRLEN];
inet_ntop(AF_INET6, addr, str, INET6_ADDRSTRLEN);
ip = str;
break;
}
default: {
return Fail("Invalid address type.");
}
}
return Success();
} }
} // namespace collective } // namespace collective
} // namespace xgboost } // namespace xgboost

View File

@ -29,31 +29,37 @@ struct DeviceSym {
* viewing types like `linalg::TensorView`. * viewing types like `linalg::TensorView`.
*/ */
struct DeviceOrd { struct DeviceOrd {
// Constant representing the device ID of CPU.
static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU}; enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
// CUDA device ordinal. // CUDA device ordinal.
bst_d_ordinal_t ordinal{-1}; bst_d_ordinal_t ordinal{CPUOrdinal()};
[[nodiscard]] bool IsCUDA() const { return device == kCUDA; } [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
[[nodiscard]] bool IsCPU() const { return device == kCPU; } [[nodiscard]] bool IsCPU() const { return device == kCPU; }
DeviceOrd() = default; constexpr DeviceOrd() = default;
constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {} constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
DeviceOrd(DeviceOrd const& that) = default; constexpr DeviceOrd(DeviceOrd const& that) = default;
DeviceOrd& operator=(DeviceOrd const& that) = default; constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
DeviceOrd(DeviceOrd&& that) = default; constexpr DeviceOrd(DeviceOrd&& that) = default;
DeviceOrd& operator=(DeviceOrd&& that) = default; constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
/** /**
* @brief Constructor for CPU. * @brief Constructor for CPU.
*/ */
[[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; } [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
/** /**
* @brief Constructor for CUDA device. * @brief Constructor for CUDA device.
* *
* @param ordinal CUDA device ordinal. * @param ordinal CUDA device ordinal.
*/ */
[[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; } [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
return DeviceOrd{kCUDA, ordinal};
}
[[nodiscard]] bool operator==(DeviceOrd const& that) const { [[nodiscard]] bool operator==(DeviceOrd const& that) const {
return device == that.device && ordinal == that.ordinal; return device == that.device && ordinal == that.ordinal;
@ -78,25 +84,26 @@ struct DeviceOrd {
static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t)); static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
/** /**
* @brief Runtime context for XGBoost. Contains information like threads and device. * @brief Runtime context for XGBoost. Contains information like threads and device.
*/ */
struct Context : public XGBoostParameter<Context> { struct Context : public XGBoostParameter<Context> {
private: private:
// User interfacing parameter for device ordinal
std::string device{DeviceSym::CPU()}; // NOLINT std::string device{DeviceSym::CPU()}; // NOLINT
// The device object for the current context. We are in the middle of replacing the // The device ordinal set by user
// `gpu_id` with this device field.
DeviceOrd device_{DeviceOrd::CPU()}; DeviceOrd device_{DeviceOrd::CPU()};
public: public:
// Constant representing the device ID of CPU.
static bst_d_ordinal_t constexpr kCpuId = -1;
static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
static std::int64_t constexpr kDefaultSeed = 0; static std::int64_t constexpr kDefaultSeed = 0;
public: public:
Context(); Context();
void Init(Args const& kwargs);
template <typename Container> template <typename Container>
Args UpdateAllowUnknown(Container const& kwargs) { Args UpdateAllowUnknown(Container const& kwargs) {
auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs); auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
return args; return args;
} }
std::int32_t gpu_id{kCpuId};
// The number of threads to use if OpenMP is enabled. If equals 0, use the system default. // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
std::int32_t nthread{0}; // NOLINT std::int32_t nthread{0}; // NOLINT
// stored random seed // stored random seed
@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
bool validate_parameters{false}; bool validate_parameters{false};
/** /**
* @brief Configure the parameter `gpu_id'. * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
* removed.
* *
* @param require_gpu Whether GPU is explicitly required by the user through other * @param require_gpu Whether GPU is explicitly required by the user through other
* configurations. * configurations.
@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
private: private:
void SetDeviceOrdinal(Args const& kwargs); void SetDeviceOrdinal(Args const& kwargs);
Context& SetDevice(DeviceOrd d) { Context& SetDevice(DeviceOrd d) {
this->device_ = d; this->device = (this->device_ = d).Name();
this->gpu_id = d.ordinal; // this can be removed once we move away from `gpu_id`.
this->device = d.Name();
return *this; return *this;
} }

View File

@ -106,10 +106,10 @@ class MetaInfo {
MetaInfo& operator=(MetaInfo&& that) = default; MetaInfo& operator=(MetaInfo&& that) = default;
MetaInfo& operator=(MetaInfo const& that) = delete; MetaInfo& operator=(MetaInfo const& that) = delete;
/*! /**
* \brief Validate all metainfo. * @brief Validate all metainfo.
*/ */
void Validate(int32_t device) const; void Validate(DeviceOrd device) const;
MetaInfo Slice(common::Span<int32_t const> ridxs) const; MetaInfo Slice(common::Span<int32_t const> ridxs) const;
@ -559,8 +559,7 @@ class DMatrix {
* *
* \param uri The URI of input. * \param uri The URI of input.
* \param silent Whether print information during loading. * \param silent Whether print information during loading.
* \param data_split_mode In distributed mode, split the input according this mode; otherwise, * \param data_split_mode Indicate how the data was split beforehand.
* it's just an indicator on how the input was split beforehand.
* \return The created DMatrix. * \return The created DMatrix.
*/ */
static DMatrix* Load(const std::string& uri, bool silent = true, static DMatrix* Load(const std::string& uri, bool silent = true,

View File

@ -88,9 +88,9 @@ class HostDeviceVector {
static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types"); static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
public: public:
explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1); explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
HostDeviceVector(std::initializer_list<T> init, int device = -1); HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
explicit HostDeviceVector(const std::vector<T>& init, int device = -1); explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
~HostDeviceVector(); ~HostDeviceVector();
HostDeviceVector(const HostDeviceVector<T>&) = delete; HostDeviceVector(const HostDeviceVector<T>&) = delete;
@ -99,17 +99,9 @@ class HostDeviceVector {
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete; HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
HostDeviceVector<T>& operator=(HostDeviceVector<T>&&); HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
bool Empty() const { return Size() == 0; } [[nodiscard]] bool Empty() const { return Size() == 0; }
size_t Size() const; [[nodiscard]] std::size_t Size() const;
int DeviceIdx() const; [[nodiscard]] DeviceOrd Device() const;
DeviceOrd Device() const {
auto idx = this->DeviceIdx();
if (idx == DeviceOrd::CPU().ordinal) {
return DeviceOrd::CPU();
} else {
return DeviceOrd::CUDA(idx);
}
}
common::Span<T> DeviceSpan(); common::Span<T> DeviceSpan();
common::Span<const T> ConstDeviceSpan() const; common::Span<const T> ConstDeviceSpan() const;
common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); } common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
@ -135,13 +127,12 @@ class HostDeviceVector {
const std::vector<T>& ConstHostVector() const; const std::vector<T>& ConstHostVector() const;
const std::vector<T>& HostVector() const {return ConstHostVector(); } const std::vector<T>& HostVector() const {return ConstHostVector(); }
bool HostCanRead() const; [[nodiscard]] bool HostCanRead() const;
bool HostCanWrite() const; [[nodiscard]] bool HostCanWrite() const;
bool DeviceCanRead() const; [[nodiscard]] bool DeviceCanRead() const;
bool DeviceCanWrite() const; [[nodiscard]] bool DeviceCanWrite() const;
GPUAccess DeviceAccess() const; [[nodiscard]] GPUAccess DeviceAccess() const;
void SetDevice(int device) const;
void SetDevice(DeviceOrd device) const; void SetDevice(DeviceOrd device) const;
void Resize(size_t new_size, T v = T()); void Resize(size_t new_size, T v = T());

View File

@ -372,6 +372,19 @@ class Json {
/*! \brief Use your own JsonWriter. */ /*! \brief Use your own JsonWriter. */
static void Dump(Json json, JsonWriter* writer); static void Dump(Json json, JsonWriter* writer);
template <typename Container = std::string>
static Container Dump(Json json) {
if constexpr (std::is_same_v<Container, std::string>) {
std::string str;
Dump(json, &str);
return str;
} else {
std::vector<char> str;
Dump(json, &str);
return str;
}
}
Json() = default; Json() = default;
// number // number
@ -595,44 +608,6 @@ using Boolean = JsonBoolean;
using String = JsonString; using String = JsonString;
using Null = JsonNull; using Null = JsonNull;
// Utils tailored for XGBoost.
namespace detail {
template <typename Head>
bool TypeCheckImpl(Json const& value) {
return IsA<Head>(value);
}
template <typename Head, typename... JT>
std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const& value) {
return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
}
template <typename Head>
std::string TypeCheckError() {
return "`" + Head{}.TypeStr() + "`";
}
template <typename Head, typename... JT>
std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
}
} // namespace detail
/**
* \brief Type check for JSON-based parameters
*
* \tparam JT Expected JSON types.
* \param value Value to be checked.
*/
template <typename... JT>
void TypeCheck(Json const& value, StringView name) {
if (!detail::TypeCheckImpl<JT...>(value)) {
LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
<< detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
<< "`";
}
}
/** /**
* \brief Convert XGBoost parameter to JSON object. * \brief Convert XGBoost parameter to JSON object.
* *

View File

@ -603,13 +603,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&.
template <typename T, typename... S> template <typename T, typename... S>
auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) { auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan(); auto span = ctx->IsCUDA() ? data->DeviceSpan() : data->HostSpan();
return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...); return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
} }
template <typename T, typename... S> template <typename T, typename... S>
auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) { auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan(); auto span = ctx->IsCUDA() ? data->ConstDeviceSpan() : data->ConstHostSpan();
return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...); return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
} }
@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
template <typename T> template <typename T>
auto MakeVec(HostDeviceVector<T> *data) { auto MakeVec(HostDeviceVector<T> *data) {
return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(), return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
data->Size(), data->Device()); data->Device());
} }
template <typename T> template <typename T>
auto MakeVec(HostDeviceVector<T> const *data) { auto MakeVec(HostDeviceVector<T> const *data) {
return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(), return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
data->Size(), data->Device()); data->Size(), data->Device());
} }
@ -757,13 +757,13 @@ class Tensor {
Order order_{Order::kC}; Order order_{Order::kC};
template <typename I, std::int32_t D> template <typename I, std::int32_t D>
void Initialize(I const (&shape)[D], std::int32_t device) { void Initialize(I const (&shape)[D], DeviceOrd device) {
static_assert(D <= kDim, "Invalid shape."); static_assert(D <= kDim, "Invalid shape.");
std::copy(shape, shape + D, shape_); std::copy(shape, shape + D, shape_);
for (auto i = D; i < kDim; ++i) { for (auto i = D; i < kDim; ++i) {
shape_[i] = 1; shape_[i] = 1;
} }
if (device >= 0) { if (device.IsCUDA()) {
data_.SetDevice(device); data_.SetDevice(device);
data_.ConstDevicePointer(); // Pull to device; data_.ConstDevicePointer(); // Pull to device;
} }
@ -780,14 +780,11 @@ class Tensor {
* See \ref TensorView for parameters of this constructor. * See \ref TensorView for parameters of this constructor.
*/ */
template <typename I, int32_t D> template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
: Tensor{common::Span<I const, D>{shape}, device, order} {}
template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC) explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
: Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {} : Tensor{common::Span<I const, D>{shape}, device, order} {}
template <typename I, size_t D> template <typename I, size_t D>
explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC) explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
: order_{order} { : order_{order} {
// No device unroll as this is a host only function. // No device unroll as this is a host only function.
std::copy(shape.data(), shape.data() + D, shape_); std::copy(shape.data(), shape.data() + D, shape_);
@ -795,11 +792,11 @@ class Tensor {
shape_[i] = 1; shape_[i] = 1;
} }
auto size = detail::CalcSize(shape_); auto size = detail::CalcSize(shape_);
if (device >= 0) { if (device.IsCUDA()) {
data_.SetDevice(device); data_.SetDevice(device);
} }
data_.Resize(size); data_.Resize(size);
if (device >= 0) { if (device.IsCUDA()) {
data_.DevicePointer(); // Pull to device data_.DevicePointer(); // Pull to device
} }
} }
@ -807,7 +804,7 @@ class Tensor {
* Initialize from 2 host iterators. * Initialize from 2 host iterators.
*/ */
template <typename It, typename I, int32_t D> template <typename It, typename I, int32_t D>
explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC) explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
: order_{order} { : order_{order} {
auto &h_vec = data_.HostVector(); auto &h_vec = data_.HostVector();
h_vec.insert(h_vec.begin(), begin, end); h_vec.insert(h_vec.begin(), begin, end);
@ -816,7 +813,7 @@ class Tensor {
} }
template <typename I, int32_t D> template <typename I, int32_t D>
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device, explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
Order order = kC) Order order = kC)
: order_{order} { : order_{order} {
auto &h_vec = data_.HostVector(); auto &h_vec = data_.HostVector();
@ -824,10 +821,6 @@ class Tensor {
// shape // shape
this->Initialize(shape, device); this->Initialize(shape, device);
} }
template <typename I, int32_t D>
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
Order order = kC)
: Tensor{data, shape, device.ordinal, order} {}
/** /**
* \brief Index operator. Not thread safe, should not be used in performance critical * \brief Index operator. Not thread safe, should not be used in performance critical
* region. For more efficient indexing, consider getting a view first. * region. For more efficient indexing, consider getting a view first.
@ -944,9 +937,7 @@ class Tensor {
/** /**
* \brief Set device ordinal for this tensor. * \brief Set device ordinal for this tensor.
*/ */
void SetDevice(int32_t device) const { data_.SetDevice(device); }
void SetDevice(DeviceOrd device) const { data_.SetDevice(device); } void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
[[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
[[nodiscard]] DeviceOrd Device() const { return data_.Device(); } [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
}; };
@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
template <typename T, typename... Index> template <typename T, typename... Index>
auto Empty(Context const *ctx, Index &&...index) { auto Empty(Context const *ctx, Index &&...index) {
Tensor<T, sizeof...(Index)> t; Tensor<T, sizeof...(Index)> t;
t.SetDevice(ctx->gpu_id); t.SetDevice(ctx->Device());
t.Reshape(index...); t.Reshape(index...);
return t; return t;
} }
@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
template <typename T, typename... Index> template <typename T, typename... Index>
auto Constant(Context const *ctx, T v, Index &&...index) { auto Constant(Context const *ctx, T v, Index &&...index) {
Tensor<T, sizeof...(Index)> t; Tensor<T, sizeof...(Index)> t;
t.SetDevice(ctx->gpu_id); t.SetDevice(ctx->Device());
t.Reshape(index...); t.Reshape(index...);
t.Data()->Fill(std::move(v)); t.Data()->Fill(std::move(v));
return t; return t;
@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
// Only first axis is supported for now. // Only first axis is supported for now.
template <typename T, int32_t D> template <typename T, int32_t D>
void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) { void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
if (r.DeviceIdx() >= 0) { if (r.Device().IsCUDA()) {
l->SetDevice(r.DeviceIdx()); l->SetDevice(r.Device());
} }
l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) { l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
for (size_t i = 1; i < D; ++i) { for (size_t i = 1; i < D; ++i) {

View File

@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
public: public:
PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {} PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) { PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
auto p_cache = this->CacheItem(m); auto p_cache = this->CacheItem(m);
if (device != Context::kCpuId) { if (device.IsCUDA()) {
p_cache->predictions.SetDevice(device); p_cache->predictions.SetDevice(device);
} }
return *p_cache; return *p_cache;

View File

@ -29,7 +29,7 @@ struct StringView {
public: public:
constexpr StringView() = default; constexpr StringView() = default;
constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {} constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {}
explicit StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} // NOLINT
constexpr StringView(CharT const* str) // NOLINT constexpr StringView(CharT const* str) // NOLINT
: str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {} : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}

View File

@ -4,16 +4,16 @@ list(APPEND JVM_SOURCES
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j.cpp ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp) ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp)
if (USE_CUDA) if(USE_CUDA)
list(APPEND JVM_SOURCES list(APPEND JVM_SOURCES
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu) ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
endif (USE_CUDA) endif()
add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES}) add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
if (ENABLE_ALL_WARNINGS) if(ENABLE_ALL_WARNINGS)
target_compile_options(xgboost4j PUBLIC -Wall -Wextra) target_compile_options(xgboost4j PUBLIC -Wall -Wextra)
endif (ENABLE_ALL_WARNINGS) endif()
target_link_libraries(xgboost4j PRIVATE objxgboost) target_link_libraries(xgboost4j PRIVATE objxgboost)
target_include_directories(xgboost4j target_include_directories(xgboost4j

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2014-2022 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -34,55 +34,51 @@ class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite
private def createNewModels(): private def createNewModels():
(String, XGBoostClassificationModel, XGBoostClassificationModel) = { (String, XGBoostClassificationModel, XGBoostClassificationModel) = {
val tmpPath = createTmpFolder("test").toAbsolutePath.toString val tmpPath = createTmpFolder("test").toAbsolutePath.toString
val (model4, model8) = { val (model2, model4) = {
val training = buildDataFrame(Classification.train) val training = buildDataFrame(Classification.train)
val paramMap = produceParamMap(tmpPath, 2) val paramMap = produceParamMap(tmpPath, 2)
(new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training), (new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training),
new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training)) new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training))
} }
(tmpPath, model4, model8) (tmpPath, model2, model4)
} }
test("test update/load models") { test("test update/load models") {
val (tmpPath, model4, model8) = createNewModels() val (tmpPath, model2, model4) = createNewModels()
val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration)) val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
manager.updateCheckpoint(model4._booster.booster) manager.updateCheckpoint(model2._booster.booster)
var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath)) var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
assert(files.length == 1) assert(files.length == 1)
assert(files.head.getPath.getName == "4.model") assert(files.head.getPath.getName == "1.model")
assert(manager.loadCheckpointAsScalaBooster().getVersion == 4) assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 2)
manager.updateCheckpoint(model8._booster) manager.updateCheckpoint(model4._booster)
files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath)) files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
assert(files.length == 1) assert(files.length == 1)
assert(files.head.getPath.getName == "8.model") assert(files.head.getPath.getName == "3.model")
assert(manager.loadCheckpointAsScalaBooster().getVersion == 8) assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 4)
} }
test("test cleanUpHigherVersions") { test("test cleanUpHigherVersions") {
val (tmpPath, model4, model8) = createNewModels() val (tmpPath, model2, model4) = createNewModels()
val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration)) val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
manager.updateCheckpoint(model8._booster) manager.updateCheckpoint(model4._booster)
manager.cleanUpHigherVersions(8) manager.cleanUpHigherVersions(3)
assert(new File(s"$tmpPath/8.model").exists()) assert(new File(s"$tmpPath/3.model").exists())
manager.cleanUpHigherVersions(4) manager.cleanUpHigherVersions(2)
assert(!new File(s"$tmpPath/8.model").exists()) assert(!new File(s"$tmpPath/3.model").exists())
} }
test("test checkpoint rounds") { test("test checkpoint rounds") {
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
val (tmpPath, model4, model8) = createNewModels() val (tmpPath, model2, model4) = createNewModels()
val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration)) val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
assertResult(Seq(7))( assertResult(Seq(2))(manager.getCheckpointRounds(0, 0, 3).asScala)
manager.getCheckpointRounds(0, 7).asScala) assertResult(Seq(0, 2, 4, 6))(manager.getCheckpointRounds(0, 2, 7).asScala)
assertResult(Seq(2, 4, 6, 7))( assertResult(Seq(0, 2, 4, 6, 7))(manager.getCheckpointRounds(0, 2, 8).asScala)
manager.getCheckpointRounds(2, 7).asScala)
manager.updateCheckpoint(model4._booster)
assertResult(Seq(4, 6, 7))(
manager.getCheckpointRounds(2, 7).asScala)
} }
@ -109,8 +105,8 @@ class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite
// Check only one model is kept after training // Check only one model is kept after training
val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath)) val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
assert(files.length == 1) assert(files.length == 1)
assert(files.head.getPath.getName == "8.model") assert(files.head.getPath.getName == "4.model")
val tmpModel = SXGBoost.loadModel(s"$tmpPath/8.model") val tmpModel = SXGBoost.loadModel(s"$tmpPath/4.model")
// Train next model based on prev model // Train next model based on prev model
val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training) val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training)
assert(error(tmpModel) >= error(prevModel._booster)) assert(error(tmpModel) >= error(prevModel._booster))

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2014-2022 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -39,6 +39,21 @@ public class Booster implements Serializable, KryoSerializable {
// handle to the booster. // handle to the booster.
private long handle = 0; private long handle = 0;
private int version = 0; private int version = 0;
/**
* Type of prediction, used for inplace_predict.
*/
public enum PredictionType {
kValue(0),
kMargin(1);
private Integer ptype;
private PredictionType(final Integer ptype) {
this.ptype = ptype;
}
public Integer getPType() {
return ptype;
}
}
/** /**
* Create a new Booster with empty stage. * Create a new Booster with empty stage.
@ -375,6 +390,97 @@ public class Booster implements Serializable, KryoSerializable {
return predicts; return predicts;
} }
/**
* Perform thread-safe prediction.
*
* @param data Flattened input matrix of features for prediction
* @param nrow The number of preditions to make (count of input matrix rows)
* @param ncol The number of features in the model (count of input matrix columns)
* @param missing Value indicating missing element in the <code>data</code> input matrix
*
* @return predict Result matrix
*/
public float[][] inplace_predict(float[] data,
int nrow,
int ncol,
float missing) throws XGBoostError {
int[] iteration_range = new int[2];
iteration_range[0] = 0;
iteration_range[1] = 0;
return this.inplace_predict(data, nrow, ncol,
missing, iteration_range, PredictionType.kValue, null);
}
/**
* Perform thread-safe prediction.
*
* @param data Flattened input matrix of features for prediction
* @param nrow The number of preditions to make (count of input matrix rows)
* @param ncol The number of features in the model (count of input matrix columns)
* @param missing Value indicating missing element in the <code>data</code> input matrix
* @param iteration_range Specifies which layer of trees are used in prediction. For
* example, if a random forest is trained with 100 rounds.
* Specifying `iteration_range=[10, 20)`, then only the forests
* built during [10, 20) (half open set) rounds are used in this
* prediction.
*
* @return predict Result matrix
*/
public float[][] inplace_predict(float[] data,
int nrow,
int ncol,
float missing, int[] iteration_range) throws XGBoostError {
return this.inplace_predict(data, nrow, ncol,
missing, iteration_range, PredictionType.kValue, null);
}
/**
* Perform thread-safe prediction.
*
* @param data Flattened input matrix of features for prediction
* @param nrow The number of preditions to make (count of input matrix rows)
* @param ncol The number of features in the model (count of input matrix columns)
* @param missing Value indicating missing element in the <code>data</code> input matrix
* @param iteration_range Specifies which layer of trees are used in prediction. For
* example, if a random forest is trained with 100 rounds.
* Specifying `iteration_range=[10, 20)`, then only the forests
* built during [10, 20) (half open set) rounds are used in this
* prediction.
* @param predict_type What kind of prediction to run.
* @return predict Result matrix
*/
public float[][] inplace_predict(float[] data,
int nrow,
int ncol,
float missing,
int[] iteration_range,
PredictionType predict_type,
float[] base_margin) throws XGBoostError {
if (iteration_range.length != 2) {
throw new XGBoostError(new String("Iteration range is expected to be [begin, end)."));
}
int ptype = predict_type.getPType();
int begin = iteration_range[0];
int end = iteration_range[1];
float[][] rawPredicts = new float[1][];
XGBoostJNI.checkCall(XGBoostJNI.XGBoosterPredictFromDense(handle, data, nrow, ncol,
missing,
begin, end, ptype, base_margin, rawPredicts));
int col = rawPredicts[0].length / nrow;
float[][] predicts = new float[nrow][col];
int r, c;
for (int i = 0; i < rawPredicts[0].length; i++) {
r = i / col;
c = i % col;
predicts[r][c] = rawPredicts[0][i];
}
return predicts;
}
/** /**
* Predict leaf indices given the data * Predict leaf indices given the data
* *
@ -681,35 +787,6 @@ public class Booster implements Serializable, KryoSerializable {
return importanceMap; return importanceMap;
} }
/**
* Save the model as byte array representation.
* Write these bytes to a file will give compatible format with other xgboost bindings.
*
* If java natively support HDFS file API, use toByteArray and write the ByteArray
*
* @param withStats Controls whether the split statistics are output.
* @return dumped model information
* @throws XGBoostError native error
*/
private String[] getDumpInfo(boolean withStats) throws XGBoostError {
int statsFlag = 0;
if (withStats) {
statsFlag = 1;
}
String[][] modelInfos = new String[1][];
XGBoostJNI.checkCall(XGBoostJNI.XGBoosterDumpModelEx(handle, "", statsFlag, "text",
modelInfos));
return modelInfos[0];
}
public int getVersion() {
return this.version;
}
public void setVersion(int version) {
this.version = version;
}
/** /**
* Save model into raw byte array. Currently it's using the deprecated format as * Save model into raw byte array. Currently it's using the deprecated format as
* default, which will be changed into `ubj` in future releases. * default, which will be changed into `ubj` in future releases.
@ -735,29 +812,6 @@ public class Booster implements Serializable, KryoSerializable {
return bytes[0]; return bytes[0];
} }
/**
* Load the booster model from thread-local rabit checkpoint.
* This is only used in distributed training.
* @return the stored version number of the checkpoint.
* @throws XGBoostError
*/
int loadRabitCheckpoint() throws XGBoostError {
int[] out = new int[1];
XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadRabitCheckpoint(this.handle, out));
version = out[0];
return version;
}
/**
* Save the booster model into thread-local rabit checkpoint and increment the version.
* This is only used in distributed training.
* @throws XGBoostError
*/
void saveRabitCheckpoint() throws XGBoostError {
XGBoostJNI.checkCall(XGBoostJNI.XGBoosterSaveRabitCheckpoint(this.handle));
version += 1;
}
/** /**
* Get number of model features. * Get number of model features.
* @return the number of features. * @return the number of features.
@ -768,6 +822,11 @@ public class Booster implements Serializable, KryoSerializable {
XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumFeature(this.handle, numFeature)); XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumFeature(this.handle, numFeature));
return numFeature[0]; return numFeature[0];
} }
public int getNumBoostedRound() throws XGBoostError {
int[] numRound = new int[1];
XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumBoostedRound(this.handle, numRound));
return numRound[0];
}
/** /**
* Internal initialization function. * Internal initialization function.

Some files were not shown because too many files have changed in this diff Show More