Compare commits
31 Commits
master-roc
...
release_1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a09446d12b | ||
|
|
b559bfc927 | ||
|
|
742c19f3ec | ||
|
|
2245a6e9ac | ||
|
|
ed8ba2150b | ||
|
|
87ddcf308e | ||
|
|
35dac8af1d | ||
|
|
1311a20f49 | ||
|
|
328d1e18db | ||
|
|
3e2d7519a6 | ||
|
|
afb9dfd421 | ||
|
|
eb69c6110a | ||
|
|
0f9ffcdc16 | ||
|
|
9bbd00a49f | ||
|
|
7e239f229c | ||
|
|
a013942649 | ||
|
|
4d2ea0d4ef | ||
|
|
d1052b5cfe | ||
|
|
14c56f05da | ||
|
|
11f8b5cfcd | ||
|
|
e7ac2486eb | ||
|
|
a3d195e73e | ||
|
|
fab3c05ced | ||
|
|
584b45a9cc | ||
|
|
30c1b5c54c | ||
|
|
36e247aca4 | ||
|
|
c4aff733bb | ||
|
|
cdbfd21d31 | ||
|
|
508a0b0dbd | ||
|
|
e04e773f9f | ||
|
|
1debabb321 |
2
.github/workflows/jvm_tests.yml
vendored
2
.github/workflows/jvm_tests.yml
vendored
@ -9,7 +9,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [windows-latest, ubuntu-latest]
|
os: [windows-latest, ubuntu-latest, macos-10.15]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
|||||||
80
.github/workflows/python_tests.yml
vendored
80
.github/workflows/python_tests.yml
vendored
@ -45,13 +45,13 @@ jobs:
|
|||||||
cd ..
|
cd ..
|
||||||
python -c 'import xgboost'
|
python -c 'import xgboost'
|
||||||
|
|
||||||
python-tests:
|
python-tests-on-win:
|
||||||
name: Test XGBoost Python package on ${{ matrix.config.os }}
|
name: Test XGBoost Python package on ${{ matrix.config.os }}
|
||||||
runs-on: ${{ matrix.config.os }}
|
runs-on: ${{ matrix.config.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- {os: windows-2016, compiler: 'msvc', python-version: '3.8'}
|
- {os: windows-2016, python-version: '3.8'}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
@ -62,7 +62,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
auto-update-conda: true
|
auto-update-conda: true
|
||||||
python-version: ${{ matrix.config.python-version }}
|
python-version: ${{ matrix.config.python-version }}
|
||||||
activate-environment: win64_test
|
activate-environment: win64_env
|
||||||
environment-file: tests/ci_build/conda_env/win64_cpu_test.yml
|
environment-file: tests/ci_build/conda_env/win64_cpu_test.yml
|
||||||
|
|
||||||
- name: Display Conda env
|
- name: Display Conda env
|
||||||
@ -71,9 +71,8 @@ jobs:
|
|||||||
conda info
|
conda info
|
||||||
conda list
|
conda list
|
||||||
|
|
||||||
- name: Build XGBoost with msvc
|
- name: Build XGBoost on Windows
|
||||||
shell: bash -l {0}
|
shell: bash -l {0}
|
||||||
if: matrix.config.compiler == 'msvc'
|
|
||||||
run: |
|
run: |
|
||||||
mkdir build_msvc
|
mkdir build_msvc
|
||||||
cd build_msvc
|
cd build_msvc
|
||||||
@ -92,3 +91,74 @@ jobs:
|
|||||||
shell: bash -l {0}
|
shell: bash -l {0}
|
||||||
run: |
|
run: |
|
||||||
pytest -s -v ./tests/python
|
pytest -s -v ./tests/python
|
||||||
|
|
||||||
|
python-tests-on-macos:
|
||||||
|
name: Test XGBoost Python package on ${{ matrix.config.os }}
|
||||||
|
runs-on: ${{ matrix.config.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- {os: macos-10.15, python-version "3.8" }
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: 'true'
|
||||||
|
|
||||||
|
- uses: conda-incubator/setup-miniconda@v2
|
||||||
|
with:
|
||||||
|
auto-update-conda: true
|
||||||
|
python-version: ${{ matrix.config.python-version }}
|
||||||
|
activate-environment: macos_test
|
||||||
|
environment-file: tests/ci_build/conda_env/macos_cpu_test.yml
|
||||||
|
|
||||||
|
- name: Display Conda env
|
||||||
|
shell: bash -l {0}
|
||||||
|
run: |
|
||||||
|
conda info
|
||||||
|
conda list
|
||||||
|
|
||||||
|
- name: Build XGBoost on macos
|
||||||
|
run: |
|
||||||
|
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb)
|
||||||
|
brew install ninja libomp
|
||||||
|
brew pin libomp
|
||||||
|
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON
|
||||||
|
ninja
|
||||||
|
|
||||||
|
- name: Install Python package
|
||||||
|
shell: bash -l {0}
|
||||||
|
run: |
|
||||||
|
cd python-package
|
||||||
|
python --version
|
||||||
|
python setup.py bdist_wheel --universal
|
||||||
|
pip install ./dist/*.whl
|
||||||
|
|
||||||
|
- name: Test Python package
|
||||||
|
shell: bash -l {0}
|
||||||
|
run: |
|
||||||
|
pytest -s -v ./tests/python
|
||||||
|
|
||||||
|
- name: Rename Python wheel
|
||||||
|
shell: bash -l {0}
|
||||||
|
run: |
|
||||||
|
TAG=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
|
||||||
|
python tests/ci_build/rename_whl.py python-package/dist/*.whl ${{ github.sha }} ${TAG}
|
||||||
|
|
||||||
|
- name: Extract branch name
|
||||||
|
shell: bash
|
||||||
|
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||||
|
id: extract_branch
|
||||||
|
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
|
||||||
|
|
||||||
|
- name: Upload Python wheel
|
||||||
|
shell: bash -l {0}
|
||||||
|
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
|
||||||
|
run: |
|
||||||
|
python -m awscli s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
|
||||||
|
env:
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
|
||||||
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
|
||||||
|
|||||||
17
.github/workflows/r_tests.yml
vendored
17
.github/workflows/r_tests.yml
vendored
@ -3,7 +3,7 @@ name: XGBoost-R-Tests
|
|||||||
on: [push, pull_request]
|
on: [push, pull_request]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
R_PACKAGES: c('XML', 'igraph', 'data.table', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
|
R_PACKAGES: c('XML', 'data.table', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
|
||||||
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
@ -40,6 +40,11 @@ jobs:
|
|||||||
install.packages(${{ env.R_PACKAGES }},
|
install.packages(${{ env.R_PACKAGES }},
|
||||||
repos = 'http://cloud.r-project.org',
|
repos = 'http://cloud.r-project.org',
|
||||||
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||||
|
- name: Install igraph on Windows
|
||||||
|
shell: Rscript {0}
|
||||||
|
if: matrix.config.os == 'windows-latest'
|
||||||
|
run: |
|
||||||
|
install.packages('igraph', type='binary')
|
||||||
|
|
||||||
- name: Run lintr
|
- name: Run lintr
|
||||||
run: |
|
run: |
|
||||||
@ -83,6 +88,11 @@ jobs:
|
|||||||
install.packages(${{ env.R_PACKAGES }},
|
install.packages(${{ env.R_PACKAGES }},
|
||||||
repos = 'http://cloud.r-project.org',
|
repos = 'http://cloud.r-project.org',
|
||||||
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||||
|
- name: Install igraph on Windows
|
||||||
|
shell: Rscript {0}
|
||||||
|
if: matrix.config.os == 'windows-2016'
|
||||||
|
run: |
|
||||||
|
install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||||
|
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
@ -91,7 +101,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Test R
|
- name: Test R
|
||||||
run: |
|
run: |
|
||||||
python tests/ci_build/test_r_package.py --compiler="${{ matrix.config.compiler }}" --build-tool="${{ matrix.config.build }}"
|
python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool='${{ matrix.config.build }}'
|
||||||
|
|
||||||
test-R-CRAN:
|
test-R-CRAN:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -115,7 +125,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install system packages
|
- name: Install system packages
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc
|
sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc libglpk-dev
|
||||||
|
|
||||||
- name: Cache R packages
|
- name: Cache R packages
|
||||||
uses: actions/cache@v2
|
uses: actions/cache@v2
|
||||||
@ -130,6 +140,7 @@ jobs:
|
|||||||
install.packages(${{ env.R_PACKAGES }},
|
install.packages(${{ env.R_PACKAGES }},
|
||||||
repos = 'http://cloud.r-project.org',
|
repos = 'http://cloud.r-project.org',
|
||||||
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||||
|
install.packages('igraph', repos = 'http://cloud.r-project.org', dependencies = c('Depends', 'Imports', 'LinkingTo'))
|
||||||
|
|
||||||
- name: Check R Package
|
- name: Check R Package
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
10
.travis.yml
10
.travis.yml
@ -10,14 +10,6 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
include:
|
include:
|
||||||
- os: osx
|
|
||||||
arch: amd64
|
|
||||||
osx_image: xcode10.2
|
|
||||||
env: TASK=python_test
|
|
||||||
- os: osx
|
|
||||||
arch: amd64
|
|
||||||
osx_image: xcode10.2
|
|
||||||
env: TASK=java_test
|
|
||||||
- os: linux
|
- os: linux
|
||||||
arch: s390x
|
arch: s390x
|
||||||
env: TASK=s390x_test
|
env: TASK=s390x_test
|
||||||
@ -33,8 +25,6 @@ addons:
|
|||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- source tests/travis/travis_setup_env.sh
|
- source tests/travis/travis_setup_env.sh
|
||||||
- if [ "${TASK}" != "python_sdist_test" ]; then export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package; fi
|
|
||||||
- echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc
|
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- source tests/travis/setup.sh
|
- source tests/travis/setup.sh
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
||||||
project(xgboost LANGUAGES CXX C VERSION 1.5.0)
|
project(xgboost LANGUAGES CXX C VERSION 1.5.2)
|
||||||
include(cmake/Utils.cmake)
|
include(cmake/Utils.cmake)
|
||||||
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
||||||
cmake_policy(SET CMP0022 NEW)
|
cmake_policy(SET CMP0022 NEW)
|
||||||
@ -135,6 +135,10 @@ if (USE_CUDA)
|
|||||||
set(GEN_CODE "")
|
set(GEN_CODE "")
|
||||||
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
|
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
|
||||||
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
|
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
|
||||||
|
|
||||||
|
if ((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.4) AND (NOT BUILD_WITH_CUDA_CUB))
|
||||||
|
message(SEND_ERROR "`BUILD_WITH_CUDA_CUB` should be set to `ON` for CUDA >= 11.4")
|
||||||
|
endif ()
|
||||||
endif (USE_CUDA)
|
endif (USE_CUDA)
|
||||||
|
|
||||||
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
Package: xgboost
|
Package: xgboost
|
||||||
Type: Package
|
Type: Package
|
||||||
Title: Extreme Gradient Boosting
|
Title: Extreme Gradient Boosting
|
||||||
Version: 1.5.0.1
|
Version: 1.5.2.1
|
||||||
Date: 2020-08-28
|
Date: 2022-1-17
|
||||||
Authors@R: c(
|
Authors@R: c(
|
||||||
person("Tianqi", "Chen", role = c("aut"),
|
person("Tianqi", "Chen", role = c("aut"),
|
||||||
email = "tianqi.tchen@gmail.com"),
|
email = "tianqi.tchen@gmail.com"),
|
||||||
@ -26,9 +26,11 @@ Authors@R: c(
|
|||||||
person("Min", "Lin", role = c("aut")),
|
person("Min", "Lin", role = c("aut")),
|
||||||
person("Yifeng", "Geng", role = c("aut")),
|
person("Yifeng", "Geng", role = c("aut")),
|
||||||
person("Yutian", "Li", role = c("aut")),
|
person("Yutian", "Li", role = c("aut")),
|
||||||
|
person("Jiaming", "Yuan", role = c("aut")),
|
||||||
person("XGBoost contributors", role = c("cph"),
|
person("XGBoost contributors", role = c("cph"),
|
||||||
comment = "base XGBoost implementation")
|
comment = "base XGBoost implementation")
|
||||||
)
|
)
|
||||||
|
Maintainer: Jiaming Yuan <jm.yuan@outlook.com>
|
||||||
Description: Extreme Gradient Boosting, which is an efficient implementation
|
Description: Extreme Gradient Boosting, which is an efficient implementation
|
||||||
of the gradient boosting framework from Chen & Guestrin (2016) <doi:10.1145/2939672.2939785>.
|
of the gradient boosting framework from Chen & Guestrin (2016) <doi:10.1145/2939672.2939785>.
|
||||||
This package is its R interface. The package includes efficient linear
|
This package is its R interface. The package includes efficient linear
|
||||||
|
|||||||
@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
|||||||
shape <- predts$shape
|
shape <- predts$shape
|
||||||
ret <- predts$results
|
ret <- predts$results
|
||||||
|
|
||||||
|
n_ret <- length(ret)
|
||||||
n_row <- nrow(newdata)
|
n_row <- nrow(newdata)
|
||||||
if (n_row != shape[1]) {
|
if (n_row != shape[1]) {
|
||||||
stop("Incorrect predict shape.")
|
stop("Incorrect predict shape.")
|
||||||
@ -405,36 +406,57 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
|||||||
arr <- array(data = ret, dim = rev(shape))
|
arr <- array(data = ret, dim = rev(shape))
|
||||||
|
|
||||||
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||||
|
n_groups <- shape[2]
|
||||||
|
|
||||||
|
## Needed regardless of whether strict shape is being used.
|
||||||
if (predcontrib) {
|
if (predcontrib) {
|
||||||
dimnames(arr) <- list(cnames, NULL, NULL)
|
dimnames(arr) <- list(cnames, NULL, NULL)
|
||||||
if (!strict_shape) {
|
|
||||||
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
|
||||||
}
|
|
||||||
} else if (predinteraction) {
|
} else if (predinteraction) {
|
||||||
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
|
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
|
||||||
if (!strict_shape) {
|
|
||||||
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
|
||||||
}
|
}
|
||||||
|
if (strict_shape) {
|
||||||
|
return(arr) # strict shape is calculated by libxgboost uniformly.
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!strict_shape) {
|
|
||||||
n_groups <- shape[2]
|
|
||||||
if (predleaf) {
|
if (predleaf) {
|
||||||
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
|
## Predict leaf
|
||||||
} else if (predcontrib && n_groups != 1) {
|
arr <- if (n_ret == n_row) {
|
||||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
|
matrix(arr, ncol = 1)
|
||||||
} else if (predinteraction && n_groups != 1) {
|
} else {
|
||||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
matrix(arr, nrow = n_row, byrow = TRUE)
|
||||||
} else if (!reshape && n_groups != 1) {
|
|
||||||
arr <- ret
|
|
||||||
} else if (reshape && n_groups != 1) {
|
|
||||||
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
|
|
||||||
}
|
}
|
||||||
arr <- drop(arr)
|
} else if (predcontrib) {
|
||||||
if (length(dim(arr)) == 1) {
|
## Predict contribution
|
||||||
arr <- as.vector(arr)
|
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
||||||
} else if (length(dim(arr)) == 2) {
|
arr <- if (n_ret == n_row) {
|
||||||
arr <- as.matrix(arr)
|
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
|
||||||
|
} else if (n_groups != 1) {
|
||||||
|
## turns array into list of matrices
|
||||||
|
lapply(seq_len(n_groups), function(g) arr[g, , ])
|
||||||
|
} else {
|
||||||
|
## remove the first axis (group)
|
||||||
|
dn <- dimnames(arr)
|
||||||
|
matrix(arr[1, , ], nrow = dim(arr)[2], ncol = dim(arr)[3], dimnames = c(dn[2], dn[3]))
|
||||||
|
}
|
||||||
|
} else if (predinteraction) {
|
||||||
|
## Predict interaction
|
||||||
|
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
||||||
|
arr <- if (n_ret == n_row) {
|
||||||
|
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
|
||||||
|
} else if (n_groups != 1) {
|
||||||
|
## turns array into list of matrices
|
||||||
|
lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
||||||
|
} else {
|
||||||
|
## remove the first axis (group)
|
||||||
|
arr <- arr[1, , , , drop = FALSE]
|
||||||
|
array(arr, dim = dim(arr)[2:4], dimnames(arr)[2:4])
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
## Normal prediction
|
||||||
|
arr <- if (reshape && n_groups != 1) {
|
||||||
|
matrix(arr, ncol = n_groups, byrow = TRUE)
|
||||||
|
} else {
|
||||||
|
as.vector(ret)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return(arr)
|
return(arr)
|
||||||
|
|||||||
@ -11,6 +11,7 @@
|
|||||||
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
|
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
|
||||||
#' It is useful when a 0 or some other extreme value represents missing values in data.
|
#' It is useful when a 0 or some other extreme value represents missing values in data.
|
||||||
#' @param silent whether to suppress printing an informational message after loading from a file.
|
#' @param silent whether to suppress printing an informational message after loading from a file.
|
||||||
|
#' @param nthread Number of threads used for creating DMatrix.
|
||||||
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
|
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
|
|||||||
@ -18,7 +18,7 @@
|
|||||||
#'
|
#'
|
||||||
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
||||||
#'
|
#'
|
||||||
#' \url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
|
#' \url{https://research.facebook.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
|
||||||
#'
|
#'
|
||||||
#' Extract explaining the method:
|
#' Extract explaining the method:
|
||||||
#'
|
#'
|
||||||
|
|||||||
@ -6,8 +6,6 @@
|
|||||||
#' @param fname the name of the text file where to save the model text dump.
|
#' @param fname the name of the text file where to save the model text dump.
|
||||||
#' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
|
#' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
|
||||||
#' @param fmap feature map file representing feature types.
|
#' @param fmap feature map file representing feature types.
|
||||||
#' Detailed description could be found at
|
|
||||||
#' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
|
|
||||||
#' See demo/ for walkthrough example in R, and
|
#' See demo/ for walkthrough example in R, and
|
||||||
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||||
#' for example Format.
|
#' for example Format.
|
||||||
|
|||||||
@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
|||||||
} else {
|
} else {
|
||||||
concatenated <- list()
|
concatenated <- list()
|
||||||
output_names <- vector()
|
output_names <- vector()
|
||||||
for (importance_type in c("weight", "gain", "cover")) {
|
for (importance_type in c("weight", "total_gain", "total_cover")) {
|
||||||
args <- list(importance_type = importance_type, feature_names = feature_names)
|
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
|
||||||
results <- .Call(
|
results <- .Call(
|
||||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||||
)
|
)
|
||||||
names(results) <- c("features", "shape", importance_type)
|
names(results) <- c("features", "shape", importance_type)
|
||||||
concatenated[
|
concatenated[
|
||||||
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
|
switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
|
||||||
] <- results[importance_type]
|
] <- results[importance_type]
|
||||||
output_names <- results$features
|
output_names <- results$features
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,8 +9,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
|
|||||||
early_stopping_rounds = NULL, maximize = NULL,
|
early_stopping_rounds = NULL, maximize = NULL,
|
||||||
save_period = NULL, save_name = "xgboost.model",
|
save_period = NULL, save_name = "xgboost.model",
|
||||||
xgb_model = NULL, callbacks = list(), ...) {
|
xgb_model = NULL, callbacks = list(), ...) {
|
||||||
|
merged <- check.booster.params(params, ...)
|
||||||
dtrain <- xgb.get.DMatrix(data, label, missing, weight, nthread = params$nthread)
|
dtrain <- xgb.get.DMatrix(data, label, missing, weight, nthread = merged$nthread)
|
||||||
|
|
||||||
watchlist <- list(train = dtrain)
|
watchlist <- list(train = dtrain)
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,14 @@
|
|||||||
\alias{xgb.DMatrix}
|
\alias{xgb.DMatrix}
|
||||||
\title{Construct xgb.DMatrix object}
|
\title{Construct xgb.DMatrix object}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.DMatrix(data, info = list(), missing = NA, silent = FALSE, ...)
|
xgb.DMatrix(
|
||||||
|
data,
|
||||||
|
info = list(),
|
||||||
|
missing = NA,
|
||||||
|
silent = FALSE,
|
||||||
|
nthread = NULL,
|
||||||
|
...
|
||||||
|
)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
|
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
|
||||||
@ -18,6 +25,8 @@ It is useful when a 0 or some other extreme value represents missing values in d
|
|||||||
|
|
||||||
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
||||||
|
|
||||||
|
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||||
|
|
||||||
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
|
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
|
|||||||
@ -29,7 +29,7 @@ Joaquin Quinonero Candela)}
|
|||||||
|
|
||||||
International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
||||||
|
|
||||||
\url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
|
\url{https://research.facebook.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
|
||||||
|
|
||||||
Extract explaining the method:
|
Extract explaining the method:
|
||||||
|
|
||||||
|
|||||||
@ -20,8 +20,6 @@ xgb.dump(
|
|||||||
If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}
|
If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}
|
||||||
|
|
||||||
\item{fmap}{feature map file representing feature types.
|
\item{fmap}{feature map file representing feature types.
|
||||||
Detailed description could be found at
|
|
||||||
\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
|
|
||||||
See demo/ for walkthrough example in R, and
|
See demo/ for walkthrough example in R, and
|
||||||
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||||
for example Format.}
|
for example Format.}
|
||||||
|
|||||||
@ -46,3 +46,31 @@ test_that("gblinear works", {
|
|||||||
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
||||||
expect_s4_class(h, "dgCMatrix")
|
expect_s4_class(h, "dgCMatrix")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("gblinear early stopping works", {
|
||||||
|
data(agaricus.train, package = 'xgboost')
|
||||||
|
data(agaricus.test, package = 'xgboost')
|
||||||
|
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
|
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||||
|
|
||||||
|
param <- list(
|
||||||
|
objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
|
||||||
|
nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001,
|
||||||
|
updater = "coord_descent"
|
||||||
|
)
|
||||||
|
|
||||||
|
es_round <- 1
|
||||||
|
n <- 10
|
||||||
|
booster <- xgb.train(
|
||||||
|
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
|
||||||
|
)
|
||||||
|
expect_equal(booster$best_iteration, 5)
|
||||||
|
predt_es <- predict(booster, dtrain)
|
||||||
|
|
||||||
|
n <- booster$best_iteration + es_round
|
||||||
|
booster <- xgb.train(
|
||||||
|
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
|
||||||
|
)
|
||||||
|
predt <- predict(booster, dtrain)
|
||||||
|
expect_equal(predt_es, predt)
|
||||||
|
})
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
library(testthat)
|
||||||
context('Test helper functions')
|
context('Test helper functions')
|
||||||
|
|
||||||
require(xgboost)
|
require(xgboost)
|
||||||
@ -227,7 +228,7 @@ if (grepl('Windows', Sys.info()[['sysname']]) ||
|
|||||||
X <- 10^runif(100, -20, 20)
|
X <- 10^runif(100, -20, 20)
|
||||||
if (capabilities('long.double')) {
|
if (capabilities('long.double')) {
|
||||||
X2X <- as.numeric(format(X, digits = 17))
|
X2X <- as.numeric(format(X, digits = 17))
|
||||||
expect_identical(X, X2X)
|
expect_equal(X, X2X, tolerance = float_tolerance)
|
||||||
}
|
}
|
||||||
# retrieved attributes to be the same as written
|
# retrieved attributes to be the same as written
|
||||||
for (x in X) {
|
for (x in X) {
|
||||||
@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
|
|||||||
# for multiclass
|
# for multiclass
|
||||||
imp.Tree <- xgb.importance(model = mbst.Tree)
|
imp.Tree <- xgb.importance(model = mbst.Tree)
|
||||||
expect_equal(dim(imp.Tree), c(4, 4))
|
expect_equal(dim(imp.Tree), c(4, 4))
|
||||||
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
|
|
||||||
|
trees <- seq(from = 0, by = 2, length.out = 2)
|
||||||
|
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
|
||||||
|
|
||||||
|
importance_from_dump <- function() {
|
||||||
|
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
|
||||||
|
imp <- xgb.model.dt.tree(
|
||||||
|
feature_names = feature.names,
|
||||||
|
text = model_text_dump,
|
||||||
|
trees = trees
|
||||||
|
)[
|
||||||
|
Feature != "Leaf", .(
|
||||||
|
Gain = sum(Quality),
|
||||||
|
Cover = sum(Cover),
|
||||||
|
Frequency = .N
|
||||||
|
),
|
||||||
|
by = Feature
|
||||||
|
][
|
||||||
|
, `:=`(
|
||||||
|
Gain = Gain / sum(Gain),
|
||||||
|
Cover = Cover / sum(Cover),
|
||||||
|
Frequency = Frequency / sum(Frequency)
|
||||||
|
)
|
||||||
|
][
|
||||||
|
order(Gain, decreasing = TRUE)
|
||||||
|
]
|
||||||
|
imp
|
||||||
|
}
|
||||||
|
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.importance works with GLM model", {
|
test_that("xgb.importance works with GLM model", {
|
||||||
|
|||||||
@ -157,3 +157,28 @@ test_that("multiclass feature interactions work", {
|
|||||||
# sums WRT columns must be close to feature contributions
|
# sums WRT columns must be close to feature contributions
|
||||||
expect_lt(max(abs(apply(intr, c(1, 2, 3), sum) - aperm(cont, c(3, 1, 2)))), 0.00001)
|
expect_lt(max(abs(apply(intr, c(1, 2, 3), sum) - aperm(cont, c(3, 1, 2)))), 0.00001)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
test_that("SHAP single sample works", {
|
||||||
|
train <- agaricus.train
|
||||||
|
test <- agaricus.test
|
||||||
|
booster <- xgboost(
|
||||||
|
data = train$data,
|
||||||
|
label = train$label,
|
||||||
|
max_depth = 2,
|
||||||
|
nrounds = 4,
|
||||||
|
objective = "binary:logistic",
|
||||||
|
)
|
||||||
|
|
||||||
|
predt <- predict(
|
||||||
|
booster,
|
||||||
|
newdata = train$data[1, , drop = FALSE], predcontrib = TRUE
|
||||||
|
)
|
||||||
|
expect_equal(dim(predt), c(1, dim(train$data)[2] + 1))
|
||||||
|
|
||||||
|
predt <- predict(
|
||||||
|
booster,
|
||||||
|
newdata = train$data[1, , drop = FALSE], predinteraction = TRUE
|
||||||
|
)
|
||||||
|
expect_equal(dim(predt), c(1, dim(train$data)[2] + 1, dim(train$data)[2] + 1))
|
||||||
|
})
|
||||||
|
|||||||
@ -138,7 +138,7 @@ levels(df[,Treatment])
|
|||||||
|
|
||||||
Next step, we will transform the categorical data to dummy variables.
|
Next step, we will transform the categorical data to dummy variables.
|
||||||
Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach.
|
Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach.
|
||||||
We will use the [dummy contrast coding](https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
|
We will use the [dummy contrast coding](https://stats.oarc.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
|
||||||
|
|
||||||
The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.
|
The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.
|
||||||
|
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@-dev
|
@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@
|
||||||
@ -148,7 +148,8 @@ From the command line on Linux starting from the XGBoost directory:
|
|||||||
|
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DUSE_CUDA=ON
|
# For CUDA toolkit >= 11.4, `BUILD_WITH_CUDA_CUB` is required.
|
||||||
|
cmake .. -DUSE_CUDA=ON -DBUILD_WITH_CUDA_CUB=ON
|
||||||
make -j4
|
make -j4
|
||||||
|
|
||||||
.. note:: Specifying compute capability
|
.. note:: Specifying compute capability
|
||||||
|
|||||||
@ -18,7 +18,7 @@ Making a Release
|
|||||||
|
|
||||||
1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue.
|
1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue.
|
||||||
2. Bump release version.
|
2. Bump release version.
|
||||||
1. Modify ``CMakeLists.txt`` source tree, run CMake.
|
1. Modify ``CMakeLists.txt`` in source tree and ``cmake/Python_version.in`` if needed, run CMake.
|
||||||
2. Modify ``DESCRIPTION`` in R-package.
|
2. Modify ``DESCRIPTION`` in R-package.
|
||||||
3. Run ``change_version.sh`` in ``jvm-packages/dev``
|
3. Run ``change_version.sh`` in ``jvm-packages/dev``
|
||||||
3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.
|
3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.
|
||||||
|
|||||||
@ -95,13 +95,13 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
|
|||||||
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||||
|
|
||||||
See examples `here
|
See examples `here
|
||||||
<https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`_.
|
<https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
|
||||||
|
|
||||||
Multi-node Multi-GPU Training
|
Multi-node Multi-GPU Training
|
||||||
=============================
|
=============================
|
||||||
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_. For
|
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_. For
|
||||||
getting started see our tutorial :doc:`/tutorials/dask` and worked examples `here
|
getting started see our tutorial :doc:`/tutorials/dask` and worked examples `here
|
||||||
<https://github.com/dmlc/xgboost/tree/master/demo/dask>`_, also Python documentation
|
<https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation
|
||||||
:ref:`dask_api` for complete reference.
|
:ref:`dask_api` for complete reference.
|
||||||
|
|
||||||
|
|
||||||
@ -238,7 +238,7 @@ Working memory is allocated inside the algorithm proportional to the number of r
|
|||||||
|
|
||||||
The quantile finding algorithm also uses some amount of working device memory. It is able to operate in batches, but is not currently well optimised for sparse data.
|
The quantile finding algorithm also uses some amount of working device memory. It is able to operate in batches, but is not currently well optimised for sparse data.
|
||||||
|
|
||||||
If you are getting out-of-memory errors on a big dataset, try the `external memory version <../tutorials/external_memory.html>`_.
|
If you are getting out-of-memory errors on a big dataset, try the :doc:`external memory version </tutorials/external_memory>`.
|
||||||
|
|
||||||
Developer notes
|
Developer notes
|
||||||
===============
|
===============
|
||||||
|
|||||||
@ -79,7 +79,7 @@ The first thing in data transformation is to load the dataset as Spark's structu
|
|||||||
StructField("class", StringType, true)))
|
StructField("class", StringType, true)))
|
||||||
val rawInput = spark.read.schema(schema).csv("input_path")
|
val rawInput = spark.read.schema(schema).csv("input_path")
|
||||||
|
|
||||||
At the first line, we create a instance of `SparkSession <http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``.
|
At the first line, we create a instance of `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``.
|
||||||
|
|
||||||
Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM.
|
Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM.
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ labels. A DataFrame like this (containing vector-represented features and numeri
|
|||||||
Dealing with missing values
|
Dealing with missing values
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
XGBoost supports missing values by default (`as desribed here <https://xgboost.readthedocs.io/en/latest/faq.html#how-to-deal-with-missing-value>`_).
|
XGBoost supports missing values by default (`as desribed here <https://xgboost.readthedocs.io/en/latest/faq.html#how-to-deal-with-missing-values>`_).
|
||||||
If given a SparseVector, XGBoost will treat any values absent from the SparseVector as missing. You are also able to
|
If given a SparseVector, XGBoost will treat any values absent from the SparseVector as missing. You are also able to
|
||||||
specify to XGBoost to treat a specific value in your Dataset as if it was a missing value. By default XGBoost will treat NaN as the value representing missing.
|
specify to XGBoost to treat a specific value in your Dataset as if it was a missing value. By default XGBoost will treat NaN as the value representing missing.
|
||||||
|
|
||||||
@ -369,7 +369,7 @@ Then we can load this model with single node Python XGBoost:
|
|||||||
|
|
||||||
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
|
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
|
||||||
|
|
||||||
1. Build XGBoost4J-Spark with the steps described in `here <https://xgboost.readthedocs.io/en/latest/jvm/index.html#installation-from-source>`_, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
|
1. Build XGBoost4J-Spark with the steps described in :ref:`here <install_jvm_packages>`, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
|
||||||
|
|
||||||
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
|
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
|
||||||
|
|
||||||
|
|||||||
@ -366,8 +366,8 @@ Specify the learning task and the corresponding learning objective. The objectiv
|
|||||||
- ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
|
- ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
|
||||||
- ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized
|
- ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized
|
||||||
- ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
|
- ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
|
||||||
- ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Applications>`_.
|
- ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications>`_.
|
||||||
- ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`_.
|
- ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications>`_.
|
||||||
|
|
||||||
* ``base_score`` [default=0.5]
|
* ``base_score`` [default=0.5]
|
||||||
|
|
||||||
@ -390,7 +390,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
|
|||||||
- ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'.
|
- ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'.
|
||||||
- ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``.
|
- ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``.
|
||||||
- ``mlogloss``: `Multiclass logloss <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html>`_.
|
- ``mlogloss``: `Multiclass logloss <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html>`_.
|
||||||
- ``auc``: `Receiver Operating Characteristic Area under the Curve <http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve>`_.
|
- ``auc``: `Receiver Operating Characteristic Area under the Curve <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
|
||||||
Available for classification and learning-to-rank tasks.
|
Available for classification and learning-to-rank tasks.
|
||||||
|
|
||||||
- When used with binary classification, the objective should be ``binary:logistic`` or similar functions that work on probability.
|
- When used with binary classification, the objective should be ``binary:logistic`` or similar functions that work on probability.
|
||||||
|
|||||||
@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
|
|||||||
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:
|
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:
|
||||||
|
|
||||||
Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
|
Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
|
||||||
has equivalent output of ``multi:softprob`` due to dropped transformation. If strict
|
has equivalent output shape of ``multi:softprob`` due to dropped transformation. If
|
||||||
shape is set to False then output can have 1 or 2 dim depending on used model.
|
strict shape is set to False then output can have 1 or 2 dim depending on used model.
|
||||||
|
|
||||||
- When using ``preds_contribs`` with ``strict_shape`` set to ``True``:
|
- When using ``preds_contribs`` with ``strict_shape`` set to ``True``:
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ In order to run a XGBoost job in a Kubernetes cluster, perform the following ste
|
|||||||
|
|
||||||
1. Install XGBoost Operator on the Kubernetes cluster.
|
1. Install XGBoost Operator on the Kubernetes cluster.
|
||||||
|
|
||||||
a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://github.com/kubeflow/xgboost-operator#installing-xgboost-operator>`_ to install XGBoost Operator.
|
a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://github.com/kubeflow/xgboost-operator#install-xgboost-operator>`_ to install XGBoost Operator.
|
||||||
|
|
||||||
2. Write application code that will be executed by the XGBoost Operator.
|
2. Write application code that will be executed by the XGBoost Operator.
|
||||||
|
|
||||||
|
|||||||
@ -227,15 +227,15 @@ XGBoost has a function called ``dump_model`` in Booster object, which lets you t
|
|||||||
the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary
|
the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary
|
||||||
use case for it is for model interpretation or visualization, and is not supposed to be
|
use case for it is for model interpretation or visualization, and is not supposed to be
|
||||||
loaded back to XGBoost. The JSON version has a `schema
|
loaded back to XGBoost. The JSON version has a `schema
|
||||||
<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`_. See next section for
|
<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__. See next section for
|
||||||
more info.
|
more info.
|
||||||
|
|
||||||
***********
|
***********
|
||||||
JSON Schema
|
JSON Schema
|
||||||
***********
|
***********
|
||||||
|
|
||||||
Another important feature of JSON format is a documented `Schema
|
Another important feature of JSON format is a documented `schema
|
||||||
<https://json-schema.org/>`_, based on which one can easily reuse the output model from
|
<https://json-schema.org/>`__, based on which one can easily reuse the output model from
|
||||||
XGBoost. Here is the initial draft of JSON schema for the output model (not
|
XGBoost. Here is the initial draft of JSON schema for the output model (not
|
||||||
serialization, which will not be stable as noted above). It's subject to change due to
|
serialization, which will not be stable as noted above). It's subject to change due to
|
||||||
the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``.
|
the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``.
|
||||||
|
|||||||
@ -211,7 +211,7 @@ struct Entry {
|
|||||||
*/
|
*/
|
||||||
struct BatchParam {
|
struct BatchParam {
|
||||||
/*! \brief The GPU device to use. */
|
/*! \brief The GPU device to use. */
|
||||||
int gpu_id;
|
int gpu_id {-1};
|
||||||
/*! \brief Maximum number of bins per feature for histograms. */
|
/*! \brief Maximum number of bins per feature for histograms. */
|
||||||
int max_bin{0};
|
int max_bin{0};
|
||||||
/*! \brief Hessian, used for sketching with future approx implementation. */
|
/*! \brief Hessian, used for sketching with future approx implementation. */
|
||||||
|
|||||||
@ -183,6 +183,7 @@ class GradientBooster : public Model, public Configurable {
|
|||||||
std::string format) const = 0;
|
std::string format) const = 0;
|
||||||
|
|
||||||
virtual void FeatureScore(std::string const& importance_type,
|
virtual void FeatureScore(std::string const& importance_type,
|
||||||
|
common::Span<int32_t const> trees,
|
||||||
std::vector<bst_feature_t>* features,
|
std::vector<bst_feature_t>* features,
|
||||||
std::vector<float>* scores) const = 0;
|
std::vector<float>* scores) const = 0;
|
||||||
/*!
|
/*!
|
||||||
|
|||||||
@ -156,6 +156,7 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
|
|||||||
* \brief Calculate feature score. See doc in C API for outputs.
|
* \brief Calculate feature score. See doc in C API for outputs.
|
||||||
*/
|
*/
|
||||||
virtual void CalcFeatureScore(std::string const& importance_type,
|
virtual void CalcFeatureScore(std::string const& importance_type,
|
||||||
|
common::Span<int32_t const> trees,
|
||||||
std::vector<bst_feature_t>* features,
|
std::vector<bst_feature_t>* features,
|
||||||
std::vector<float>* scores) = 0;
|
std::vector<float>* scores) = 0;
|
||||||
|
|
||||||
|
|||||||
@ -6,6 +6,6 @@
|
|||||||
|
|
||||||
#define XGBOOST_VER_MAJOR 1
|
#define XGBOOST_VER_MAJOR 1
|
||||||
#define XGBOOST_VER_MINOR 5
|
#define XGBOOST_VER_MINOR 5
|
||||||
#define XGBOOST_VER_PATCH 0
|
#define XGBOOST_VER_PATCH 2
|
||||||
|
|
||||||
#endif // XGBOOST_VERSION_CONFIG_H_
|
#endif // XGBOOST_VERSION_CONFIG_H_
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>XGBoost JVM Package</name>
|
<name>XGBoost JVM Package</name>
|
||||||
<description>JVM Package for XGBoost</description>
|
<description>JVM Package for XGBoost</description>
|
||||||
|
|||||||
@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-example_2.12</artifactId>
|
<artifactId>xgboost4j-example_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
@ -26,7 +26,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
@ -37,7 +37,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
|||||||
@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-flink_2.12</artifactId>
|
<artifactId>xgboost4j-flink_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
@ -26,7 +26,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
|||||||
@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-gpu_2.12</artifactId>
|
<artifactId>xgboost4j-gpu_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
|
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
|
||||||
<build>
|
<build>
|
||||||
@ -24,7 +24,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-spark_2.12</artifactId>
|
<artifactId>xgboost4j-spark_2.12</artifactId>
|
||||||
<build>
|
<build>
|
||||||
@ -24,7 +24,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
|
|||||||
@ -17,11 +17,13 @@
|
|||||||
package ml.dmlc.xgboost4j.scala.spark.params
|
package ml.dmlc.xgboost4j.scala.spark.params
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
|
|
||||||
import org.apache.spark.SparkContext
|
import org.apache.spark.SparkContext
|
||||||
import org.apache.spark.ml.param.{ParamPair, Params}
|
import org.apache.spark.ml.param.{ParamPair, Params}
|
||||||
import org.json4s.JsonDSL._
|
|
||||||
import org.json4s.jackson.JsonMethods._
|
import org.json4s.jackson.JsonMethods._
|
||||||
import org.json4s.{JObject, _}
|
import org.json4s.{JArray, JBool, JDouble, JField, JInt, JNothing, JObject, JString, JValue}
|
||||||
|
|
||||||
|
import JsonDSLXGBoost._
|
||||||
|
|
||||||
// This originates from apache-spark DefaultPramsWriter copy paste
|
// This originates from apache-spark DefaultPramsWriter copy paste
|
||||||
private[spark] object DefaultXGBoostParamsWriter {
|
private[spark] object DefaultXGBoostParamsWriter {
|
||||||
@ -87,3 +89,62 @@ private[spark] object DefaultXGBoostParamsWriter {
|
|||||||
metadataJson
|
metadataJson
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fix json4s bin-incompatible issue.
|
||||||
|
// This originates from org.json4s.JsonDSL of 3.6.6
|
||||||
|
object JsonDSLXGBoost {
|
||||||
|
|
||||||
|
implicit def seq2jvalue[A](s: Iterable[A])(implicit ev: A => JValue): JArray =
|
||||||
|
JArray(s.toList.map(ev))
|
||||||
|
|
||||||
|
implicit def map2jvalue[A](m: Map[String, A])(implicit ev: A => JValue): JObject =
|
||||||
|
JObject(m.toList.map { case (k, v) => JField(k, ev(v)) })
|
||||||
|
|
||||||
|
implicit def option2jvalue[A](opt: Option[A])(implicit ev: A => JValue): JValue = opt match {
|
||||||
|
case Some(x) => ev(x)
|
||||||
|
case None => JNothing
|
||||||
|
}
|
||||||
|
|
||||||
|
implicit def short2jvalue(x: Short): JValue = JInt(x)
|
||||||
|
implicit def byte2jvalue(x: Byte): JValue = JInt(x)
|
||||||
|
implicit def char2jvalue(x: Char): JValue = JInt(x)
|
||||||
|
implicit def int2jvalue(x: Int): JValue = JInt(x)
|
||||||
|
implicit def long2jvalue(x: Long): JValue = JInt(x)
|
||||||
|
implicit def bigint2jvalue(x: BigInt): JValue = JInt(x)
|
||||||
|
implicit def double2jvalue(x: Double): JValue = JDouble(x)
|
||||||
|
implicit def float2jvalue(x: Float): JValue = JDouble(x.toDouble)
|
||||||
|
implicit def bigdecimal2jvalue(x: BigDecimal): JValue = JDouble(x.doubleValue)
|
||||||
|
implicit def boolean2jvalue(x: Boolean): JValue = JBool(x)
|
||||||
|
implicit def string2jvalue(x: String): JValue = JString(x)
|
||||||
|
|
||||||
|
implicit def symbol2jvalue(x: Symbol): JString = JString(x.name)
|
||||||
|
implicit def pair2jvalue[A](t: (String, A))(implicit ev: A => JValue): JObject =
|
||||||
|
JObject(List(JField(t._1, ev(t._2))))
|
||||||
|
implicit def list2jvalue(l: List[JField]): JObject = JObject(l)
|
||||||
|
implicit def jobject2assoc(o: JObject): JsonListAssoc = new JsonListAssoc(o.obj)
|
||||||
|
implicit def pair2Assoc[A](t: (String, A))(implicit ev: A => JValue): JsonAssoc[A] =
|
||||||
|
new JsonAssoc(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
final class JsonAssoc[A](private val left: (String, A)) extends AnyVal {
|
||||||
|
def ~[B](right: (String, B))(implicit ev1: A => JValue, ev2: B => JValue): JObject = {
|
||||||
|
val l: JValue = ev1(left._2)
|
||||||
|
val r: JValue = ev2(right._2)
|
||||||
|
JObject(JField(left._1, l) :: JField(right._1, r) :: Nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
def ~(right: JObject)(implicit ev: A => JValue): JObject = {
|
||||||
|
val l: JValue = ev(left._2)
|
||||||
|
JObject(JField(left._1, l) :: right.obj)
|
||||||
|
}
|
||||||
|
def ~~[B](right: (String, B))(implicit ev1: A => JValue, ev2: B => JValue): JObject =
|
||||||
|
this.~(right)
|
||||||
|
def ~~(right: JObject)(implicit ev: A => JValue): JObject = this.~(right)
|
||||||
|
}
|
||||||
|
|
||||||
|
final class JsonListAssoc(private val left: List[JField]) extends AnyVal {
|
||||||
|
def ~(right: (String, JValue)): JObject = JObject(left ::: List(JField(right._1, right._2)))
|
||||||
|
def ~(right: JObject): JObject = JObject(left ::: right.obj)
|
||||||
|
def ~~(right: (String, JValue)): JObject = this.~(right)
|
||||||
|
def ~~(right: JObject): JObject = this.~(right)
|
||||||
|
}
|
||||||
|
|||||||
@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j_2.12</artifactId>
|
<artifactId>xgboost4j_2.12</artifactId>
|
||||||
<version>1.5.0-SNAPSHOT</version>
|
<version>1.5.2</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import shutil
|
|||||||
import subprocess
|
import subprocess
|
||||||
import logging
|
import logging
|
||||||
import distutils
|
import distutils
|
||||||
|
from typing import Optional, List
|
||||||
import sys
|
import sys
|
||||||
from platform import system
|
from platform import system
|
||||||
from setuptools import setup, find_packages, Extension
|
from setuptools import setup, find_packages, Extension
|
||||||
@ -36,7 +37,7 @@ NEED_CLEAN_FILE = set()
|
|||||||
BUILD_TEMP_DIR = None
|
BUILD_TEMP_DIR = None
|
||||||
|
|
||||||
|
|
||||||
def lib_name():
|
def lib_name() -> str:
|
||||||
'''Return platform dependent shared object name.'''
|
'''Return platform dependent shared object name.'''
|
||||||
if system() == 'Linux' or system().upper().endswith('BSD'):
|
if system() == 'Linux' or system().upper().endswith('BSD'):
|
||||||
name = 'libxgboost.so'
|
name = 'libxgboost.so'
|
||||||
@ -47,13 +48,13 @@ def lib_name():
|
|||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
def copy_tree(src_dir, target_dir):
|
def copy_tree(src_dir: str, target_dir: str) -> None:
|
||||||
'''Copy source tree into build directory.'''
|
'''Copy source tree into build directory.'''
|
||||||
def clean_copy_tree(src, dst):
|
def clean_copy_tree(src: str, dst: str) -> None:
|
||||||
distutils.dir_util.copy_tree(src, dst)
|
distutils.dir_util.copy_tree(src, dst)
|
||||||
NEED_CLEAN_TREE.add(os.path.abspath(dst))
|
NEED_CLEAN_TREE.add(os.path.abspath(dst))
|
||||||
|
|
||||||
def clean_copy_file(src, dst):
|
def clean_copy_file(src: str, dst: str) -> None:
|
||||||
distutils.file_util.copy_file(src, dst)
|
distutils.file_util.copy_file(src, dst)
|
||||||
NEED_CLEAN_FILE.add(os.path.abspath(dst))
|
NEED_CLEAN_FILE.add(os.path.abspath(dst))
|
||||||
|
|
||||||
@ -77,7 +78,7 @@ def copy_tree(src_dir, target_dir):
|
|||||||
clean_copy_file(lic, os.path.join(target_dir, 'LICENSE'))
|
clean_copy_file(lic, os.path.join(target_dir, 'LICENSE'))
|
||||||
|
|
||||||
|
|
||||||
def clean_up():
|
def clean_up() -> None:
|
||||||
'''Removed copied files.'''
|
'''Removed copied files.'''
|
||||||
for path in NEED_CLEAN_TREE:
|
for path in NEED_CLEAN_TREE:
|
||||||
shutil.rmtree(path)
|
shutil.rmtree(path)
|
||||||
@ -87,7 +88,7 @@ def clean_up():
|
|||||||
|
|
||||||
class CMakeExtension(Extension): # pylint: disable=too-few-public-methods
|
class CMakeExtension(Extension): # pylint: disable=too-few-public-methods
|
||||||
'''Wrapper for extension'''
|
'''Wrapper for extension'''
|
||||||
def __init__(self, name):
|
def __init__(self, name: str) -> None:
|
||||||
super().__init__(name=name, sources=[])
|
super().__init__(name=name, sources=[])
|
||||||
|
|
||||||
|
|
||||||
@ -97,7 +98,14 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
|
|||||||
logger = logging.getLogger('XGBoost build_ext')
|
logger = logging.getLogger('XGBoost build_ext')
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments,no-self-use
|
# pylint: disable=too-many-arguments,no-self-use
|
||||||
def build(self, src_dir, build_dir, generator, build_tool=None, use_omp=1):
|
def build(
|
||||||
|
self,
|
||||||
|
src_dir: str,
|
||||||
|
build_dir: str,
|
||||||
|
generator: str,
|
||||||
|
build_tool: Optional[str] = None,
|
||||||
|
use_omp: int = 1,
|
||||||
|
) -> None:
|
||||||
'''Build the core library with CMake.'''
|
'''Build the core library with CMake.'''
|
||||||
cmake_cmd = ['cmake', src_dir, generator]
|
cmake_cmd = ['cmake', src_dir, generator]
|
||||||
|
|
||||||
@ -116,13 +124,14 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
|
|||||||
|
|
||||||
if system() != 'Windows':
|
if system() != 'Windows':
|
||||||
nproc = os.cpu_count()
|
nproc = os.cpu_count()
|
||||||
|
assert build_tool is not None
|
||||||
subprocess.check_call([build_tool, '-j' + str(nproc)],
|
subprocess.check_call([build_tool, '-j' + str(nproc)],
|
||||||
cwd=build_dir)
|
cwd=build_dir)
|
||||||
else:
|
else:
|
||||||
subprocess.check_call(['cmake', '--build', '.',
|
subprocess.check_call(['cmake', '--build', '.',
|
||||||
'--config', 'Release'], cwd=build_dir)
|
'--config', 'Release'], cwd=build_dir)
|
||||||
|
|
||||||
def build_cmake_extension(self):
|
def build_cmake_extension(self) -> None:
|
||||||
'''Configure and build using CMake'''
|
'''Configure and build using CMake'''
|
||||||
if USER_OPTIONS['use-system-libxgboost'][2]:
|
if USER_OPTIONS['use-system-libxgboost'][2]:
|
||||||
self.logger.info('Using system libxgboost.')
|
self.logger.info('Using system libxgboost.')
|
||||||
@ -174,14 +183,14 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors
|
|||||||
self.logger.warning('Disabling OpenMP support.')
|
self.logger.warning('Disabling OpenMP support.')
|
||||||
self.build(src_dir, build_dir, gen, build_tool, use_omp=0)
|
self.build(src_dir, build_dir, gen, build_tool, use_omp=0)
|
||||||
|
|
||||||
def build_extension(self, ext):
|
def build_extension(self, ext: Extension) -> None:
|
||||||
'''Override the method for dispatching.'''
|
'''Override the method for dispatching.'''
|
||||||
if isinstance(ext, CMakeExtension):
|
if isinstance(ext, CMakeExtension):
|
||||||
self.build_cmake_extension()
|
self.build_cmake_extension()
|
||||||
else:
|
else:
|
||||||
super().build_extension(ext)
|
super().build_extension(ext)
|
||||||
|
|
||||||
def copy_extensions_to_source(self):
|
def copy_extensions_to_source(self) -> None:
|
||||||
'''Dummy override. Invoked during editable installation. Our binary
|
'''Dummy override. Invoked during editable installation. Our binary
|
||||||
should available in `lib`.
|
should available in `lib`.
|
||||||
|
|
||||||
@ -196,7 +205,7 @@ class Sdist(sdist.sdist): # pylint: disable=too-many-ancestors
|
|||||||
'''Copy c++ source into Python directory.'''
|
'''Copy c++ source into Python directory.'''
|
||||||
logger = logging.getLogger('xgboost sdist')
|
logger = logging.getLogger('xgboost sdist')
|
||||||
|
|
||||||
def run(self):
|
def run(self) -> None:
|
||||||
copy_tree(os.path.join(CURRENT_DIR, os.path.pardir),
|
copy_tree(os.path.join(CURRENT_DIR, os.path.pardir),
|
||||||
os.path.join(CURRENT_DIR, 'xgboost'))
|
os.path.join(CURRENT_DIR, 'xgboost'))
|
||||||
libxgboost = os.path.join(
|
libxgboost = os.path.join(
|
||||||
@ -213,7 +222,7 @@ class InstallLib(install_lib.install_lib):
|
|||||||
'''Copy shared object into installation directory.'''
|
'''Copy shared object into installation directory.'''
|
||||||
logger = logging.getLogger('xgboost install_lib')
|
logger = logging.getLogger('xgboost install_lib')
|
||||||
|
|
||||||
def install(self):
|
def install(self) -> List[str]:
|
||||||
outfiles = super().install()
|
outfiles = super().install()
|
||||||
|
|
||||||
if USER_OPTIONS['use-system-libxgboost'][2] != 0:
|
if USER_OPTIONS['use-system-libxgboost'][2] != 0:
|
||||||
@ -255,7 +264,7 @@ class Install(install.install): # pylint: disable=too-many-instance-attributes
|
|||||||
user_options = install.install.user_options + list(
|
user_options = install.install.user_options + list(
|
||||||
(k, v[0], v[1]) for k, v in USER_OPTIONS.items())
|
(k, v[0], v[1]) for k, v in USER_OPTIONS.items())
|
||||||
|
|
||||||
def initialize_options(self):
|
def initialize_options(self) -> None:
|
||||||
super().initialize_options()
|
super().initialize_options()
|
||||||
self.use_openmp = 1
|
self.use_openmp = 1
|
||||||
self.use_cuda = 0
|
self.use_cuda = 0
|
||||||
@ -271,7 +280,7 @@ class Install(install.install): # pylint: disable=too-many-instance-attributes
|
|||||||
|
|
||||||
self.use_system_libxgboost = 0
|
self.use_system_libxgboost = 0
|
||||||
|
|
||||||
def run(self):
|
def run(self) -> None:
|
||||||
# setuptools will configure the options according to user supplied command line
|
# setuptools will configure the options according to user supplied command line
|
||||||
# arguments, then here we propagate them into `USER_OPTIONS` for visibility to
|
# arguments, then here we propagate them into `USER_OPTIONS` for visibility to
|
||||||
# other sub-commands like `build_ext`.
|
# other sub-commands like `build_ext`.
|
||||||
@ -341,7 +350,9 @@ if __name__ == '__main__':
|
|||||||
'Programming Language :: Python :: 3',
|
'Programming Language :: Python :: 3',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
'Programming Language :: Python :: 3.7',
|
'Programming Language :: Python :: 3.7',
|
||||||
'Programming Language :: Python :: 3.8'],
|
'Programming Language :: Python :: 3.8',
|
||||||
|
'Programming Language :: Python :: 3.9',
|
||||||
|
'Programming Language :: Python :: 3.10'],
|
||||||
python_requires='>=3.6',
|
python_requires='>=3.6',
|
||||||
url='https://github.com/dmlc/xgboost')
|
url='https://github.com/dmlc/xgboost')
|
||||||
|
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
1.5.0-dev
|
1.5.2
|
||||||
|
|||||||
@ -229,7 +229,7 @@ def _numpy2ctypes_type(dtype):
|
|||||||
}
|
}
|
||||||
if np.intc is not np.int32: # Windows
|
if np.intc is not np.int32: # Windows
|
||||||
_NUMPY_TO_CTYPES_MAPPING[np.intc] = _NUMPY_TO_CTYPES_MAPPING[np.int32]
|
_NUMPY_TO_CTYPES_MAPPING[np.intc] = _NUMPY_TO_CTYPES_MAPPING[np.int32]
|
||||||
if dtype not in _NUMPY_TO_CTYPES_MAPPING.keys():
|
if dtype not in _NUMPY_TO_CTYPES_MAPPING:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"Supported types: {_NUMPY_TO_CTYPES_MAPPING.keys()}, got: {dtype}"
|
f"Supported types: {_NUMPY_TO_CTYPES_MAPPING.keys()}, got: {dtype}"
|
||||||
)
|
)
|
||||||
@ -266,7 +266,7 @@ def ctypes2cupy(cptr, length, dtype):
|
|||||||
from cupy.cuda.memory import UnownedMemory
|
from cupy.cuda.memory import UnownedMemory
|
||||||
|
|
||||||
CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint}
|
CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint}
|
||||||
if dtype not in CUPY_TO_CTYPES_MAPPING.keys():
|
if dtype not in CUPY_TO_CTYPES_MAPPING:
|
||||||
raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
|
raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
|
||||||
addr = ctypes.cast(cptr, ctypes.c_void_p).value
|
addr = ctypes.cast(cptr, ctypes.c_void_p).value
|
||||||
# pylint: disable=c-extension-no-member,no-member
|
# pylint: disable=c-extension-no-member,no-member
|
||||||
@ -386,7 +386,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
|||||||
raise exc # pylint: disable=raising-bad-type
|
raise exc # pylint: disable=raising-bad-type
|
||||||
|
|
||||||
def __del__(self) -> None:
|
def __del__(self) -> None:
|
||||||
assert self._temporary_data is None, self._temporary_data
|
assert self._temporary_data is None
|
||||||
assert self._exception is None
|
assert self._exception is None
|
||||||
|
|
||||||
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
|
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
|
||||||
@ -410,19 +410,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
|||||||
feature_names: Optional[List[str]] = None,
|
feature_names: Optional[List[str]] = None,
|
||||||
feature_types: Optional[List[str]] = None,
|
feature_types: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
) -> None:
|
||||||
from .data import dispatch_proxy_set_data
|
from .data import dispatch_proxy_set_data
|
||||||
from .data import _proxy_transform
|
from .data import _proxy_transform
|
||||||
|
|
||||||
transformed, feature_names, feature_types = _proxy_transform(
|
new, cat_codes, feature_names, feature_types = _proxy_transform(
|
||||||
data,
|
data,
|
||||||
feature_names,
|
feature_names,
|
||||||
feature_types,
|
feature_types,
|
||||||
self._enable_categorical,
|
self._enable_categorical,
|
||||||
)
|
)
|
||||||
# Stage the data, meta info are copied inside C++ MetaInfo.
|
# Stage the data, meta info are copied inside C++ MetaInfo.
|
||||||
self._temporary_data = transformed
|
self._temporary_data = (new, cat_codes)
|
||||||
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
|
dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
|
||||||
self.proxy.set_info(
|
self.proxy.set_info(
|
||||||
feature_names=feature_names,
|
feature_names=feature_names,
|
||||||
feature_types=feature_types,
|
feature_types=feature_types,
|
||||||
@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
|
|||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
||||||
|
|
||||||
def _set_data_from_cuda_interface(self, data):
|
def _set_data_from_cuda_interface(self, data) -> None:
|
||||||
"""Set data from CUDA array interface."""
|
"""Set data from CUDA array interface."""
|
||||||
interface = data.__cuda_array_interface__
|
interface = data.__cuda_array_interface__
|
||||||
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
||||||
@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
|
|||||||
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
|
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _set_data_from_cuda_columnar(self, data):
|
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
|
||||||
"""Set data from CUDA columnar format."""
|
"""Set data from CUDA columnar format."""
|
||||||
from .data import _cudf_array_interfaces
|
from .data import _cudf_array_interfaces
|
||||||
|
|
||||||
_, interfaces_str = _cudf_array_interfaces(data)
|
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||||
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
|
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
|
||||||
|
|
||||||
def _set_data_from_array(self, data: np.ndarray):
|
def _set_data_from_array(self, data: np.ndarray):
|
||||||
@ -1805,7 +1805,7 @@ class Booster(object):
|
|||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
See `Prediction
|
See `Prediction
|
||||||
<https://xgboost.readthedocs.io/en/latest/tutorials/prediction.html>`_
|
<https://xgboost.readthedocs.io/en/latest/prediction.html>`_
|
||||||
for issues like thread safety and a summary of outputs from this function.
|
for issues like thread safety and a summary of outputs from this function.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -1986,13 +1986,6 @@ class Booster(object):
|
|||||||
preds = ctypes.POINTER(ctypes.c_float)()
|
preds = ctypes.POINTER(ctypes.c_float)()
|
||||||
|
|
||||||
# once caching is supported, we can pass id(data) as cache id.
|
# once caching is supported, we can pass id(data) as cache id.
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
if isinstance(data, pd.DataFrame):
|
|
||||||
data = data.values
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
args = {
|
args = {
|
||||||
"type": 0,
|
"type": 0,
|
||||||
"training": False,
|
"training": False,
|
||||||
@ -2027,7 +2020,20 @@ class Booster(object):
|
|||||||
f"got {data.shape[1]}"
|
f"got {data.shape[1]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .data import _is_pandas_df, _transform_pandas_df
|
||||||
from .data import _array_interface
|
from .data import _array_interface
|
||||||
|
if (
|
||||||
|
_is_pandas_df(data)
|
||||||
|
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||||
|
):
|
||||||
|
ft = self.feature_types
|
||||||
|
if ft is None:
|
||||||
|
enable_categorical = False
|
||||||
|
else:
|
||||||
|
enable_categorical = any(f == "c" for f in ft)
|
||||||
|
if _is_pandas_df(data):
|
||||||
|
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||||
|
|
||||||
if isinstance(data, np.ndarray):
|
if isinstance(data, np.ndarray):
|
||||||
from .data import _ensure_np_dtype
|
from .data import _ensure_np_dtype
|
||||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||||
@ -2080,9 +2086,11 @@ class Booster(object):
|
|||||||
)
|
)
|
||||||
return _prediction_output(shape, dims, preds, True)
|
return _prediction_output(shape, dims, preds, True)
|
||||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
||||||
from .data import _cudf_array_interfaces
|
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||||
|
data, cat_codes, _, _ = _transform_cudf_df(
|
||||||
_, interfaces_str = _cudf_array_interfaces(data)
|
data, None, None, enable_categorical
|
||||||
|
)
|
||||||
|
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||||
_check_call(
|
_check_call(
|
||||||
_LIB.XGBoosterPredictFromCudaColumnar(
|
_LIB.XGBoosterPredictFromCudaColumnar(
|
||||||
self.handle,
|
self.handle,
|
||||||
|
|||||||
@ -1606,8 +1606,9 @@ class DaskScikitLearnBase(XGBModel):
|
|||||||
should use `worker_client' instead of default client.
|
should use `worker_client' instead of default client.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
asynchronous = getattr(self, "_asynchronous", False)
|
|
||||||
if self._client is None:
|
if self._client is None:
|
||||||
|
asynchronous = getattr(self, "_asynchronous", False)
|
||||||
try:
|
try:
|
||||||
distributed.get_worker()
|
distributed.get_worker()
|
||||||
in_worker = True
|
in_worker = True
|
||||||
@ -1620,7 +1621,7 @@ class DaskScikitLearnBase(XGBModel):
|
|||||||
return ret
|
return ret
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
return self.client.sync(func, **kwargs, asynchronous=asynchronous)
|
return self.client.sync(func, **kwargs, asynchronous=self.client.asynchronous)
|
||||||
|
|
||||||
|
|
||||||
@xgboost_model_doc(
|
@xgboost_model_doc(
|
||||||
@ -1833,7 +1834,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
|
|||||||
vstack = update_wrapper(
|
vstack = update_wrapper(
|
||||||
partial(da.vstack, allow_unknown_chunksizes=True), da.vstack
|
partial(da.vstack, allow_unknown_chunksizes=True), da.vstack
|
||||||
)
|
)
|
||||||
return _cls_predict_proba(getattr(self, "n_classes_", None), predts, vstack)
|
return _cls_predict_proba(getattr(self, "n_classes_", 0), predts, vstack)
|
||||||
|
|
||||||
# pylint: disable=missing-function-docstring
|
# pylint: disable=missing-function-docstring
|
||||||
def predict_proba(
|
def predict_proba(
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# pylint: disable=too-many-arguments, too-many-branches
|
# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
|
||||||
# pylint: disable=too-many-return-statements, import-error
|
# pylint: disable=too-many-return-statements, import-error
|
||||||
'''Data dispatching for DMatrix.'''
|
'''Data dispatching for DMatrix.'''
|
||||||
import ctypes
|
import ctypes
|
||||||
@ -12,7 +12,7 @@ import numpy as np
|
|||||||
from .core import c_array, _LIB, _check_call, c_str
|
from .core import c_array, _LIB, _check_call, c_str
|
||||||
from .core import _cuda_array_interface
|
from .core import _cuda_array_interface
|
||||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||||
from .compat import lazy_isinstance
|
from .compat import lazy_isinstance, DataFrame
|
||||||
|
|
||||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||||
|
|
||||||
@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _transform_pandas_df(
|
def _invalid_dataframe_dtype(data) -> None:
|
||||||
data,
|
# pandas series has `dtypes` but it's just a single object
|
||||||
enable_categorical: bool,
|
# cudf series doesn't have `dtypes`.
|
||||||
feature_names: Optional[List[str]] = None,
|
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
|
||||||
feature_types: Optional[List[str]] = None,
|
|
||||||
meta=None,
|
|
||||||
meta_type=None,
|
|
||||||
):
|
|
||||||
import pandas as pd
|
|
||||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
|
||||||
|
|
||||||
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
|
|
||||||
(is_categorical_dtype(dtype) and enable_categorical)
|
|
||||||
for dtype in data.dtypes):
|
|
||||||
bad_fields = [
|
bad_fields = [
|
||||||
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
|
str(data.columns[i])
|
||||||
|
for i, dtype in enumerate(data.dtypes)
|
||||||
if dtype.name not in _pandas_dtype_mapper
|
if dtype.name not in _pandas_dtype_mapper
|
||||||
]
|
]
|
||||||
|
err = " Invalid columns:" + ", ".join(bad_fields)
|
||||||
|
else:
|
||||||
|
err = ""
|
||||||
|
|
||||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||||
be set to `True`."""
|
be set to `True`.""" + err
|
||||||
raise ValueError(msg + ', '.join(bad_fields))
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _transform_pandas_df(
|
||||||
|
data: DataFrame,
|
||||||
|
enable_categorical: bool,
|
||||||
|
feature_names: Optional[List[str]] = None,
|
||||||
|
feature_types: Optional[List[str]] = None,
|
||||||
|
meta: Optional[str] = None,
|
||||||
|
meta_type: Optional[str] = None,
|
||||||
|
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
|
||||||
|
import pandas as pd
|
||||||
|
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||||
|
|
||||||
|
if not all(
|
||||||
|
dtype.name in _pandas_dtype_mapper
|
||||||
|
or is_sparse(dtype)
|
||||||
|
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||||
|
for dtype in data.dtypes
|
||||||
|
):
|
||||||
|
_invalid_dataframe_dtype(data)
|
||||||
|
|
||||||
# handle feature names
|
# handle feature names
|
||||||
if feature_names is None and meta is None:
|
if feature_names is None and meta is None:
|
||||||
if isinstance(data.columns, pd.MultiIndex):
|
if isinstance(data.columns, pd.MultiIndex):
|
||||||
feature_names = [
|
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||||
' '.join([str(x) for x in i]) for i in data.columns
|
|
||||||
]
|
|
||||||
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
||||||
feature_names = list(map(str, data.columns))
|
feature_names = list(map(str, data.columns))
|
||||||
else:
|
else:
|
||||||
@ -263,21 +275,24 @@ def _transform_pandas_df(
|
|||||||
else:
|
else:
|
||||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||||
|
|
||||||
# handle categorical codes.
|
# handle category codes.
|
||||||
transformed = pd.DataFrame()
|
transformed = pd.DataFrame()
|
||||||
if enable_categorical:
|
if enable_categorical:
|
||||||
for i, dtype in enumerate(data.dtypes):
|
for i, dtype in enumerate(data.dtypes):
|
||||||
if is_categorical_dtype(dtype):
|
if is_categorical_dtype(dtype):
|
||||||
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
|
# pandas uses -1 as default missing value for categorical data
|
||||||
|
transformed[data.columns[i]] = (
|
||||||
|
data[data.columns[i]]
|
||||||
|
.cat.codes.astype(np.float32)
|
||||||
|
.replace(-1.0, np.NaN)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
transformed[data.columns[i]] = data[data.columns[i]]
|
transformed[data.columns[i]] = data[data.columns[i]]
|
||||||
else:
|
else:
|
||||||
transformed = data
|
transformed = data
|
||||||
|
|
||||||
if meta and len(data.columns) > 1:
|
if meta and len(data.columns) > 1:
|
||||||
raise ValueError(
|
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||||
f"DataFrame for {meta} cannot have multiple columns"
|
|
||||||
)
|
|
||||||
|
|
||||||
dtype = meta_type if meta_type else np.float32
|
dtype = meta_type if meta_type else np.float32
|
||||||
arr = transformed.values
|
arr = transformed.values
|
||||||
@ -287,7 +302,7 @@ def _transform_pandas_df(
|
|||||||
|
|
||||||
|
|
||||||
def _from_pandas_df(
|
def _from_pandas_df(
|
||||||
data,
|
data: DataFrame,
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
missing,
|
missing,
|
||||||
nthread,
|
nthread,
|
||||||
@ -300,6 +315,7 @@ def _from_pandas_df(
|
|||||||
feature_types)
|
feature_types)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _is_pandas_series(data):
|
def _is_pandas_series(data):
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -318,13 +334,26 @@ def _is_modin_series(data):
|
|||||||
|
|
||||||
def _from_pandas_series(
|
def _from_pandas_series(
|
||||||
data,
|
data,
|
||||||
missing,
|
missing: float,
|
||||||
nthread,
|
nthread: int,
|
||||||
|
enable_categorical: bool,
|
||||||
feature_names: Optional[List[str]],
|
feature_names: Optional[List[str]],
|
||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
):
|
):
|
||||||
|
from pandas.api.types import is_categorical_dtype
|
||||||
|
|
||||||
|
if (data.dtype.name not in _pandas_dtype_mapper) and not (
|
||||||
|
is_categorical_dtype(data.dtype) and enable_categorical
|
||||||
|
):
|
||||||
|
_invalid_dataframe_dtype(data)
|
||||||
|
if enable_categorical and is_categorical_dtype(data.dtype):
|
||||||
|
data = data.cat.codes
|
||||||
return _from_numpy_array(
|
return _from_numpy_array(
|
||||||
data.values.astype("float"), missing, nthread, feature_names, feature_types
|
data.values.reshape(data.shape[0], 1).astype("float"),
|
||||||
|
missing,
|
||||||
|
nthread,
|
||||||
|
feature_names,
|
||||||
|
feature_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -428,7 +457,7 @@ def _is_cudf_df(data):
|
|||||||
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
||||||
|
|
||||||
|
|
||||||
def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
||||||
data and a list of array interfaces. The data is list of categorical codes that
|
data and a list of array interfaces. The data is list of categorical codes that
|
||||||
caller can safely ignore, but have to keep their reference alive until usage of array
|
caller can safely ignore, but have to keep their reference alive until usage of array
|
||||||
@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from cudf.utils.dtypes import is_categorical_dtype
|
from cudf.utils.dtypes import is_categorical_dtype
|
||||||
|
|
||||||
cat_codes = []
|
|
||||||
interfaces = []
|
interfaces = []
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
interfaces.append(data.__cuda_array_interface__)
|
if is_categorical_dtype(data.dtype):
|
||||||
|
interface = cat_codes[0].__cuda_array_interface__
|
||||||
else:
|
else:
|
||||||
for col in data:
|
interface = data.__cuda_array_interface__
|
||||||
|
if "mask" in interface:
|
||||||
|
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||||
|
interfaces.append(interface)
|
||||||
|
else:
|
||||||
|
for i, col in enumerate(data):
|
||||||
if is_categorical_dtype(data[col].dtype):
|
if is_categorical_dtype(data[col].dtype):
|
||||||
codes = data[col].cat.codes
|
codes = cat_codes[i]
|
||||||
interface = codes.__cuda_array_interface__
|
interface = codes.__cuda_array_interface__
|
||||||
cat_codes.append(codes)
|
|
||||||
else:
|
else:
|
||||||
interface = data[col].__cuda_array_interface__
|
interface = data[col].__cuda_array_interface__
|
||||||
if "mask" in interface:
|
if "mask" in interface:
|
||||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||||
interfaces.append(interface)
|
interfaces.append(interface)
|
||||||
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
||||||
return cat_codes, interfaces_str
|
return interfaces_str
|
||||||
|
|
||||||
|
|
||||||
def _transform_cudf_df(
|
def _transform_cudf_df(
|
||||||
@ -470,25 +503,57 @@ def _transform_cudf_df(
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from cudf.utils.dtypes import is_categorical_dtype
|
from cudf.utils.dtypes import is_categorical_dtype
|
||||||
|
|
||||||
|
if _is_cudf_ser(data):
|
||||||
|
dtypes = [data.dtype]
|
||||||
|
else:
|
||||||
|
dtypes = data.dtypes
|
||||||
|
|
||||||
|
if not all(
|
||||||
|
dtype.name in _pandas_dtype_mapper
|
||||||
|
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||||
|
for dtype in dtypes
|
||||||
|
):
|
||||||
|
_invalid_dataframe_dtype(data)
|
||||||
|
|
||||||
|
# handle feature names
|
||||||
if feature_names is None:
|
if feature_names is None:
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
feature_names = [data.name]
|
feature_names = [data.name]
|
||||||
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
||||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||||
|
elif (
|
||||||
|
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
|
||||||
|
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
|
||||||
|
# Unique to cuDF, no equivalence in pandas 1.3.3
|
||||||
|
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
|
||||||
|
):
|
||||||
|
feature_names = list(map(str, data.columns))
|
||||||
else:
|
else:
|
||||||
feature_names = data.columns.format()
|
feature_names = data.columns.format()
|
||||||
|
|
||||||
|
# handle feature types
|
||||||
if feature_types is None:
|
if feature_types is None:
|
||||||
feature_types = []
|
feature_types = []
|
||||||
if _is_cudf_ser(data):
|
|
||||||
dtypes = [data.dtype]
|
|
||||||
else:
|
|
||||||
dtypes = data.dtypes
|
|
||||||
for dtype in dtypes:
|
for dtype in dtypes:
|
||||||
if is_categorical_dtype(dtype) and enable_categorical:
|
if is_categorical_dtype(dtype) and enable_categorical:
|
||||||
feature_types.append(CAT_T)
|
feature_types.append(CAT_T)
|
||||||
else:
|
else:
|
||||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||||
return data, feature_names, feature_types
|
|
||||||
|
# handle categorical data
|
||||||
|
cat_codes = []
|
||||||
|
if _is_cudf_ser(data):
|
||||||
|
# unlike pandas, cuDF uses NA for missing data.
|
||||||
|
if is_categorical_dtype(data.dtype) and enable_categorical:
|
||||||
|
codes = data.cat.codes
|
||||||
|
cat_codes.append(codes)
|
||||||
|
else:
|
||||||
|
for col in data:
|
||||||
|
if is_categorical_dtype(data[col].dtype) and enable_categorical:
|
||||||
|
codes = data[col].cat.codes
|
||||||
|
cat_codes.append(codes)
|
||||||
|
|
||||||
|
return data, cat_codes, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
def _from_cudf_df(
|
def _from_cudf_df(
|
||||||
@ -499,10 +564,10 @@ def _from_cudf_df(
|
|||||||
feature_types: Optional[List[str]],
|
feature_types: Optional[List[str]],
|
||||||
enable_categorical: bool,
|
enable_categorical: bool,
|
||||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||||
data, feature_names, feature_types = _transform_cudf_df(
|
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
|
||||||
data, feature_names, feature_types, enable_categorical
|
data, feature_names, feature_types, enable_categorical
|
||||||
)
|
)
|
||||||
_, interfaces_str = _cudf_array_interfaces(data)
|
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||||
handle = ctypes.c_void_p()
|
handle = ctypes.c_void_p()
|
||||||
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
||||||
_check_call(
|
_check_call(
|
||||||
@ -707,8 +772,9 @@ def dispatch_data_backend(
|
|||||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||||
feature_names, feature_types)
|
feature_names, feature_types)
|
||||||
if _is_pandas_series(data):
|
if _is_pandas_series(data):
|
||||||
return _from_pandas_series(data, missing, threads, feature_names,
|
return _from_pandas_series(
|
||||||
feature_types)
|
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||||
|
)
|
||||||
if _is_cudf_df(data) or _is_cudf_ser(data):
|
if _is_cudf_df(data) or _is_cudf_ser(data):
|
||||||
return _from_cudf_df(
|
return _from_cudf_df(
|
||||||
data, missing, threads, feature_names, feature_types, enable_categorical
|
data, missing, threads, feature_names, feature_types, enable_categorical
|
||||||
@ -732,8 +798,9 @@ def dispatch_data_backend(
|
|||||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||||
feature_names, feature_types)
|
feature_names, feature_types)
|
||||||
if _is_modin_series(data):
|
if _is_modin_series(data):
|
||||||
return _from_pandas_series(data, missing, threads, feature_names,
|
return _from_pandas_series(
|
||||||
feature_types)
|
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||||
|
)
|
||||||
if _has_array_protocol(data):
|
if _has_array_protocol(data):
|
||||||
array = np.asarray(data)
|
array = np.asarray(data)
|
||||||
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
||||||
@ -747,7 +814,7 @@ def dispatch_data_backend(
|
|||||||
|
|
||||||
def _to_data_type(dtype: str, name: str):
|
def _to_data_type(dtype: str, name: str):
|
||||||
dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
|
dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
|
||||||
if dtype not in dtype_map.keys():
|
if dtype not in dtype_map:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f'Expecting float32, float64, uint32, uint64, got {dtype} ' +
|
f'Expecting float32, float64, uint32, uint64, got {dtype} ' +
|
||||||
f'for {name}.')
|
f'for {name}.')
|
||||||
@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
|||||||
_meta_from_dt(data, name, dtype, handle)
|
_meta_from_dt(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_modin_df(data):
|
if _is_modin_df(data):
|
||||||
data, _, _ = _transform_pandas_df(
|
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||||
data, False, meta=name, meta_type=dtype)
|
|
||||||
_meta_from_numpy(data, name, dtype, handle)
|
_meta_from_numpy(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_modin_series(data):
|
if _is_modin_series(data):
|
||||||
@ -917,30 +983,38 @@ def _proxy_transform(
|
|||||||
)
|
)
|
||||||
if _is_cupy_array(data):
|
if _is_cupy_array(data):
|
||||||
data = _transform_cupy_array(data)
|
data = _transform_cupy_array(data)
|
||||||
return data, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_dlpack(data):
|
if _is_dlpack(data):
|
||||||
return _transform_dlpack(data), feature_names, feature_types
|
return _transform_dlpack(data), None, feature_names, feature_types
|
||||||
if _is_numpy_array(data):
|
if _is_numpy_array(data):
|
||||||
return data, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_scipy_csr(data):
|
if _is_scipy_csr(data):
|
||||||
return data, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_pandas_df(data):
|
if _is_pandas_df(data):
|
||||||
arr, feature_names, feature_types = _transform_pandas_df(
|
arr, feature_names, feature_types = _transform_pandas_df(
|
||||||
data, enable_categorical, feature_names, feature_types
|
data, enable_categorical, feature_names, feature_types
|
||||||
)
|
)
|
||||||
return arr, feature_names, feature_types
|
return arr, None, feature_names, feature_types
|
||||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||||
|
|
||||||
|
|
||||||
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
|
def dispatch_proxy_set_data(
|
||||||
|
proxy: _ProxyDMatrix,
|
||||||
|
data: Any,
|
||||||
|
cat_codes: Optional[list],
|
||||||
|
allow_host: bool,
|
||||||
|
) -> None:
|
||||||
"""Dispatch for DeviceQuantileDMatrix."""
|
"""Dispatch for DeviceQuantileDMatrix."""
|
||||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||||
_check_data_shape(data)
|
_check_data_shape(data)
|
||||||
|
|
||||||
if _is_cudf_df(data):
|
if _is_cudf_df(data):
|
||||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
# pylint: disable=W0212
|
||||||
|
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||||
return
|
return
|
||||||
if _is_cudf_ser(data):
|
if _is_cudf_ser(data):
|
||||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
# pylint: disable=W0212
|
||||||
|
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||||
return
|
return
|
||||||
if _is_cupy_array(data):
|
if _is_cupy_array(data):
|
||||||
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
||||||
|
|||||||
@ -174,7 +174,9 @@ __model_doc = f'''
|
|||||||
Device ordinal.
|
Device ordinal.
|
||||||
validate_parameters : Optional[bool]
|
validate_parameters : Optional[bool]
|
||||||
Give warnings for unknown parameter.
|
Give warnings for unknown parameter.
|
||||||
|
predictor : Optional[str]
|
||||||
|
Force XGBoost to use specific predictor, available choices are [cpu_predictor,
|
||||||
|
gpu_predictor].
|
||||||
enable_categorical : bool
|
enable_categorical : bool
|
||||||
|
|
||||||
.. versionadded:: 1.5.0
|
.. versionadded:: 1.5.0
|
||||||
@ -807,7 +809,11 @@ class XGBModel(XGBModelBase):
|
|||||||
# Inplace predict doesn't handle as many data types as DMatrix, but it's
|
# Inplace predict doesn't handle as many data types as DMatrix, but it's
|
||||||
# sufficient for dask interface where input is simpiler.
|
# sufficient for dask interface where input is simpiler.
|
||||||
predictor = self.get_params().get("predictor", None)
|
predictor = self.get_params().get("predictor", None)
|
||||||
if predictor in ("auto", None) and self.booster != "gblinear":
|
if (
|
||||||
|
not self.enable_categorical
|
||||||
|
and predictor in ("auto", None)
|
||||||
|
and self.booster != "gblinear"
|
||||||
|
):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -834,7 +840,9 @@ class XGBModel(XGBModelBase):
|
|||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[Tuple[int, int]] = None,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration`
|
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration`
|
||||||
is used automatically.
|
is used automatically. For tree models, when data is on GPU, like cupy array or
|
||||||
|
cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
|
||||||
|
automatically, otherwise it will run on CPU.
|
||||||
|
|
||||||
.. note:: This function is only thread safe for `gbtree` and `dart`.
|
.. note:: This function is only thread safe for `gbtree` and `dart`.
|
||||||
|
|
||||||
@ -862,6 +870,7 @@ class XGBModel(XGBModelBase):
|
|||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
prediction
|
prediction
|
||||||
|
|
||||||
"""
|
"""
|
||||||
iteration_range = _convert_ntree_limit(
|
iteration_range = _convert_ntree_limit(
|
||||||
self.get_booster(), ntree_limit, iteration_range
|
self.get_booster(), ntree_limit, iteration_range
|
||||||
@ -886,7 +895,10 @@ class XGBModel(XGBModelBase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
test = DMatrix(
|
test = DMatrix(
|
||||||
X, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs
|
X, base_margin=base_margin,
|
||||||
|
missing=self.missing,
|
||||||
|
nthread=self.n_jobs,
|
||||||
|
enable_categorical=self.enable_categorical
|
||||||
)
|
)
|
||||||
return self.get_booster().predict(
|
return self.get_booster().predict(
|
||||||
data=test,
|
data=test,
|
||||||
@ -1342,9 +1354,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
iteration_range=iteration_range
|
iteration_range=iteration_range
|
||||||
)
|
)
|
||||||
# If model is loaded from a raw booster there's no `n_classes_`
|
# If model is loaded from a raw booster there's no `n_classes_`
|
||||||
return _cls_predict_proba(
|
return _cls_predict_proba(getattr(self, "n_classes_", 0), class_probs, np.vstack)
|
||||||
getattr(self, "n_classes_", None), class_probs, np.vstack
|
|
||||||
)
|
|
||||||
|
|
||||||
def evals_result(self) -> TrainingCallback.EvalsLog:
|
def evals_result(self) -> TrainingCallback.EvalsLog:
|
||||||
"""Return the evaluation results.
|
"""Return the evaluation results.
|
||||||
|
|||||||
@ -144,7 +144,7 @@ class RabitTracker(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, hostIP, nslave, port=9091, port_end=9999, use_logger: bool = True
|
self, hostIP, nslave, port=9091, port_end=9999, use_logger: bool = False
|
||||||
) -> None:
|
) -> None:
|
||||||
"""A Python implementation of RABIT tracker.
|
"""A Python implementation of RABIT tracker.
|
||||||
|
|
||||||
@ -384,16 +384,17 @@ def start_rabit_tracker(args):
|
|||||||
----------
|
----------
|
||||||
args: arguments to start the rabit tracker.
|
args: arguments to start the rabit tracker.
|
||||||
"""
|
"""
|
||||||
envs = {'DMLC_NUM_WORKER': args.num_workers,
|
envs = {"DMLC_NUM_WORKER": args.num_workers, "DMLC_NUM_SERVER": args.num_servers}
|
||||||
'DMLC_NUM_SERVER': args.num_servers}
|
rabit = RabitTracker(
|
||||||
rabit = RabitTracker(hostIP=get_host_ip(args.host_ip), nslave=args.num_workers)
|
hostIP=get_host_ip(args.host_ip), nslave=args.num_workers, use_logger=True
|
||||||
|
)
|
||||||
envs.update(rabit.slave_envs())
|
envs.update(rabit.slave_envs())
|
||||||
rabit.start(args.num_workers)
|
rabit.start(args.num_workers)
|
||||||
sys.stdout.write('DMLC_TRACKER_ENV_START\n')
|
sys.stdout.write("DMLC_TRACKER_ENV_START\n")
|
||||||
# simply write configuration to stdout
|
# simply write configuration to stdout
|
||||||
for k, v in envs.items():
|
for k, v in envs.items():
|
||||||
sys.stdout.write(f"{k}={v}\n")
|
sys.stdout.write(f"{k}={v}\n")
|
||||||
sys.stdout.write('DMLC_TRACKER_ENV_END\n')
|
sys.stdout.write("DMLC_TRACKER_ENV_END\n")
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
rabit.join()
|
rabit.join()
|
||||||
|
|
||||||
|
|||||||
@ -472,13 +472,15 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
|||||||
if is_new_callback:
|
if is_new_callback:
|
||||||
assert all(isinstance(c, callback.TrainingCallback)
|
assert all(isinstance(c, callback.TrainingCallback)
|
||||||
for c in callbacks), "You can't mix new and old callback styles."
|
for c in callbacks), "You can't mix new and old callback styles."
|
||||||
if isinstance(verbose_eval, bool) and verbose_eval:
|
if verbose_eval:
|
||||||
verbose_eval = 1 if verbose_eval is True else verbose_eval
|
verbose_eval = 1 if verbose_eval is True else verbose_eval
|
||||||
callbacks.append(callback.EvaluationMonitor(period=verbose_eval,
|
callbacks.append(
|
||||||
show_stdv=show_stdv))
|
callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
|
||||||
|
)
|
||||||
if early_stopping_rounds:
|
if early_stopping_rounds:
|
||||||
callbacks.append(callback.EarlyStopping(
|
callbacks.append(
|
||||||
rounds=early_stopping_rounds, maximize=maximize))
|
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
|
||||||
|
)
|
||||||
callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True)
|
callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True)
|
||||||
else:
|
else:
|
||||||
callbacks = _configure_deprecated_callbacks(
|
callbacks = _configure_deprecated_callbacks(
|
||||||
|
|||||||
@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
|
|||||||
custom_feature_names = get<Array const>(config["feature_names"]);
|
custom_feature_names = get<Array const>(config["feature_names"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<int32_t> tree_idx;
|
||||||
|
if (!IsA<Null>(config["tree_idx"])) {
|
||||||
|
auto j_tree_idx = get<Array const>(config["tree_idx"]);
|
||||||
|
for (auto const &idx : j_tree_idx) {
|
||||||
|
tree_idx.push_back(get<Integer const>(idx));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto &scores = learner->GetThreadLocal().ret_vec_float;
|
auto &scores = learner->GetThreadLocal().ret_vec_float;
|
||||||
std::vector<bst_feature_t> features;
|
std::vector<bst_feature_t> features;
|
||||||
learner->CalcFeatureScore(importance, &features, &scores);
|
learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);
|
||||||
|
|
||||||
auto n_features = learner->GetNumFeature();
|
auto n_features = learner->GetNumFeature();
|
||||||
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
|
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
|
||||||
|
|||||||
@ -58,14 +58,15 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
|
|||||||
template <typename VT, typename Direction, bool IsConst = false>
|
template <typename VT, typename Direction, bool IsConst = false>
|
||||||
struct BitFieldContainer {
|
struct BitFieldContainer {
|
||||||
using value_type = std::conditional_t<IsConst, VT const, VT>; // NOLINT
|
using value_type = std::conditional_t<IsConst, VT const, VT>; // NOLINT
|
||||||
|
using index_type = size_t; // NOLINT
|
||||||
using pointer = value_type*; // NOLINT
|
using pointer = value_type*; // NOLINT
|
||||||
|
|
||||||
static value_type constexpr kValueSize = sizeof(value_type) * 8;
|
static index_type constexpr kValueSize = sizeof(value_type) * 8;
|
||||||
static value_type constexpr kOne = 1; // force correct type.
|
static index_type constexpr kOne = 1; // force correct type.
|
||||||
|
|
||||||
struct Pos {
|
struct Pos {
|
||||||
std::remove_const_t<value_type> int_pos {0};
|
index_type int_pos{0};
|
||||||
std::remove_const_t<value_type> bit_pos {0};
|
index_type bit_pos{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -73,7 +74,7 @@ struct BitFieldContainer {
|
|||||||
static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
|
static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
|
||||||
|
|
||||||
public:
|
public:
|
||||||
XGBOOST_DEVICE static Pos ToBitPos(value_type pos) {
|
XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
|
||||||
Pos pos_v;
|
Pos pos_v;
|
||||||
if (pos == 0) {
|
if (pos == 0) {
|
||||||
return pos_v;
|
return pos_v;
|
||||||
@ -96,7 +97,7 @@ struct BitFieldContainer {
|
|||||||
/*\brief Compute the size of needed memory allocation. The returned value is in terms
|
/*\brief Compute the size of needed memory allocation. The returned value is in terms
|
||||||
* of number of elements with `BitFieldContainer::value_type'.
|
* of number of elements with `BitFieldContainer::value_type'.
|
||||||
*/
|
*/
|
||||||
XGBOOST_DEVICE static size_t ComputeStorageSize(size_t size) {
|
XGBOOST_DEVICE static size_t ComputeStorageSize(index_type size) {
|
||||||
return common::DivRoundUp(size, kValueSize);
|
return common::DivRoundUp(size, kValueSize);
|
||||||
}
|
}
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(__CUDA_ARCH__)
|
||||||
@ -138,14 +139,14 @@ struct BitFieldContainer {
|
|||||||
#endif // defined(__CUDA_ARCH__)
|
#endif // defined(__CUDA_ARCH__)
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(__CUDA_ARCH__)
|
||||||
__device__ auto Set(value_type pos) {
|
__device__ auto Set(index_type pos) {
|
||||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||||
value_type& value = bits_[pos_v.int_pos];
|
value_type& value = bits_[pos_v.int_pos];
|
||||||
value_type set_bit = kOne << pos_v.bit_pos;
|
value_type set_bit = kOne << pos_v.bit_pos;
|
||||||
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
|
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
|
||||||
atomicOr(reinterpret_cast<Type *>(&value), set_bit);
|
atomicOr(reinterpret_cast<Type *>(&value), set_bit);
|
||||||
}
|
}
|
||||||
__device__ void Clear(value_type pos) {
|
__device__ void Clear(index_type pos) {
|
||||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||||
value_type& value = bits_[pos_v.int_pos];
|
value_type& value = bits_[pos_v.int_pos];
|
||||||
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
||||||
@ -153,13 +154,13 @@ struct BitFieldContainer {
|
|||||||
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
|
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void Set(value_type pos) {
|
void Set(index_type pos) {
|
||||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||||
value_type& value = bits_[pos_v.int_pos];
|
value_type& value = bits_[pos_v.int_pos];
|
||||||
value_type set_bit = kOne << pos_v.bit_pos;
|
value_type set_bit = kOne << pos_v.bit_pos;
|
||||||
value |= set_bit;
|
value |= set_bit;
|
||||||
}
|
}
|
||||||
void Clear(value_type pos) {
|
void Clear(index_type pos) {
|
||||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||||
value_type& value = bits_[pos_v.int_pos];
|
value_type& value = bits_[pos_v.int_pos];
|
||||||
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
||||||
@ -175,7 +176,7 @@ struct BitFieldContainer {
|
|||||||
value_type result = test_bit & value;
|
value_type result = test_bit & value;
|
||||||
return static_cast<bool>(result);
|
return static_cast<bool>(result);
|
||||||
}
|
}
|
||||||
XGBOOST_DEVICE bool Check(value_type pos) const {
|
XGBOOST_DEVICE bool Check(index_type pos) const {
|
||||||
Pos pos_v = ToBitPos(pos);
|
Pos pos_v = ToBitPos(pos);
|
||||||
return Check(pos_v);
|
return Check(pos_v);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2020 by XGBoost Contributors
|
* Copyright 2020-2021 by XGBoost Contributors
|
||||||
* \file categorical.h
|
* \file categorical.h
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
||||||
@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
|
|||||||
return !s_cats.Check(cat);
|
return !s_cats.Check(cat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void CheckCat(bst_cat_t cat) {
|
||||||
|
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
|
||||||
|
"should be non-negative.";
|
||||||
|
}
|
||||||
|
|
||||||
struct IsCatOp {
|
struct IsCatOp {
|
||||||
XGBOOST_DEVICE bool operator()(FeatureType ft) {
|
XGBOOST_DEVICE bool operator()(FeatureType ft) {
|
||||||
return ft == FeatureType::kCategorical;
|
return ft == FeatureType::kCategorical;
|
||||||
|
|||||||
@ -711,6 +711,12 @@ constexpr std::pair<int, int> CUDAVersion() {
|
|||||||
constexpr std::pair<int32_t, int32_t> ThrustVersion() {
|
constexpr std::pair<int32_t, int32_t> ThrustVersion() {
|
||||||
return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION);
|
return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION);
|
||||||
}
|
}
|
||||||
|
// Whether do we have thrust 1.x with x >= minor
|
||||||
|
template <int32_t minor>
|
||||||
|
constexpr bool HasThrustMinorVer() {
|
||||||
|
return (ThrustVersion().first == 1 && ThrustVersion().second >= minor) ||
|
||||||
|
ThrustVersion().first > 1;
|
||||||
|
}
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -725,10 +731,8 @@ class TypedDiscard : public thrust::discard_iterator<T> {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
using TypedDiscard =
|
using TypedDiscard =
|
||||||
std::conditional_t<((ThrustVersion().first == 1 &&
|
std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
|
||||||
ThrustVersion().second >= 12) ||
|
detail::TypedDiscard<T>>;
|
||||||
ThrustVersion().first > 1),
|
|
||||||
detail::TypedDiscardCTK114<T>, detail::TypedDiscard<T>>;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class AllReducer
|
* \class AllReducer
|
||||||
@ -1442,24 +1446,39 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
namespace detail {
|
namespace detail {
|
||||||
// Wrapper around cub sort for easier `descending` sort.
|
// Wrapper around cub sort for easier `descending` sort.
|
||||||
template <bool descending, typename KeyT, typename ValueT,
|
template <bool descending, typename KeyT, typename ValueT,
|
||||||
typename OffsetIteratorT>
|
typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
||||||
void DeviceSegmentedRadixSortPair(
|
void DeviceSegmentedRadixSortPair(
|
||||||
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
||||||
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
|
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
|
||||||
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets,
|
size_t num_items, size_t num_segments, BeginOffsetIteratorT d_begin_offsets,
|
||||||
OffsetIteratorT d_end_offsets, int begin_bit = 0,
|
EndOffsetIteratorT d_end_offsets, int begin_bit = 0,
|
||||||
int end_bit = sizeof(KeyT) * 8) {
|
int end_bit = sizeof(KeyT) * 8) {
|
||||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
||||||
d_values_out);
|
d_values_out);
|
||||||
using OffsetT = int32_t; // num items in dispatch is also int32_t, no way to change.
|
// In old version of cub, num_items in dispatch is also int32_t, no way to change.
|
||||||
CHECK_LE(num_items, std::numeric_limits<int32_t>::max());
|
using OffsetT =
|
||||||
|
std::conditional_t<BuildWithCUDACub() && HasThrustMinorVer<13>(), size_t,
|
||||||
|
int32_t>;
|
||||||
|
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
|
||||||
|
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
|
||||||
|
|
||||||
|
#if (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13) || THRUST_MAJOR_VERSION > 1
|
||||||
safe_cuda((cub::DispatchSegmentedRadixSort<
|
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||||
descending, KeyT, ValueT, OffsetIteratorT,
|
descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
|
||||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||||
d_values, num_items, num_segments,
|
d_values, num_items, num_segments,
|
||||||
d_begin_offsets, d_end_offsets, begin_bit,
|
d_begin_offsets, d_end_offsets, begin_bit,
|
||||||
end_bit, false, nullptr, false)));
|
end_bit, false, nullptr, false)));
|
||||||
|
#else
|
||||||
|
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||||
|
descending, KeyT, ValueT, BeginOffsetIteratorT,
|
||||||
|
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||||
|
d_values, num_items, num_segments,
|
||||||
|
d_begin_offsets, d_end_offsets, begin_bit,
|
||||||
|
end_bit, false, nullptr, false)));
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
|
|||||||
@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
|
|||||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||||
dh::device_vector<Entry> *p_sorted_entries,
|
dh::device_vector<Entry> *p_sorted_entries,
|
||||||
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
||||||
|
info.feature_types.SetDevice(device);
|
||||||
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
||||||
CHECK(!d_feature_types.empty());
|
CHECK(!d_feature_types.empty());
|
||||||
auto &column_sizes_scan = *p_column_sizes_scan;
|
auto &column_sizes_scan = *p_column_sizes_scan;
|
||||||
|
|||||||
@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
|
|||||||
|
|
||||||
void SortByWeight(dh::device_vector<float>* weights,
|
void SortByWeight(dh::device_vector<float>* weights,
|
||||||
dh::device_vector<Entry>* sorted_entries);
|
dh::device_vector<Entry>* sorted_entries);
|
||||||
|
|
||||||
|
void RemoveDuplicatedCategories(
|
||||||
|
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||||
|
dh::device_vector<Entry> *p_sorted_entries,
|
||||||
|
dh::caching_device_vector<size_t> *p_column_sizes_scan);
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
// Compute sketch on DMatrix.
|
// Compute sketch on DMatrix.
|
||||||
@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
|
|||||||
size_t sketch_batch_num_elements = 0);
|
size_t sketch_batch_num_elements = 0);
|
||||||
|
|
||||||
template <typename AdapterBatch>
|
template <typename AdapterBatch>
|
||||||
void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||||
size_t begin, size_t end, float missing,
|
int device, size_t columns, size_t begin, size_t end,
|
||||||
SketchContainer* sketch_container, int num_cuts) {
|
float missing, SketchContainer *sketch_container,
|
||||||
|
int num_cuts) {
|
||||||
// Copy current subset of valid elements into temporary storage and sort
|
// Copy current subset of valid elements into temporary storage and sort
|
||||||
dh::device_vector<Entry> sorted_entries;
|
dh::device_vector<Entry> sorted_entries;
|
||||||
dh::caching_device_vector<size_t> column_sizes_scan;
|
dh::caching_device_vector<size_t> column_sizes_scan;
|
||||||
@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
|||||||
thrust::make_counting_iterator(0llu),
|
thrust::make_counting_iterator(0llu),
|
||||||
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
||||||
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
||||||
|
cuts_ptr.SetDevice(device);
|
||||||
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
||||||
columns, num_cuts, device,
|
columns, num_cuts, device,
|
||||||
&cuts_ptr,
|
&cuts_ptr,
|
||||||
@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
|||||||
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
||||||
sorted_entries.end(), detail::EntryCompareOp());
|
sorted_entries.end(), detail::EntryCompareOp());
|
||||||
|
|
||||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
if (sketch_container->HasCategorical()) {
|
||||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||||
|
&sorted_entries, &column_sizes_scan);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
auto const &h_cuts_ptr = cuts_ptr.HostVector();
|
||||||
// Extract the cuts from all columns concurrently
|
// Extract the cuts from all columns concurrently
|
||||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||||
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
||||||
@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
|||||||
|
|
||||||
detail::SortByWeight(&temp_weights, &sorted_entries);
|
detail::SortByWeight(&temp_weights, &sorted_entries);
|
||||||
|
|
||||||
|
if (sketch_container->HasCategorical()) {
|
||||||
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||||
|
&sorted_entries, &column_sizes_scan);
|
||||||
|
}
|
||||||
|
|
||||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
|
|
||||||
@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
|||||||
device, num_cuts_per_feature, false);
|
device, num_cuts_per_feature, false);
|
||||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||||
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
||||||
ProcessSlidingWindow(batch, device, num_cols,
|
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
|
||||||
begin, end, missing, sketch_container, num_cuts_per_feature);
|
sketch_container, num_cuts_per_feature);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,11 +7,28 @@
|
|||||||
#define XGBOOST_COMMON_THREADING_UTILS_H_
|
#define XGBOOST_COMMON_THREADING_UTILS_H_
|
||||||
|
|
||||||
#include <dmlc/common.h>
|
#include <dmlc/common.h>
|
||||||
#include <vector>
|
#include <dmlc/omp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <limits>
|
||||||
#include <type_traits> // std::is_signed
|
#include <type_traits> // std::is_signed
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "xgboost/logging.h"
|
#include "xgboost/logging.h"
|
||||||
|
|
||||||
|
#if !defined(_OPENMP)
|
||||||
|
extern "C" {
|
||||||
|
inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; } // NOLINT
|
||||||
|
}
|
||||||
|
#endif // !defined(_OPENMP)
|
||||||
|
|
||||||
|
// MSVC doesn't implement the thread limit.
|
||||||
|
#if defined(_OPENMP) && defined(_MSC_VER)
|
||||||
|
extern "C" {
|
||||||
|
inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); } // NOLINT
|
||||||
|
}
|
||||||
|
#endif // defined(_MSC_VER)
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
@ -153,7 +170,7 @@ struct Sched {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename Index, typename Func>
|
template <typename Index, typename Func>
|
||||||
void ParallelFor(Index size, size_t n_threads, Sched sched, Func fn) {
|
void ParallelFor(Index size, int32_t n_threads, Sched sched, Func fn) {
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// msvc doesn't support unsigned integer as openmp index.
|
// msvc doesn't support unsigned integer as openmp index.
|
||||||
using OmpInd = std::conditional_t<std::is_signed<Index>::value, Index, omp_ulong>;
|
using OmpInd = std::conditional_t<std::is_signed<Index>::value, Index, omp_ulong>;
|
||||||
@ -220,6 +237,13 @@ void ParallelFor(Index size, size_t n_threads, Func fn) {
|
|||||||
template <typename Index, typename Func>
|
template <typename Index, typename Func>
|
||||||
void ParallelFor(Index size, Func fn) {
|
void ParallelFor(Index size, Func fn) {
|
||||||
ParallelFor(size, omp_get_max_threads(), Sched::Static(), fn);
|
ParallelFor(size, omp_get_max_threads(), Sched::Static(), fn);
|
||||||
|
} // !defined(_OPENMP)
|
||||||
|
|
||||||
|
|
||||||
|
inline int32_t OmpGetThreadLimit() {
|
||||||
|
int32_t limit = omp_get_thread_limit();
|
||||||
|
CHECK_GE(limit, 1) << "Invalid thread limit for OpenMP.";
|
||||||
|
return limit;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* \brief Configure parallel threads.
|
/* \brief Configure parallel threads.
|
||||||
@ -235,15 +259,18 @@ inline int32_t OmpSetNumThreads(int32_t* p_threads) {
|
|||||||
if (threads <= 0) {
|
if (threads <= 0) {
|
||||||
threads = omp_get_num_procs();
|
threads = omp_get_num_procs();
|
||||||
}
|
}
|
||||||
|
threads = std::min(threads, OmpGetThreadLimit());
|
||||||
omp_set_num_threads(threads);
|
omp_set_num_threads(threads);
|
||||||
return nthread_original;
|
return nthread_original;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int32_t OmpSetNumThreadsWithoutHT(int32_t* p_threads) {
|
inline int32_t OmpSetNumThreadsWithoutHT(int32_t* p_threads) {
|
||||||
auto& threads = *p_threads;
|
auto& threads = *p_threads;
|
||||||
int32_t nthread_original = omp_get_max_threads();
|
int32_t nthread_original = omp_get_max_threads();
|
||||||
if (threads <= 0) {
|
if (threads <= 0) {
|
||||||
threads = nthread_original;
|
threads = nthread_original;
|
||||||
}
|
}
|
||||||
|
threads = std::min(threads, OmpGetThreadLimit());
|
||||||
omp_set_num_threads(threads);
|
omp_set_num_threads(threads);
|
||||||
return nthread_original;
|
return nthread_original;
|
||||||
}
|
}
|
||||||
@ -252,6 +279,7 @@ inline int32_t OmpGetNumThreads(int32_t n_threads) {
|
|||||||
if (n_threads <= 0) {
|
if (n_threads <= 0) {
|
||||||
n_threads = omp_get_num_procs();
|
n_threads = omp_get_num_procs();
|
||||||
}
|
}
|
||||||
|
n_threads = std::min(n_threads, OmpGetThreadLimit());
|
||||||
return n_threads;
|
return n_threads;
|
||||||
}
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
|
|||||||
@ -21,6 +21,7 @@
|
|||||||
|
|
||||||
#include "array_interface.h"
|
#include "array_interface.h"
|
||||||
#include "../c_api/c_api_error.h"
|
#include "../c_api/c_api_error.h"
|
||||||
|
#include "../common/math.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
@ -80,6 +81,24 @@ struct COOTuple {
|
|||||||
float value{0};
|
float value{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct IsValidFunctor {
|
||||||
|
float missing;
|
||||||
|
|
||||||
|
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||||
|
|
||||||
|
XGBOOST_DEVICE bool operator()(float value) const {
|
||||||
|
return !(common::CheckNAN(value) || value == missing);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
|
||||||
|
return !(common::CheckNAN(e.value) || e.value == missing);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGBOOST_DEVICE bool operator()(const Entry& e) const {
|
||||||
|
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -987,6 +987,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
|
|
||||||
// Second pass over batch, placing elements in correct position
|
// Second pass over batch, placing elements in correct position
|
||||||
|
|
||||||
|
auto is_valid = data::IsValidFunctor{missing};
|
||||||
#pragma omp parallel num_threads(nthread)
|
#pragma omp parallel num_threads(nthread)
|
||||||
{
|
{
|
||||||
exec.Run([&]() {
|
exec.Run([&]() {
|
||||||
@ -998,7 +999,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
for (auto j = 0ull; j < line.Size(); j++) {
|
for (auto j = 0ull; j < line.Size(); j++) {
|
||||||
auto element = line.GetElement(j);
|
auto element = line.GetElement(j);
|
||||||
const size_t key = (element.row_idx - base_rowid);
|
const size_t key = (element.row_idx - base_rowid);
|
||||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
if (is_valid(element)) {
|
||||||
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,29 +15,6 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
|
|
||||||
float missing;
|
|
||||||
|
|
||||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
|
||||||
|
|
||||||
__device__ bool operator()(float value) const {
|
|
||||||
return !(common::CheckNAN(value) || value == missing);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ bool operator()(const data::COOTuple& e) const {
|
|
||||||
if (common::CheckNAN(e.value) || e.value == missing) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
__device__ bool operator()(const Entry& e) const {
|
|
||||||
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class CudfAdapterBatch : public detail::NoMetaInfo {
|
class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||||
friend class CudfAdapter;
|
friend class CudfAdapter;
|
||||||
|
|
||||||
|
|||||||
@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
|||||||
|
|
||||||
if (batches == 1) {
|
if (batches == 1) {
|
||||||
this->info_ = std::move(proxy->Info());
|
this->info_ = std::move(proxy->Info());
|
||||||
|
this->info_.num_nonzero_ = nnz;
|
||||||
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -49,10 +49,10 @@ class SimpleDMatrix : public DMatrix {
|
|||||||
MetaInfo info_;
|
MetaInfo info_;
|
||||||
// Primary storage type
|
// Primary storage type
|
||||||
std::shared_ptr<SparsePage> sparse_page_ = std::make_shared<SparsePage>();
|
std::shared_ptr<SparsePage> sparse_page_ = std::make_shared<SparsePage>();
|
||||||
std::shared_ptr<CSCPage> column_page_;
|
std::shared_ptr<CSCPage> column_page_{nullptr};
|
||||||
std::shared_ptr<SortedCSCPage> sorted_column_page_;
|
std::shared_ptr<SortedCSCPage> sorted_column_page_{nullptr};
|
||||||
std::shared_ptr<EllpackPage> ellpack_page_;
|
std::shared_ptr<EllpackPage> ellpack_page_{nullptr};
|
||||||
std::shared_ptr<GHistIndexMatrix> gradient_index_;
|
std::shared_ptr<GHistIndexMatrix> gradient_index_{nullptr};
|
||||||
BatchParam batch_param_;
|
BatchParam batch_param_;
|
||||||
|
|
||||||
bool EllpackExists() const override {
|
bool EllpackExists() const override {
|
||||||
|
|||||||
@ -62,9 +62,8 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void LinearCheckLayer(unsigned layer_begin, unsigned layer_end) {
|
void LinearCheckLayer(unsigned layer_begin) {
|
||||||
CHECK_EQ(layer_begin, 0) << "Linear booster does not support prediction range.";
|
CHECK_EQ(layer_begin, 0) << "Linear booster does not support prediction range.";
|
||||||
CHECK_EQ(layer_end, 0) << "Linear booster does not support prediction range.";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -152,7 +151,7 @@ class GBLinear : public GradientBooster {
|
|||||||
void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *predts,
|
void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *predts,
|
||||||
bool training, unsigned layer_begin, unsigned layer_end) override {
|
bool training, unsigned layer_begin, unsigned layer_end) override {
|
||||||
monitor_.Start("PredictBatch");
|
monitor_.Start("PredictBatch");
|
||||||
LinearCheckLayer(layer_begin, layer_end);
|
LinearCheckLayer(layer_begin);
|
||||||
auto* out_preds = &predts->predictions;
|
auto* out_preds = &predts->predictions;
|
||||||
this->PredictBatchInternal(p_fmat, &out_preds->HostVector());
|
this->PredictBatchInternal(p_fmat, &out_preds->HostVector());
|
||||||
monitor_.Stop("PredictBatch");
|
monitor_.Stop("PredictBatch");
|
||||||
@ -161,7 +160,7 @@ class GBLinear : public GradientBooster {
|
|||||||
void PredictInstance(const SparsePage::Inst &inst,
|
void PredictInstance(const SparsePage::Inst &inst,
|
||||||
std::vector<bst_float> *out_preds,
|
std::vector<bst_float> *out_preds,
|
||||||
unsigned layer_begin, unsigned layer_end) override {
|
unsigned layer_begin, unsigned layer_end) override {
|
||||||
LinearCheckLayer(layer_begin, layer_end);
|
LinearCheckLayer(layer_begin);
|
||||||
const int ngroup = model_.learner_model_param->num_output_group;
|
const int ngroup = model_.learner_model_param->num_output_group;
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
this->Pred(inst, dmlc::BeginPtr(*out_preds), gid,
|
this->Pred(inst, dmlc::BeginPtr(*out_preds), gid,
|
||||||
@ -177,7 +176,7 @@ class GBLinear : public GradientBooster {
|
|||||||
HostDeviceVector<bst_float>* out_contribs,
|
HostDeviceVector<bst_float>* out_contribs,
|
||||||
unsigned layer_begin, unsigned layer_end, bool, int, unsigned) override {
|
unsigned layer_begin, unsigned layer_end, bool, int, unsigned) override {
|
||||||
model_.LazyInitModel();
|
model_.LazyInitModel();
|
||||||
LinearCheckLayer(layer_begin, layer_end);
|
LinearCheckLayer(layer_begin);
|
||||||
const auto &base_margin = p_fmat->Info().base_margin_.ConstHostVector();
|
const auto &base_margin = p_fmat->Info().base_margin_.ConstHostVector();
|
||||||
const int ngroup = model_.learner_model_param->num_output_group;
|
const int ngroup = model_.learner_model_param->num_output_group;
|
||||||
const size_t ncolumns = model_.learner_model_param->num_feature + 1;
|
const size_t ncolumns = model_.learner_model_param->num_feature + 1;
|
||||||
@ -214,7 +213,7 @@ class GBLinear : public GradientBooster {
|
|||||||
void PredictInteractionContributions(DMatrix* p_fmat,
|
void PredictInteractionContributions(DMatrix* p_fmat,
|
||||||
HostDeviceVector<bst_float>* out_contribs,
|
HostDeviceVector<bst_float>* out_contribs,
|
||||||
unsigned layer_begin, unsigned layer_end, bool) override {
|
unsigned layer_begin, unsigned layer_end, bool) override {
|
||||||
LinearCheckLayer(layer_begin, layer_end);
|
LinearCheckLayer(layer_begin);
|
||||||
std::vector<bst_float>& contribs = out_contribs->HostVector();
|
std::vector<bst_float>& contribs = out_contribs->HostVector();
|
||||||
|
|
||||||
// linear models have no interaction effects
|
// linear models have no interaction effects
|
||||||
@ -232,9 +231,11 @@ class GBLinear : public GradientBooster {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void FeatureScore(std::string const &importance_type,
|
void FeatureScore(std::string const &importance_type,
|
||||||
|
common::Span<int32_t const> trees,
|
||||||
std::vector<bst_feature_t> *out_features,
|
std::vector<bst_feature_t> *out_features,
|
||||||
std::vector<float> *out_scores) const override {
|
std::vector<float> *out_scores) const override {
|
||||||
CHECK(!model_.weight.empty()) << "Model is not initialized";
|
CHECK(!model_.weight.empty()) << "Model is not initialized";
|
||||||
|
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
|
||||||
CHECK_EQ(importance_type, "weight")
|
CHECK_EQ(importance_type, "weight")
|
||||||
<< "gblinear only has `weight` defined for feature importance.";
|
<< "gblinear only has `weight` defined for feature importance.";
|
||||||
out_features->resize(this->learner_model_param_->num_feature, 0);
|
out_features->resize(this->learner_model_param_->num_feature, 0);
|
||||||
|
|||||||
@ -18,6 +18,7 @@ void GBLinearModel::SaveModel(Json* p_out) const {
|
|||||||
j_weights[i] = weight[i];
|
j_weights[i] = weight[i];
|
||||||
}
|
}
|
||||||
out["weights"] = std::move(j_weights);
|
out["weights"] = std::move(j_weights);
|
||||||
|
out["boosted_rounds"] = Json{this->num_boosted_rounds};
|
||||||
}
|
}
|
||||||
|
|
||||||
void GBLinearModel::LoadModel(Json const& in) {
|
void GBLinearModel::LoadModel(Json const& in) {
|
||||||
@ -27,6 +28,13 @@ void GBLinearModel::LoadModel(Json const& in) {
|
|||||||
for (size_t i = 0; i < n_weights; ++i) {
|
for (size_t i = 0; i < n_weights; ++i) {
|
||||||
weight[i] = get<Number const>(j_weights[i]);
|
weight[i] = get<Number const>(j_weights[i]);
|
||||||
}
|
}
|
||||||
|
auto const& obj = get<Object const>(in);
|
||||||
|
auto boosted_rounds = obj.find("boosted_rounds");
|
||||||
|
if (boosted_rounds != obj.cend()) {
|
||||||
|
this->num_boosted_rounds = get<Integer const>(boosted_rounds->second);
|
||||||
|
} else {
|
||||||
|
this->num_boosted_rounds = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
DMLC_REGISTER_PARAMETER(DeprecatedGBLinearModelParam);
|
DMLC_REGISTER_PARAMETER(DeprecatedGBLinearModelParam);
|
||||||
|
|||||||
@ -273,6 +273,7 @@ class GBTree : public GradientBooster {
|
|||||||
uint32_t tree_begin, tree_end;
|
uint32_t tree_begin, tree_end;
|
||||||
std::tie(tree_begin, tree_end) =
|
std::tie(tree_begin, tree_end) =
|
||||||
detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
|
detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
|
||||||
|
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
|
||||||
std::vector<Predictor const *> predictors{
|
std::vector<Predictor const *> predictors{
|
||||||
cpu_predictor_.get(),
|
cpu_predictor_.get(),
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
@ -300,7 +301,7 @@ class GBTree : public GradientBooster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void FeatureScore(std::string const &importance_type,
|
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
|
||||||
std::vector<bst_feature_t>* features,
|
std::vector<bst_feature_t>* features,
|
||||||
std::vector<float>* scores) const override {
|
std::vector<float>* scores) const override {
|
||||||
// Because feature with no importance doesn't appear in the return value so
|
// Because feature with no importance doesn't appear in the return value so
|
||||||
@ -308,8 +309,18 @@ class GBTree : public GradientBooster {
|
|||||||
// computation.
|
// computation.
|
||||||
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
|
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
|
||||||
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
|
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
|
||||||
|
std::vector<int32_t> tree_idx;
|
||||||
|
if (trees.empty()) {
|
||||||
|
tree_idx.resize(this->model_.trees.size());
|
||||||
|
std::iota(tree_idx.begin(), tree_idx.end(), 0);
|
||||||
|
trees = common::Span<int32_t const>(tree_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto total_n_trees = model_.trees.size();
|
||||||
auto add_score = [&](auto fn) {
|
auto add_score = [&](auto fn) {
|
||||||
for (auto const &p_tree : model_.trees) {
|
for (auto idx : trees) {
|
||||||
|
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
|
||||||
|
auto const& p_tree = model_.trees[idx];
|
||||||
p_tree->WalkTree([&](bst_node_t nidx) {
|
p_tree->WalkTree([&](bst_node_t nidx) {
|
||||||
auto const& node = (*p_tree)[nidx];
|
auto const& node = (*p_tree)[nidx];
|
||||||
if (!node.IsLeaf()) {
|
if (!node.IsLeaf()) {
|
||||||
|
|||||||
@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO {
|
|||||||
*out_preds = &out_predictions.predictions;
|
*out_preds = &out_predictions.predictions;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CalcFeatureScore(std::string const &importance_type,
|
void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
|
||||||
std::vector<bst_feature_t> *features,
|
std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
|
||||||
std::vector<float> *scores) override {
|
|
||||||
this->Configure();
|
this->Configure();
|
||||||
gbm_->FeatureScore(importance_type, features, scores);
|
gbm_->FeatureScore(importance_type, trees, features, scores);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::map<std::string, std::string>& GetConfigurationArguments() const override {
|
const std::map<std::string, std::string>& GetConfigurationArguments() const override {
|
||||||
|
|||||||
@ -291,7 +291,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
|
|||||||
// labels is a vector of size n_samples.
|
// labels is a vector of size n_samples.
|
||||||
float label = labels[idx % n_samples] == class_id;
|
float label = labels[idx % n_samples] == class_id;
|
||||||
|
|
||||||
float w = get_weight(i % n_samples);
|
float w = weights.empty() ? 1.0f : weights[d_sorted_idx[i] % n_samples];
|
||||||
float fp = (1.0 - label) * w;
|
float fp = (1.0 - label) * w;
|
||||||
float tp = label * w;
|
float tp = label * w;
|
||||||
return thrust::make_pair(fp, tp);
|
return thrust::make_pair(fp, tp);
|
||||||
|
|||||||
@ -309,9 +309,8 @@ struct EvalGammaNLogLik {
|
|||||||
float constexpr kPsi = 1.0;
|
float constexpr kPsi = 1.0;
|
||||||
bst_float theta = -1. / py;
|
bst_float theta = -1. / py;
|
||||||
bst_float a = kPsi;
|
bst_float a = kPsi;
|
||||||
// b = -std::log(-theta);
|
float b = -std::log(-theta);
|
||||||
float b = 1.0f;
|
// c = 1. / kPsi^2 * std::log(y/kPsi) - std::log(y) - common::LogGamma(1. / kPsi);
|
||||||
// c = 1. / kPsi * std::log(y/kPsi) - std::log(y) - common::LogGamma(1. / kPsi);
|
|
||||||
// = 1.0f * std::log(y) - std::log(y) - 0 = 0
|
// = 1.0f * std::log(y) - std::log(y) - 0 = 0
|
||||||
float c = 0;
|
float c = 0;
|
||||||
// general form for exponential family.
|
// general form for exponential family.
|
||||||
|
|||||||
@ -109,10 +109,9 @@ class ColMaker: public TreeUpdater {
|
|||||||
interaction_constraints_.Configure(param_, dmat->Info().num_row_);
|
interaction_constraints_.Configure(param_, dmat->Info().num_row_);
|
||||||
// build tree
|
// build tree
|
||||||
for (auto tree : trees) {
|
for (auto tree : trees) {
|
||||||
Builder builder(
|
CHECK(tparam_);
|
||||||
param_,
|
Builder builder(param_, colmaker_param_, interaction_constraints_, tparam_,
|
||||||
colmaker_param_,
|
column_densities_);
|
||||||
interaction_constraints_, column_densities_);
|
|
||||||
builder.Update(gpair->ConstHostVector(), dmat, tree);
|
builder.Update(gpair->ConstHostVector(), dmat, tree);
|
||||||
}
|
}
|
||||||
param_.learning_rate = lr;
|
param_.learning_rate = lr;
|
||||||
@ -154,12 +153,12 @@ class ColMaker: public TreeUpdater {
|
|||||||
class Builder {
|
class Builder {
|
||||||
public:
|
public:
|
||||||
// constructor
|
// constructor
|
||||||
explicit Builder(const TrainParam& param,
|
explicit Builder(const TrainParam ¶m, const ColMakerTrainParam &colmaker_train_param,
|
||||||
const ColMakerTrainParam& colmaker_train_param,
|
|
||||||
FeatureInteractionConstraintHost _interaction_constraints,
|
FeatureInteractionConstraintHost _interaction_constraints,
|
||||||
const std::vector<float> &column_densities)
|
GenericParameter const *ctx, const std::vector<float> &column_densities)
|
||||||
: param_(param), colmaker_train_param_{colmaker_train_param},
|
: param_(param),
|
||||||
nthread_(omp_get_max_threads()),
|
colmaker_train_param_{colmaker_train_param},
|
||||||
|
ctx_{ctx},
|
||||||
tree_evaluator_(param_, column_densities.size(), GenericParameter::kCpuId),
|
tree_evaluator_(param_, column_densities.size(), GenericParameter::kCpuId),
|
||||||
interaction_constraints_{std::move(_interaction_constraints)},
|
interaction_constraints_{std::move(_interaction_constraints)},
|
||||||
column_densities_(column_densities) {}
|
column_densities_(column_densities) {}
|
||||||
@ -238,7 +237,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
// setup temp space for each thread
|
// setup temp space for each thread
|
||||||
// reserve a small space
|
// reserve a small space
|
||||||
stemp_.clear();
|
stemp_.clear();
|
||||||
stemp_.resize(this->nthread_, std::vector<ThreadEntry>());
|
stemp_.resize(this->ctx_->Threads(), std::vector<ThreadEntry>());
|
||||||
for (auto& i : stemp_) {
|
for (auto& i : stemp_) {
|
||||||
i.clear(); i.reserve(256);
|
i.clear(); i.reserve(256);
|
||||||
}
|
}
|
||||||
@ -451,8 +450,9 @@ class ColMaker: public TreeUpdater {
|
|||||||
// start enumeration
|
// start enumeration
|
||||||
const auto num_features = static_cast<bst_omp_uint>(feat_set.size());
|
const auto num_features = static_cast<bst_omp_uint>(feat_set.size());
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
|
CHECK(this->ctx_);
|
||||||
const int batch_size = // NOLINT
|
const int batch_size = // NOLINT
|
||||||
std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
|
std::max(static_cast<int>(num_features / this->ctx_->Threads() / 32), 1);
|
||||||
#endif // defined(_OPENMP)
|
#endif // defined(_OPENMP)
|
||||||
{
|
{
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
@ -553,7 +553,8 @@ class ColMaker: public TreeUpdater {
|
|||||||
virtual void SyncBestSolution(const std::vector<int> &qexpand) {
|
virtual void SyncBestSolution(const std::vector<int> &qexpand) {
|
||||||
for (int nid : qexpand) {
|
for (int nid : qexpand) {
|
||||||
NodeEntry &e = snode_[nid];
|
NodeEntry &e = snode_[nid];
|
||||||
for (int tid = 0; tid < this->nthread_; ++tid) {
|
CHECK(this->ctx_);
|
||||||
|
for (int tid = 0; tid < this->ctx_->Threads(); ++tid) {
|
||||||
e.best.Update(stemp_[tid][nid].best);
|
e.best.Update(stemp_[tid][nid].best);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -609,7 +610,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
const TrainParam& param_;
|
const TrainParam& param_;
|
||||||
const ColMakerTrainParam& colmaker_train_param_;
|
const ColMakerTrainParam& colmaker_train_param_;
|
||||||
// number of omp thread used during training
|
// number of omp thread used during training
|
||||||
const int nthread_;
|
GenericParameter const* ctx_;
|
||||||
common::ColumnSampler column_sampler_;
|
common::ColumnSampler column_sampler_;
|
||||||
// Instance Data: current node position in the tree of each instance
|
// Instance Data: current node position in the tree of each instance
|
||||||
std::vector<int> position_;
|
std::vector<int> position_;
|
||||||
|
|||||||
@ -585,6 +585,7 @@ struct GPUHistMakerDevice {
|
|||||||
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
||||||
<< "Categorical feature value too large.";
|
<< "Categorical feature value too large.";
|
||||||
auto cat = common::AsCat(candidate.split.fvalue);
|
auto cat = common::AsCat(candidate.split.fvalue);
|
||||||
|
common::CheckCat(cat);
|
||||||
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
||||||
LBitField32 cats_bits(split_cats);
|
LBitField32 cats_bits(split_cats);
|
||||||
cats_bits.Set(cat);
|
cats_bits.Set(cat);
|
||||||
|
|||||||
@ -115,9 +115,6 @@ bool QuantileHistMaker::UpdatePredictionCache(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
|
||||||
QuantileHistMaker::Builder<GradientSumT>::~Builder() = default;
|
|
||||||
|
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
template <bool any_missing>
|
template <bool any_missing>
|
||||||
|
|||||||
@ -204,7 +204,6 @@ class QuantileHistMaker: public TreeUpdater {
|
|||||||
new HistogramBuilder<GradientSumT, CPUExpandEntry>} {
|
new HistogramBuilder<GradientSumT, CPUExpandEntry>} {
|
||||||
builder_monitor_.Init("Quantile::Builder");
|
builder_monitor_.Init("Quantile::Builder");
|
||||||
}
|
}
|
||||||
~Builder();
|
|
||||||
// update one tree, growing
|
// update one tree, growing
|
||||||
virtual void Update(const GHistIndexMatrix& gmat,
|
virtual void Update(const GHistIndexMatrix& gmat,
|
||||||
const ColumnMatrix& column_matrix,
|
const ColumnMatrix& column_matrix,
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
name: cpu_test
|
name: macos_test
|
||||||
channels:
|
channels:
|
||||||
- conda-forge
|
- conda-forge
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|||||||
@ -38,6 +38,14 @@ TEST(BitField, Check) {
|
|||||||
ASSERT_FALSE(bits.Check(i));
|
ASSERT_FALSE(bits.Check(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// regression test for correct index type.
|
||||||
|
std::vector<RBitField8::value_type> storage(33, 0);
|
||||||
|
storage[32] = static_cast<uint8_t>(1);
|
||||||
|
auto bits = RBitField8({storage.data(), storage.size()});
|
||||||
|
ASSERT_TRUE(bits.Check(256));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename BitFieldT, typename VT = typename BitFieldT::value_type>
|
template <typename BitFieldT, typename VT = typename BitFieldT::value_type>
|
||||||
|
|||||||
@ -392,6 +392,52 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
|
|||||||
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
|
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
|
||||||
|
int32_t num_bins, bool weighted) {
|
||||||
|
auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
||||||
|
thrust::device_vector<float> x(h_x);
|
||||||
|
auto adapter = AdapterFromData(x, n, 1);
|
||||||
|
MetaInfo info;
|
||||||
|
info.num_row_ = n;
|
||||||
|
info.num_col_ = 1;
|
||||||
|
info.feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||||
|
|
||||||
|
if (weighted) {
|
||||||
|
std::vector<float> weights(n, 0);
|
||||||
|
SimpleLCG lcg;
|
||||||
|
SimpleRealUniformDistribution<float> dist(0, 1);
|
||||||
|
for (auto& v : weights) {
|
||||||
|
v = dist(&lcg);
|
||||||
|
}
|
||||||
|
info.weights_.HostVector() = weights;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_EQ(info.feature_types.Size(), 1);
|
||||||
|
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
|
||||||
|
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), &container);
|
||||||
|
HistogramCuts cuts;
|
||||||
|
container.MakeCuts(&cuts);
|
||||||
|
|
||||||
|
thrust::sort(x.begin(), x.end());
|
||||||
|
auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
|
||||||
|
ASSERT_NE(n_uniques, x.size());
|
||||||
|
ASSERT_EQ(cuts.TotalBins(), n_uniques);
|
||||||
|
ASSERT_EQ(n_uniques, num_categories);
|
||||||
|
|
||||||
|
auto& values = cuts.cut_values_.HostVector();
|
||||||
|
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
|
||||||
|
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
|
||||||
|
ASSERT_TRUE(is_unique);
|
||||||
|
|
||||||
|
x.resize(n_uniques);
|
||||||
|
h_x.resize(n_uniques);
|
||||||
|
thrust::copy(x.begin(), x.end(), h_x.begin());
|
||||||
|
for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
|
||||||
|
ASSERT_EQ(h_x[i], values[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
||||||
int categorical_sizes[] = {2, 6, 8, 12};
|
int categorical_sizes[] = {2, 6, 8, 12};
|
||||||
int num_bins = 256;
|
int num_bins = 256;
|
||||||
@ -404,6 +450,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
|||||||
auto adapter = AdapterFromData(x_device, n, 1);
|
auto adapter = AdapterFromData(x_device, n, 1);
|
||||||
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
|
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
|
||||||
adapter.NumRows(), dmat.get());
|
adapter.NumRows(), dmat.get());
|
||||||
|
TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
|
||||||
|
TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -430,7 +430,7 @@ TEST(GBTree, FeatureScore) {
|
|||||||
|
|
||||||
std::vector<bst_feature_t> features_weight;
|
std::vector<bst_feature_t> features_weight;
|
||||||
std::vector<float> scores_weight;
|
std::vector<float> scores_weight;
|
||||||
learner->CalcFeatureScore("weight", &features_weight, &scores_weight);
|
learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
|
||||||
ASSERT_EQ(features_weight.size(), scores_weight.size());
|
ASSERT_EQ(features_weight.size(), scores_weight.size());
|
||||||
ASSERT_LE(features_weight.size(), learner->GetNumFeature());
|
ASSERT_LE(features_weight.size(), learner->GetNumFeature());
|
||||||
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));
|
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));
|
||||||
@ -438,11 +438,11 @@ TEST(GBTree, FeatureScore) {
|
|||||||
auto test_eq = [&learner, &scores_weight](std::string type) {
|
auto test_eq = [&learner, &scores_weight](std::string type) {
|
||||||
std::vector<bst_feature_t> features;
|
std::vector<bst_feature_t> features;
|
||||||
std::vector<float> scores;
|
std::vector<float> scores;
|
||||||
learner->CalcFeatureScore(type, &features, &scores);
|
learner->CalcFeatureScore(type, {}, &features, &scores);
|
||||||
|
|
||||||
std::vector<bst_feature_t> features_total;
|
std::vector<bst_feature_t> features_total;
|
||||||
std::vector<float> scores_total;
|
std::vector<float> scores_total;
|
||||||
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total);
|
learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);
|
||||||
|
|
||||||
for (size_t i = 0; i < scores_weight.size(); ++i) {
|
for (size_t i = 0; i < scores_weight.size(); ++i) {
|
||||||
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
|
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
|
||||||
@ -452,4 +452,47 @@ TEST(GBTree, FeatureScore) {
|
|||||||
test_eq("gain");
|
test_eq("gain");
|
||||||
test_eq("cover");
|
test_eq("cover");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(GBTree, PredictRange) {
|
||||||
|
size_t n_samples = 1000, n_features = 10, n_classes = 4;
|
||||||
|
auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
|
||||||
|
|
||||||
|
std::unique_ptr<Learner> learner{Learner::Create({m})};
|
||||||
|
learner->SetParam("num_class", std::to_string(n_classes));
|
||||||
|
|
||||||
|
learner->Configure();
|
||||||
|
for (size_t i = 0; i < 2; ++i) {
|
||||||
|
learner->UpdateOneIter(i, m);
|
||||||
|
}
|
||||||
|
HostDeviceVector<float> out_predt;
|
||||||
|
ASSERT_THROW(learner->Predict(m, false, &out_predt, 0, 3), dmlc::Error);
|
||||||
|
|
||||||
|
auto m_1 =
|
||||||
|
RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
|
||||||
|
HostDeviceVector<float> out_predt_full;
|
||||||
|
learner->Predict(m_1, false, &out_predt_full, 0, 0);
|
||||||
|
ASSERT_TRUE(std::equal(out_predt.HostVector().begin(), out_predt.HostVector().end(),
|
||||||
|
out_predt_full.HostVector().begin()));
|
||||||
|
|
||||||
|
{
|
||||||
|
// inplace predict
|
||||||
|
HostDeviceVector<float> raw_storage;
|
||||||
|
auto raw = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateArrayInterface(&raw_storage);
|
||||||
|
std::shared_ptr<data::ArrayAdapter> x{new data::ArrayAdapter{StringView{raw}}};
|
||||||
|
|
||||||
|
HostDeviceVector<float>* out_predt;
|
||||||
|
learner->InplacePredict(x, nullptr, PredictionType::kValue,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 2);
|
||||||
|
auto h_out_predt = out_predt->HostVector();
|
||||||
|
learner->InplacePredict(x, nullptr, PredictionType::kValue,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
|
||||||
|
auto h_out_predt_full = out_predt->HostVector();
|
||||||
|
|
||||||
|
ASSERT_TRUE(std::equal(h_out_predt.begin(), h_out_predt.end(), h_out_predt_full.begin()));
|
||||||
|
|
||||||
|
ASSERT_THROW(learner->InplacePredict(x, nullptr, PredictionType::kValue,
|
||||||
|
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 3),
|
||||||
|
dmlc::Error);
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -143,7 +143,7 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|||||||
}
|
}
|
||||||
|
|
||||||
xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
|
xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
|
||||||
xgboost::HostDeviceVector<xgboost::bst_float> preds,
|
xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
|
||||||
std::vector<xgboost::bst_float> labels,
|
std::vector<xgboost::bst_float> labels,
|
||||||
std::vector<xgboost::bst_float> weights,
|
std::vector<xgboost::bst_float> weights,
|
||||||
std::vector<xgboost::bst_uint> groups) {
|
std::vector<xgboost::bst_uint> groups) {
|
||||||
|
|||||||
@ -86,7 +86,7 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|||||||
|
|
||||||
xgboost::bst_float GetMetricEval(
|
xgboost::bst_float GetMetricEval(
|
||||||
xgboost::Metric * metric,
|
xgboost::Metric * metric,
|
||||||
xgboost::HostDeviceVector<xgboost::bst_float> preds,
|
xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
|
||||||
std::vector<xgboost::bst_float> labels,
|
std::vector<xgboost::bst_float> labels,
|
||||||
std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
|
std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
|
||||||
std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
|
std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
|
||||||
|
|||||||
@ -90,6 +90,16 @@ TEST(Metric, DeclareUnifiedTest(MultiAUC)) {
|
|||||||
},
|
},
|
||||||
{0, 1, 1}); // no class 2.
|
{0, 1, 1}); // no class 2.
|
||||||
EXPECT_TRUE(std::isnan(auc)) << auc;
|
EXPECT_TRUE(std::isnan(auc)) << auc;
|
||||||
|
|
||||||
|
HostDeviceVector<float> predts{
|
||||||
|
0.0f, 1.0f, 0.0f,
|
||||||
|
1.0f, 0.0f, 0.0f,
|
||||||
|
0.0f, 0.0f, 1.0f,
|
||||||
|
0.0f, 0.0f, 1.0f,
|
||||||
|
};
|
||||||
|
std::vector<float> labels {1.0f, 0.0f, 2.0f, 1.0f};
|
||||||
|
auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
|
||||||
|
ASSERT_GT(auc, 0.714);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Metric, DeclareUnifiedTest(RankingAUC)) {
|
TEST(Metric, DeclareUnifiedTest(RankingAUC)) {
|
||||||
|
|||||||
@ -186,6 +186,37 @@ Arrow specification.'''
|
|||||||
assert len(Xy.feature_types) == X.shape[1]
|
assert len(Xy.feature_types) == X.shape[1]
|
||||||
assert all(t == "c" for t in Xy.feature_types)
|
assert all(t == "c" for t in Xy.feature_types)
|
||||||
|
|
||||||
|
# test missing value
|
||||||
|
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||||
|
X["f0"] = X["f0"].astype("category")
|
||||||
|
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
|
||||||
|
X, None, None, enable_categorical=True
|
||||||
|
)
|
||||||
|
for col in cat_codes:
|
||||||
|
assert col.has_nulls
|
||||||
|
|
||||||
|
y = [0, 1, 2]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DMatrix(X, y)
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DeviceQuantileDMatrix(X, y)
|
||||||
|
|
||||||
|
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
|
X = X["f0"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DMatrix(X, y)
|
||||||
|
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_cudf())
|
@pytest.mark.skipif(**tm.no_cudf())
|
||||||
@pytest.mark.skipif(**tm.no_cupy())
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
|||||||
@ -13,9 +13,11 @@ class TestGPUEvalMetrics:
|
|||||||
def test_roc_auc_binary(self, n_samples):
|
def test_roc_auc_binary(self, n_samples):
|
||||||
self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples)
|
self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples)
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_samples", [4, 100, 1000])
|
@pytest.mark.parametrize(
|
||||||
def test_roc_auc_multi(self, n_samples):
|
"n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
|
||||||
self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples)
|
)
|
||||||
|
def test_roc_auc_multi(self, n_samples, weighted):
|
||||||
|
self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples, weighted)
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_samples", [4, 100, 1000])
|
@pytest.mark.parametrize("n_samples", [4, 100, 1000])
|
||||||
def test_roc_auc_ltr(self, n_samples):
|
def test_roc_auc_ltr(self, n_samples):
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
from hypothesis import strategies, given, settings, assume
|
from hypothesis import strategies, given, settings, assume, note
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
sys.path.append("tests/python")
|
sys.path.append("tests/python")
|
||||||
import testing as tm
|
import testing as tm
|
||||||
@ -17,10 +16,14 @@ parameter_strategy = strategies.fixed_dictionaries({
|
|||||||
'top_k': strategies.integers(1, 10),
|
'top_k': strategies.integers(1, 10),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
def train_result(param, dmat, num_rounds):
|
def train_result(param, dmat, num_rounds):
|
||||||
result = {}
|
result = {}
|
||||||
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
booster = xgb.train(
|
||||||
evals_result=result)
|
param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||||
|
evals_result=result
|
||||||
|
)
|
||||||
|
assert booster.num_boosted_rounds() == num_rounds
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@ -33,6 +36,7 @@ class TestGPULinear:
|
|||||||
param['updater'] = 'gpu_coord_descent'
|
param['updater'] = 'gpu_coord_descent'
|
||||||
param = dataset.set_params(param)
|
param = dataset.set_params(param)
|
||||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
note(result)
|
||||||
assert tm.non_increasing(result)
|
assert tm.non_increasing(result)
|
||||||
|
|
||||||
# Loss is not guaranteed to always decrease because of regularisation parameters
|
# Loss is not guaranteed to always decrease because of regularisation parameters
|
||||||
@ -49,6 +53,7 @@ class TestGPULinear:
|
|||||||
param['lambda'] = lambd
|
param['lambda'] = lambd
|
||||||
param = dataset.set_params(param)
|
param = dataset.set_params(param)
|
||||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
note(result)
|
||||||
assert tm.non_increasing([result[0], result[-1]])
|
assert tm.non_increasing([result[0], result[-1]])
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_cupy())
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
|||||||
@ -59,6 +59,7 @@ def test_categorical():
|
|||||||
)
|
)
|
||||||
X = pd.DataFrame(X.todense()).astype("category")
|
X = pd.DataFrame(X.todense()).astype("category")
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
|
assert not clf._can_use_inplace_predict()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
model = os.path.join(tempdir, "categorial.json")
|
model = os.path.join(tempdir, "categorial.json")
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
from typing import Union
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import pytest
|
import pytest
|
||||||
import os
|
import os
|
||||||
@ -22,17 +23,14 @@ class TestCallbacks:
|
|||||||
cls.X_valid = X[split:, ...]
|
cls.X_valid = X[split:, ...]
|
||||||
cls.y_valid = y[split:, ...]
|
cls.y_valid = y[split:, ...]
|
||||||
|
|
||||||
def run_evaluation_monitor(self, D_train, D_valid, rounds, verbose_eval):
|
def run_evaluation_monitor(
|
||||||
evals_result = {}
|
self,
|
||||||
with tm.captured_output() as (out, err):
|
D_train: xgb.DMatrix,
|
||||||
xgb.train({'objective': 'binary:logistic',
|
D_valid: xgb.DMatrix,
|
||||||
'eval_metric': 'error'}, D_train,
|
rounds: int,
|
||||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
verbose_eval: Union[bool, int]
|
||||||
num_boost_round=rounds,
|
):
|
||||||
evals_result=evals_result,
|
def check_output(output: str) -> None:
|
||||||
verbose_eval=verbose_eval)
|
|
||||||
output: str = out.getvalue().strip()
|
|
||||||
|
|
||||||
if int(verbose_eval) == 1:
|
if int(verbose_eval) == 1:
|
||||||
# Should print each iteration info
|
# Should print each iteration info
|
||||||
assert len(output.split('\n')) == rounds
|
assert len(output.split('\n')) == rounds
|
||||||
@ -40,11 +38,32 @@ class TestCallbacks:
|
|||||||
# Should print first and latest iteration info
|
# Should print first and latest iteration info
|
||||||
assert len(output.split('\n')) == 2
|
assert len(output.split('\n')) == 2
|
||||||
else:
|
else:
|
||||||
# Should print info by each period additionaly to first and latest iteration
|
# Should print info by each period additionaly to first and latest
|
||||||
|
# iteration
|
||||||
num_periods = rounds // int(verbose_eval)
|
num_periods = rounds // int(verbose_eval)
|
||||||
# Extra information is required for latest iteration
|
# Extra information is required for latest iteration
|
||||||
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
|
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
|
||||||
assert len(output.split('\n')) == 1 + num_periods + int(is_extra_info_required)
|
assert len(output.split('\n')) == (
|
||||||
|
1 + num_periods + int(is_extra_info_required)
|
||||||
|
)
|
||||||
|
|
||||||
|
evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
|
||||||
|
params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
|
||||||
|
with tm.captured_output() as (out, err):
|
||||||
|
xgb.train(
|
||||||
|
params, D_train,
|
||||||
|
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||||
|
num_boost_round=rounds,
|
||||||
|
evals_result=evals_result,
|
||||||
|
verbose_eval=verbose_eval,
|
||||||
|
)
|
||||||
|
output: str = out.getvalue().strip()
|
||||||
|
check_output(output)
|
||||||
|
|
||||||
|
with tm.captured_output() as (out, err):
|
||||||
|
xgb.cv(params, D_train, num_boost_round=rounds, verbose_eval=verbose_eval)
|
||||||
|
output = out.getvalue().strip()
|
||||||
|
check_output(output)
|
||||||
|
|
||||||
def test_evaluation_monitor(self):
|
def test_evaluation_monitor(self):
|
||||||
D_train = xgb.DMatrix(self.X_train, self.y_train)
|
D_train = xgb.DMatrix(self.X_train, self.y_train)
|
||||||
|
|||||||
@ -124,6 +124,35 @@ class TestEvalMetrics:
|
|||||||
skl_gamma_dev = mean_gamma_deviance(y, score)
|
skl_gamma_dev = mean_gamma_deviance(y, score)
|
||||||
np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6)
|
np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6)
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
|
def test_gamma_lik(self) -> None:
|
||||||
|
import scipy.stats as stats
|
||||||
|
rng = np.random.default_rng(1994)
|
||||||
|
n_samples = 32
|
||||||
|
n_features = 10
|
||||||
|
|
||||||
|
X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
|
||||||
|
|
||||||
|
alpha, loc, beta = 5.0, 11.1, 22
|
||||||
|
y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
|
||||||
|
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
|
||||||
|
reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
|
||||||
|
|
||||||
|
score = reg.predict(X)
|
||||||
|
|
||||||
|
booster = reg.get_booster()
|
||||||
|
nloglik = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1].split(":")[0])
|
||||||
|
|
||||||
|
# \beta_i = - (1 / \theta_i a)
|
||||||
|
# where \theta_i is the canonical parameter
|
||||||
|
# XGBoost uses the canonical link function of gamma in evaluation function.
|
||||||
|
# so \theta = - (1.0 / y)
|
||||||
|
# dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
|
||||||
|
beta = - (1.0 / (- (1.0 / y))) # == y
|
||||||
|
nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
|
||||||
|
|
||||||
|
np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
|
||||||
|
|
||||||
def run_roc_auc_binary(self, tree_method, n_samples):
|
def run_roc_auc_binary(self, tree_method, n_samples):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
@ -162,11 +191,11 @@ class TestEvalMetrics:
|
|||||||
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
|
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
@pytest.mark.parametrize("n_samples", [4, 100, 1000])
|
@pytest.mark.parametrize("n_samples", [100, 1000])
|
||||||
def test_roc_auc(self, n_samples):
|
def test_roc_auc(self, n_samples):
|
||||||
self.run_roc_auc_binary("hist", n_samples)
|
self.run_roc_auc_binary("hist", n_samples)
|
||||||
|
|
||||||
def run_roc_auc_multi(self, tree_method, n_samples):
|
def run_roc_auc_multi(self, tree_method, n_samples, weighted):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
from sklearn.metrics import roc_auc_score
|
from sklearn.metrics import roc_auc_score
|
||||||
@ -184,8 +213,14 @@ class TestEvalMetrics:
|
|||||||
n_classes=n_classes,
|
n_classes=n_classes,
|
||||||
random_state=rng
|
random_state=rng
|
||||||
)
|
)
|
||||||
|
if weighted:
|
||||||
|
weights = rng.randn(n_samples)
|
||||||
|
weights -= weights.min()
|
||||||
|
weights /= weights.max()
|
||||||
|
else:
|
||||||
|
weights = None
|
||||||
|
|
||||||
Xy = xgb.DMatrix(X, y)
|
Xy = xgb.DMatrix(X, y, weight=weights)
|
||||||
booster = xgb.train(
|
booster = xgb.train(
|
||||||
{
|
{
|
||||||
"tree_method": tree_method,
|
"tree_method": tree_method,
|
||||||
@ -197,16 +232,22 @@ class TestEvalMetrics:
|
|||||||
num_boost_round=8,
|
num_boost_round=8,
|
||||||
)
|
)
|
||||||
score = booster.predict(Xy)
|
score = booster.predict(Xy)
|
||||||
skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr")
|
skl_auc = roc_auc_score(
|
||||||
|
y, score, average="weighted", sample_weight=weights, multi_class="ovr"
|
||||||
|
)
|
||||||
auc = float(booster.eval(Xy).split(":")[1])
|
auc = float(booster.eval(Xy).split(":")[1])
|
||||||
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
|
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
|
||||||
|
|
||||||
X = rng.randn(*X.shape)
|
X = rng.randn(*X.shape)
|
||||||
score = booster.predict(xgb.DMatrix(X))
|
score = booster.predict(xgb.DMatrix(X, weight=weights))
|
||||||
skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr")
|
skl_auc = roc_auc_score(
|
||||||
auc = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1])
|
y, score, average="weighted", sample_weight=weights, multi_class="ovr"
|
||||||
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
|
)
|
||||||
|
auc = float(booster.eval(xgb.DMatrix(X, y, weight=weights)).split(":")[1])
|
||||||
|
np.testing.assert_allclose(skl_auc, auc, rtol=1e-5)
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_samples", [4, 100, 1000])
|
@pytest.mark.parametrize(
|
||||||
def test_roc_auc_multi(self, n_samples):
|
"n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
|
||||||
self.run_roc_auc_multi("hist", n_samples)
|
)
|
||||||
|
def test_roc_auc_multi(self, n_samples, weighted):
|
||||||
|
self.run_roc_auc_multi("hist", n_samples, weighted)
|
||||||
|
|||||||
@ -32,6 +32,7 @@ class TestLinear:
|
|||||||
param.update(coord_param)
|
param.update(coord_param)
|
||||||
param = dataset.set_params(param)
|
param = dataset.set_params(param)
|
||||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
note(result)
|
||||||
assert tm.non_increasing(result, 5e-4)
|
assert tm.non_increasing(result, 5e-4)
|
||||||
|
|
||||||
# Loss is not guaranteed to always decrease because of regularisation parameters
|
# Loss is not guaranteed to always decrease because of regularisation parameters
|
||||||
@ -48,6 +49,7 @@ class TestLinear:
|
|||||||
param.update(coord_param)
|
param.update(coord_param)
|
||||||
param = dataset.set_params(param)
|
param = dataset.set_params(param)
|
||||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
note(result)
|
||||||
assert tm.non_increasing([result[0], result[-1]])
|
assert tm.non_increasing([result[0], result[-1]])
|
||||||
|
|
||||||
@given(parameter_strategy, strategies.integers(10, 50),
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
@ -57,6 +59,7 @@ class TestLinear:
|
|||||||
param['updater'] = 'shotgun'
|
param['updater'] = 'shotgun'
|
||||||
param = dataset.set_params(param)
|
param = dataset.set_params(param)
|
||||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
note(result)
|
||||||
# shotgun is non-deterministic, so we relax the test by only using first and last
|
# shotgun is non-deterministic, so we relax the test by only using first and last
|
||||||
# iteration.
|
# iteration.
|
||||||
if len(result) > 2:
|
if len(result) > 2:
|
||||||
@ -75,4 +78,5 @@ class TestLinear:
|
|||||||
param['lambda'] = lambd
|
param['lambda'] = lambd
|
||||||
param = dataset.set_params(param)
|
param = dataset.set_params(param)
|
||||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
note(result)
|
||||||
assert tm.non_increasing([result[0], result[-1]])
|
assert tm.non_increasing([result[0], result[-1]])
|
||||||
|
|||||||
@ -1,6 +1,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
import os
|
||||||
|
import tempfile
|
||||||
|
import subprocess
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import testing as tm
|
||||||
|
|
||||||
|
|
||||||
class TestOMP:
|
class TestOMP:
|
||||||
@ -71,3 +77,31 @@ class TestOMP:
|
|||||||
assert auc_1 == auc_2 == auc_3
|
assert auc_1 == auc_2 == auc_3
|
||||||
assert np.array_equal(auc_1, auc_2)
|
assert np.array_equal(auc_1, auc_2)
|
||||||
assert np.array_equal(auc_1, auc_3)
|
assert np.array_equal(auc_1, auc_3)
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
|
def test_with_omp_thread_limit(self):
|
||||||
|
args = [
|
||||||
|
"python", os.path.join(
|
||||||
|
tm.PROJECT_ROOT, "tests", "python", "with_omp_limit.py"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
for i in (1, 2, 16):
|
||||||
|
path = os.path.join(tmpdir, str(i))
|
||||||
|
with open(path, "w") as fd:
|
||||||
|
fd.write("\n")
|
||||||
|
cp = args.copy()
|
||||||
|
cp.append(path)
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["OMP_THREAD_LIMIT"] = str(i)
|
||||||
|
|
||||||
|
status = subprocess.call(cp, env=env)
|
||||||
|
assert status == 0
|
||||||
|
|
||||||
|
with open(path, "r") as fd:
|
||||||
|
results.append(float(fd.read()))
|
||||||
|
|
||||||
|
for auc in results:
|
||||||
|
np.testing.assert_allclose(auc, results[0])
|
||||||
|
|||||||
@ -705,8 +705,7 @@ async def run_from_dask_array_asyncio(scheduler_address: str) -> xgb.dask.TrainR
|
|||||||
async def run_dask_regressor_asyncio(scheduler_address: str) -> None:
|
async def run_dask_regressor_asyncio(scheduler_address: str) -> None:
|
||||||
async with Client(scheduler_address, asynchronous=True) as client:
|
async with Client(scheduler_address, asynchronous=True) as client:
|
||||||
X, y, _ = generate_array()
|
X, y, _ = generate_array()
|
||||||
regressor = await xgb.dask.DaskXGBRegressor(verbosity=1,
|
regressor = await xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
|
||||||
n_estimators=2)
|
|
||||||
regressor.set_params(tree_method='hist')
|
regressor.set_params(tree_method='hist')
|
||||||
regressor.client = client
|
regressor.client = client
|
||||||
await regressor.fit(X, y, eval_set=[(X, y)])
|
await regressor.fit(X, y, eval_set=[(X, y)])
|
||||||
|
|||||||
@ -138,9 +138,22 @@ class TestPandas:
|
|||||||
X, enable_categorical=True
|
X, enable_categorical=True
|
||||||
)
|
)
|
||||||
|
|
||||||
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
|
|
||||||
assert transformed[:, 0].min() == 0
|
assert transformed[:, 0].min() == 0
|
||||||
|
|
||||||
|
# test missing value
|
||||||
|
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||||
|
X["f0"] = X["f0"].astype("category")
|
||||||
|
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
|
||||||
|
assert not np.any(arr == -1.0)
|
||||||
|
|
||||||
|
X = X["f0"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
xgb.DMatrix(X, y)
|
||||||
|
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert Xy.num_row() == 3
|
||||||
|
assert Xy.num_col() == 1
|
||||||
|
|
||||||
def test_pandas_sparse(self):
|
def test_pandas_sparse(self):
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
rows = 100
|
rows = 100
|
||||||
|
|||||||
26
tests/python/with_omp_limit.py
Normal file
26
tests/python/with_omp_limit.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import os
|
||||||
|
import xgboost as xgb
|
||||||
|
from sklearn.datasets import make_classification
|
||||||
|
from sklearn.metrics import roc_auc_score
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def run_omp(output_path: str):
|
||||||
|
X, y = make_classification(
|
||||||
|
n_samples=200, n_features=32, n_classes=3, n_informative=8
|
||||||
|
)
|
||||||
|
Xy = xgb.DMatrix(X, y, nthread=16)
|
||||||
|
booster = xgb.train(
|
||||||
|
{"num_class": 3, "objective": "multi:softprob", "n_jobs": 16},
|
||||||
|
Xy,
|
||||||
|
num_boost_round=8,
|
||||||
|
)
|
||||||
|
score = booster.predict(Xy)
|
||||||
|
auc = roc_auc_score(y, score, average="weighted", multi_class="ovr")
|
||||||
|
with open(output_path, "w") as fd:
|
||||||
|
fd.write(str(auc))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
out = sys.argv[1]
|
||||||
|
run_omp(out)
|
||||||
@ -1,16 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
if [ ${TRAVIS_OS_NAME} == "osx" ]; then
|
|
||||||
# https://travis-ci.community/t/macos-build-fails-because-of-homebrew-bundle-unknown-command/7296/27
|
|
||||||
# Use libomp 11.1.0: https://github.com/dmlc/xgboost/issues/7039
|
|
||||||
brew update # Force update, so that update doesn't overwrite our version of libomp.rb
|
|
||||||
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb)
|
|
||||||
brew install cmake libomp
|
|
||||||
brew pin libomp
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_sdist_test" ]; then
|
if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_sdist_test" ]; then
|
||||||
if [ ${TRAVIS_OS_NAME} == "osx" ]; then
|
if [ ${TRAVIS_OS_NAME} == "osx" ]; then
|
||||||
wget --no-verbose -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
wget --no-verbose -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user