Compare commits

...

20 Commits

Author SHA1 Message Date
Hyunsu Cho
eb69c6110a Bump version to 1.5.1 2021-11-22 14:29:59 -08:00
Jiaming Yuan
0f9ffcdc16 [backport] Fix R CRAN failures. (#7404) (#7451)
* Remove hist builder dtor.

* Initialize values.

* Tolerance.

* Remove the use of nthread in col maker.
2021-11-19 21:40:04 +08:00
Jiaming Yuan
9bbd00a49f [backport] Set use_logger in tracker to false. (#7438) (#7439) 2021-11-16 09:51:37 +08:00
Jiaming Yuan
7e239f229c [CI] Install igraph as binary. (#7417) (#7430) 2021-11-13 01:53:41 +08:00
Jiaming Yuan
a013942649 Check number of trees in inplace predict. (#7409) (#7424) 2021-11-12 19:31:31 +08:00
Jiaming Yuan
4d2ea0d4ef [backport] [doc] Fix broken links. (#7341) (#7418)
* Fix most of the link checks from sphinx.
* Remove duplicate explicit target name.
2021-11-11 19:33:02 +08:00
Jiaming Yuan
d1052b5cfe [jvm-packages] Fix json4s binary compatibility issue (#7376) (#7414)
Spark 3.2 depends on 3.7.0-M11 which has changed some implicited functions'
signatures. And it will result the xgboost4j built against spark 3.0/3.1
failed when saving the model.

Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2021-11-10 21:25:11 +08:00
Jiaming Yuan
14c56f05da [backport] Handle missing values in dataframe with category dtype. (#7331) (#7413)
* Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.

* Fix pick error.
2021-11-10 21:24:46 +08:00
Jiaming Yuan
11f8b5cfcd [backport] Support building with CTK11.5. (#7379) (#7411)
* Support building with CTK11.5.

* Require system cub installation for CTK11.4+.
* Check thrust version for segmented sort.
2021-11-10 19:23:29 +08:00
Jiaming Yuan
e7ac2486eb [backport] [R] Fix global feature importance and predict with 1 sample. (#7394) (#7397)
* [R] Fix global feature importance.

* Add implementation for tree index.  The parameter is not documented in C API since we
should work on porting the model slicing to R instead of supporting more use of tree
index.

* Fix the difference between "gain" and "total_gain".

* debug.

* Fix prediction.
2021-11-06 00:07:36 +08:00
Jiaming Yuan
a3d195e73e Handle OMP_THREAD_LIMIT. (#7390) (#7391) 2021-11-03 20:25:51 +08:00
Jiaming Yuan
fab3c05ced Move macos test to github action. (#7382) (#7392)
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
2021-11-03 18:39:47 +08:00
Jiaming Yuan
584b45a9cc Release 1.5.0. (#7317) 2021-10-15 12:21:04 +08:00
Jiaming Yuan
30c1b5c54c [backport] Fix prediction with cat data in sklearn interface. (#7306) (#7312)
* Specify DMatrix parameter for pre-processing dataframe.
* Add document about the behaviour of prediction.
2021-10-12 18:49:57 +08:00
Jiaming Yuan
36e247aca4 Fix weighted samples in multi-class AUC. (#7300) (#7305) 2021-10-11 18:00:36 +08:00
Jiaming Yuan
c4aff733bb [backport] Fix cv verbose_eval (#7291) (#7296) 2021-10-08 14:24:27 +08:00
Jiaming Yuan
cdbfd21d31 [backport] Fix gamma neg log likelihood. (#7275) (#7285) 2021-10-05 23:01:11 +08:00
Jiaming Yuan
508a0b0dbd [backport] [R] Fix document for nthread. (#7263) (#7269) 2021-09-28 14:41:32 +08:00
Jiaming Yuan
e04e773f9f Add RC1 tag for building packages. (#7261) 2021-09-28 11:50:18 +08:00
Jiaming Yuan
1debabb321 Change version to 1.5.0. (#7258) 2021-09-26 13:27:54 +08:00
72 changed files with 923 additions and 323 deletions

View File

@@ -9,7 +9,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
os: [windows-latest, ubuntu-latest] os: [windows-latest, ubuntu-latest, macos-10.15]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2

View File

@@ -51,7 +51,8 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- {os: windows-2016, compiler: 'msvc', python-version: '3.8'} - {os: windows-2016, python-version: '3.8'}
- {os: macos-10.15, python-version "3.8" }
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@@ -71,15 +72,27 @@ jobs:
conda info conda info
conda list conda list
- name: Build XGBoost with msvc - name: Build XGBoost on Windows
shell: bash -l {0} shell: bash -l {0}
if: matrix.config.compiler == 'msvc' if: matrix.config.os == 'windows-2016'
run: | run: |
mkdir build_msvc mkdir build_msvc
cd build_msvc cd build_msvc
cmake .. -G"Visual Studio 15 2017" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON cmake .. -G"Visual Studio 15 2017" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON
cmake --build . --config Release --parallel $(nproc) cmake --build . --config Release --parallel $(nproc)
- name: Build XGBoost on macos
if: matrix.config.os == 'macos-10.15'
run: |
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb)
brew install ninja libomp
brew pin libomp
mkdir build
cd build
cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON
ninja
- name: Install Python package - name: Install Python package
shell: bash -l {0} shell: bash -l {0}
run: | run: |
@@ -92,3 +105,21 @@ jobs:
shell: bash -l {0} shell: bash -l {0}
run: | run: |
pytest -s -v ./tests/python pytest -s -v ./tests/python
- name: Rename Python wheel
shell: bash -l {0}
if: matrix.config.os == 'macos-10.15'
run: |
TAG=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
python tests/ci_build/rename_whl.py python-package/dist/*.whl ${{ github.sha }} ${TAG}
- name: Upload Python wheel
shell: bash -l {0}
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'macos-latest'
run: |
python -m awscli s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}

View File

@@ -3,7 +3,7 @@ name: XGBoost-R-Tests
on: [push, pull_request] on: [push, pull_request]
env: env:
R_PACKAGES: c('XML', 'igraph', 'data.table', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic') R_PACKAGES: c('XML', 'data.table', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools', 'float', 'titanic')
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
jobs: jobs:
@@ -40,6 +40,11 @@ jobs:
install.packages(${{ env.R_PACKAGES }}, install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org', repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo')) dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Install igraph on Windows
shell: Rscript {0}
if: matrix.config.os == 'windows-latest'
run: |
install.packages('igraph', type='binary')
- name: Run lintr - name: Run lintr
run: | run: |
@@ -83,6 +88,11 @@ jobs:
install.packages(${{ env.R_PACKAGES }}, install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org', repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo')) dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Install igraph on Windows
shell: Rscript {0}
if: matrix.config.os == 'windows-2016'
run: |
install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo'))
- uses: actions/setup-python@v2 - uses: actions/setup-python@v2
with: with:
@@ -91,7 +101,7 @@ jobs:
- name: Test R - name: Test R
run: | run: |
python tests/ci_build/test_r_package.py --compiler="${{ matrix.config.compiler }}" --build-tool="${{ matrix.config.build }}" python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool='${{ matrix.config.build }}'
test-R-CRAN: test-R-CRAN:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -115,7 +125,7 @@ jobs:
- name: Install system packages - name: Install system packages
run: | run: |
sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc sudo apt-get update && sudo apt-get install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev pandoc pandoc-citeproc libglpk-dev
- name: Cache R packages - name: Cache R packages
uses: actions/cache@v2 uses: actions/cache@v2
@@ -130,6 +140,7 @@ jobs:
install.packages(${{ env.R_PACKAGES }}, install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org', repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo')) dependencies = c('Depends', 'Imports', 'LinkingTo'))
install.packages('igraph', repos = 'http://cloud.r-project.org', dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Check R Package - name: Check R Package
run: | run: |

View File

@@ -10,14 +10,6 @@ env:
jobs: jobs:
include: include:
- os: osx
arch: amd64
osx_image: xcode10.2
env: TASK=python_test
- os: osx
arch: amd64
osx_image: xcode10.2
env: TASK=java_test
- os: linux - os: linux
arch: s390x arch: s390x
env: TASK=s390x_test env: TASK=s390x_test
@@ -33,8 +25,6 @@ addons:
before_install: before_install:
- source tests/travis/travis_setup_env.sh - source tests/travis/travis_setup_env.sh
- if [ "${TASK}" != "python_sdist_test" ]; then export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package; fi
- echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc
install: install:
- source tests/travis/setup.sh - source tests/travis/setup.sh

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR) cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 1.5.0) project(xgboost LANGUAGES CXX C VERSION 1.5.1)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)
@@ -135,6 +135,10 @@ if (USE_CUDA)
set(GEN_CODE "") set(GEN_CODE "")
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE) format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap) add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
if ((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.4) AND (NOT BUILD_WITH_CUDA_CUB))
message(SEND_ERROR "`BUILD_WITH_CUDA_CUB` should be set to `ON` for CUDA >= 11.4")
endif ()
endif (USE_CUDA) endif (USE_CUDA)
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND

View File

@@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 1.5.0.1 Version: 1.5.1.1
Date: 2020-08-28 Date: 2021-10-13
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"), email = "tianqi.tchen@gmail.com"),

View File

@@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
shape <- predts$shape shape <- predts$shape
ret <- predts$results ret <- predts$results
n_ret <- length(ret)
n_row <- nrow(newdata) n_row <- nrow(newdata)
if (n_row != shape[1]) { if (n_row != shape[1]) {
stop("Incorrect predict shape.") stop("Incorrect predict shape.")
@@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
arr <- array(data = ret, dim = rev(shape)) arr <- array(data = ret, dim = rev(shape))
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
n_groups <- shape[2]
## Needed regardless of whether strict shape is being used.
if (predcontrib) { if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL) dimnames(arr) <- list(cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
}
} else if (predinteraction) { } else if (predinteraction) {
dimnames(arr) <- list(cnames, cnames, NULL, NULL) dimnames(arr) <- list(cnames, cnames, NULL, NULL)
if (!strict_shape) { }
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col] if (strict_shape) {
} return(arr) # strict shape is calculated by libxgboost uniformly.
} }
if (!strict_shape) { if (predleaf) {
n_groups <- shape[2] ## Predict leaf
if (predleaf) { arr <- if (n_ret == n_row) {
arr <- matrix(arr, nrow = n_row, byrow = TRUE) matrix(arr, ncol = 1)
} else if (predcontrib && n_groups != 1) { } else {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ]) matrix(arr, nrow = n_row, byrow = TRUE)
} else if (predinteraction && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else if (!reshape && n_groups != 1) {
arr <- ret
} else if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
} }
arr <- drop(arr) } else if (predcontrib) {
if (length(dim(arr)) == 1) { ## Predict contribution
arr <- as.vector(arr) arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
} else if (length(dim(arr)) == 2) { arr <- if (n_ret == n_row) {
arr <- as.matrix(arr) matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , ])
} else {
## remove the first axis (group)
as.matrix(arr[1, , ])
}
} else if (predinteraction) {
## Predict interaction
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else {
## remove the first axis (group)
arr[1, , , ]
}
} else {
## Normal prediction
arr <- if (reshape && n_groups != 1) {
matrix(arr, ncol = n_groups, byrow = TRUE)
} else {
as.vector(ret)
} }
} }
return(arr) return(arr)

View File

@@ -11,6 +11,7 @@
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix). #' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data. #' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file. #' @param silent whether to suppress printing an informational message after loading from a file.
#' @param nthread Number of threads used for creating DMatrix.
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list. #' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
#' #'
#' @examples #' @examples

View File

@@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
} else { } else {
concatenated <- list() concatenated <- list()
output_names <- vector() output_names <- vector()
for (importance_type in c("weight", "gain", "cover")) { for (importance_type in c("weight", "total_gain", "total_cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names) args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
results <- .Call( results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
) )
names(results) <- c("features", "shape", importance_type) names(results) <- c("features", "shape", importance_type)
concatenated[ concatenated[
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover") switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
] <- results[importance_type] ] <- results[importance_type]
output_names <- results$features output_names <- results$features
} }

View File

@@ -9,8 +9,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
early_stopping_rounds = NULL, maximize = NULL, early_stopping_rounds = NULL, maximize = NULL,
save_period = NULL, save_name = "xgboost.model", save_period = NULL, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) { xgb_model = NULL, callbacks = list(), ...) {
merged <- check.booster.params(params, ...)
dtrain <- xgb.get.DMatrix(data, label, missing, weight, nthread = params$nthread) dtrain <- xgb.get.DMatrix(data, label, missing, weight, nthread = merged$nthread)
watchlist <- list(train = dtrain) watchlist <- list(train = dtrain)

View File

@@ -4,7 +4,14 @@
\alias{xgb.DMatrix} \alias{xgb.DMatrix}
\title{Construct xgb.DMatrix object} \title{Construct xgb.DMatrix object}
\usage{ \usage{
xgb.DMatrix(data, info = list(), missing = NA, silent = FALSE, ...) xgb.DMatrix(
data,
info = list(),
missing = NA,
silent = FALSE,
nthread = NULL,
...
)
} }
\arguments{ \arguments{
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character \item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
@@ -18,6 +25,8 @@ It is useful when a 0 or some other extreme value represents missing values in d
\item{silent}{whether to suppress printing an informational message after loading from a file.} \item{silent}{whether to suppress printing an informational message after loading from a file.}
\item{nthread}{Number of threads used for creating DMatrix.}
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.} \item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
} }
\description{ \description{

View File

@@ -1,3 +1,4 @@
library(testthat)
context('Test helper functions') context('Test helper functions')
require(xgboost) require(xgboost)
@@ -227,7 +228,7 @@ if (grepl('Windows', Sys.info()[['sysname']]) ||
X <- 10^runif(100, -20, 20) X <- 10^runif(100, -20, 20)
if (capabilities('long.double')) { if (capabilities('long.double')) {
X2X <- as.numeric(format(X, digits = 17)) X2X <- as.numeric(format(X, digits = 17))
expect_identical(X, X2X) expect_equal(X, X2X, tolerance = float_tolerance)
} }
# retrieved attributes to be the same as written # retrieved attributes to be the same as written
for (x in X) { for (x in X) {
@@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
# for multiclass # for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree) imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4)) expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
trees <- seq(from = 0, by = 2, length.out = 2)
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
trees = trees
)[
Feature != "Leaf", .(
Gain = sum(Quality),
Cover = sum(Cover),
Frequency = .N
),
by = Feature
][
, `:=`(
Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency)
)
][
order(Gain, decreasing = TRUE)
]
imp
}
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
}) })
test_that("xgb.importance works with GLM model", { test_that("xgb.importance works with GLM model", {

View File

@@ -1 +1 @@
@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@-dev @xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@

View File

@@ -148,7 +148,8 @@ From the command line on Linux starting from the XGBoost directory:
mkdir build mkdir build
cd build cd build
cmake .. -DUSE_CUDA=ON # For CUDA toolkit >= 11.4, `BUILD_WITH_CUDA_CUB` is required.
cmake .. -DUSE_CUDA=ON -DBUILD_WITH_CUDA_CUB=ON
make -j4 make -j4
.. note:: Specifying compute capability .. note:: Specifying compute capability

View File

@@ -18,7 +18,7 @@ Making a Release
1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue. 1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue.
2. Bump release version. 2. Bump release version.
1. Modify ``CMakeLists.txt`` source tree, run CMake. 1. Modify ``CMakeLists.txt`` in source tree and ``cmake/Python_version.in`` if needed, run CMake.
2. Modify ``DESCRIPTION`` in R-package. 2. Modify ``DESCRIPTION`` in R-package.
3. Run ``change_version.sh`` in ``jvm-packages/dev`` 3. Run ``change_version.sh`` in ``jvm-packages/dev``
3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``. 3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.

View File

@@ -95,13 +95,13 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
shap_interaction_values = model.predict(dtrain, pred_interactions=True) shap_interaction_values = model.predict(dtrain, pred_interactions=True)
See examples `here See examples `here
<https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`_. <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
Multi-node Multi-GPU Training Multi-node Multi-GPU Training
============================= =============================
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_. For XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_. For
getting started see our tutorial :doc:`/tutorials/dask` and worked examples `here getting started see our tutorial :doc:`/tutorials/dask` and worked examples `here
<https://github.com/dmlc/xgboost/tree/master/demo/dask>`_, also Python documentation <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation
:ref:`dask_api` for complete reference. :ref:`dask_api` for complete reference.
@@ -238,7 +238,7 @@ Working memory is allocated inside the algorithm proportional to the number of r
The quantile finding algorithm also uses some amount of working device memory. It is able to operate in batches, but is not currently well optimised for sparse data. The quantile finding algorithm also uses some amount of working device memory. It is able to operate in batches, but is not currently well optimised for sparse data.
If you are getting out-of-memory errors on a big dataset, try the `external memory version <../tutorials/external_memory.html>`_. If you are getting out-of-memory errors on a big dataset, try the :doc:`external memory version </tutorials/external_memory>`.
Developer notes Developer notes
=============== ===============

View File

@@ -79,7 +79,7 @@ The first thing in data transformation is to load the dataset as Spark's structu
StructField("class", StringType, true))) StructField("class", StringType, true)))
val rawInput = spark.read.schema(schema).csv("input_path") val rawInput = spark.read.schema(schema).csv("input_path")
At the first line, we create a instance of `SparkSession <http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``. At the first line, we create a instance of `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``.
Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM. Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM.
@@ -130,7 +130,7 @@ labels. A DataFrame like this (containing vector-represented features and numeri
Dealing with missing values Dealing with missing values
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
XGBoost supports missing values by default (`as desribed here <https://xgboost.readthedocs.io/en/latest/faq.html#how-to-deal-with-missing-value>`_). XGBoost supports missing values by default (`as desribed here <https://xgboost.readthedocs.io/en/latest/faq.html#how-to-deal-with-missing-values>`_).
If given a SparseVector, XGBoost will treat any values absent from the SparseVector as missing. You are also able to If given a SparseVector, XGBoost will treat any values absent from the SparseVector as missing. You are also able to
specify to XGBoost to treat a specific value in your Dataset as if it was a missing value. By default XGBoost will treat NaN as the value representing missing. specify to XGBoost to treat a specific value in your Dataset as if it was a missing value. By default XGBoost will treat NaN as the value representing missing.
@@ -369,7 +369,7 @@ Then we can load this model with single node Python XGBoost:
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following: When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
1. Build XGBoost4J-Spark with the steps described in `here <https://xgboost.readthedocs.io/en/latest/jvm/index.html#installation-from-source>`_, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path. 1. Build XGBoost4J-Spark with the steps described in :ref:`here <install_jvm_packages>`, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option. - However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.

View File

@@ -366,8 +366,8 @@ Specify the learning task and the corresponding learning objective. The objectiv
- ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized - ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
- ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized - ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized
- ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized - ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
- ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Applications>`_. - ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications>`_.
- ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`_. - ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications>`_.
* ``base_score`` [default=0.5] * ``base_score`` [default=0.5]
@@ -390,7 +390,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
- ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'. - ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'.
- ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``. - ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``.
- ``mlogloss``: `Multiclass logloss <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html>`_. - ``mlogloss``: `Multiclass logloss <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html>`_.
- ``auc``: `Receiver Operating Characteristic Area under the Curve <http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve>`_. - ``auc``: `Receiver Operating Characteristic Area under the Curve <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
Available for classification and learning-to-rank tasks. Available for classification and learning-to-rank tasks.
- When used with binary classification, the objective should be ``binary:logistic`` or similar functions that work on probability. - When used with binary classification, the objective should be ``binary:logistic`` or similar functions that work on probability.

View File

@@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``: - When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:
Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax`` Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
has equivalent output of ``multi:softprob`` due to dropped transformation. If strict has equivalent output shape of ``multi:softprob`` due to dropped transformation. If
shape is set to False then output can have 1 or 2 dim depending on used model. strict shape is set to False then output can have 1 or 2 dim depending on used model.
- When using ``preds_contribs`` with ``strict_shape`` set to ``True``: - When using ``preds_contribs`` with ``strict_shape`` set to ``True``:

View File

@@ -11,7 +11,7 @@ In order to run a XGBoost job in a Kubernetes cluster, perform the following ste
1. Install XGBoost Operator on the Kubernetes cluster. 1. Install XGBoost Operator on the Kubernetes cluster.
a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://github.com/kubeflow/xgboost-operator#installing-xgboost-operator>`_ to install XGBoost Operator. a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://github.com/kubeflow/xgboost-operator#install-xgboost-operator>`_ to install XGBoost Operator.
2. Write application code that will be executed by the XGBoost Operator. 2. Write application code that will be executed by the XGBoost Operator.

View File

@@ -227,15 +227,15 @@ XGBoost has a function called ``dump_model`` in Booster object, which lets you t
the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary
use case for it is for model interpretation or visualization, and is not supposed to be use case for it is for model interpretation or visualization, and is not supposed to be
loaded back to XGBoost. The JSON version has a `schema loaded back to XGBoost. The JSON version has a `schema
<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`_. See next section for <https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__. See next section for
more info. more info.
*********** ***********
JSON Schema JSON Schema
*********** ***********
Another important feature of JSON format is a documented `Schema Another important feature of JSON format is a documented `schema
<https://json-schema.org/>`_, based on which one can easily reuse the output model from <https://json-schema.org/>`__, based on which one can easily reuse the output model from
XGBoost. Here is the initial draft of JSON schema for the output model (not XGBoost. Here is the initial draft of JSON schema for the output model (not
serialization, which will not be stable as noted above). It's subject to change due to serialization, which will not be stable as noted above). It's subject to change due to
the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``. the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``.

View File

@@ -211,7 +211,7 @@ struct Entry {
*/ */
struct BatchParam { struct BatchParam {
/*! \brief The GPU device to use. */ /*! \brief The GPU device to use. */
int gpu_id; int gpu_id {-1};
/*! \brief Maximum number of bins per feature for histograms. */ /*! \brief Maximum number of bins per feature for histograms. */
int max_bin{0}; int max_bin{0};
/*! \brief Hessian, used for sketching with future approx implementation. */ /*! \brief Hessian, used for sketching with future approx implementation. */

View File

@@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable {
bool with_stats, bool with_stats,
std::string format) const = 0; std::string format) const = 0;
virtual void FeatureScore(std::string const &importance_type, virtual void FeatureScore(std::string const& importance_type,
std::vector<bst_feature_t> *features, common::Span<int32_t const> trees,
std::vector<float> *scores) const = 0; std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*! /*!
* \brief Whether the current booster uses GPU. * \brief Whether the current booster uses GPU.
*/ */

View File

@@ -155,9 +155,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
/*! /*!
* \brief Calculate feature score. See doc in C API for outputs. * \brief Calculate feature score. See doc in C API for outputs.
*/ */
virtual void CalcFeatureScore(std::string const &importance_type, virtual void CalcFeatureScore(std::string const& importance_type,
std::vector<bst_feature_t> *features, common::Span<int32_t const> trees,
std::vector<float> *scores) = 0; std::vector<bst_feature_t>* features,
std::vector<float>* scores) = 0;
/* /*
* \brief Get number of boosted rounds from gradient booster. * \brief Get number of boosted rounds from gradient booster.

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</parent> </parent>
<artifactId>xgboost4j-example_2.12</artifactId> <artifactId>xgboost4j-example_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</parent> </parent>
<artifactId>xgboost4j-flink_2.12</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_2.12</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<properties> <properties>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</parent> </parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId> <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</parent> </parent>
<artifactId>xgboost4j-spark_2.12</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -17,11 +17,13 @@
package ml.dmlc.xgboost4j.scala.spark.params package ml.dmlc.xgboost4j.scala.spark.params
import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.ml.param.{ParamPair, Params} import org.apache.spark.ml.param.{ParamPair, Params}
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.JsonMethods._
import org.json4s.{JObject, _} import org.json4s.{JArray, JBool, JDouble, JField, JInt, JNothing, JObject, JString, JValue}
import JsonDSLXGBoost._
// This originates from apache-spark DefaultPramsWriter copy paste // This originates from apache-spark DefaultPramsWriter copy paste
private[spark] object DefaultXGBoostParamsWriter { private[spark] object DefaultXGBoostParamsWriter {
@@ -87,3 +89,62 @@ private[spark] object DefaultXGBoostParamsWriter {
metadataJson metadataJson
} }
} }
// Fix json4s bin-incompatible issue.
// This originates from org.json4s.JsonDSL of 3.6.6
object JsonDSLXGBoost {
implicit def seq2jvalue[A](s: Iterable[A])(implicit ev: A => JValue): JArray =
JArray(s.toList.map(ev))
implicit def map2jvalue[A](m: Map[String, A])(implicit ev: A => JValue): JObject =
JObject(m.toList.map { case (k, v) => JField(k, ev(v)) })
implicit def option2jvalue[A](opt: Option[A])(implicit ev: A => JValue): JValue = opt match {
case Some(x) => ev(x)
case None => JNothing
}
implicit def short2jvalue(x: Short): JValue = JInt(x)
implicit def byte2jvalue(x: Byte): JValue = JInt(x)
implicit def char2jvalue(x: Char): JValue = JInt(x)
implicit def int2jvalue(x: Int): JValue = JInt(x)
implicit def long2jvalue(x: Long): JValue = JInt(x)
implicit def bigint2jvalue(x: BigInt): JValue = JInt(x)
implicit def double2jvalue(x: Double): JValue = JDouble(x)
implicit def float2jvalue(x: Float): JValue = JDouble(x.toDouble)
implicit def bigdecimal2jvalue(x: BigDecimal): JValue = JDouble(x.doubleValue)
implicit def boolean2jvalue(x: Boolean): JValue = JBool(x)
implicit def string2jvalue(x: String): JValue = JString(x)
implicit def symbol2jvalue(x: Symbol): JString = JString(x.name)
implicit def pair2jvalue[A](t: (String, A))(implicit ev: A => JValue): JObject =
JObject(List(JField(t._1, ev(t._2))))
implicit def list2jvalue(l: List[JField]): JObject = JObject(l)
implicit def jobject2assoc(o: JObject): JsonListAssoc = new JsonListAssoc(o.obj)
implicit def pair2Assoc[A](t: (String, A))(implicit ev: A => JValue): JsonAssoc[A] =
new JsonAssoc(t)
}
final class JsonAssoc[A](private val left: (String, A)) extends AnyVal {
def ~[B](right: (String, B))(implicit ev1: A => JValue, ev2: B => JValue): JObject = {
val l: JValue = ev1(left._2)
val r: JValue = ev2(right._2)
JObject(JField(left._1, l) :: JField(right._1, r) :: Nil)
}
def ~(right: JObject)(implicit ev: A => JValue): JObject = {
val l: JValue = ev(left._2)
JObject(JField(left._1, l) :: right.obj)
}
def ~~[B](right: (String, B))(implicit ev1: A => JValue, ev2: B => JValue): JObject =
this.~(right)
def ~~(right: JObject)(implicit ev: A => JValue): JObject = this.~(right)
}
final class JsonListAssoc(private val left: List[JField]) extends AnyVal {
def ~(right: (String, JValue)): JObject = JObject(left ::: List(JField(right._1, right._2)))
def ~(right: JObject): JObject = JObject(left ::: right.obj)
def ~~(right: (String, JValue)): JObject = this.~(right)
def ~~(right: JObject): JObject = this.~(right)
}

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
</parent> </parent>
<artifactId>xgboost4j_2.12</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.1</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -1 +1 @@
1.5.0-dev 1.5.1

View File

@@ -386,7 +386,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
raise exc # pylint: disable=raising-bad-type raise exc # pylint: disable=raising-bad-type
def __del__(self) -> None: def __del__(self) -> None:
assert self._temporary_data is None, self._temporary_data assert self._temporary_data is None
assert self._exception is None assert self._exception is None
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
@@ -410,19 +410,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
feature_names: Optional[List[str]] = None, feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
**kwargs: Any, **kwargs: Any,
): ) -> None:
from .data import dispatch_proxy_set_data from .data import dispatch_proxy_set_data
from .data import _proxy_transform from .data import _proxy_transform
transformed, feature_names, feature_types = _proxy_transform( new, cat_codes, feature_names, feature_types = _proxy_transform(
data, data,
feature_names, feature_names,
feature_types, feature_types,
self._enable_categorical, self._enable_categorical,
) )
# Stage the data, meta info are copied inside C++ MetaInfo. # Stage the data, meta info are copied inside C++ MetaInfo.
self._temporary_data = transformed self._temporary_data = (new, cat_codes)
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host) dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
self.proxy.set_info( self.proxy.set_info(
feature_names=feature_names, feature_names=feature_names,
feature_types=feature_types, feature_types=feature_types,
@@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle))) _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
def _set_data_from_cuda_interface(self, data): def _set_data_from_cuda_interface(self, data) -> None:
"""Set data from CUDA array interface.""" """Set data from CUDA array interface."""
interface = data.__cuda_array_interface__ interface = data.__cuda_array_interface__
interface_str = bytes(json.dumps(interface, indent=2), "utf-8") interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
@@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str) _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
) )
def _set_data_from_cuda_columnar(self, data): def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
"""Set data from CUDA columnar format.""" """Set data from CUDA columnar format."""
from .data import _cudf_array_interfaces from .data import _cudf_array_interfaces
_, interfaces_str = _cudf_array_interfaces(data) interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str)) _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
def _set_data_from_array(self, data: np.ndarray): def _set_data_from_array(self, data: np.ndarray):
@@ -1805,7 +1805,7 @@ class Booster(object):
.. note:: .. note::
See `Prediction See `Prediction
<https://xgboost.readthedocs.io/en/latest/tutorials/prediction.html>`_ <https://xgboost.readthedocs.io/en/latest/prediction.html>`_
for issues like thread safety and a summary of outputs from this function. for issues like thread safety and a summary of outputs from this function.
Parameters Parameters
@@ -1986,13 +1986,6 @@ class Booster(object):
preds = ctypes.POINTER(ctypes.c_float)() preds = ctypes.POINTER(ctypes.c_float)()
# once caching is supported, we can pass id(data) as cache id. # once caching is supported, we can pass id(data) as cache id.
try:
import pandas as pd
if isinstance(data, pd.DataFrame):
data = data.values
except ImportError:
pass
args = { args = {
"type": 0, "type": 0,
"training": False, "training": False,
@@ -2027,7 +2020,20 @@ class Booster(object):
f"got {data.shape[1]}" f"got {data.shape[1]}"
) )
from .data import _is_pandas_df, _transform_pandas_df
from .data import _array_interface from .data import _array_interface
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)
if isinstance(data, np.ndarray): if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)
@@ -2080,9 +2086,11 @@ class Booster(object):
) )
return _prediction_output(shape, dims, preds, True) return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"): if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
from .data import _cudf_array_interfaces from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
_, interfaces_str = _cudf_array_interfaces(data) data, None, None, enable_categorical
)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call( _check_call(
_LIB.XGBoosterPredictFromCudaColumnar( _LIB.XGBoosterPredictFromCudaColumnar(
self.handle, self.handle,

View File

@@ -1,4 +1,4 @@
# pylint: disable=too-many-arguments, too-many-branches # pylint: disable=too-many-arguments, too-many-branches, too-many-lines
# pylint: disable=too-many-return-statements, import-error # pylint: disable=too-many-return-statements, import-error
'''Data dispatching for DMatrix.''' '''Data dispatching for DMatrix.'''
import ctypes import ctypes
@@ -12,7 +12,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix from .core import DataIter, _ProxyDMatrix, DMatrix
from .compat import lazy_isinstance from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
} }
def _invalid_dataframe_dtype(data) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
bad_fields = [
str(data.columns[i])
for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]
err = " Invalid columns:" + ", ".join(bad_fields)
else:
err = ""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`.""" + err
raise ValueError(msg)
def _transform_pandas_df( def _transform_pandas_df(
data, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
feature_names: Optional[List[str]] = None, feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
meta=None, meta: Optional[str] = None,
meta_type=None, meta_type: Optional[str] = None,
): ) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
import pandas as pd import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype from pandas.api.types import is_sparse, is_categorical_dtype
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or if not all(
(is_categorical_dtype(dtype) and enable_categorical) dtype.name in _pandas_dtype_mapper
for dtype in data.dtypes): or is_sparse(dtype)
bad_fields = [ or (is_categorical_dtype(dtype) and enable_categorical)
str(data.columns[i]) for i, dtype in enumerate(data.dtypes) for dtype in data.dtypes
if dtype.name not in _pandas_dtype_mapper ):
] _invalid_dataframe_dtype(data)
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))
# handle feature names # handle feature names
if feature_names is None and meta is None: if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex): if isinstance(data.columns, pd.MultiIndex):
feature_names = [ feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
' '.join([str(x) for x in i]) for i in data.columns
]
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)): elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns)) feature_names = list(map(str, data.columns))
else: else:
@@ -263,21 +275,24 @@ def _transform_pandas_df(
else: else:
feature_types.append(_pandas_dtype_mapper[dtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.name])
# handle categorical codes. # handle category codes.
transformed = pd.DataFrame() transformed = pd.DataFrame()
if enable_categorical: if enable_categorical:
for i, dtype in enumerate(data.dtypes): for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype): if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes # pandas uses -1 as default missing value for categorical data
transformed[data.columns[i]] = (
data[data.columns[i]]
.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN)
)
else: else:
transformed[data.columns[i]] = data[data.columns[i]] transformed[data.columns[i]] = data[data.columns[i]]
else: else:
transformed = data transformed = data
if meta and len(data.columns) > 1: if meta and len(data.columns) > 1:
raise ValueError( raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
f"DataFrame for {meta} cannot have multiple columns"
)
dtype = meta_type if meta_type else np.float32 dtype = meta_type if meta_type else np.float32
arr = transformed.values arr = transformed.values
@@ -287,7 +302,7 @@ def _transform_pandas_df(
def _from_pandas_df( def _from_pandas_df(
data, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
missing, missing,
nthread, nthread,
@@ -300,6 +315,7 @@ def _from_pandas_df(
feature_types) feature_types)
def _is_pandas_series(data): def _is_pandas_series(data):
try: try:
import pandas as pd import pandas as pd
@@ -318,13 +334,26 @@ def _is_modin_series(data):
def _from_pandas_series( def _from_pandas_series(
data, data,
missing, missing: float,
nthread, nthread: int,
enable_categorical: bool,
feature_names: Optional[List[str]], feature_names: Optional[List[str]],
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical
):
_invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype):
data = data.cat.codes
return _from_numpy_array( return _from_numpy_array(
data.values.astype("float"), missing, nthread, feature_names, feature_types data.values.reshape(data.shape[0], 1).astype("float"),
missing,
nthread,
feature_names,
feature_types,
) )
@@ -428,7 +457,7 @@ def _is_cudf_df(data):
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame) return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _cudf_array_interfaces(data) -> Tuple[list, bytes]: def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
data and a list of array interfaces. The data is list of categorical codes that data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of array caller can safely ignore, but have to keep their reference alive until usage of array
@@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
except ImportError: except ImportError:
from cudf.utils.dtypes import is_categorical_dtype from cudf.utils.dtypes import is_categorical_dtype
cat_codes = []
interfaces = [] interfaces = []
if _is_cudf_ser(data): if _is_cudf_ser(data):
interfaces.append(data.__cuda_array_interface__) if is_categorical_dtype(data.dtype):
interface = cat_codes[0].__cuda_array_interface__
else:
interface = data.__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)
else: else:
for col in data: for i, col in enumerate(data):
if is_categorical_dtype(data[col].dtype): if is_categorical_dtype(data[col].dtype):
codes = data[col].cat.codes codes = cat_codes[i]
interface = codes.__cuda_array_interface__ interface = codes.__cuda_array_interface__
cat_codes.append(codes)
else: else:
interface = data[col].__cuda_array_interface__ interface = data[col].__cuda_array_interface__
if "mask" in interface: if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__ interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface) interfaces.append(interface)
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8") interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
return cat_codes, interfaces_str return interfaces_str
def _transform_cudf_df( def _transform_cudf_df(
@@ -470,25 +503,57 @@ def _transform_cudf_df(
except ImportError: except ImportError:
from cudf.utils.dtypes import is_categorical_dtype from cudf.utils.dtypes import is_categorical_dtype
if _is_cudf_ser(data):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
if not all(
dtype.name in _pandas_dtype_mapper
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in dtypes
):
_invalid_dataframe_dtype(data)
# handle feature names
if feature_names is None: if feature_names is None:
if _is_cudf_ser(data): if _is_cudf_ser(data):
feature_names = [data.name] feature_names = [data.name]
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"): elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
feature_names = [" ".join([str(x) for x in i]) for i in data.columns] feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
elif (
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
# Unique to cuDF, no equivalence in pandas 1.3.3
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
):
feature_names = list(map(str, data.columns))
else: else:
feature_names = data.columns.format() feature_names = data.columns.format()
# handle feature types
if feature_types is None: if feature_types is None:
feature_types = [] feature_types = []
if _is_cudf_ser(data):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
for dtype in dtypes: for dtype in dtypes:
if is_categorical_dtype(dtype) and enable_categorical: if is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T) feature_types.append(CAT_T)
else: else:
feature_types.append(_pandas_dtype_mapper[dtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.name])
return data, feature_names, feature_types
# handle categorical data
cat_codes = []
if _is_cudf_ser(data):
# unlike pandas, cuDF uses NA for missing data.
if is_categorical_dtype(data.dtype) and enable_categorical:
codes = data.cat.codes
cat_codes.append(codes)
else:
for col in data:
if is_categorical_dtype(data[col].dtype) and enable_categorical:
codes = data[col].cat.codes
cat_codes.append(codes)
return data, cat_codes, feature_names, feature_types
def _from_cudf_df( def _from_cudf_df(
@@ -499,10 +564,10 @@ def _from_cudf_df(
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]: ) -> Tuple[ctypes.c_void_p, Any, Any]:
data, feature_names, feature_types = _transform_cudf_df( data, cat_codes, feature_names, feature_types = _transform_cudf_df(
data, feature_names, feature_types, enable_categorical data, feature_names, feature_types, enable_categorical
) )
_, interfaces_str = _cudf_array_interfaces(data) interfaces_str = _cudf_array_interfaces(data, cat_codes)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8") config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
_check_call( _check_call(
@@ -707,8 +772,9 @@ def dispatch_data_backend(
return _from_pandas_df(data, enable_categorical, missing, threads, return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types) feature_names, feature_types)
if _is_pandas_series(data): if _is_pandas_series(data):
return _from_pandas_series(data, missing, threads, feature_names, return _from_pandas_series(
feature_types) data, missing, threads, enable_categorical, feature_names, feature_types
)
if _is_cudf_df(data) or _is_cudf_ser(data): if _is_cudf_df(data) or _is_cudf_ser(data):
return _from_cudf_df( return _from_cudf_df(
data, missing, threads, feature_names, feature_types, enable_categorical data, missing, threads, feature_names, feature_types, enable_categorical
@@ -732,8 +798,9 @@ def dispatch_data_backend(
return _from_pandas_df(data, enable_categorical, missing, threads, return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types) feature_names, feature_types)
if _is_modin_series(data): if _is_modin_series(data):
return _from_pandas_series(data, missing, threads, feature_names, return _from_pandas_series(
feature_types) data, missing, threads, enable_categorical, feature_names, feature_types
)
if _has_array_protocol(data): if _has_array_protocol(data):
array = np.asarray(data) array = np.asarray(data)
return _from_numpy_array(array, missing, threads, feature_names, feature_types) return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_dt(data, name, dtype, handle) _meta_from_dt(data, name, dtype, handle)
return return
if _is_modin_df(data): if _is_modin_df(data):
data, _, _ = _transform_pandas_df( data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
return return
if _is_modin_series(data): if _is_modin_series(data):
@@ -917,30 +983,38 @@ def _proxy_transform(
) )
if _is_cupy_array(data): if _is_cupy_array(data):
data = _transform_cupy_array(data) data = _transform_cupy_array(data)
return data, feature_names, feature_types return data, None, feature_names, feature_types
if _is_dlpack(data): if _is_dlpack(data):
return _transform_dlpack(data), feature_names, feature_types return _transform_dlpack(data), None, feature_names, feature_types
if _is_numpy_array(data): if _is_numpy_array(data):
return data, feature_names, feature_types return data, None, feature_names, feature_types
if _is_scipy_csr(data): if _is_scipy_csr(data):
return data, feature_names, feature_types return data, None, feature_names, feature_types
if _is_pandas_df(data): if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df( arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types data, enable_categorical, feature_names, feature_types
) )
return arr, feature_names, feature_types return arr, None, feature_names, feature_types
raise TypeError("Value type is not supported for data iterator:" + str(type(data))) raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None: def dispatch_proxy_set_data(
proxy: _ProxyDMatrix,
data: Any,
cat_codes: Optional[list],
allow_host: bool,
) -> None:
"""Dispatch for DeviceQuantileDMatrix.""" """Dispatch for DeviceQuantileDMatrix."""
if not _is_cudf_ser(data) and not _is_pandas_series(data): if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data) _check_data_shape(data)
if _is_cudf_df(data): if _is_cudf_df(data):
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212 # pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
return return
if _is_cudf_ser(data): if _is_cudf_ser(data):
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212 # pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
return return
if _is_cupy_array(data): if _is_cupy_array(data):
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212 proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212

View File

@@ -174,7 +174,9 @@ __model_doc = f'''
Device ordinal. Device ordinal.
validate_parameters : Optional[bool] validate_parameters : Optional[bool]
Give warnings for unknown parameter. Give warnings for unknown parameter.
predictor : Optional[str]
Force XGBoost to use specific predictor, available choices are [cpu_predictor,
gpu_predictor].
enable_categorical : bool enable_categorical : bool
.. versionadded:: 1.5.0 .. versionadded:: 1.5.0
@@ -807,7 +809,11 @@ class XGBModel(XGBModelBase):
# Inplace predict doesn't handle as many data types as DMatrix, but it's # Inplace predict doesn't handle as many data types as DMatrix, but it's
# sufficient for dask interface where input is simpiler. # sufficient for dask interface where input is simpiler.
predictor = self.get_params().get("predictor", None) predictor = self.get_params().get("predictor", None)
if predictor in ("auto", None) and self.booster != "gblinear": if (
not self.enable_categorical
and predictor in ("auto", None)
and self.booster != "gblinear"
):
return True return True
return False return False
@@ -834,7 +840,9 @@ class XGBModel(XGBModelBase):
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray: ) -> np.ndarray:
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration` """Predict with `X`. If the model is trained with early stopping, then `best_iteration`
is used automatically. is used automatically. For tree models, when data is on GPU, like cupy array or
cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
automatically, otherwise it will run on CPU.
.. note:: This function is only thread safe for `gbtree` and `dart`. .. note:: This function is only thread safe for `gbtree` and `dart`.
@@ -862,6 +870,7 @@ class XGBModel(XGBModelBase):
Returns Returns
------- -------
prediction prediction
""" """
iteration_range = _convert_ntree_limit( iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range self.get_booster(), ntree_limit, iteration_range
@@ -886,7 +895,10 @@ class XGBModel(XGBModelBase):
pass pass
test = DMatrix( test = DMatrix(
X, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs X, base_margin=base_margin,
missing=self.missing,
nthread=self.n_jobs,
enable_categorical=self.enable_categorical
) )
return self.get_booster().predict( return self.get_booster().predict(
data=test, data=test,

View File

@@ -144,7 +144,7 @@ class RabitTracker(object):
""" """
def __init__( def __init__(
self, hostIP, nslave, port=9091, port_end=9999, use_logger: bool = True self, hostIP, nslave, port=9091, port_end=9999, use_logger: bool = False
) -> None: ) -> None:
"""A Python implementation of RABIT tracker. """A Python implementation of RABIT tracker.
@@ -384,16 +384,17 @@ def start_rabit_tracker(args):
---------- ----------
args: arguments to start the rabit tracker. args: arguments to start the rabit tracker.
""" """
envs = {'DMLC_NUM_WORKER': args.num_workers, envs = {"DMLC_NUM_WORKER": args.num_workers, "DMLC_NUM_SERVER": args.num_servers}
'DMLC_NUM_SERVER': args.num_servers} rabit = RabitTracker(
rabit = RabitTracker(hostIP=get_host_ip(args.host_ip), nslave=args.num_workers) hostIP=get_host_ip(args.host_ip), nslave=args.num_workers, use_logger=True
)
envs.update(rabit.slave_envs()) envs.update(rabit.slave_envs())
rabit.start(args.num_workers) rabit.start(args.num_workers)
sys.stdout.write('DMLC_TRACKER_ENV_START\n') sys.stdout.write("DMLC_TRACKER_ENV_START\n")
# simply write configuration to stdout # simply write configuration to stdout
for k, v in envs.items(): for k, v in envs.items():
sys.stdout.write(f"{k}={v}\n") sys.stdout.write(f"{k}={v}\n")
sys.stdout.write('DMLC_TRACKER_ENV_END\n') sys.stdout.write("DMLC_TRACKER_ENV_END\n")
sys.stdout.flush() sys.stdout.flush()
rabit.join() rabit.join()

View File

@@ -472,13 +472,15 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
if is_new_callback: if is_new_callback:
assert all(isinstance(c, callback.TrainingCallback) assert all(isinstance(c, callback.TrainingCallback)
for c in callbacks), "You can't mix new and old callback styles." for c in callbacks), "You can't mix new and old callback styles."
if isinstance(verbose_eval, bool) and verbose_eval: if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(callback.EvaluationMonitor(period=verbose_eval, callbacks.append(
show_stdv=show_stdv)) callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
)
if early_stopping_rounds: if early_stopping_rounds:
callbacks.append(callback.EarlyStopping( callbacks.append(
rounds=early_stopping_rounds, maximize=maximize)) callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
)
callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True) callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True)
else: else:
callbacks = _configure_deprecated_callbacks( callbacks = _configure_deprecated_callbacks(

View File

@@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
custom_feature_names = get<Array const>(config["feature_names"]); custom_feature_names = get<Array const>(config["feature_names"]);
} }
auto& scores = learner->GetThreadLocal().ret_vec_float; std::vector<int32_t> tree_idx;
if (!IsA<Null>(config["tree_idx"])) {
auto j_tree_idx = get<Array const>(config["tree_idx"]);
for (auto const &idx : j_tree_idx) {
tree_idx.push_back(get<Integer const>(idx));
}
}
auto &scores = learner->GetThreadLocal().ret_vec_float;
std::vector<bst_feature_t> features; std::vector<bst_feature_t> features;
learner->CalcFeatureScore(importance, &features, &scores); learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);
auto n_features = learner->GetNumFeature(); auto n_features = learner->GetNumFeature();
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map); GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);

View File

@@ -1,5 +1,5 @@
/*! /*!
* Copyright 2020 by XGBoost Contributors * Copyright 2020-2021 by XGBoost Contributors
* \file categorical.h * \file categorical.h
*/ */
#ifndef XGBOOST_COMMON_CATEGORICAL_H_ #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
return !s_cats.Check(cat); return !s_cats.Check(cat);
} }
inline void CheckCat(bst_cat_t cat) {
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
"should be non-negative.";
}
struct IsCatOp { struct IsCatOp {
XGBOOST_DEVICE bool operator()(FeatureType ft) { XGBOOST_DEVICE bool operator()(FeatureType ft) {
return ft == FeatureType::kCategorical; return ft == FeatureType::kCategorical;

View File

@@ -711,6 +711,12 @@ constexpr std::pair<int, int> CUDAVersion() {
constexpr std::pair<int32_t, int32_t> ThrustVersion() { constexpr std::pair<int32_t, int32_t> ThrustVersion() {
return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION); return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION);
} }
// Whether do we have thrust 1.x with x >= minor
template <int32_t minor>
constexpr bool HasThrustMinorVer() {
return (ThrustVersion().first == 1 && ThrustVersion().second >= minor) ||
ThrustVersion().first > 1;
}
namespace detail { namespace detail {
template <typename T> template <typename T>
@@ -725,10 +731,8 @@ class TypedDiscard : public thrust::discard_iterator<T> {
template <typename T> template <typename T>
using TypedDiscard = using TypedDiscard =
std::conditional_t<((ThrustVersion().first == 1 && std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
ThrustVersion().second >= 12) || detail::TypedDiscard<T>>;
ThrustVersion().first > 1),
detail::TypedDiscardCTK114<T>, detail::TypedDiscard<T>>;
/** /**
* \class AllReducer * \class AllReducer
@@ -1442,24 +1446,39 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
namespace detail { namespace detail {
// Wrapper around cub sort for easier `descending` sort. // Wrapper around cub sort for easier `descending` sort.
template <bool descending, typename KeyT, typename ValueT, template <bool descending, typename KeyT, typename ValueT,
typename OffsetIteratorT> typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
void DeviceSegmentedRadixSortPair( void DeviceSegmentedRadixSortPair(
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets, size_t num_items, size_t num_segments, BeginOffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets, int begin_bit = 0, EndOffsetIteratorT d_end_offsets, int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8) { int end_bit = sizeof(KeyT) * 8) {
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
d_values_out); d_values_out);
using OffsetT = int32_t; // num items in dispatch is also int32_t, no way to change. // In old version of cub, num_items in dispatch is also int32_t, no way to change.
CHECK_LE(num_items, std::numeric_limits<int32_t>::max()); using OffsetT =
std::conditional_t<BuildWithCUDACub() && HasThrustMinorVer<13>(), size_t,
int32_t>;
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
#if (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13) || THRUST_MAJOR_VERSION > 1
safe_cuda((cub::DispatchSegmentedRadixSort< safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, OffsetIteratorT, descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
d_values, num_items, num_segments, d_values, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit, d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, nullptr, false))); end_bit, false, nullptr, false)));
#else
safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, BeginOffsetIteratorT,
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
d_values, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, nullptr, false)));
#endif
} }
} // namespace detail } // namespace detail

View File

@@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr, int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries, dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan) { dh::caching_device_vector<size_t> *p_column_sizes_scan) {
info.feature_types.SetDevice(device);
auto d_feature_types = info.feature_types.ConstDeviceSpan(); auto d_feature_types = info.feature_types.ConstDeviceSpan();
CHECK(!d_feature_types.empty()); CHECK(!d_feature_types.empty());
auto &column_sizes_scan = *p_column_sizes_scan; auto &column_sizes_scan = *p_column_sizes_scan;

View File

@@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
void SortByWeight(dh::device_vector<float>* weights, void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries); dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan);
} // namespace detail } // namespace detail
// Compute sketch on DMatrix. // Compute sketch on DMatrix.
@@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements = 0); size_t sketch_batch_num_elements = 0);
template <typename AdapterBatch> template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns, void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
size_t begin, size_t end, float missing, int device, size_t columns, size_t begin, size_t end,
SketchContainer* sketch_container, int num_cuts) { float missing, SketchContainer *sketch_container,
int num_cuts) {
// Copy current subset of valid elements into temporary storage and sort // Copy current subset of valid elements into temporary storage and sort
dh::device_vector<Entry> sorted_entries; dh::device_vector<Entry> sorted_entries;
dh::caching_device_vector<size_t> column_sizes_scan; dh::caching_device_vector<size_t> column_sizes_scan;
@@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
thrust::make_counting_iterator(0llu), thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); }); [=] __device__(size_t idx) { return batch.GetElement(idx); });
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr; HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
cuts_ptr.SetDevice(device);
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing, detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
columns, num_cuts, device, columns, num_cuts, device,
&cuts_ptr, &cuts_ptr,
@@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(), thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), detail::EntryCompareOp()); sorted_entries.end(), detail::EntryCompareOp());
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector(); if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
}
auto d_cuts_ptr = cuts_ptr.DeviceSpan(); auto d_cuts_ptr = cuts_ptr.DeviceSpan();
auto const &h_cuts_ptr = cuts_ptr.HostVector();
// Extract the cuts from all columns concurrently // Extract the cuts from all columns concurrently
sketch_container->Push(dh::ToSpan(sorted_entries), sketch_container->Push(dh::ToSpan(sorted_entries),
dh::ToSpan(column_sizes_scan), d_cuts_ptr, dh::ToSpan(column_sizes_scan), d_cuts_ptr,
@@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
detail::SortByWeight(&temp_weights, &sorted_entries); detail::SortByWeight(&temp_weights, &sorted_entries);
if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
}
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector(); auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
auto d_cuts_ptr = cuts_ptr.DeviceSpan(); auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
device, num_cuts_per_feature, false); device, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) { for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements)); size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
ProcessSlidingWindow(batch, device, num_cols, ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
begin, end, missing, sketch_container, num_cuts_per_feature); sketch_container, num_cuts_per_feature);
} }
} }
} }

View File

@@ -7,11 +7,28 @@
#define XGBOOST_COMMON_THREADING_UTILS_H_ #define XGBOOST_COMMON_THREADING_UTILS_H_
#include <dmlc/common.h> #include <dmlc/common.h>
#include <vector> #include <dmlc/omp.h>
#include <algorithm> #include <algorithm>
#include <limits>
#include <type_traits> // std::is_signed #include <type_traits> // std::is_signed
#include <vector>
#include "xgboost/logging.h" #include "xgboost/logging.h"
#if !defined(_OPENMP)
extern "C" {
inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; } // NOLINT
}
#endif // !defined(_OPENMP)
// MSVC doesn't implement the thread limit.
#if defined(_OPENMP) && defined(_MSC_VER)
extern "C" {
inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); } // NOLINT
}
#endif // defined(_MSC_VER)
namespace xgboost { namespace xgboost {
namespace common { namespace common {
@@ -153,7 +170,7 @@ struct Sched {
}; };
template <typename Index, typename Func> template <typename Index, typename Func>
void ParallelFor(Index size, size_t n_threads, Sched sched, Func fn) { void ParallelFor(Index size, int32_t n_threads, Sched sched, Func fn) {
#if defined(_MSC_VER) #if defined(_MSC_VER)
// msvc doesn't support unsigned integer as openmp index. // msvc doesn't support unsigned integer as openmp index.
using OmpInd = std::conditional_t<std::is_signed<Index>::value, Index, omp_ulong>; using OmpInd = std::conditional_t<std::is_signed<Index>::value, Index, omp_ulong>;
@@ -220,6 +237,13 @@ void ParallelFor(Index size, size_t n_threads, Func fn) {
template <typename Index, typename Func> template <typename Index, typename Func>
void ParallelFor(Index size, Func fn) { void ParallelFor(Index size, Func fn) {
ParallelFor(size, omp_get_max_threads(), Sched::Static(), fn); ParallelFor(size, omp_get_max_threads(), Sched::Static(), fn);
} // !defined(_OPENMP)
inline int32_t OmpGetThreadLimit() {
int32_t limit = omp_get_thread_limit();
CHECK_GE(limit, 1) << "Invalid thread limit for OpenMP.";
return limit;
} }
/* \brief Configure parallel threads. /* \brief Configure parallel threads.
@@ -235,15 +259,18 @@ inline int32_t OmpSetNumThreads(int32_t* p_threads) {
if (threads <= 0) { if (threads <= 0) {
threads = omp_get_num_procs(); threads = omp_get_num_procs();
} }
threads = std::min(threads, OmpGetThreadLimit());
omp_set_num_threads(threads); omp_set_num_threads(threads);
return nthread_original; return nthread_original;
} }
inline int32_t OmpSetNumThreadsWithoutHT(int32_t* p_threads) { inline int32_t OmpSetNumThreadsWithoutHT(int32_t* p_threads) {
auto& threads = *p_threads; auto& threads = *p_threads;
int32_t nthread_original = omp_get_max_threads(); int32_t nthread_original = omp_get_max_threads();
if (threads <= 0) { if (threads <= 0) {
threads = nthread_original; threads = nthread_original;
} }
threads = std::min(threads, OmpGetThreadLimit());
omp_set_num_threads(threads); omp_set_num_threads(threads);
return nthread_original; return nthread_original;
} }
@@ -252,6 +279,7 @@ inline int32_t OmpGetNumThreads(int32_t n_threads) {
if (n_threads <= 0) { if (n_threads <= 0) {
n_threads = omp_get_num_procs(); n_threads = omp_get_num_procs();
} }
n_threads = std::min(n_threads, OmpGetThreadLimit());
return n_threads; return n_threads;
} }
} // namespace common } // namespace common

View File

@@ -21,6 +21,7 @@
#include "array_interface.h" #include "array_interface.h"
#include "../c_api/c_api_error.h" #include "../c_api/c_api_error.h"
#include "../common/math.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
@@ -80,6 +81,24 @@ struct COOTuple {
float value{0}; float value{0};
}; };
struct IsValidFunctor {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
XGBOOST_DEVICE bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
return !(common::CheckNAN(e.value) || e.value == missing);
}
XGBOOST_DEVICE bool operator()(const Entry& e) const {
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
}
};
namespace detail { namespace detail {
/** /**

View File

@@ -987,18 +987,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
// Second pass over batch, placing elements in correct position // Second pass over batch, placing elements in correct position
auto is_valid = data::IsValidFunctor{missing};
#pragma omp parallel num_threads(nthread) #pragma omp parallel num_threads(nthread)
{ {
exec.Run([&]() { exec.Run([&]() {
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
size_t begin = tid*thread_size; size_t begin = tid * thread_size;
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size; size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
for (size_t i = begin; i < end; ++i) { for (size_t i = begin; i < end; ++i) {
auto line = batch.GetLine(i); auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) { for (auto j = 0ull; j < line.Size(); j++) {
auto element = line.GetElement(j); auto element = line.GetElement(j);
const size_t key = (element.row_idx - base_rowid); const size_t key = (element.row_idx - base_rowid);
if (!common::CheckNAN(element.value) && element.value != missing) { if (is_valid(element)) {
builder.Push(key, Entry(element.column_idx, element.value), tid); builder.Push(key, Entry(element.column_idx, element.value), tid);
} }
} }

View File

@@ -15,29 +15,6 @@
namespace xgboost { namespace xgboost {
namespace data { namespace data {
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
__device__ bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
__device__ bool operator()(const data::COOTuple& e) const {
if (common::CheckNAN(e.value) || e.value == missing) {
return false;
}
return true;
}
__device__ bool operator()(const Entry& e) const {
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
return false;
}
return true;
}
};
class CudfAdapterBatch : public detail::NoMetaInfo { class CudfAdapterBatch : public detail::NoMetaInfo {
friend class CudfAdapter; friend class CudfAdapter;

View File

@@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
if (batches == 1) { if (batches == 1) {
this->info_ = std::move(proxy->Info()); this->info_ = std::move(proxy->Info());
this->info_.num_nonzero_ = nnz;
CHECK_EQ(proxy->Info().labels_.Size(), 0); CHECK_EQ(proxy->Info().labels_.Size(), 0);
} }

View File

@@ -49,10 +49,10 @@ class SimpleDMatrix : public DMatrix {
MetaInfo info_; MetaInfo info_;
// Primary storage type // Primary storage type
std::shared_ptr<SparsePage> sparse_page_ = std::make_shared<SparsePage>(); std::shared_ptr<SparsePage> sparse_page_ = std::make_shared<SparsePage>();
std::shared_ptr<CSCPage> column_page_; std::shared_ptr<CSCPage> column_page_{nullptr};
std::shared_ptr<SortedCSCPage> sorted_column_page_; std::shared_ptr<SortedCSCPage> sorted_column_page_{nullptr};
std::shared_ptr<EllpackPage> ellpack_page_; std::shared_ptr<EllpackPage> ellpack_page_{nullptr};
std::shared_ptr<GHistIndexMatrix> gradient_index_; std::shared_ptr<GHistIndexMatrix> gradient_index_{nullptr};
BatchParam batch_param_; BatchParam batch_param_;
bool EllpackExists() const override { bool EllpackExists() const override {

View File

@@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
} }
void FeatureScore(std::string const &importance_type, void FeatureScore(std::string const &importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t> *out_features, std::vector<bst_feature_t> *out_features,
std::vector<float> *out_scores) const override { std::vector<float> *out_scores) const override {
CHECK(!model_.weight.empty()) << "Model is not initialized"; CHECK(!model_.weight.empty()) << "Model is not initialized";
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
CHECK_EQ(importance_type, "weight") CHECK_EQ(importance_type, "weight")
<< "gblinear only has `weight` defined for feature importance."; << "gblinear only has `weight` defined for feature importance.";
out_features->resize(this->learner_model_param_->num_feature, 0); out_features->resize(this->learner_model_param_->num_feature, 0);

View File

@@ -273,6 +273,7 @@ class GBTree : public GradientBooster {
uint32_t tree_begin, tree_end; uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = std::tie(tree_begin, tree_end) =
detail::LayerToTree(model_, tparam_, layer_begin, layer_end); detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
std::vector<Predictor const *> predictors{ std::vector<Predictor const *> predictors{
cpu_predictor_.get(), cpu_predictor_.get(),
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
@@ -300,18 +301,28 @@ class GBTree : public GradientBooster {
} }
} }
void FeatureScore(std::string const &importance_type, void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t> *features, std::vector<bst_feature_t>* features,
std::vector<float> *scores) const override { std::vector<float>* scores) const override {
// Because feature with no importance doesn't appear in the return value so // Because feature with no importance doesn't appear in the return value so
// we need to set up another pair of vectors to store the values during // we need to set up another pair of vectors to store the values during
// computation. // computation.
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0); std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0); std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
std::vector<int32_t> tree_idx;
if (trees.empty()) {
tree_idx.resize(this->model_.trees.size());
std::iota(tree_idx.begin(), tree_idx.end(), 0);
trees = common::Span<int32_t const>(tree_idx);
}
auto total_n_trees = model_.trees.size();
auto add_score = [&](auto fn) { auto add_score = [&](auto fn) {
for (auto const &p_tree : model_.trees) { for (auto idx : trees) {
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
auto const& p_tree = model_.trees[idx];
p_tree->WalkTree([&](bst_node_t nidx) { p_tree->WalkTree([&](bst_node_t nidx) {
auto const &node = (*p_tree)[nidx]; auto const& node = (*p_tree)[nidx];
if (!node.IsLeaf()) { if (!node.IsLeaf()) {
split_counts[node.SplitIndex()]++; split_counts[node.SplitIndex()]++;
fn(p_tree, nidx, node.SplitIndex()); fn(p_tree, nidx, node.SplitIndex());

View File

@@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO {
*out_preds = &out_predictions.predictions; *out_preds = &out_predictions.predictions;
} }
void CalcFeatureScore(std::string const &importance_type, void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t> *features, std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
std::vector<float> *scores) override {
this->Configure(); this->Configure();
gbm_->FeatureScore(importance_type, features, scores); gbm_->FeatureScore(importance_type, trees, features, scores);
} }
const std::map<std::string, std::string>& GetConfigurationArguments() const override { const std::map<std::string, std::string>& GetConfigurationArguments() const override {

View File

@@ -291,7 +291,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
// labels is a vector of size n_samples. // labels is a vector of size n_samples.
float label = labels[idx % n_samples] == class_id; float label = labels[idx % n_samples] == class_id;
float w = get_weight(i % n_samples); float w = weights.empty() ? 1.0f : weights[d_sorted_idx[i] % n_samples];
float fp = (1.0 - label) * w; float fp = (1.0 - label) * w;
float tp = label * w; float tp = label * w;
return thrust::make_pair(fp, tp); return thrust::make_pair(fp, tp);

View File

@@ -309,10 +309,9 @@ struct EvalGammaNLogLik {
float constexpr kPsi = 1.0; float constexpr kPsi = 1.0;
bst_float theta = -1. / py; bst_float theta = -1. / py;
bst_float a = kPsi; bst_float a = kPsi;
// b = -std::log(-theta); float b = -std::log(-theta);
float b = 1.0f; // c = 1. / kPsi^2 * std::log(y/kPsi) - std::log(y) - common::LogGamma(1. / kPsi);
// c = 1. / kPsi * std::log(y/kPsi) - std::log(y) - common::LogGamma(1. / kPsi); // = 1.0f * std::log(y) - std::log(y) - 0 = 0
// = 1.0f * std::log(y) - std::log(y) - 0 = 0
float c = 0; float c = 0;
// general form for exponential family. // general form for exponential family.
return -((y * theta - b) / a + c); return -((y * theta - b) / a + c);

View File

@@ -109,10 +109,9 @@ class ColMaker: public TreeUpdater {
interaction_constraints_.Configure(param_, dmat->Info().num_row_); interaction_constraints_.Configure(param_, dmat->Info().num_row_);
// build tree // build tree
for (auto tree : trees) { for (auto tree : trees) {
Builder builder( CHECK(tparam_);
param_, Builder builder(param_, colmaker_param_, interaction_constraints_, tparam_,
colmaker_param_, column_densities_);
interaction_constraints_, column_densities_);
builder.Update(gpair->ConstHostVector(), dmat, tree); builder.Update(gpair->ConstHostVector(), dmat, tree);
} }
param_.learning_rate = lr; param_.learning_rate = lr;
@@ -154,12 +153,12 @@ class ColMaker: public TreeUpdater {
class Builder { class Builder {
public: public:
// constructor // constructor
explicit Builder(const TrainParam& param, explicit Builder(const TrainParam &param, const ColMakerTrainParam &colmaker_train_param,
const ColMakerTrainParam& colmaker_train_param,
FeatureInteractionConstraintHost _interaction_constraints, FeatureInteractionConstraintHost _interaction_constraints,
const std::vector<float> &column_densities) GenericParameter const *ctx, const std::vector<float> &column_densities)
: param_(param), colmaker_train_param_{colmaker_train_param}, : param_(param),
nthread_(omp_get_max_threads()), colmaker_train_param_{colmaker_train_param},
ctx_{ctx},
tree_evaluator_(param_, column_densities.size(), GenericParameter::kCpuId), tree_evaluator_(param_, column_densities.size(), GenericParameter::kCpuId),
interaction_constraints_{std::move(_interaction_constraints)}, interaction_constraints_{std::move(_interaction_constraints)},
column_densities_(column_densities) {} column_densities_(column_densities) {}
@@ -238,7 +237,7 @@ class ColMaker: public TreeUpdater {
// setup temp space for each thread // setup temp space for each thread
// reserve a small space // reserve a small space
stemp_.clear(); stemp_.clear();
stemp_.resize(this->nthread_, std::vector<ThreadEntry>()); stemp_.resize(this->ctx_->Threads(), std::vector<ThreadEntry>());
for (auto& i : stemp_) { for (auto& i : stemp_) {
i.clear(); i.reserve(256); i.clear(); i.reserve(256);
} }
@@ -451,8 +450,9 @@ class ColMaker: public TreeUpdater {
// start enumeration // start enumeration
const auto num_features = static_cast<bst_omp_uint>(feat_set.size()); const auto num_features = static_cast<bst_omp_uint>(feat_set.size());
#if defined(_OPENMP) #if defined(_OPENMP)
CHECK(this->ctx_);
const int batch_size = // NOLINT const int batch_size = // NOLINT
std::max(static_cast<int>(num_features / this->nthread_ / 32), 1); std::max(static_cast<int>(num_features / this->ctx_->Threads() / 32), 1);
#endif // defined(_OPENMP) #endif // defined(_OPENMP)
{ {
auto page = batch.GetView(); auto page = batch.GetView();
@@ -553,7 +553,8 @@ class ColMaker: public TreeUpdater {
virtual void SyncBestSolution(const std::vector<int> &qexpand) { virtual void SyncBestSolution(const std::vector<int> &qexpand) {
for (int nid : qexpand) { for (int nid : qexpand) {
NodeEntry &e = snode_[nid]; NodeEntry &e = snode_[nid];
for (int tid = 0; tid < this->nthread_; ++tid) { CHECK(this->ctx_);
for (int tid = 0; tid < this->ctx_->Threads(); ++tid) {
e.best.Update(stemp_[tid][nid].best); e.best.Update(stemp_[tid][nid].best);
} }
} }
@@ -609,7 +610,7 @@ class ColMaker: public TreeUpdater {
const TrainParam& param_; const TrainParam& param_;
const ColMakerTrainParam& colmaker_train_param_; const ColMakerTrainParam& colmaker_train_param_;
// number of omp thread used during training // number of omp thread used during training
const int nthread_; GenericParameter const* ctx_;
common::ColumnSampler column_sampler_; common::ColumnSampler column_sampler_;
// Instance Data: current node position in the tree of each instance // Instance Data: current node position in the tree of each instance
std::vector<int> position_; std::vector<int> position_;

View File

@@ -585,6 +585,7 @@ struct GPUHistMakerDevice {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max()) CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large."; << "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue); auto cat = common::AsCat(candidate.split.fvalue);
common::CheckCat(cat);
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0); std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats); LBitField32 cats_bits(split_cats);
cats_bits.Set(cat); cats_bits.Set(cat);

View File

@@ -115,9 +115,6 @@ bool QuantileHistMaker::UpdatePredictionCache(
} }
} }
template <typename GradientSumT>
QuantileHistMaker::Builder<GradientSumT>::~Builder() = default;
template <typename GradientSumT> template <typename GradientSumT>
template <bool any_missing> template <bool any_missing>

View File

@@ -204,7 +204,6 @@ class QuantileHistMaker: public TreeUpdater {
new HistogramBuilder<GradientSumT, CPUExpandEntry>} { new HistogramBuilder<GradientSumT, CPUExpandEntry>} {
builder_monitor_.Init("Quantile::Builder"); builder_monitor_.Init("Quantile::Builder");
} }
~Builder();
// update one tree, growing // update one tree, growing
virtual void Update(const GHistIndexMatrix& gmat, virtual void Update(const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix, const ColumnMatrix& column_matrix,

View File

@@ -392,6 +392,52 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required); EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
} }
void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
int32_t num_bins, bool weighted) {
auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
thrust::device_vector<float> x(h_x);
auto adapter = AdapterFromData(x, n, 1);
MetaInfo info;
info.num_row_ = n;
info.num_col_ = 1;
info.feature_types.HostVector().push_back(FeatureType::kCategorical);
if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
info.weights_.HostVector() = weights;
}
ASSERT_EQ(info.feature_types.Size(), 1);
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
AdapterDeviceSketch(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(), &container);
HistogramCuts cuts;
container.MakeCuts(&cuts);
thrust::sort(x.begin(), x.end());
auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);
auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);
x.resize(n_uniques);
h_x.resize(n_uniques);
thrust::copy(x.begin(), x.end(), h_x.begin());
for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
ASSERT_EQ(h_x[i], values[i]);
}
}
TEST(HistUtil, AdapterDeviceSketchCategorical) { TEST(HistUtil, AdapterDeviceSketchCategorical) {
int categorical_sizes[] = {2, 6, 8, 12}; int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256; int num_bins = 256;
@@ -404,6 +450,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
auto adapter = AdapterFromData(x_device, n, 1); auto adapter = AdapterFromData(x_device, n, 1);
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(), ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
adapter.NumRows(), dmat.get()); adapter.NumRows(), dmat.get());
TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
} }
} }
} }

View File

@@ -430,7 +430,7 @@ TEST(GBTree, FeatureScore) {
std::vector<bst_feature_t> features_weight; std::vector<bst_feature_t> features_weight;
std::vector<float> scores_weight; std::vector<float> scores_weight;
learner->CalcFeatureScore("weight", &features_weight, &scores_weight); learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
ASSERT_EQ(features_weight.size(), scores_weight.size()); ASSERT_EQ(features_weight.size(), scores_weight.size());
ASSERT_LE(features_weight.size(), learner->GetNumFeature()); ASSERT_LE(features_weight.size(), learner->GetNumFeature());
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end())); ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));
@@ -438,11 +438,11 @@ TEST(GBTree, FeatureScore) {
auto test_eq = [&learner, &scores_weight](std::string type) { auto test_eq = [&learner, &scores_weight](std::string type) {
std::vector<bst_feature_t> features; std::vector<bst_feature_t> features;
std::vector<float> scores; std::vector<float> scores;
learner->CalcFeatureScore(type, &features, &scores); learner->CalcFeatureScore(type, {}, &features, &scores);
std::vector<bst_feature_t> features_total; std::vector<bst_feature_t> features_total;
std::vector<float> scores_total; std::vector<float> scores_total;
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total); learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);
for (size_t i = 0; i < scores_weight.size(); ++i) { for (size_t i = 0; i < scores_weight.size(); ++i) {
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps); ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
@@ -452,4 +452,47 @@ TEST(GBTree, FeatureScore) {
test_eq("gain"); test_eq("gain");
test_eq("cover"); test_eq("cover");
} }
TEST(GBTree, PredictRange) {
size_t n_samples = 1000, n_features = 10, n_classes = 4;
auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
std::unique_ptr<Learner> learner{Learner::Create({m})};
learner->SetParam("num_class", std::to_string(n_classes));
learner->Configure();
for (size_t i = 0; i < 2; ++i) {
learner->UpdateOneIter(i, m);
}
HostDeviceVector<float> out_predt;
ASSERT_THROW(learner->Predict(m, false, &out_predt, 0, 3), dmlc::Error);
auto m_1 =
RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
HostDeviceVector<float> out_predt_full;
learner->Predict(m_1, false, &out_predt_full, 0, 0);
ASSERT_TRUE(std::equal(out_predt.HostVector().begin(), out_predt.HostVector().end(),
out_predt_full.HostVector().begin()));
{
// inplace predict
HostDeviceVector<float> raw_storage;
auto raw = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateArrayInterface(&raw_storage);
std::shared_ptr<data::ArrayAdapter> x{new data::ArrayAdapter{StringView{raw}}};
HostDeviceVector<float>* out_predt;
learner->InplacePredict(x, nullptr, PredictionType::kValue,
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 2);
auto h_out_predt = out_predt->HostVector();
learner->InplacePredict(x, nullptr, PredictionType::kValue,
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
auto h_out_predt_full = out_predt->HostVector();
ASSERT_TRUE(std::equal(h_out_predt.begin(), h_out_predt.end(), h_out_predt_full.begin()));
ASSERT_THROW(learner->InplacePredict(x, nullptr, PredictionType::kValue,
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 3),
dmlc::Error);
}
}
} // namespace xgboost } // namespace xgboost

View File

@@ -143,7 +143,7 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
} }
xgboost::bst_float GetMetricEval(xgboost::Metric * metric, xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
xgboost::HostDeviceVector<xgboost::bst_float> preds, xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
std::vector<xgboost::bst_float> labels, std::vector<xgboost::bst_float> labels,
std::vector<xgboost::bst_float> weights, std::vector<xgboost::bst_float> weights,
std::vector<xgboost::bst_uint> groups) { std::vector<xgboost::bst_uint> groups) {

View File

@@ -86,7 +86,7 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
xgboost::bst_float GetMetricEval( xgboost::bst_float GetMetricEval(
xgboost::Metric * metric, xgboost::Metric * metric,
xgboost::HostDeviceVector<xgboost::bst_float> preds, xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
std::vector<xgboost::bst_float> labels, std::vector<xgboost::bst_float> labels,
std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(), std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>()); std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());

View File

@@ -90,6 +90,16 @@ TEST(Metric, DeclareUnifiedTest(MultiAUC)) {
}, },
{0, 1, 1}); // no class 2. {0, 1, 1}); // no class 2.
EXPECT_TRUE(std::isnan(auc)) << auc; EXPECT_TRUE(std::isnan(auc)) << auc;
HostDeviceVector<float> predts{
0.0f, 1.0f, 0.0f,
1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f,
0.0f, 0.0f, 1.0f,
};
std::vector<float> labels {1.0f, 0.0f, 2.0f, 1.0f};
auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
ASSERT_GT(auc, 0.714);
} }
TEST(Metric, DeclareUnifiedTest(RankingAUC)) { TEST(Metric, DeclareUnifiedTest(RankingAUC)) {

View File

@@ -186,6 +186,37 @@ Arrow specification.'''
assert len(Xy.feature_types) == X.shape[1] assert len(Xy.feature_types) == X.shape[1]
assert all(t == "c" for t in Xy.feature_types) assert all(t == "c" for t in Xy.feature_types)
# test missing value
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
X, None, None, enable_categorical=True
)
for col in cat_codes:
assert col.has_nulls
y = [0, 1, 2]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
with pytest.raises(ValueError):
xgb.DeviceQuantileDMatrix(X, y)
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
X = X["f0"]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
@pytest.mark.skipif(**tm.no_cudf()) @pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())

View File

@@ -13,9 +13,11 @@ class TestGPUEvalMetrics:
def test_roc_auc_binary(self, n_samples): def test_roc_auc_binary(self, n_samples):
self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples) self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples)
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize(
def test_roc_auc_multi(self, n_samples): "n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples) )
def test_roc_auc_multi(self, n_samples, weighted):
self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples, weighted)
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize("n_samples", [4, 100, 1000])
def test_roc_auc_ltr(self, n_samples): def test_roc_auc_ltr(self, n_samples):

View File

@@ -59,6 +59,7 @@ def test_categorical():
) )
X = pd.DataFrame(X.todense()).astype("category") X = pd.DataFrame(X.todense()).astype("category")
clf.fit(X, y) clf.fit(X, y)
assert not clf._can_use_inplace_predict()
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model = os.path.join(tempdir, "categorial.json") model = os.path.join(tempdir, "categorial.json")

View File

@@ -1,3 +1,4 @@
from typing import Union
import xgboost as xgb import xgboost as xgb
import pytest import pytest
import os import os
@@ -22,29 +23,47 @@ class TestCallbacks:
cls.X_valid = X[split:, ...] cls.X_valid = X[split:, ...]
cls.y_valid = y[split:, ...] cls.y_valid = y[split:, ...]
def run_evaluation_monitor(self, D_train, D_valid, rounds, verbose_eval): def run_evaluation_monitor(
evals_result = {} self,
with tm.captured_output() as (out, err): D_train: xgb.DMatrix,
xgb.train({'objective': 'binary:logistic', D_valid: xgb.DMatrix,
'eval_metric': 'error'}, D_train, rounds: int,
evals=[(D_train, 'Train'), (D_valid, 'Valid')], verbose_eval: Union[bool, int]
num_boost_round=rounds, ):
evals_result=evals_result, def check_output(output: str) -> None:
verbose_eval=verbose_eval) if int(verbose_eval) == 1:
output: str = out.getvalue().strip() # Should print each iteration info
assert len(output.split('\n')) == rounds
elif int(verbose_eval) > rounds:
# Should print first and latest iteration info
assert len(output.split('\n')) == 2
else:
# Should print info by each period additionaly to first and latest
# iteration
num_periods = rounds // int(verbose_eval)
# Extra information is required for latest iteration
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
assert len(output.split('\n')) == (
1 + num_periods + int(is_extra_info_required)
)
if int(verbose_eval) == 1: evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
# Should print each iteration info params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
assert len(output.split('\n')) == rounds with tm.captured_output() as (out, err):
elif int(verbose_eval) > rounds: xgb.train(
# Should print first and latest iteration info params, D_train,
assert len(output.split('\n')) == 2 evals=[(D_train, 'Train'), (D_valid, 'Valid')],
else: num_boost_round=rounds,
# Should print info by each period additionaly to first and latest iteration evals_result=evals_result,
num_periods = rounds // int(verbose_eval) verbose_eval=verbose_eval,
# Extra information is required for latest iteration )
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1) output: str = out.getvalue().strip()
assert len(output.split('\n')) == 1 + num_periods + int(is_extra_info_required) check_output(output)
with tm.captured_output() as (out, err):
xgb.cv(params, D_train, num_boost_round=rounds, verbose_eval=verbose_eval)
output = out.getvalue().strip()
check_output(output)
def test_evaluation_monitor(self): def test_evaluation_monitor(self):
D_train = xgb.DMatrix(self.X_train, self.y_train) D_train = xgb.DMatrix(self.X_train, self.y_train)

View File

@@ -124,6 +124,35 @@ class TestEvalMetrics:
skl_gamma_dev = mean_gamma_deviance(y, score) skl_gamma_dev = mean_gamma_deviance(y, score)
np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6) np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6)
@pytest.mark.skipif(**tm.no_sklearn())
def test_gamma_lik(self) -> None:
import scipy.stats as stats
rng = np.random.default_rng(1994)
n_samples = 32
n_features = 10
X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
alpha, loc, beta = 5.0, 11.1, 22
y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
score = reg.predict(X)
booster = reg.get_booster()
nloglik = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1].split(":")[0])
# \beta_i = - (1 / \theta_i a)
# where \theta_i is the canonical parameter
# XGBoost uses the canonical link function of gamma in evaluation function.
# so \theta = - (1.0 / y)
# dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
beta = - (1.0 / (- (1.0 / y))) # == y
nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
def run_roc_auc_binary(self, tree_method, n_samples): def run_roc_auc_binary(self, tree_method, n_samples):
import numpy as np import numpy as np
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
@@ -162,11 +191,11 @@ class TestEvalMetrics:
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6) np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize("n_samples", [100, 1000])
def test_roc_auc(self, n_samples): def test_roc_auc(self, n_samples):
self.run_roc_auc_binary("hist", n_samples) self.run_roc_auc_binary("hist", n_samples)
def run_roc_auc_multi(self, tree_method, n_samples): def run_roc_auc_multi(self, tree_method, n_samples, weighted):
import numpy as np import numpy as np
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
@@ -184,8 +213,14 @@ class TestEvalMetrics:
n_classes=n_classes, n_classes=n_classes,
random_state=rng random_state=rng
) )
if weighted:
weights = rng.randn(n_samples)
weights -= weights.min()
weights /= weights.max()
else:
weights = None
Xy = xgb.DMatrix(X, y) Xy = xgb.DMatrix(X, y, weight=weights)
booster = xgb.train( booster = xgb.train(
{ {
"tree_method": tree_method, "tree_method": tree_method,
@@ -197,16 +232,22 @@ class TestEvalMetrics:
num_boost_round=8, num_boost_round=8,
) )
score = booster.predict(Xy) score = booster.predict(Xy)
skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr") skl_auc = roc_auc_score(
y, score, average="weighted", sample_weight=weights, multi_class="ovr"
)
auc = float(booster.eval(Xy).split(":")[1]) auc = float(booster.eval(Xy).split(":")[1])
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6) np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
X = rng.randn(*X.shape) X = rng.randn(*X.shape)
score = booster.predict(xgb.DMatrix(X)) score = booster.predict(xgb.DMatrix(X, weight=weights))
skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr") skl_auc = roc_auc_score(
auc = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1]) y, score, average="weighted", sample_weight=weights, multi_class="ovr"
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6) )
auc = float(booster.eval(xgb.DMatrix(X, y, weight=weights)).split(":")[1])
np.testing.assert_allclose(skl_auc, auc, rtol=1e-5)
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize(
def test_roc_auc_multi(self, n_samples): "n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
self.run_roc_auc_multi("hist", n_samples) )
def test_roc_auc_multi(self, n_samples, weighted):
self.run_roc_auc_multi("hist", n_samples, weighted)

View File

@@ -1,6 +1,12 @@
# -*- coding: utf-8 -*- import os
import tempfile
import subprocess
import xgboost as xgb import xgboost as xgb
import numpy as np import numpy as np
import pytest
import testing as tm
class TestOMP: class TestOMP:
@@ -71,3 +77,31 @@ class TestOMP:
assert auc_1 == auc_2 == auc_3 assert auc_1 == auc_2 == auc_3
assert np.array_equal(auc_1, auc_2) assert np.array_equal(auc_1, auc_2)
assert np.array_equal(auc_1, auc_3) assert np.array_equal(auc_1, auc_3)
@pytest.mark.skipif(**tm.no_sklearn())
def test_with_omp_thread_limit(self):
args = [
"python", os.path.join(
tm.PROJECT_ROOT, "tests", "python", "with_omp_limit.py"
)
]
results = []
with tempfile.TemporaryDirectory() as tmpdir:
for i in (1, 2, 16):
path = os.path.join(tmpdir, str(i))
with open(path, "w") as fd:
fd.write("\n")
cp = args.copy()
cp.append(path)
env = os.environ.copy()
env["OMP_THREAD_LIMIT"] = str(i)
status = subprocess.call(cp, env=env)
assert status == 0
with open(path, "r") as fd:
results.append(float(fd.read()))
for auc in results:
np.testing.assert_allclose(auc, results[0])

View File

@@ -138,9 +138,22 @@ class TestPandas:
X, enable_categorical=True X, enable_categorical=True
) )
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0 assert transformed[:, 0].min() == 0
# test missing value
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
assert not np.any(arr == -1.0)
X = X["f0"]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
def test_pandas_sparse(self): def test_pandas_sparse(self):
import pandas as pd import pandas as pd
rows = 100 rows = 100

View File

@@ -0,0 +1,26 @@
import os
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import sys
def run_omp(output_path: str):
X, y = make_classification(
n_samples=200, n_features=32, n_classes=3, n_informative=8
)
Xy = xgb.DMatrix(X, y, nthread=16)
booster = xgb.train(
{"num_class": 3, "objective": "multi:softprob", "n_jobs": 16},
Xy,
num_boost_round=8,
)
score = booster.predict(Xy)
auc = roc_auc_score(y, score, average="weighted", multi_class="ovr")
with open(output_path, "w") as fd:
fd.write(str(auc))
if __name__ == "__main__":
out = sys.argv[1]
run_omp(out)

View File

@@ -1,16 +1,5 @@
#!/bin/bash #!/bin/bash
if [ ${TRAVIS_OS_NAME} == "osx" ]; then
# https://travis-ci.community/t/macos-build-fails-because-of-homebrew-bundle-unknown-command/7296/27
# Use libomp 11.1.0: https://github.com/dmlc/xgboost/issues/7039
brew update # Force update, so that update doesn't overwrite our version of libomp.rb
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb)
brew install cmake libomp
brew pin libomp
fi
if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_sdist_test" ]; then if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_sdist_test" ]; then
if [ ${TRAVIS_OS_NAME} == "osx" ]; then if [ ${TRAVIS_OS_NAME} == "osx" ]; then
wget --no-verbose -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh wget --no-verbose -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh