Compare commits

...

34 Commits

Author SHA1 Message Date
Philip Hyunsu Cho
82d846bbeb
Update change_scala_version.py to also change scala.version property (#9897) 2023-12-18 23:49:41 -08:00
Philip Hyunsu Cho
71d330afdc
Bump version to 2.0.3 (#9895) 2023-12-14 17:54:05 -08:00
Philip Hyunsu Cho
3acbd8692b
[jvm-packages] Fix POM for xgboost-jvm metapackage (#9893)
* [jvm-packages] Fix POM for xgboost-jvm metapackage

* Add script for updating the Scala version
2023-12-14 16:50:34 -08:00
Philip Hyunsu Cho
ad524f76ab
[backport] [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9887)
* [CI] Set up CI for Mac M1 (#9699)

* [CI] Improve CI for Mac M1 (#9748)

* [CI] Build libxgboost4j.dylib with CMAKE_OSX_DEPLOYMENT_TARGET (#9749)

* [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9886)
2023-12-13 16:05:40 -08:00
Jiaming Yuan
d2d1751c03
[backport][py] Use the first found native library. (#9860) (#9879) 2023-12-13 14:20:30 +08:00
Jiaming Yuan
e4ee4e79dc
[backport][sklearn] Fix loading model attributes. (#9808) (#9880) 2023-12-13 14:20:04 +08:00
Philip Hyunsu Cho
41ce8f28b2
[jvm-packages] Add Scala version suffix to xgboost-jvm package (#9776)
* Update JVM script (#9714)

* Bump version to 2.0.2; revamp pom.xml

* Update instructions in prepare_jvm_release.py

* Fix formatting
2023-11-08 10:17:26 -08:00
Jiaming Yuan
0ffc52e05c
[backport] Fix using categorical data with the ranker. (#9753) (#9778) 2023-11-09 01:20:52 +08:00
Philip Hyunsu Cho
a408254c2f
Use sys.base_prefix instead of sys.prefix (#9711)
* Use sys.base_prefix instead of sys.prefix

* Update libpath.py too
2023-10-23 23:31:40 -07:00
Philip Hyunsu Cho
22e891dafa
[jvm-packages] Remove hard dependency on libjvm (#9698) (#9705) 2023-10-23 21:21:14 -07:00
Philip Hyunsu Cho
89530c80a7 [CI] Build libxgboost4j.dylib for Intel Mac (#9704) 2023-10-23 20:45:01 -07:00
Philip Hyunsu Cho
946ab53b57
Fix libpath logic for Windows (#9687) 2023-10-19 10:42:46 -07:00
Philip Hyunsu Cho
afd03a6934
Fix build for AppleClang 11 (#9684) 2023-10-18 09:35:59 -07:00
Jiaming Yuan
f7da938458
[backport][pyspark] Support stage-level scheduling (#9519) (#9686)
Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2023-10-18 14:05:08 +08:00
Philip Hyunsu Cho
6ab6577511
Fix build for GCC 8.x (#9670) 2023-10-12 23:36:41 -07:00
Philip Hyunsu Cho
8c57558d74
[backport] [CI] Pull CentOS 7 images from NGC (#9666) (#9668) 2023-10-13 14:09:54 +08:00
Jiaming Yuan
58aa98a796
Bump version to 2.0.1. (#9660) 2023-10-13 08:47:32 +08:00
Jiaming Yuan
92273b39d8
[backport] Add support for cgroupv2. (#9651) (#9656) 2023-10-12 11:39:27 +08:00
Jiaming Yuan
e824b18bf6
[backport] Support pandas 2.1.0. (#9557) (#9655) 2023-10-12 11:29:59 +08:00
Jiaming Yuan
66ee89d8b4
[backport] Workaround Apple clang issue. (#9615) (#9636) 2023-10-08 15:42:15 +08:00
Jiaming Yuan
54d1d72d01
[backport] Use array interface for testing numpy arrays. (#9602) (#9635) 2023-10-08 11:45:49 +08:00
Jiaming Yuan
032bcc57f9
[backport][R] Fix method name. (#9577) (#9592) 2023-09-19 02:08:46 +08:00
Jiaming Yuan
ace7713201
[backport] Fix default metric configuration. (#9575) (#9590) 2023-09-18 23:40:43 +08:00
Jiaming Yuan
096047c547
Make 2.0 release. (#9567) 2023-09-12 00:20:49 +08:00
Jiaming Yuan
e75dd75bb2
[backport] [pyspark] support gpu transform (#9542) (#9559)
---------

Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2023-09-07 17:21:09 +08:00
Jiaming Yuan
4d387cbfbf
[backport] [pyspark] rework transform to reuse same code (#9292) (#9558)
Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2023-09-07 15:26:24 +08:00
Jiaming Yuan
3fde9361d7
[backport] Fix inplace predict with fallback when base margin is used. (#9536) (#9548)
- Copy meta info from proxy DMatrix.
- Use `std::call_once` to emit less warnings.
2023-09-05 23:38:06 +08:00
Jiaming Yuan
b67c2ed96d
[backport] [CI] bump setup-r action version. (#9544) (#9551) 2023-09-05 22:10:30 +08:00
Jiaming Yuan
177fd79864
[backport] Fix read the doc configuration. [skip ci] (#9549) 2023-09-05 17:32:00 +08:00
Jiaming Yuan
06487d3896
[backport] Fix GPU categorical split memory allocation. (#9529) (#9535) 2023-08-29 21:14:43 +08:00
Jiaming Yuan
e50ccc4d3c
[R] Fix integer inputs with NA. (#9522) (#9534) 2023-08-29 19:52:13 +08:00
Jiaming Yuan
add57f8880
[backport] Delay the check for vector leaf. (#9509) (#9533) 2023-08-29 18:25:59 +08:00
Jiaming Yuan
a0d3573c74
[backport] Fix device dispatch for linear updater. (#9507) (#9532) 2023-08-29 15:10:43 +08:00
Jiaming Yuan
4301558a57
Make 2.0.0 RC1. (#9492) 2023-08-17 16:16:51 +08:00
78 changed files with 1559 additions and 629 deletions

View File

@ -51,14 +51,14 @@ jobs:
id: extract_branch id: extract_branch
if: | if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest' (matrix.os == 'windows-latest' || matrix.os == 'macos-11')
- name: Publish artifact xgboost4j.dll to S3 - name: Publish artifact xgboost4j.dll to S3
run: | run: |
cd lib/ cd lib/
Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
dir dir
python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
if: | if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest' matrix.os == 'windows-latest'
@ -66,6 +66,19 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Publish artifact libxgboost4j.dylib to S3
run: |
cd lib/
mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
ls
python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'macos-11'
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Test XGBoost4J (Core, Spark, Examples) - name: Test XGBoost4J (Core, Spark, Examples)
run: | run: |

View File

@ -25,7 +25,7 @@ jobs:
with: with:
submodules: 'true' submodules: 'true'
- uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1 - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
with: with:
r-version: ${{ matrix.config.r }} r-version: ${{ matrix.config.r }}
@ -64,7 +64,7 @@ jobs:
with: with:
submodules: 'true' submodules: 'true'
- uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1 - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
with: with:
r-version: ${{ matrix.config.r }} r-version: ${{ matrix.config.r }}

View File

@ -32,4 +32,3 @@ formats:
python: python:
install: install:
- requirements: doc/requirements.txt - requirements: doc/requirements.txt
system_packages: true

View File

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR) cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 2.0.0) project(xgboost LANGUAGES CXX C VERSION 2.0.3)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)
@ -233,6 +233,11 @@ endif (RABIT_BUILD_MPI)
add_subdirectory(${xgboost_SOURCE_DIR}/src) add_subdirectory(${xgboost_SOURCE_DIR}/src)
target_link_libraries(objxgboost PUBLIC dmlc) target_link_libraries(objxgboost PUBLIC dmlc)
# Link -lstdc++fs for GCC 8.x
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
target_link_libraries(objxgboost PUBLIC stdc++fs)
endif()
# Exports some R specific definitions and objects # Exports some R specific definitions and objects
if (R_LIB) if (R_LIB)
add_subdirectory(${xgboost_SOURCE_DIR}/R-package) add_subdirectory(${xgboost_SOURCE_DIR}/R-package)

View File

@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 2.0.0.1 Version: 2.0.3.1
Date: 2022-10-18 Date: 2023-12-14
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"), email = "tianqi.tchen@gmail.com"),

View File

@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
i == env$begin_iteration || i == env$begin_iteration ||
i == env$end_iteration) { i == env$end_iteration) {
stdev <- if (showsd) env$bst_evaluation_err else NULL stdev <- if (showsd) env$bst_evaluation_err else NULL
msg <- format.eval.string(i, env$bst_evaluation, stdev) msg <- .format_eval_string(i, env$bst_evaluation, stdev)
cat(msg, '\n') cat(msg, '\n')
} }
} }
@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
if ((maximize && score > best_score) || if ((maximize && score > best_score) ||
(!maximize && score < best_score)) { (!maximize && score < best_score)) {
best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) best_msg <<- .format_eval_string(
i, env$bst_evaluation, env$bst_evaluation_err
)
best_score <<- score best_score <<- score
best_iteration <<- i best_iteration <<- i
best_ntreelimit <<- best_iteration * env$num_parallel_tree best_ntreelimit <<- best_iteration * env$num_parallel_tree
@ -754,7 +756,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
# #
# Format the evaluation metric string # Format the evaluation metric string
format.eval.string <- function(iter, eval_res, eval_err = NULL) { .format_eval_string <- function(iter, eval_res, eval_err = NULL) {
if (length(eval_res) == 0) if (length(eval_res) == 0)
stop('no evaluation results') stop('no evaluation results')
enames <- names(eval_res) enames <- names(eval_res)

18
R-package/configure vendored
View File

@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for xgboost 2.0.0. # Generated by GNU Autoconf 2.71 for xgboost 2.0.3.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='xgboost' PACKAGE_NAME='xgboost'
PACKAGE_TARNAME='xgboost' PACKAGE_TARNAME='xgboost'
PACKAGE_VERSION='2.0.0' PACKAGE_VERSION='2.0.3'
PACKAGE_STRING='xgboost 2.0.0' PACKAGE_STRING='xgboost 2.0.3'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems. \`configure' configures xgboost 2.0.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1287,7 +1287,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of xgboost 2.0.0:";; short | recursive ) echo "Configuration of xgboost 2.0.3:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@ -1367,7 +1367,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
xgboost configure 2.0.0 xgboost configure 2.0.3
generated by GNU Autoconf 2.71 generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc. Copyright (C) 2021 Free Software Foundation, Inc.
@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by xgboost $as_me 2.0.0, which was It was created by xgboost $as_me 2.0.3, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by xgboost $as_me 2.0.0, which was This file was extended by xgboost $as_me 2.0.3, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
xgboost config.status 2.0.0 xgboost config.status 2.0.3
configured by $0, generated by GNU Autoconf 2.71, configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@ -2,7 +2,7 @@
AC_PREREQ(2.69) AC_PREREQ(2.69)
AC_INIT([xgboost],[2.0.0],[],[xgboost],[]) AC_INIT([xgboost],[2.0.3],[],[xgboost],[])
: ${R_HOME=`R RHOME`} : ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then if test -z "${R_HOME}"; then

View File

@ -120,11 +120,25 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
ctx.nthread = asInteger(n_threads); ctx.nthread = asInteger(n_threads);
std::int32_t threads = ctx.Threads(); std::int32_t threads = ctx.Threads();
xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) { if (is_int) {
for (size_t j = 0; j < ncol; ++j) { xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
data[i * ncol + j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j]; for (size_t j = 0; j < ncol; ++j) {
} auto v = iin[i + nrow * j];
}); if (v == NA_INTEGER) {
data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
} else {
data[i * ncol + j] = static_cast<float>(v);
}
}
});
} else {
xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
for (size_t j = 0; j < ncol; ++j) {
data[i * ncol + j] = din[i + nrow * j];
}
});
}
DMatrixHandle handle; DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol, CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
asReal(missing), &handle, threads)); asReal(missing), &handle, threads));

View File

@ -56,6 +56,42 @@ test_that("xgb.DMatrix: basic construction", {
expect_equal(raw_fd, raw_dgc) expect_equal(raw_fd, raw_dgc)
}) })
test_that("xgb.DMatrix: NA", {
n_samples <- 3
x <- cbind(
x1 = sample(x = 4, size = n_samples, replace = TRUE),
x2 = sample(x = 4, size = n_samples, replace = TRUE)
)
x[1, "x1"] <- NA
m <- xgb.DMatrix(x)
xgb.DMatrix.save(m, "int.dmatrix")
x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
colnames(x) <- c("x1", "x2")
m <- xgb.DMatrix(x)
xgb.DMatrix.save(m, "float.dmatrix")
iconn <- file("int.dmatrix", "rb")
fconn <- file("float.dmatrix", "rb")
expect_equal(file.size("int.dmatrix"), file.size("float.dmatrix"))
bytes <- file.size("int.dmatrix")
idmatrix <- readBin(iconn, "raw", n = bytes)
fdmatrix <- readBin(fconn, "raw", n = bytes)
expect_equal(length(idmatrix), length(fdmatrix))
expect_equal(idmatrix, fdmatrix)
close(iconn)
close(fconn)
file.remove("int.dmatrix")
file.remove("float.dmatrix")
})
test_that("xgb.DMatrix: saving, loading", { test_that("xgb.DMatrix: saving, loading", {
# save to a local file # save to a local file
dtest1 <- xgb.DMatrix(test_data, label = test_label) dtest1 <- xgb.DMatrix(test_data, label = test_label)

View File

@ -0,0 +1,79 @@
import argparse
import pathlib
import re
import shutil
def main(args):
if args.scala_version == "2.12":
scala_ver = "2.12"
scala_patchver = "2.12.18"
elif args.scala_version == "2.13":
scala_ver = "2.13"
scala_patchver = "2.13.11"
else:
raise ValueError(f"Unsupported Scala version: {args.scala_version}")
# Clean artifacts
if args.purge_artifacts:
for target in pathlib.Path("jvm-packages/").glob("**/target"):
if target.is_dir():
print(f"Removing {target}...")
shutil.rmtree(target)
# Update pom.xml
for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):
print(f"Updating {pom}...")
with open(pom, "r", encoding="utf-8") as f:
lines = f.readlines()
with open(pom, "w", encoding="utf-8") as f:
replaced_scalaver = False
replaced_scala_binver = False
for line in lines:
for artifact in [
"xgboost-jvm",
"xgboost4j",
"xgboost4j-gpu",
"xgboost4j-spark",
"xgboost4j-spark-gpu",
"xgboost4j-flink",
"xgboost4j-example",
]:
line = re.sub(
f"<artifactId>{artifact}_[0-9\\.]*",
f"<artifactId>{artifact}_{scala_ver}",
line,
)
# Only replace the first occurrence of scala.version
if not replaced_scalaver:
line, nsubs = re.subn(
r"<scala.version>[0-9\.]*",
f"<scala.version>{scala_patchver}",
line,
)
if nsubs > 0:
replaced_scalaver = True
# Only replace the first occurrence of scala.binary.version
if not replaced_scala_binver:
line, nsubs = re.subn(
r"<scala.binary.version>[0-9\.]*",
f"<scala.binary.version>{scala_ver}",
line,
)
if nsubs > 0:
replaced_scala_binver = True
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--purge-artifacts", action="store_true")
parser.add_argument(
"--scala-version",
type=str,
required=True,
help="Version of Scala to use in the JVM packages",
choices=["2.12", "2.13"],
)
parsed_args = parser.parse_args()
main(parsed_args)

View File

@ -2,7 +2,6 @@ import argparse
import errno import errno
import glob import glob
import os import os
import platform
import re import re
import shutil import shutil
import subprocess import subprocess
@ -21,12 +20,14 @@ def normpath(path):
else: else:
return normalized return normalized
def cp(source, target): def cp(source, target):
source = normpath(source) source = normpath(source)
target = normpath(target) target = normpath(target)
print("cp {0} {1}".format(source, target)) print("cp {0} {1}".format(source, target))
shutil.copy(source, target) shutil.copy(source, target)
def maybe_makedirs(path): def maybe_makedirs(path):
path = normpath(path) path = normpath(path)
print("mkdir -p " + path) print("mkdir -p " + path)
@ -36,6 +37,7 @@ def maybe_makedirs(path):
if e.errno != errno.EEXIST: if e.errno != errno.EEXIST:
raise raise
@contextmanager @contextmanager
def cd(path): def cd(path):
path = normpath(path) path = normpath(path)
@ -47,18 +49,22 @@ def cd(path):
finally: finally:
os.chdir(cwd) os.chdir(cwd)
def run(command, **kwargs): def run(command, **kwargs):
print(command) print(command)
subprocess.check_call(command, shell=True, **kwargs) subprocess.check_call(command, shell=True, **kwargs)
def get_current_git_tag(): def get_current_git_tag():
out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]) out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"])
return out.decode().split("\n")[0] return out.decode().split("\n")[0]
def get_current_commit_hash(): def get_current_commit_hash():
out = subprocess.check_output(["git", "rev-parse", "HEAD"]) out = subprocess.check_output(["git", "rev-parse", "HEAD"])
return out.decode().split("\n")[0] return out.decode().split("\n")[0]
def get_current_git_branch(): def get_current_git_branch():
out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"]) out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"])
m = re.search(r"release_[0-9\.]+", out.decode()) m = re.search(r"release_[0-9\.]+", out.decode())
@ -66,38 +72,49 @@ def get_current_git_branch():
raise ValueError("Expected branch name of form release_xxx") raise ValueError("Expected branch name of form release_xxx")
return m.group(0) return m.group(0)
def retrieve(url, filename=None): def retrieve(url, filename=None):
print(f"{url} -> {filename}") print(f"{url} -> {filename}")
return urlretrieve(url, filename) return urlretrieve(url, filename)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--release-version", type=str, required=True, parser.add_argument(
help="Version of the release being prepared") "--release-version",
type=str,
required=True,
help="Version of the release being prepared",
)
args = parser.parse_args() args = parser.parse_args()
if sys.platform != "darwin" or platform.machine() != "x86_64":
raise NotImplementedError("Please run this script using an Intel Mac")
version = args.release_version version = args.release_version
expected_git_tag = "v" + version expected_git_tag = "v" + version
current_git_tag = get_current_git_tag() current_git_tag = get_current_git_tag()
if current_git_tag != expected_git_tag: if current_git_tag != expected_git_tag:
if not current_git_tag: if not current_git_tag:
raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD has no tag. " raise ValueError(
f"Run: git checkout {expected_git_tag}") f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD is at tag " f"Run: git checkout {expected_git_tag}"
f"{current_git_tag}. Run: git checkout {expected_git_tag}") )
raise ValueError(
f"Expected git tag {expected_git_tag} but current HEAD is at tag "
f"{current_git_tag}. Run: git checkout {expected_git_tag}"
)
commit_hash = get_current_commit_hash() commit_hash = get_current_commit_hash()
git_branch = get_current_git_branch() git_branch = get_current_git_branch()
print(f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}") print(
f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}"
)
with cd("jvm-packages/"): with cd("jvm-packages/"):
print("====copying pure-Python tracker====") print("====copying pure-Python tracker====")
for use_cuda in [True, False]: for use_cuda in [True, False]:
xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j" xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
cp("../python-package/xgboost/tracker.py", f"{xgboost4j}/src/main/resources") cp(
"../python-package/xgboost/tracker.py",
f"{xgboost4j}/src/main/resources",
)
print("====copying resources for testing====") print("====copying resources for testing====")
with cd("../demo/CLI/regression"): with cd("../demo/CLI/regression"):
@ -115,7 +132,12 @@ def main():
cp(file, f"{xgboost4j_spark}/src/test/resources") cp(file, f"{xgboost4j_spark}/src/test/resources")
print("====Creating directories to hold native binaries====") print("====Creating directories to hold native binaries====")
for os_ident, arch in [("linux", "x86_64"), ("windows", "x86_64"), ("macos", "x86_64")]: for os_ident, arch in [
("linux", "x86_64"),
("windows", "x86_64"),
("macos", "x86_64"),
("macos", "aarch64"),
]:
output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}" output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
maybe_makedirs(output_dir) maybe_makedirs(output_dir)
for os_ident, arch in [("linux", "x86_64")]: for os_ident, arch in [("linux", "x86_64")]:
@ -123,52 +145,98 @@ def main():
maybe_makedirs(output_dir) maybe_makedirs(output_dir)
print("====Downloading native binaries from CI====") print("====Downloading native binaries from CI====")
nightly_bucket_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds" nightly_bucket_prefix = (
maven_repo_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc" "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
)
maven_repo_prefix = (
"https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
)
retrieve(url=f"{nightly_bucket_prefix}/{git_branch}/xgboost4j_{commit_hash}.dll", retrieve(
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll") url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll",
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
# libxgboost4j.so for Linux x86_64, CPU only # libxgboost4j.so for Linux x86_64, CPU only
zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar") zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j") extract_dir = os.path.join(tempdir, "xgboost4j")
retrieve(url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/" retrieve(
f"xgboost4j_2.12-{version}.jar", url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
filename=zip_path) f"xgboost4j_2.12-{version}.jar",
filename=zip_path,
)
os.mkdir(extract_dir) os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t: with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir) t.extractall(extract_dir)
cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"), cp(
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so") os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
# libxgboost4j.so for Linux x86_64, GPU support # libxgboost4j.so for Linux x86_64, GPU support
zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar") zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j-gpu") extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
retrieve(url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/" retrieve(
f"xgboost4j-gpu_2.12-{version}.jar", url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
filename=zip_path) f"xgboost4j-gpu_2.12-{version}.jar",
filename=zip_path,
)
os.mkdir(extract_dir) os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t: with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir) t.extractall(extract_dir)
cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"), cp(
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so") os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
print("====Next Steps====") print("====Next Steps====")
print("1. Gain upload right to Maven Central repo.") print("1. Gain upload right to Maven Central repo.")
print("1-1. Sign up for a JIRA account at Sonatype: ") print("1-1. Sign up for a JIRA account at Sonatype: ")
print("1-2. File a JIRA ticket: " print(
"https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: " "1-2. File a JIRA ticket: "
"https://issues.sonatype.org/browse/OSSRH-67724") "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
print("2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in " "https://issues.sonatype.org/browse/OSSRH-67724"
"https://central.sonatype.org/publish/publish-maven/") )
print("3. Now on a Mac machine, run:") print(
print(" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests") "2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
print("4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging " "https://central.sonatype.org/publish/publish-maven/"
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 " )
"to inspect the staged JAR files. Finally, press Release button to publish the " print(
"artifacts to the Maven Central repository.") "3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
"Make sure to use an Internet connection with fast upload speed:"
)
print(
" # Skip native build, since we have all needed native binaries from CI\n"
" export MAVEN_SKIP_NATIVE_BUILD=1\n"
" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
)
print(
"4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
"to inspect the staged JAR files. Finally, press Release button to publish the "
"artifacts to the Maven Central repository. The top-level metapackage should be "
"named xgboost-jvm_2.12."
)
print(
"5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
" export MAVEN_SKIP_NATIVE_BUILD=1\n"
" python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
" GPG_TTY=$(tty) mvn deploy -Prelease-cpu-only,scala-2.13 -DskipTests"
)
print(
"6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
"The top-level metapackage should be named xgboost-jvm_2.13."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -329,7 +329,7 @@ Parameters for Linear Booster (``booster=gblinear``)
- Choice of algorithm to fit linear model - Choice of algorithm to fit linear model
- ``shotgun``: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run. - ``shotgun``: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run.
- ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the ``device`` parameter is set to ``cuda`` or ``gpu``, a GPU variant would be used.
* ``feature_selector`` [default= ``cyclic``] * ``feature_selector`` [default= ``cyclic``]

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright 2020 by Contributors * Copyright 2020-2023, XGBoost Contributors
* \file global_config.h * \file global_config.h
* \brief Global configuration for XGBoost * \brief Global configuration for XGBoost
* \author Hyunsu Cho * \author Hyunsu Cho
@ -7,24 +7,22 @@
#ifndef XGBOOST_GLOBAL_CONFIG_H_ #ifndef XGBOOST_GLOBAL_CONFIG_H_
#define XGBOOST_GLOBAL_CONFIG_H_ #define XGBOOST_GLOBAL_CONFIG_H_
#include <xgboost/parameter.h> #include <dmlc/thread_local.h> // for ThreadLocalStore
#include <vector> #include <xgboost/parameter.h> // for XGBoostParameter
#include <string>
#include <cstdint> // for int32_t
namespace xgboost { namespace xgboost {
class Json;
struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> { struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
int verbosity { 1 }; std::int32_t verbosity{1};
bool use_rmm { false }; bool use_rmm{false};
DMLC_DECLARE_PARAMETER(GlobalConfiguration) { DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
DMLC_DECLARE_FIELD(verbosity) DMLC_DECLARE_FIELD(verbosity)
.set_range(0, 3) .set_range(0, 3)
.set_default(1) // shows only warning .set_default(1) // shows only warning
.describe("Flag to print out detailed breakdown of runtime."); .describe("Flag to print out detailed breakdown of runtime.");
DMLC_DECLARE_FIELD(use_rmm) DMLC_DECLARE_FIELD(use_rmm).set_default(false).describe(
.set_default(false) "Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
.describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
} }
}; };

View File

@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 2 /* NOLINT */ #define XGBOOST_VER_MAJOR 2 /* NOLINT */
#define XGBOOST_VER_MINOR 0 /* NOLINT */ #define XGBOOST_VER_MINOR 0 /* NOLINT */
#define XGBOOST_VER_PATCH 0 /* NOLINT */ #define XGBOOST_VER_PATCH 3 /* NOLINT */
#endif // XGBOOST_VERSION_CONFIG_H_ #endif // XGBOOST_VERSION_CONFIG_H_

View File

@ -25,4 +25,3 @@ target_include_directories(xgboost4j
${PROJECT_SOURCE_DIR}/rabit/include) ${PROJECT_SOURCE_DIR}/rabit/include)
set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib) set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
target_link_libraries(xgboost4j PRIVATE ${JAVA_JVM_LIBRARY})

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import errno
import argparse import argparse
import errno
import glob import glob
import os import os
import platform import platform
@ -19,11 +19,10 @@ CONFIG = {
"USE_HDFS": "OFF", "USE_HDFS": "OFF",
"USE_AZURE": "OFF", "USE_AZURE": "OFF",
"USE_S3": "OFF", "USE_S3": "OFF",
"USE_CUDA": "OFF", "USE_CUDA": "OFF",
"USE_NCCL": "OFF", "USE_NCCL": "OFF",
"JVM_BINDINGS": "ON", "JVM_BINDINGS": "ON",
"LOG_CAPI_INVOCATION": "OFF" "LOG_CAPI_INVOCATION": "OFF",
} }
@ -70,26 +69,22 @@ def normpath(path):
return normalized return normalized
if __name__ == "__main__": def native_build(args):
parser = argparse.ArgumentParser()
parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
cli_args = parser.parse_args()
if sys.platform == "darwin": if sys.platform == "darwin":
# Enable of your compiler supports OpenMP. # Enable of your compiler supports OpenMP.
CONFIG["USE_OPENMP"] = "OFF" CONFIG["USE_OPENMP"] = "OFF"
os.environ["JAVA_HOME"] = subprocess.check_output( os.environ["JAVA_HOME"] = (
"/usr/libexec/java_home").strip().decode() subprocess.check_output("/usr/libexec/java_home").strip().decode()
)
print("building Java wrapper") print("building Java wrapper")
with cd(".."): with cd(".."):
build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build' build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
maybe_makedirs(build_dir) maybe_makedirs(build_dir)
with cd(build_dir): with cd(build_dir):
if sys.platform == "win32": if sys.platform == "win32":
# Force x64 build on Windows. # Force x64 build on Windows.
maybe_generator = ' -A x64' maybe_generator = " -A x64"
else: else:
maybe_generator = "" maybe_generator = ""
if sys.platform == "linux": if sys.platform == "linux":
@ -97,12 +92,12 @@ if __name__ == "__main__":
else: else:
maybe_parallel_build = "" maybe_parallel_build = ""
if cli_args.log_capi_invocation == 'ON': if cli_args.log_capi_invocation == "ON":
CONFIG['LOG_CAPI_INVOCATION'] = 'ON' CONFIG["LOG_CAPI_INVOCATION"] = "ON"
if cli_args.use_cuda == 'ON': if cli_args.use_cuda == "ON":
CONFIG['USE_CUDA'] = 'ON' CONFIG["USE_CUDA"] = "ON"
CONFIG['USE_NCCL'] = 'ON' CONFIG["USE_NCCL"] = "ON"
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()] args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
@ -115,7 +110,7 @@ if __name__ == "__main__":
if gpu_arch_flag is not None: if gpu_arch_flag is not None:
args.append("%s" % gpu_arch_flag) args.append("%s" % gpu_arch_flag)
lib_dir = os.path.join(os.pardir, 'lib') lib_dir = os.path.join(os.pardir, "lib")
if os.path.exists(lib_dir): if os.path.exists(lib_dir):
shutil.rmtree(lib_dir) shutil.rmtree(lib_dir)
run("cmake .. " + " ".join(args) + maybe_generator) run("cmake .. " + " ".join(args) + maybe_generator)
@ -125,8 +120,10 @@ if __name__ == "__main__":
run(f'"{sys.executable}" mapfeat.py') run(f'"{sys.executable}" mapfeat.py')
run(f'"{sys.executable}" mknfold.py machine.txt 1') run(f'"{sys.executable}" mknfold.py machine.txt 1')
xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j' xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark' xgboost4j_spark = (
"xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
)
print("copying native library") print("copying native library")
library_name, os_folder = { library_name, os_folder = {
@ -141,14 +138,19 @@ if __name__ == "__main__":
"i86pc": "x86_64", # on Solaris x86_64 "i86pc": "x86_64", # on Solaris x86_64
"sun4v": "sparc", # on Solaris sparc "sun4v": "sparc", # on Solaris sparc
"arm64": "aarch64", # on macOS & Windows ARM 64-bit "arm64": "aarch64", # on macOS & Windows ARM 64-bit
"aarch64": "aarch64" "aarch64": "aarch64",
}[platform.machine().lower()] }[platform.machine().lower()]
output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder) output_folder = "{}/src/main/resources/lib/{}/{}".format(
xgboost4j, os_folder, arch_folder
)
maybe_makedirs(output_folder) maybe_makedirs(output_folder)
cp("../lib/" + library_name, output_folder) cp("../lib/" + library_name, output_folder)
print("copying pure-Python tracker") print("copying pure-Python tracker")
cp("../python-package/xgboost/tracker.py", "{}/src/main/resources".format(xgboost4j)) cp(
"../python-package/xgboost/tracker.py",
"{}/src/main/resources".format(xgboost4j),
)
print("copying train/test files") print("copying train/test files")
maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark)) maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
@ -164,3 +166,18 @@ if __name__ == "__main__":
maybe_makedirs("{}/src/test/resources".format(xgboost4j)) maybe_makedirs("{}/src/test/resources".format(xgboost4j))
for file in glob.glob("../demo/data/agaricus.*"): for file in glob.glob("../demo/data/agaricus.*"):
cp(file, "{}/src/test/resources".format(xgboost4j)) cp(file, "{}/src/test/resources".format(xgboost4j))
if __name__ == "__main__":
if "MAVEN_SKIP_NATIVE_BUILD" in os.environ:
print("MAVEN_SKIP_NATIVE_BUILD is set. Skipping native build...")
else:
parser = argparse.ArgumentParser()
parser.add_argument(
"--log-capi-invocation", type=str, choices=["ON", "OFF"], default="OFF"
)
parser.add_argument(
"--use-cuda", type=str, choices=["ON", "OFF"], default="OFF"
)
cli_args = parser.parse_args()
native_build(cli_args)

View File

@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>
@ -189,6 +189,93 @@
</plugins> </plugins>
</build> </build>
</profile> </profile>
<profile>
<id>release-cpu-only</id>
<modules>
<module>xgboost4j</module>
<module>xgboost4j-example</module>
<module>xgboost4j-spark</module>
<module>xgboost4j-flink</module>
</modules>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>empty-javadoc-jar</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>javadoc</classifier>
<classesDirectory>${basedir}/javadoc</classesDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.1</version>
<configuration>
<autoVersionSubmodules>true</autoVersionSubmodules>
<useReleaseProfile>false</useReleaseProfile>
<releaseProfiles>release</releaseProfiles>
<goals>deploy</goals>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile> <profile>
<id>assembly</id> <id>assembly</id>
<build> <build>

View File

@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-example</name> <name>xgboost4j-example</name>
<artifactId>xgboost4j-example_${scala.binary.version}</artifactId> <artifactId>xgboost4j-example_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>
@ -26,7 +26,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -37,7 +37,7 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@ -5,13 +5,13 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-flink</name> <name>xgboost4j-flink</name>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
<properties> <properties>
<flink-ml.version>2.2.0</flink-ml.version> <flink-ml.version>2.2.0</flink-ml.version>
</properties> </properties>
@ -30,7 +30,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<name>xgboost4j-gpu</name> <name>xgboost4j-gpu</name>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@ -5,11 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-spark-gpu</name> <name>xgboost4j-spark-gpu</name>
<artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@ -24,7 +24,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -5,11 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-spark</name> <name>xgboost4j-spark</name>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@ -24,7 +24,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j</name> <name>xgboost4j</name>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@ -132,16 +132,28 @@ def locate_or_build_libxgboost(
if build_config.use_system_libxgboost: if build_config.use_system_libxgboost:
# Find libxgboost from system prefix # Find libxgboost from system prefix
sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve() sys_prefix = pathlib.Path(sys.base_prefix)
libxgboost_sys = sys_base_prefix / "lib" / _lib_name() sys_prefix_candidates = [
if not libxgboost_sys.exists(): sys_prefix / "lib",
raise RuntimeError( # Paths possibly used on Windows
f"use_system_libxgboost was specified but {_lib_name()} is " sys_prefix / "bin",
f"not found in {libxgboost_sys.parent}" sys_prefix / "Library",
) sys_prefix / "Library" / "bin",
sys_prefix / "Library" / "lib",
logger.info("Using system XGBoost: %s", str(libxgboost_sys)) ]
return libxgboost_sys sys_prefix_candidates = [
p.expanduser().resolve() for p in sys_prefix_candidates
]
for candidate_dir in sys_prefix_candidates:
libtreelite_sys = candidate_dir / _lib_name()
if libtreelite_sys.exists():
logger.info("Using system XGBoost: %s", str(libtreelite_sys))
return libtreelite_sys
raise RuntimeError(
f"use_system_libxgboost was specified but {_lib_name()} is "
f"not found. Paths searched (in order): \n"
+ "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
)
libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger) libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
if libxgboost is not None: if libxgboost is not None:

View File

@ -7,7 +7,7 @@ build-backend = "packager.pep517"
[project] [project]
name = "xgboost" name = "xgboost"
version = "2.0.0-dev" version = "2.0.3"
authors = [ authors = [
{ name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" }, { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" } { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }

View File

@ -1 +1 @@
2.0.0-dev 2.0.3

View File

@ -88,6 +88,18 @@ def is_cudf_available() -> bool:
return False return False
def is_cupy_available() -> bool:
"""Check cupy package available or not"""
if importlib.util.find_spec("cupy") is None:
return False
try:
import cupy
return True
except ImportError:
return False
try: try:
import scipy.sparse as scipy_sparse import scipy.sparse as scipy_sparse
from scipy.sparse import csr_matrix as scipy_csr from scipy.sparse import csr_matrix as scipy_csr

View File

@ -206,6 +206,7 @@ def _load_lib() -> ctypes.CDLL:
lib = ctypes.cdll.LoadLibrary(lib_path) lib = ctypes.cdll.LoadLibrary(lib_path)
setattr(lib, "path", os.path.normpath(lib_path)) setattr(lib, "path", os.path.normpath(lib_path))
lib_success = True lib_success = True
break
except OSError as e: except OSError as e:
os_error_list.append(str(e)) os_error_list.append(str(e))
continue continue
@ -2399,6 +2400,7 @@ class Booster:
_is_cudf_df, _is_cudf_df,
_is_cupy_array, _is_cupy_array,
_is_list, _is_list,
_is_np_array_like,
_is_pandas_df, _is_pandas_df,
_is_pandas_series, _is_pandas_series,
_is_tuple, _is_tuple,
@ -2428,7 +2430,7 @@ class Booster:
f"got {data.shape[1]}" f"got {data.shape[1]}"
) )
if isinstance(data, np.ndarray): if _is_np_array_like(data):
from .data import _ensure_np_dtype from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)

View File

@ -78,7 +78,6 @@ from .data import _is_cudf_ser, _is_cupy_array
from .sklearn import ( from .sklearn import (
XGBClassifier, XGBClassifier,
XGBClassifierBase, XGBClassifierBase,
XGBClassifierMixIn,
XGBModel, XGBModel,
XGBRanker, XGBRanker,
XGBRankerMixIn, XGBRankerMixIn,
@ -1854,7 +1853,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
"Implementation of the scikit-learn API for XGBoost classification.", "Implementation of the scikit-learn API for XGBoost classification.",
["estimators", "model"], ["estimators", "model"],
) )
class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase): class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
# pylint: disable=missing-class-docstring # pylint: disable=missing-class-docstring
async def _fit_async( async def _fit_async(
self, self,
@ -2036,10 +2035,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
preds = da.map_blocks(_argmax, pred_probs, drop_axis=1) preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
return preds return preds
def load_model(self, fname: ModelIn) -> None:
super().load_model(fname)
self._load_model_attributes(self.get_booster())
@xgboost_model_doc( @xgboost_model_doc(
"""Implementation of the Scikit-Learn API for XGBoost Ranking. """Implementation of the Scikit-Learn API for XGBoost Ranking.

View File

@ -164,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
return isinstance(data, scipy.sparse.coo_matrix) return isinstance(data, scipy.sparse.coo_matrix)
def _is_numpy_array(data: DataType) -> bool: def _is_np_array_like(data: DataType) -> bool:
return isinstance(data, (np.ndarray, np.matrix)) return hasattr(data, "__array_interface__")
def _ensure_np_dtype( def _ensure_np_dtype(
@ -317,7 +317,6 @@ def pandas_feature_info(
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
"""Handle feature info for pandas dataframe.""" """Handle feature info for pandas dataframe."""
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype, is_sparse
# handle feature names # handle feature names
if feature_names is None and meta is None: if feature_names is None and meta is None:
@ -332,10 +331,10 @@ def pandas_feature_info(
if feature_types is None and meta is None: if feature_types is None and meta is None:
feature_types = [] feature_types = []
for dtype in data.dtypes: for dtype in data.dtypes:
if is_sparse(dtype): if is_pd_sparse_dtype(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif ( elif (
is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
) and enable_categorical: ) and enable_categorical:
feature_types.append(CAT_T) feature_types.append(CAT_T)
else: else:
@ -345,18 +344,13 @@ def pandas_feature_info(
def is_nullable_dtype(dtype: PandasDType) -> bool: def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Whether dtype is a pandas nullable type.""" """Whether dtype is a pandas nullable type."""
from pandas.api.types import ( from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
is_bool_dtype,
is_categorical_dtype,
is_float_dtype,
is_integer_dtype,
)
is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
return is_int or is_bool or is_float or is_categorical_dtype(dtype) return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
def is_pa_ext_dtype(dtype: Any) -> bool: def is_pa_ext_dtype(dtype: Any) -> bool:
@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
) )
def is_pd_cat_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas category type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import CategoricalDtype
return isinstance(dtype, CategoricalDtype)
from pandas.api.types import is_categorical_dtype
return is_categorical_dtype(dtype)
def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas sparse type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import SparseDtype
return isinstance(dtype, SparseDtype)
from pandas.api.types import is_sparse
return is_sparse(dtype)
def pandas_cat_null(data: DataFrame) -> DataFrame: def pandas_cat_null(data: DataFrame) -> DataFrame:
"""Handle categorical dtype and nullable extension types from pandas.""" """Handle categorical dtype and nullable extension types from pandas."""
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype
# handle category codes and nullable. # handle category codes and nullable.
cat_columns = [] cat_columns = []
nul_columns = [] nul_columns = []
# avoid an unnecessary conversion if possible # avoid an unnecessary conversion if possible
for col, dtype in zip(data.columns, data.dtypes): for col, dtype in zip(data.columns, data.dtypes):
if is_categorical_dtype(dtype): if is_pd_cat_dtype(dtype):
cat_columns.append(col) cat_columns.append(col)
elif is_pa_ext_categorical_dtype(dtype): elif is_pa_ext_categorical_dtype(dtype):
raise ValueError( raise ValueError(
@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
transformed = data transformed = data
def cat_codes(ser: pd.Series) -> pd.Series: def cat_codes(ser: pd.Series) -> pd.Series:
if is_categorical_dtype(ser.dtype): if is_pd_cat_dtype(ser.dtype):
return ser.cat.codes return ser.cat.codes
assert is_pa_ext_categorical_dtype(ser.dtype) assert is_pa_ext_categorical_dtype(ser.dtype)
# Not yet supported, the index is not ordered for some reason. Alternately: # Not yet supported, the index is not ordered for some reason. Alternately:
@ -454,14 +479,12 @@ def _transform_pandas_df(
meta: Optional[str] = None, meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None, meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
from pandas.api.types import is_categorical_dtype, is_sparse
pyarrow_extension = False pyarrow_extension = False
for dtype in data.dtypes: for dtype in data.dtypes:
if not ( if not (
(dtype.name in _pandas_dtype_mapper) (dtype.name in _pandas_dtype_mapper)
or is_sparse(dtype) or is_pd_sparse_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical) or (is_pd_cat_dtype(dtype) and enable_categorical)
or is_pa_ext_dtype(dtype) or is_pa_ext_dtype(dtype)
): ):
_invalid_dataframe_dtype(data) _invalid_dataframe_dtype(data)
@ -515,9 +538,8 @@ def _meta_from_pandas_series(
) -> None: ) -> None:
"""Help transform pandas series for meta data like labels""" """Help transform pandas series for meta data like labels"""
data = data.values.astype("float") data = data.values.astype("float")
from pandas.api.types import is_sparse
if is_sparse(data): if is_pd_sparse_dtype(getattr(data, "dtype", data)):
data = data.to_dense() # type: ignore data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
@ -539,13 +561,11 @@ def _from_pandas_series(
feature_names: Optional[FeatureNames], feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType: ) -> DispatchedDataBackendReturnType:
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not ( if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical is_pd_cat_dtype(data.dtype) and enable_categorical
): ):
_invalid_dataframe_dtype(data) _invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype): if enable_categorical and is_pd_cat_dtype(data.dtype):
data = data.cat.codes data = data.cat.codes
return _from_numpy_array( return _from_numpy_array(
data.values.reshape(data.shape[0], 1).astype("float"), data.values.reshape(data.shape[0], 1).astype("float"),
@ -1051,7 +1071,7 @@ def dispatch_data_backend(
return _from_scipy_csr( return _from_scipy_csr(
data.tocsr(), missing, threads, feature_names, feature_types data.tocsr(), missing, threads, feature_names, feature_types
) )
if _is_numpy_array(data): if _is_np_array_like(data):
return _from_numpy_array( return _from_numpy_array(
data, missing, threads, feature_names, feature_types, data_split_mode data, missing, threads, feature_names, feature_types, data_split_mode
) )
@ -1194,7 +1214,7 @@ def dispatch_meta_backend(
if _is_tuple(data): if _is_tuple(data):
_meta_from_tuple(data, name, dtype, handle) _meta_from_tuple(data, name, dtype, handle)
return return
if _is_numpy_array(data): if _is_np_array_like(data):
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
return return
if _is_pandas_df(data): if _is_pandas_df(data):
@ -1281,7 +1301,7 @@ def _proxy_transform(
return _transform_dlpack(data), None, feature_names, feature_types return _transform_dlpack(data), None, feature_names, feature_types
if _is_list(data) or _is_tuple(data): if _is_list(data) or _is_tuple(data):
data = np.array(data) data = np.array(data)
if _is_numpy_array(data): if _is_np_array_like(data):
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types return data, None, feature_names, feature_types
if _is_scipy_csr(data): if _is_scipy_csr(data):
@ -1331,7 +1351,7 @@ def dispatch_proxy_set_data(
if not allow_host: if not allow_host:
raise err raise err
if _is_numpy_array(data): if _is_np_array_like(data):
_check_data_shape(data) _check_data_shape(data)
proxy._set_data_from_array(data) # pylint: disable=W0212 proxy._set_data_from_array(data) # pylint: disable=W0212
return return

View File

@ -31,16 +31,15 @@ def find_lib_path() -> List[str]:
] ]
if sys.platform == "win32": if sys.platform == "win32":
if platform.architecture()[0] == "64bit": # On Windows, Conda may install libs in different paths
dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/")) dll_path.extend(
# hack for pip installation when copy all parent source [
# directory here os.path.join(sys.base_prefix, "bin"),
dll_path.append(os.path.join(curr_path, "./windows/x64/Release/")) os.path.join(sys.base_prefix, "Library"),
else: os.path.join(sys.base_prefix, "Library", "bin"),
dll_path.append(os.path.join(curr_path, "../../windows/Release/")) os.path.join(sys.base_prefix, "Library", "lib"),
# hack for pip installation when copy all parent source ]
# directory here )
dll_path.append(os.path.join(curr_path, "./windows/Release/"))
dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path] dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
elif sys.platform.startswith(("linux", "freebsd", "emscripten")): elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path] dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]

View File

@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
from .training import train from .training import train
class XGBClassifierMixIn: # pylint: disable=too-few-public-methods
"""MixIn for classification."""
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
def _load_model_attributes(self, booster: Booster) -> None:
config = json.loads(booster.save_config())
self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
# binary classification is treated as regression in XGBoost.
self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
class XGBRankerMixIn: # pylint: disable=too-few-public-methods class XGBRankerMixIn: # pylint: disable=too-few-public-methods
"""MixIn for ranking, defines the _estimator_type usually defined in scikit-learn """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
base classes. base classes.
@ -845,21 +832,38 @@ class XGBModel(XGBModelBase):
self.get_booster().load_model(fname) self.get_booster().load_model(fname)
meta_str = self.get_booster().attr("scikit_learn") meta_str = self.get_booster().attr("scikit_learn")
if meta_str is None: if meta_str is not None:
return meta = json.loads(meta_str)
t = meta.get("_estimator_type", None)
if t is not None and t != self._get_type():
raise TypeError(
"Loading an estimator with different type. Expecting: "
f"{self._get_type()}, got: {t}"
)
meta = json.loads(meta_str)
t = meta.get("_estimator_type", None)
if t is not None and t != self._get_type():
raise TypeError(
"Loading an estimator with different type. Expecting: "
f"{self._get_type()}, got: {t}"
)
self.feature_types = self.get_booster().feature_types self.feature_types = self.get_booster().feature_types
self.get_booster().set_attr(scikit_learn=None) self.get_booster().set_attr(scikit_learn=None)
config = json.loads(self.get_booster().save_config())
self._load_model_attributes(config)
load_model.__doc__ = f"""{Booster.load_model.__doc__}""" load_model.__doc__ = f"""{Booster.load_model.__doc__}"""
def _load_model_attributes(self, config: dict) -> None:
"""Load model attributes without hyper-parameters."""
from sklearn.base import is_classifier
booster = self.get_booster()
self.objective = config["learner"]["objective"]["name"]
self.booster = config["learner"]["gradient_booster"]["name"]
self.base_score = config["learner"]["learner_model_param"]["base_score"]
self.feature_types = booster.feature_types
if is_classifier(self):
self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
# binary classification is treated as regression in XGBoost.
self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
# pylint: disable=too-many-branches # pylint: disable=too-many-branches
def _configure_fit( def _configure_fit(
self, self,
@ -1409,7 +1413,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
Number of boosting rounds. Number of boosting rounds.
""", """,
) )
class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase): class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
@_deprecate_positional_args @_deprecate_positional_args
def __init__( def __init__(
@ -1637,10 +1641,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
def classes_(self) -> np.ndarray: def classes_(self) -> np.ndarray:
return np.arange(self.n_classes_) return np.arange(self.n_classes_)
def load_model(self, fname: ModelIn) -> None:
super().load_model(fname)
self._load_model_attributes(self.get_booster())
@xgboost_model_doc( @xgboost_model_doc(
"scikit-learn API for XGBoost random forest classification.", "scikit-learn API for XGBoost random forest classification.",
@ -2093,7 +2093,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
""" """
X, qid = _get_qid(X, None) X, qid = _get_qid(X, None)
Xyq = DMatrix(X, y, qid=qid) # fixme(jiamingy): base margin and group weight is not yet supported. We might
# need to make extra special fields in the dataframe.
Xyq = DMatrix(
X,
y,
qid=qid,
missing=self.missing,
enable_categorical=self.enable_categorical,
nthread=self.n_jobs,
feature_types=self.feature_types,
)
if callable(self.eval_metric): if callable(self.eval_metric):
metric = ltr_metric_decorator(self.eval_metric, self.n_jobs) metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric) result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)

View File

@ -22,7 +22,7 @@ from typing import (
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pyspark import SparkContext, cloudpickle from pyspark import RDD, SparkContext, cloudpickle
from pyspark.ml import Estimator, Model from pyspark.ml import Estimator, Model
from pyspark.ml.functions import array_to_vector, vector_to_array from pyspark.ml.functions import array_to_vector, vector_to_array
from pyspark.ml.linalg import VectorUDT from pyspark.ml.linalg import VectorUDT
@ -44,6 +44,7 @@ from pyspark.ml.util import (
MLWritable, MLWritable,
MLWriter, MLWriter,
) )
from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
from pyspark.sql import Column, DataFrame from pyspark.sql import Column, DataFrame
from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
from pyspark.sql.types import ( from pyspark.sql.types import (
@ -59,11 +60,12 @@ from scipy.special import expit, softmax # pylint: disable=no-name-in-module
import xgboost import xgboost
from xgboost import XGBClassifier from xgboost import XGBClassifier
from xgboost.compat import is_cudf_available from xgboost.compat import is_cudf_available, is_cupy_available
from xgboost.core import Booster, _check_distributed_params from xgboost.core import Booster, _check_distributed_params
from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
from xgboost.training import train as worker_train from xgboost.training import train as worker_train
from .._typing import ArrayLike
from .data import ( from .data import (
_read_csr_matrix_from_unwrapped_spark_vec, _read_csr_matrix_from_unwrapped_spark_vec,
alias, alias,
@ -87,6 +89,7 @@ from .utils import (
_get_rabit_args, _get_rabit_args,
_get_spark_session, _get_spark_session,
_is_local, _is_local,
_is_standalone_or_localcluster,
deserialize_booster, deserialize_booster,
deserialize_xgb_model, deserialize_xgb_model,
get_class_name, get_class_name,
@ -241,6 +244,13 @@ class _SparkXGBParams(
TypeConverters.toList, TypeConverters.toList,
) )
def set_device(self, value: str) -> "_SparkXGBParams":
"""Set device, optional value: cpu, cuda, gpu"""
_check_distributed_params({"device": value})
assert value in ("cpu", "cuda", "gpu")
self.set(self.device, value)
return self
@classmethod @classmethod
def _xgb_cls(cls) -> Type[XGBModel]: def _xgb_cls(cls) -> Type[XGBModel]:
""" """
@ -334,6 +344,54 @@ class _SparkXGBParams(
predict_params[param.name] = self.getOrDefault(param) predict_params[param.name] = self.getOrDefault(param)
return predict_params return predict_params
def _validate_gpu_params(self) -> None:
"""Validate the gpu parameters and gpu configurations"""
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
ss = _get_spark_session()
sc = ss.sparkContext
if _is_local(sc):
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
if executor_gpus is None:
raise ValueError(
"The `spark.executor.resource.gpu.amount` is required for training"
" on GPU."
)
if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
# We will enable stage-level scheduling in spark 3.4.0+ which doesn't
# require spark.task.resource.gpu.amount to be set explicitly
gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
def _validate_params(self) -> None: def _validate_params(self) -> None:
# pylint: disable=too-many-branches # pylint: disable=too-many-branches
init_model = self.getOrDefault("xgb_model") init_model = self.getOrDefault("xgb_model")
@ -413,53 +471,7 @@ class _SparkXGBParams(
"`pyspark.ml.linalg.Vector` type." "`pyspark.ml.linalg.Vector` type."
) )
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu): self._validate_gpu_params()
gpu_per_task = (
_get_spark_session()
.sparkContext.getConf()
.get("spark.task.resource.gpu.amount")
)
is_local = _is_local(_get_spark_session().sparkContext)
if is_local:
# checking spark local mode.
if gpu_per_task is not None:
raise RuntimeError(
"The spark local mode does not support gpu configuration."
"Please remove spark.executor.resource.gpu.amount and "
"spark.task.resource.gpu.amount"
)
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
# checking spark non-local mode.
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
def _validate_and_convert_feature_col_as_float_col_list( def _validate_and_convert_feature_col_as_float_col_list(
@ -584,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
arbitrary_params_dict={}, arbitrary_params_dict={},
) )
self.logger = get_logger(self.__class__.__name__)
def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name
""" """
Set params for the estimator. Set params for the estimator.
@ -886,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
return booster_params, train_call_kwargs_params, dmatrix_kwargs return booster_params, train_call_kwargs_params, dmatrix_kwargs
def _skip_stage_level_scheduling(self) -> bool:
# pylint: disable=too-many-return-statements
"""Check if stage-level scheduling is not needed,
return true to skip stage-level scheduling"""
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
ss = _get_spark_session()
sc = ss.sparkContext
if ss.version < "3.4.0":
self.logger.info(
"Stage-level scheduling in xgboost requires spark version 3.4.0+"
)
return True
if not _is_standalone_or_localcluster(sc):
self.logger.info(
"Stage-level scheduling in xgboost requires spark standalone or "
"local-cluster mode"
)
return True
executor_cores = sc.getConf().get("spark.executor.cores")
executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
if executor_cores is None or executor_gpus is None:
self.logger.info(
"Stage-level scheduling in xgboost requires spark.executor.cores, "
"spark.executor.resource.gpu.amount to be set."
)
return True
if int(executor_cores) == 1:
# there will be only 1 task running at any time.
self.logger.info(
"Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
)
return True
if int(executor_gpus) > 1:
# For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
# to make xgboost run successfully.
#
self.logger.info(
"Stage-level scheduling in xgboost will not work "
"when spark.executor.resource.gpu.amount>1"
)
return True
task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
if task_gpu_amount is None:
# The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
# but with stage-level scheduling, we can make training task grab the gpu.
return False
if float(task_gpu_amount) == float(executor_gpus):
# spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
# results in only 1 task running at a time, which may cause perf issue.
return True
# We can enable stage-level scheduling
return False
# CPU training doesn't require stage-level scheduling
return True
def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
"""Try to enable stage-level scheduling"""
if self._skip_stage_level_scheduling():
return rdd
ss = _get_spark_session()
# executor_cores will not be None
executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
assert executor_cores is not None
# Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
# If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
# ETL gpu tasks running alongside training tasks.
spark_plugins = ss.conf.get("spark.plugins", " ")
assert spark_plugins is not None
spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
assert spark_rapids_sql_enabled is not None
task_cores = (
int(executor_cores)
if "com.nvidia.spark.SQLPlugin" in spark_plugins
and "true" == spark_rapids_sql_enabled.lower()
else (int(executor_cores) // 2) + 1
)
# Each training task requires cpu cores > total executor cores//2 + 1 which can
# make sure the tasks be sent to different executors.
#
# Please note that we can't use GPU to limit the concurrent tasks because of
# https://issues.apache.org/jira/browse/SPARK-45527.
task_gpus = 1.0
treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
rp = ResourceProfileBuilder().require(treqs).build
self.logger.info(
"XGBoost training tasks require the resource(cores=%s, gpu=%s).",
task_cores,
task_gpus,
)
return rdd.withResources(rp)
def _fit(self, dataset: DataFrame) -> "_SparkXGBModel": def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
# pylint: disable=too-many-statements, too-many-locals # pylint: disable=too-many-statements, too-many-locals
self._validate_params() self._validate_params()
@ -986,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
) )
def _run_job() -> Tuple[str, str]: def _run_job() -> Tuple[str, str]:
ret = ( rdd = (
dataset.mapInPandas( dataset.mapInPandas(
_train_booster, schema="config string, booster string" # type: ignore _train_booster, # type: ignore
schema="config string, booster string",
) )
.rdd.barrier() .rdd.barrier()
.mapPartitions(lambda x: x) .mapPartitions(lambda x: x)
.collect()[0]
) )
rdd_with_resource = self._try_stage_level_scheduling(rdd)
ret = rdd_with_resource.collect()[0]
return ret[0], ret[1] return ret[0], ret[1]
get_logger("XGBoost-PySpark").info( get_logger("XGBoost-PySpark").info(
@ -1117,12 +1243,111 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
) )
return features_col, feature_col_names return features_col, feature_col_names
def _get_pred_contrib_col_name(self) -> Optional[str]:
"""Return the pred_contrib_col col name"""
pred_contrib_col_name = None
if (
self.isDefined(self.pred_contrib_col)
and self.getOrDefault(self.pred_contrib_col) != ""
):
pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
return pred_contrib_col_name
def _out_schema(self) -> Tuple[bool, str]:
"""Return the bool to indicate if it's a single prediction, true is single prediction,
and the returned type of the user-defined function. The value must
be a DDL-formatted type string."""
if self._get_pred_contrib_col_name() is not None:
return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
return True, "double"
def _get_predict_func(self) -> Callable:
"""Return the true prediction function which will be running on the executor side"""
predict_params = self._gen_predict_params_dict()
pred_contrib_col_name = self._get_pred_contrib_col_name()
def _predict(
model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
) -> Union[pd.DataFrame, pd.Series]:
data = {}
preds = model.predict(
X,
base_margin=base_margin,
validate_features=False,
**predict_params,
)
data[pred.prediction] = pd.Series(preds)
if pred_contrib_col_name is not None:
contribs = pred_contribs(model, X, base_margin)
data[pred.pred_contrib] = pd.Series(list(contribs))
return pd.DataFrame(data=data)
return data[pred.prediction]
return _predict
def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
"""Post process of transform"""
prediction_col_name = self.getOrDefault(self.predictionCol)
single_pred, _ = self._out_schema()
if single_pred:
if prediction_col_name:
dataset = dataset.withColumn(prediction_col_name, pred_col)
else:
pred_struct_col = "_prediction_struct"
dataset = dataset.withColumn(pred_struct_col, pred_col)
if prediction_col_name:
dataset = dataset.withColumn(
prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
)
pred_contrib_col_name = self._get_pred_contrib_col_name()
if pred_contrib_col_name is not None:
dataset = dataset.withColumn(
pred_contrib_col_name,
array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
)
dataset = dataset.drop(pred_struct_col)
return dataset
def _gpu_transform(self) -> bool:
"""If gpu is used to do the prediction, true to gpu prediction"""
if _is_local(_get_spark_session().sparkContext):
# if it's local model, we just use the internal "device"
return use_cuda(self.getOrDefault(self.device))
gpu_per_task = (
_get_spark_session()
.sparkContext.getConf()
.get("spark.task.resource.gpu.amount")
)
# User don't set gpu configurations, just use cpu
if gpu_per_task is None:
if use_cuda(self.getOrDefault(self.device)):
get_logger("XGBoost-PySpark").warning(
"Do the prediction on the CPUs since "
"no gpu configurations are set"
)
return False
# User already sets the gpu configurations, we just use the internal "device".
return use_cuda(self.getOrDefault(self.device))
def _transform(self, dataset: DataFrame) -> DataFrame: def _transform(self, dataset: DataFrame) -> DataFrame:
# pylint: disable=too-many-statements, too-many-locals # pylint: disable=too-many-statements, too-many-locals
# Save xgb_sklearn_model and predict_params to be local variable # Save xgb_sklearn_model and predict_params to be local variable
# to avoid the `self` object to be pickled to remote. # to avoid the `self` object to be pickled to remote.
xgb_sklearn_model = self._xgb_sklearn_model xgb_sklearn_model = self._xgb_sklearn_model
predict_params = self._gen_predict_params_dict()
has_base_margin = False has_base_margin = False
if ( if (
@ -1137,79 +1362,92 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
features_col, feature_col_names = self._get_feature_col(dataset) features_col, feature_col_names = self._get_feature_col(dataset)
enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim) enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
pred_contrib_col_name = None predict_func = self._get_predict_func()
if (
self.isDefined(self.pred_contrib_col)
and self.getOrDefault(self.pred_contrib_col) != ""
):
pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
single_pred = True _, schema = self._out_schema()
schema = "double"
if pred_contrib_col_name: is_local = _is_local(_get_spark_session().sparkContext)
single_pred = False run_on_gpu = self._gpu_transform()
schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
@pandas_udf(schema) # type: ignore @pandas_udf(schema) # type: ignore
def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]: def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
assert xgb_sklearn_model is not None assert xgb_sklearn_model is not None
model = xgb_sklearn_model model = xgb_sklearn_model
from pyspark import TaskContext
context = TaskContext.get()
assert context is not None
dev_ordinal = -1
if is_cudf_available():
if is_local:
if run_on_gpu and is_cupy_available():
import cupy as cp # pylint: disable=import-error
total_gpus = cp.cuda.runtime.getDeviceCount()
if total_gpus > 0:
partition_id = context.partitionId()
# For transform local mode, default the dev_ordinal to
# (partition id) % gpus.
dev_ordinal = partition_id % total_gpus
elif run_on_gpu:
dev_ordinal = _get_gpu_id(context)
if dev_ordinal >= 0:
device = "cuda:" + str(dev_ordinal)
get_logger("XGBoost-PySpark").info(
"Do the inference with device: %s", device
)
model.set_params(device=device)
else:
get_logger("XGBoost-PySpark").info("Do the inference on the CPUs")
else:
msg = (
"CUDF is unavailable, fallback the inference on the CPUs"
if run_on_gpu
else "Do the inference on the CPUs"
)
get_logger("XGBoost-PySpark").info(msg)
def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
"""Move the data to gpu if possible"""
if dev_ordinal >= 0:
import cudf # pylint: disable=import-error
import cupy as cp # pylint: disable=import-error
# We must set the device after import cudf, which will change the device id to 0
# See https://github.com/rapidsai/cudf/issues/11386
cp.cuda.runtime.setDevice(dev_ordinal) # pylint: disable=I1101
df = cudf.DataFrame(data)
del data
return df
return data
for data in iterator: for data in iterator:
if enable_sparse_data_optim: if enable_sparse_data_optim:
X = _read_csr_matrix_from_unwrapped_spark_vec(data) X = _read_csr_matrix_from_unwrapped_spark_vec(data)
else: else:
if feature_col_names is not None: if feature_col_names is not None:
X = data[feature_col_names] tmp = data[feature_col_names]
else: else:
X = stack_series(data[alias.data]) tmp = stack_series(data[alias.data])
X = to_gpu_if_possible(tmp)
if has_base_margin: if has_base_margin:
base_margin = data[alias.margin].to_numpy() base_margin = to_gpu_if_possible(data[alias.margin])
else: else:
base_margin = None base_margin = None
data = {} yield predict_func(model, X, base_margin)
preds = model.predict(
X,
base_margin=base_margin,
validate_features=False,
**predict_params,
)
data[pred.prediction] = pd.Series(preds)
if pred_contrib_col_name:
contribs = pred_contribs(model, X, base_margin)
data[pred.pred_contrib] = pd.Series(list(contribs))
yield pd.DataFrame(data=data)
else:
yield data[pred.prediction]
if has_base_margin: if has_base_margin:
pred_col = predict_udf(struct(*features_col, base_margin_col)) pred_col = predict_udf(struct(*features_col, base_margin_col))
else: else:
pred_col = predict_udf(struct(*features_col)) pred_col = predict_udf(struct(*features_col))
prediction_col_name = self.getOrDefault(self.predictionCol) return self._post_transform(dataset, pred_col)
if single_pred:
dataset = dataset.withColumn(prediction_col_name, pred_col)
else:
pred_struct_col = "_prediction_struct"
dataset = dataset.withColumn(pred_struct_col, pred_col)
dataset = dataset.withColumn(
prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
)
if pred_contrib_col_name:
dataset = dataset.withColumn(
pred_contrib_col_name,
array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
)
dataset = dataset.drop(pred_struct_col)
return dataset
class _ClassificationModel( # pylint: disable=abstract-method class _ClassificationModel( # pylint: disable=abstract-method
@ -1221,22 +1459,21 @@ class _ClassificationModel( # pylint: disable=abstract-method
.. Note:: This API is experimental. .. Note:: This API is experimental.
""" """
def _transform(self, dataset: DataFrame) -> DataFrame: def _out_schema(self) -> Tuple[bool, str]:
# pylint: disable=too-many-statements, too-many-locals schema = (
# Save xgb_sklearn_model and predict_params to be local variable f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
# to avoid the `self` object to be pickled to remote. f" {pred.probability} array<double>"
xgb_sklearn_model = self._xgb_sklearn_model )
predict_params = self._gen_predict_params_dict() if self._get_pred_contrib_col_name() is not None:
# We will force setting strict_shape to True when predicting contribs,
# So, it will also output 3-D shape result.
schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
has_base_margin = False return False, schema
if (
self.isDefined(self.base_margin_col) def _get_predict_func(self) -> Callable:
and self.getOrDefault(self.base_margin_col) != "" predict_params = self._gen_predict_params_dict()
): pred_contrib_col_name = self._get_pred_contrib_col_name()
has_base_margin = True
base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
alias.margin
)
def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
if margins.ndim == 1: if margins.ndim == 1:
@ -1251,76 +1488,38 @@ class _ClassificationModel( # pylint: disable=abstract-method
class_probs = softmax(raw_preds, axis=1) class_probs = softmax(raw_preds, axis=1)
return raw_preds, class_probs return raw_preds, class_probs
features_col, feature_col_names = self._get_feature_col(dataset) def _predict(
enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim) model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
) -> Union[pd.DataFrame, pd.Series]:
margins = model.predict(
X,
base_margin=base_margin,
output_margin=True,
validate_features=False,
**predict_params,
)
raw_preds, class_probs = transform_margin(margins)
pred_contrib_col_name = None # It seems that they use argmax of class probs,
if ( # not of margin to get the prediction (Note: scala implementation)
self.isDefined(self.pred_contrib_col) preds = np.argmax(class_probs, axis=1)
and self.getOrDefault(self.pred_contrib_col) != "" result: Dict[str, pd.Series] = {
): pred.raw_prediction: pd.Series(list(raw_preds)),
pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col) pred.prediction: pd.Series(preds),
pred.probability: pd.Series(list(class_probs)),
}
schema = ( if pred_contrib_col_name is not None:
f"{pred.raw_prediction} array<double>, {pred.prediction} double," contribs = pred_contribs(model, X, base_margin, strict_shape=True)
f" {pred.probability} array<double>" result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
)
if pred_contrib_col_name:
# We will force setting strict_shape to True when predicting contribs,
# So, it will also output 3-D shape result.
schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
@pandas_udf(schema) # type: ignore return pd.DataFrame(data=result)
def predict_udf(
iterator: Iterator[Tuple[pd.Series, ...]]
) -> Iterator[pd.DataFrame]:
assert xgb_sklearn_model is not None
model = xgb_sklearn_model
for data in iterator:
if enable_sparse_data_optim:
X = _read_csr_matrix_from_unwrapped_spark_vec(data)
else:
if feature_col_names is not None:
X = data[feature_col_names] # type: ignore
else:
X = stack_series(data[alias.data])
if has_base_margin: return _predict
base_margin = stack_series(data[alias.margin])
else:
base_margin = None
margins = model.predict(
X,
base_margin=base_margin,
output_margin=True,
validate_features=False,
**predict_params,
)
raw_preds, class_probs = transform_margin(margins)
# It seems that they use argmax of class probs,
# not of margin to get the prediction (Note: scala implementation)
preds = np.argmax(class_probs, axis=1)
result: Dict[str, pd.Series] = {
pred.raw_prediction: pd.Series(list(raw_preds)),
pred.prediction: pd.Series(preds),
pred.probability: pd.Series(list(class_probs)),
}
if pred_contrib_col_name:
contribs = pred_contribs(model, X, base_margin, strict_shape=True)
result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
yield pd.DataFrame(data=result)
if has_base_margin:
pred_struct = predict_udf(struct(*features_col, base_margin_col))
else:
pred_struct = predict_udf(struct(*features_col))
def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
pred_struct_col = "_prediction_struct" pred_struct_col = "_prediction_struct"
dataset = dataset.withColumn(pred_struct_col, pred_struct) dataset = dataset.withColumn(pred_struct_col, pred_col)
raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol) raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
if raw_prediction_col_name: if raw_prediction_col_name:
@ -1342,7 +1541,8 @@ class _ClassificationModel( # pylint: disable=abstract-method
array_to_vector(getattr(col(pred_struct_col), pred.probability)), array_to_vector(getattr(col(pred_struct_col), pred.probability)),
) )
if pred_contrib_col_name: pred_contrib_col_name = self._get_pred_contrib_col_name()
if pred_contrib_col_name is not None:
dataset = dataset.withColumn( dataset = dataset.withColumn(
pred_contrib_col_name, pred_contrib_col_name,
getattr(col(pred_struct_col), pred.pred_contrib), getattr(col(pred_struct_col), pred.pred_contrib),

View File

@ -10,7 +10,7 @@ from threading import Thread
from typing import Any, Callable, Dict, Optional, Set, Type from typing import Any, Callable, Dict, Optional, Set, Type
import pyspark import pyspark
from pyspark import BarrierTaskContext, SparkContext, SparkFiles from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
from pyspark.sql.session import SparkSession from pyspark.sql.session import SparkSession
from xgboost import Booster, XGBModel, collective from xgboost import Booster, XGBModel, collective
@ -129,7 +129,14 @@ def _is_local(spark_context: SparkContext) -> bool:
return spark_context._jsc.sc().isLocal() return spark_context._jsc.sc().isLocal()
def _get_gpu_id(task_context: BarrierTaskContext) -> int: def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
master = spark_context.getConf().get("spark.master")
return master is not None and (
master.startswith("spark://") or master.startswith("local-cluster")
)
def _get_gpu_id(task_context: TaskContext) -> int:
"""Get the gpu id from the task resources""" """Get the gpu id from the task resources"""
if task_context is None: if task_context is None:
# This is a safety check. # This is a safety check.

View File

@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
with pytest.raises(ValueError, match="Either `group` or `qid`."): with pytest.raises(ValueError, match="Either `group` or `qid`."):
ranker.fit(df, y, eval_set=[(X, y)]) ranker.fit(df, y, eval_set=[(X, y)])
def run_ranking_categorical(device: str) -> None:
"""Test LTR with categorical features."""
from sklearn.model_selection import cross_val_score
X, y = tm.make_categorical(
n_samples=512, n_features=10, n_categories=3, onehot=False
)
rng = np.random.default_rng(1994)
qid = rng.choice(3, size=y.shape[0])
qid = np.sort(qid)
X["qid"] = qid
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
ltr.fit(X, y)
score = ltr.score(X, y)
assert score > 0.9
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
# test using the score function inside sklearn.
scores = cross_val_score(ltr, X, y)
for s in scores:
assert s > 0.7

View File

@ -52,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
* *
* Go to left if it's NOT the matching category, which matches one-hot encoding. * Go to left if it's NOT the matching category, which matches one-hot encoding.
*/ */
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) { inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
KCatBitField const s_cats(cats); KCatBitField const s_cats(cats);
if (XGBOOST_EXPECT(InvalidCat(cat), false)) { if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
return true; return true;

View File

@ -3,9 +3,11 @@
*/ */
#include "error_msg.h" #include "error_msg.h"
#include <mutex> // for call_once, once_flag
#include <sstream> // for stringstream #include <sstream> // for stringstream
#include "../collective/communicator-inl.h" // for GetRank #include "../collective/communicator-inl.h" // for GetRank
#include "xgboost/context.h" // for Context
#include "xgboost/logging.h" #include "xgboost/logging.h"
namespace xgboost::error { namespace xgboost::error {
@ -26,34 +28,43 @@ void WarnDeprecatedGPUHist() {
} }
void WarnManualUpdater() { void WarnManualUpdater() {
bool static thread_local logged{false}; static std::once_flag flag;
if (logged) { std::call_once(flag, [] {
return; LOG(WARNING)
} << "You have manually specified the `updater` parameter. The `tree_method` parameter "
LOG(WARNING) "will be ignored. Incorrect sequence of updaters will produce undefined "
<< "You have manually specified the `updater` parameter. The `tree_method` parameter " "behavior. For common uses, we recommend using `tree_method` parameter instead.";
"will be ignored. Incorrect sequence of updaters will produce undefined " });
"behavior. For common uses, we recommend using `tree_method` parameter instead.";
logged = true;
} }
void WarnDeprecatedGPUId() { void WarnDeprecatedGPUId() {
static thread_local bool logged{false}; static std::once_flag flag;
if (logged) { std::call_once(flag, [] {
return; auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
} msg += " E.g. device=cpu/cuda/cuda:0";
auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device"); LOG(WARNING) << msg;
msg += " E.g. device=cpu/cuda/cuda:0"; });
LOG(WARNING) << msg;
logged = true;
} }
void WarnEmptyDataset() { void WarnEmptyDataset() {
static thread_local bool logged{false}; static std::once_flag flag;
if (logged) { std::call_once(flag,
return; [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
} }
LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
logged = true; void MismatchedDevices(Context const* booster, Context const* data) {
static std::once_flag flag;
std::call_once(flag, [&] {
LOG(WARNING)
<< "Falling back to prediction using DMatrix due to mismatched devices. This might "
"lead to higher memory usage and slower performance. XGBoost is running on: "
<< booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
<< R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
This warning will only be shown once.
)";
});
} }
} // namespace xgboost::error } // namespace xgboost::error

View File

@ -10,7 +10,8 @@
#include <limits> // for numeric_limits #include <limits> // for numeric_limits
#include <string> // for string #include <string> // for string
#include "xgboost/base.h" // for bst_feature_t #include "xgboost/base.h" // for bst_feature_t
#include "xgboost/context.h" // for Context
#include "xgboost/logging.h" #include "xgboost/logging.h"
#include "xgboost/string_view.h" // for StringView #include "xgboost/string_view.h" // for StringView
@ -94,5 +95,7 @@ constexpr StringView InvalidCUDAOrdinal() {
return "Invalid device. `device` is required to be CUDA and there must be at least one GPU " return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
"available for using GPU."; "available for using GPU.";
} }
void MismatchedDevices(Context const* booster, Context const* data);
} // namespace xgboost::error } // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_ #endif // XGBOOST_COMMON_ERROR_MSG_H_

View File

@ -384,7 +384,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
* @param length See the `length` parameter of `mmap` for details. * @param length See the `length` parameter of `mmap` for details.
*/ */
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length) explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {} : AlignedResourceReadStream{std::shared_ptr<MmapResource>{ // NOLINT
new MmapResource{std::move(path), offset, length}}} {}
~PrivateMmapConstStream() noexcept(false) override; ~PrivateMmapConstStream() noexcept(false) override;
}; };

View File

@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT [[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT [[nodiscard]] size_type size_bytes() const { // NOLINT
return Span{data(), size()}.size_bytes(); return Span<const value_type>{data(), size()}.size_bytes();
} }
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT [[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT [[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT

View File

@ -3,14 +3,23 @@
*/ */
#include "threading_utils.h" #include "threading_utils.h"
#include <fstream> #include <algorithm> // for max
#include <string> #include <exception> // for exception
#include <filesystem> // for path, exists
#include <fstream> // for ifstream
#include <string> // for string
#include "xgboost/logging.h" #include "common.h" // for DivRoundUp
namespace xgboost { namespace xgboost::common {
namespace common { /**
int32_t GetCfsCPUCount() noexcept { * Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
std::filesystem::path const& peroid_path) {
#if defined(__linux__) #if defined(__linux__)
// https://bugs.openjdk.java.net/browse/JDK-8146115 // https://bugs.openjdk.java.net/browse/JDK-8146115
// http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42 // http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
} }
}; };
// complete fair scheduler from Linux // complete fair scheduler from Linux
auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")); auto const cfs_quota(read_int(quota_path.c_str()));
auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us")); auto const cfs_period(read_int(peroid_path.c_str()));
if ((cfs_quota > 0) && (cfs_period > 0)) { if ((cfs_quota > 0) && (cfs_period > 0)) {
return std::max(cfs_quota / cfs_period, 1); return std::max(cfs_quota / cfs_period, 1);
} }
@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
return -1; return -1;
} }
std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
std::int32_t cnt{-1};
#if defined(__linux__)
namespace fs = std::filesystem;
std::int32_t a{0}, b{0};
auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
try {
std::ifstream fin{bandwidth_path, std::ios::in};
fin >> a;
fin >> b;
} catch (std::exception const&) {
warn();
return cnt;
}
if (a > 0 && b > 0) {
cnt = std::max(common::DivRoundUp(a, b), 1);
}
#endif // defined(__linux__)
return cnt;
}
std::int32_t GetCfsCPUCount() noexcept {
namespace fs = std::filesystem;
fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
auto has_v2 = fs::exists(bandwidth_path);
if (has_v2) {
return GetCGroupV2Count(bandwidth_path);
}
fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
if (has_v1) {
return GetCGroupV1Count(quota_path, peroid_path);
}
return -1;
}
std::int32_t OmpGetNumThreads(std::int32_t n_threads) { std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
// Don't use parallel if we are in a parallel region. // Don't use parallel if we are in a parallel region.
if (omp_in_parallel()) { if (omp_in_parallel()) {
@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
n_threads = std::max(n_threads, 1); n_threads = std::max(n_threads, 1);
return n_threads; return n_threads;
} }
} // namespace common } // namespace xgboost::common
} // namespace xgboost

View File

@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
* \brief Get thread limit from CFS. * \brief Get thread limit from CFS.
* *
* This function has non-trivial overhead and should not be called repeatly. * This function has non-trivial overhead and should not be called repeatly.
*
* Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/ */
std::int32_t GetCfsCPUCount() noexcept; std::int32_t GetCfsCPUCount() noexcept;

View File

@ -55,6 +55,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
} }
CHECK(p_fmat) << "Failed to fallback."; CHECK(p_fmat) << "Failed to fallback.";
p_fmat->Info() = proxy->Info().Copy();
return p_fmat; return p_fmat;
} }
} // namespace xgboost::data } // namespace xgboost::data

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright 2014-2022 by XGBoost Contributors * Copyright 2014-2023, XGBoost Contributors
* \file gblinear.cc * \file gblinear.cc
* \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
* the update rule is parallel coordinate descent (shotgun) * the update rule is parallel coordinate descent (shotgun)
@ -26,9 +26,9 @@
#include "../common/timer.h" #include "../common/timer.h"
#include "../common/common.h" #include "../common/common.h"
#include "../common/threading_utils.h" #include "../common/threading_utils.h"
#include "../common/error_msg.h"
namespace xgboost { namespace xgboost::gbm {
namespace gbm {
DMLC_REGISTRY_FILE_TAG(gblinear); DMLC_REGISTRY_FILE_TAG(gblinear);
@ -83,7 +83,16 @@ class GBLinear : public GradientBooster {
} }
param_.UpdateAllowUnknown(cfg); param_.UpdateAllowUnknown(cfg);
param_.CheckGPUSupport(); param_.CheckGPUSupport();
updater_.reset(LinearUpdater::Create(param_.updater, ctx_)); if (param_.updater == "gpu_coord_descent") {
LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
R"(device="cuda", updater="coord_descent")");
}
if (param_.updater == "coord_descent" && ctx_->IsCUDA()) {
updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_));
} else {
updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
}
updater_->Configure(cfg); updater_->Configure(cfg);
monitor_.Init("GBLinear"); monitor_.Init("GBLinear");
} }
@ -354,5 +363,4 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
.set_body([](LearnerModelParam const* booster_config, Context const* ctx) { .set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
return new GBLinear(booster_config, ctx); return new GBLinear(booster_config, ctx);
}); });
} // namespace gbm } // namespace xgboost::gbm
} // namespace xgboost

View File

@ -85,25 +85,6 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
return name == up->Name(); return name == up->Name();
}); });
} }
void MismatchedDevices(Context const* booster, Context const* data) {
bool thread_local static logged{false};
if (logged) {
return;
}
LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
"lead to higher memory usage and slower performance. XGBoost is running on: "
<< booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
<< ".\n"
<< R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
This warning will only be shown once for each thread. Subsequent warnings made by the
current thread will be suppressed.
)";
logged = true;
}
} // namespace } // namespace
void GBTree::Configure(Args const& cfg) { void GBTree::Configure(Args const& cfg) {
@ -146,14 +127,6 @@ void GBTree::Configure(Args const& cfg) {
if (specified_updater_) { if (specified_updater_) {
error::WarnManualUpdater(); error::WarnManualUpdater();
} }
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method); LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
if (!specified_updater_) { if (!specified_updater_) {
@ -225,6 +198,13 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair, void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) { PredictionCacheEntry* predt, ObjFunction const* obj) {
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
}
TreesOneIter new_trees; TreesOneIter new_trees;
bst_target_t const n_groups = model_.learner_model_param->OutputLength(); bst_target_t const n_groups = model_.learner_model_param->OutputLength();
monitor_.Start("BoostNewTrees"); monitor_.Start("BoostNewTrees");
@ -555,7 +535,7 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end); auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees."; CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
if (p_m->Ctx()->Device() != this->ctx_->Device()) { if (p_m->Ctx()->Device() != this->ctx_->Device()) {
MismatchedDevices(this->ctx_, p_m->Ctx()); error::MismatchedDevices(this->ctx_, p_m->Ctx());
CHECK_EQ(out_preds->version, 0); CHECK_EQ(out_preds->version, 0);
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m); auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
CHECK(proxy) << error::InplacePredictProxy(); CHECK(proxy) << error::InplacePredictProxy();
@ -808,7 +788,7 @@ class Dart : public GBTree {
auto n_groups = model_.learner_model_param->num_output_group; auto n_groups = model_.learner_model_param->num_output_group;
if (ctx_->Device() != p_fmat->Ctx()->Device()) { if (ctx_->Device() != p_fmat->Ctx()->Device()) {
MismatchedDevices(ctx_, p_fmat->Ctx()); error::MismatchedDevices(ctx_, p_fmat->Ctx());
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat); auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
CHECK(proxy) << error::InplacePredictProxy(); CHECK(proxy) << error::InplacePredictProxy();
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing); auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);

View File

@ -1317,7 +1317,9 @@ class LearnerImpl : public LearnerIO {
if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) { if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_)); metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
auto config = obj_->DefaultMetricConfig(); auto config = obj_->DefaultMetricConfig();
metrics_.back()->LoadConfig(config); if (!IsA<Null>(config)) {
metrics_.back()->LoadConfig(config);
}
metrics_.back()->Configure({cfg_.begin(), cfg_.end()}); metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
} }

View File

@ -9,8 +9,7 @@
#include "coordinate_common.h" #include "coordinate_common.h"
#include "xgboost/json.h" #include "xgboost/json.h"
namespace xgboost { namespace xgboost::linear {
namespace linear {
DMLC_REGISTER_PARAMETER(CoordinateParam); DMLC_REGISTER_PARAMETER(CoordinateParam);
DMLC_REGISTRY_FILE_TAG(updater_coordinate); DMLC_REGISTRY_FILE_TAG(updater_coordinate);
@ -39,8 +38,9 @@ class CoordinateUpdater : public LinearUpdater {
FromJson(config.at("linear_train_param"), &tparam_); FromJson(config.at("linear_train_param"), &tparam_);
FromJson(config.at("coordinate_param"), &cparam_); FromJson(config.at("coordinate_param"), &cparam_);
} }
void SaveConfig(Json* p_out) const override { void SaveConfig(Json *p_out) const override {
auto& out = *p_out; LOG(DEBUG) << "Save config for CPU updater.";
auto &out = *p_out;
out["linear_train_param"] = ToJson(tparam_); out["linear_train_param"] = ToJson(tparam_);
out["coordinate_param"] = ToJson(cparam_); out["coordinate_param"] = ToJson(cparam_);
} }
@ -99,5 +99,4 @@ class CoordinateUpdater : public LinearUpdater {
XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent") XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
.describe("Update linear model according to coordinate descent algorithm.") .describe("Update linear model according to coordinate descent algorithm.")
.set_body([]() { return new CoordinateUpdater(); }); .set_body([]() { return new CoordinateUpdater(); });
} // namespace linear } // namespace xgboost::linear
} // namespace xgboost

View File

@ -15,8 +15,7 @@
#include "../common/timer.h" #include "../common/timer.h"
#include "./param.h" #include "./param.h"
namespace xgboost { namespace xgboost::linear {
namespace linear {
DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate); DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
@ -29,7 +28,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
class GPUCoordinateUpdater : public LinearUpdater { // NOLINT class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
public: public:
// set training parameter // set training parameter
void Configure(Args const& args) override { void Configure(Args const &args) override {
tparam_.UpdateAllowUnknown(args); tparam_.UpdateAllowUnknown(args);
coord_param_.UpdateAllowUnknown(args); coord_param_.UpdateAllowUnknown(args);
selector_.reset(FeatureSelector::Create(tparam_.feature_selector)); selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
@ -41,8 +40,9 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
FromJson(config.at("linear_train_param"), &tparam_); FromJson(config.at("linear_train_param"), &tparam_);
FromJson(config.at("coordinate_param"), &coord_param_); FromJson(config.at("coordinate_param"), &coord_param_);
} }
void SaveConfig(Json* p_out) const override { void SaveConfig(Json *p_out) const override {
auto& out = *p_out; LOG(DEBUG) << "Save config for GPU updater.";
auto &out = *p_out;
out["linear_train_param"] = ToJson(tparam_); out["linear_train_param"] = ToJson(tparam_);
out["coordinate_param"] = ToJson(coord_param_); out["coordinate_param"] = ToJson(coord_param_);
} }
@ -101,10 +101,9 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
monitor_.Stop("LazyInitDevice"); monitor_.Stop("LazyInitDevice");
monitor_.Start("UpdateGpair"); monitor_.Start("UpdateGpair");
auto &in_gpair_host = in_gpair->ConstHostVector();
// Update gpair // Update gpair
if (ctx_->gpu_id >= 0) { if (ctx_->gpu_id >= 0) {
this->UpdateGpair(in_gpair_host); this->UpdateGpair(in_gpair->ConstHostVector());
} }
monitor_.Stop("UpdateGpair"); monitor_.Stop("UpdateGpair");
@ -249,5 +248,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
"Update linear model according to coordinate descent algorithm. GPU " "Update linear model according to coordinate descent algorithm. GPU "
"accelerated.") "accelerated.")
.set_body([]() { return new GPUCoordinateUpdater(); }); .set_body([]() { return new GPUCoordinateUpdater(); });
} // namespace linear } // namespace xgboost::linear
} // namespace xgboost

View File

@ -268,6 +268,13 @@ class PseudoHuberRegression : public FitIntercept {
} }
FromJson(in["pseudo_huber_param"], &param_); FromJson(in["pseudo_huber_param"], &param_);
} }
[[nodiscard]] Json DefaultMetricConfig() const override {
CHECK(param_.GetInitialised());
Json config{Object{}};
config["name"] = String{this->DefaultEvalMetric()};
config["pseudo_huber_param"] = ToJson(param_);
return config;
}
}; };
XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror") XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright 2020-2022 by XGBoost Contributors * Copyright 2020-2023, XGBoost Contributors
*/ */
#include <algorithm> // std::max #include <algorithm> // std::max
#include <vector> #include <vector>
@ -11,9 +11,7 @@
#include "evaluate_splits.cuh" #include "evaluate_splits.cuh"
#include "expand_entry.cuh" #include "expand_entry.cuh"
namespace xgboost { namespace xgboost::tree {
namespace tree {
// With constraints // With constraints
XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan, XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
const GradientPairInt64 &missing, const GradientPairInt64 &missing,
@ -315,11 +313,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
common::Span<common::CatBitField::value_type> out, common::Span<common::CatBitField::value_type> out,
DeviceSplitCandidate *p_out_split) { DeviceSplitCandidate *p_out_split) {
auto &out_split = *p_out_split; auto &out_split = *p_out_split;
out_split.split_cats = common::CatBitField{out}; auto out_cats = common::CatBitField{out};
// Simple case for one hot split // Simple case for one hot split
if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) { if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
out_split.split_cats.Set(common::AsCat(out_split.thresh)); out_cats.Set(common::AsCat(out_split.thresh));
return; return;
} }
@ -339,7 +337,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
assert(partition > 0 && "Invalid partition."); assert(partition > 0 && "Invalid partition.");
thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) { thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
auto cat = shared_inputs.feature_values[c - node_offset]; auto cat = shared_inputs.feature_values[c - node_offset];
out_split.SetCat(cat); out_cats.Set(common::AsCat(cat));
}); });
} }
@ -427,8 +425,7 @@ void GPUHistEvaluator::EvaluateSplits(
if (split.is_cat) { if (split.is_cat) {
SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i, SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
device_cats_accessor.GetNodeCatStorage(input.nidx), device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
&out_splits[i]);
} }
float base_weight = float base_weight =
@ -460,6 +457,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
return root_entry; return root_entry;
} }
} // namespace xgboost::tree
} // namespace tree
} // namespace xgboost

View File

@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
common::Span<const float> feature_values; common::Span<const float> feature_values;
common::Span<const float> min_fvalue; common::Span<const float> min_fvalue;
bool is_dense; bool is_dense;
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; } [[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
__device__ auto FeatureBins(bst_feature_t fidx) const { [[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
return feature_segments[fidx + 1] - feature_segments[fidx]; return feature_segments[fidx + 1] - feature_segments[fidx];
} }
}; };
@ -102,7 +102,7 @@ class GPUHistEvaluator {
} }
/** /**
* \brief Get device category storage of nidx for internal calculation. * @brief Get device category storage of nidx for internal calculation.
*/ */
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) { auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{}; if (!has_categoricals_) return CatAccessor{};
@ -117,8 +117,8 @@ class GPUHistEvaluator {
/** /**
* \brief Get sorted index storage based on the left node of inputs. * \brief Get sorted index storage based on the left node of inputs.
*/ */
auto SortedIdx(int num_nodes, bst_feature_t total_bins) { auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
if(!need_sort_histogram_) return common::Span<bst_feature_t>(); if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
cat_sorted_idx_.resize(num_nodes * total_bins); cat_sorted_idx_.resize(num_nodes * total_bins);
return dh::ToSpan(cat_sorted_idx_); return dh::ToSpan(cat_sorted_idx_);
} }
@ -142,12 +142,22 @@ class GPUHistEvaluator {
* \brief Get host category storage for nidx. Different from the internal version, this * \brief Get host category storage for nidx. Different from the internal version, this
* returns strictly 1 node. * returns strictly 1 node.
*/ */
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const { [[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
copy_stream_.View().Sync(); copy_stream_.View().Sync();
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan( auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
nidx * node_categorical_storage_size_, node_categorical_storage_size_); nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out; return cats_out;
} }
[[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
copy_stream_.View().Sync();
if (has_categoricals_) {
CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
} else {
return common::KCatBitField{};
}
}
/** /**
* \brief Add a split to the internal tree evaluator. * \brief Add a split to the internal tree evaluator.
*/ */

View File

@ -64,7 +64,6 @@ struct DeviceSplitCandidate {
// split. // split.
bst_cat_t thresh{-1}; bst_cat_t thresh{-1};
common::CatBitField split_cats;
bool is_cat { false }; bool is_cat { false };
GradientPairInt64 left_sum; GradientPairInt64 left_sum;
@ -72,12 +71,6 @@ struct DeviceSplitCandidate {
XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT
template <typename T>
XGBOOST_DEVICE void SetCat(T c) {
this->split_cats.Set(common::AsCat(c));
fvalue = std::max(this->fvalue, static_cast<float>(c));
}
XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in, XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
int findex_in, GradientPairInt64 left_sum_in, int findex_in, GradientPairInt64 left_sum_in,
GradientPairInt64 right_sum_in, bool cat, GradientPairInt64 right_sum_in, bool cat,
@ -100,22 +93,23 @@ struct DeviceSplitCandidate {
*/ */
XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in, XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
bst_feature_t findex_in, GradientPairInt64 left_sum_in, bst_feature_t findex_in, GradientPairInt64 left_sum_in,
GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) { GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
if (loss_chg_in > loss_chg && const GradientQuantiser& quantiser) {
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight && if (loss_chg_in > loss_chg &&
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) { quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
loss_chg = loss_chg_in; quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
dir = dir_in; loss_chg = loss_chg_in;
fvalue = std::numeric_limits<float>::quiet_NaN(); dir = dir_in;
thresh = thresh_in; fvalue = std::numeric_limits<float>::quiet_NaN();
is_cat = true; thresh = thresh_in;
left_sum = left_sum_in; is_cat = true;
right_sum = right_sum_in; left_sum = left_sum_in;
findex = findex_in; right_sum = right_sum_in;
} findex = findex_in;
}
} }
XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; } [[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) { friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
os << "loss_chg:" << c.loss_chg << ", " os << "loss_chg:" << c.loss_chg << ", "

View File

@ -7,9 +7,9 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <limits> #include <cstddef> // for size_t
#include <memory> #include <memory> // for unique_ptr, make_unique
#include <utility> #include <utility> // for move
#include <vector> #include <vector>
#include "../collective/communicator-inl.cuh" #include "../collective/communicator-inl.cuh"
@ -216,9 +216,9 @@ struct GPUHistMakerDevice {
void InitFeatureGroupsOnce() { void InitFeatureGroupsOnce() {
if (!feature_groups) { if (!feature_groups) {
CHECK(page); CHECK(page);
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense, feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id), dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientPairPrecise))); sizeof(GradientPairPrecise));
} }
} }
@ -244,10 +244,10 @@ struct GPUHistMakerDevice {
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id); this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
quantiser.reset(new GradientQuantiser(this->gpair)); quantiser = std::make_unique<GradientQuantiser>(this->gpair);
row_partitioner.reset(); // Release the device memory first before reallocating row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows)); row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
// Init histogram // Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins()); hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
@ -294,7 +294,7 @@ struct GPUHistMakerDevice {
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size()); dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
// Store the feature set ptrs so they dont go out of scope before the kernel is called // Store the feature set ptrs so they dont go out of scope before the kernel is called
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets; std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
for (size_t i = 0; i < candidates.size(); i++) { for (std::size_t i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i); auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild(); int left_nidx = tree[candidate.nid].LeftChild();
int right_nidx = tree[candidate.nid].RightChild(); int right_nidx = tree[candidate.nid].RightChild();
@ -327,14 +327,13 @@ struct GPUHistMakerDevice {
d_node_inputs.data().get(), h_node_inputs.data(), d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
this->evaluator_.EvaluateSplits(nidx, max_active_features, this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
dh::ToSpan(d_node_inputs), shared_inputs, shared_inputs, dh::ToSpan(entries));
dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
dh::DefaultStream().Sync(); dh::DefaultStream().Sync();
} }
void BuildHist(int nidx) { void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx);
@ -366,23 +365,29 @@ struct GPUHistMakerDevice {
struct NodeSplitData { struct NodeSplitData {
RegTree::Node split_node; RegTree::Node split_node;
FeatureType split_type; FeatureType split_type;
common::CatBitField node_cats; common::KCatBitField node_cats;
}; };
void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) { void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
if (candidates.empty()) return; if (candidates.empty()) {
std::vector<int> nidx(candidates.size()); return;
std::vector<int> left_nidx(candidates.size()); }
std::vector<int> right_nidx(candidates.size());
std::vector<bst_node_t> nidx(candidates.size());
std::vector<bst_node_t> left_nidx(candidates.size());
std::vector<bst_node_t> right_nidx(candidates.size());
std::vector<NodeSplitData> split_data(candidates.size()); std::vector<NodeSplitData> split_data(candidates.size());
for (size_t i = 0; i < candidates.size(); i++) { for (size_t i = 0; i < candidates.size(); i++) {
auto& e = candidates[i]; auto const& e = candidates[i];
RegTree::Node split_node = (*p_tree)[e.nid]; RegTree::Node split_node = (*p_tree)[e.nid];
auto split_type = p_tree->NodeSplitType(e.nid); auto split_type = p_tree->NodeSplitType(e.nid);
nidx.at(i) = e.nid; nidx.at(i) = e.nid;
left_nidx.at(i) = split_node.LeftChild(); left_nidx.at(i) = split_node.LeftChild();
right_nidx.at(i) = split_node.RightChild(); right_nidx.at(i) = split_node.RightChild();
split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats}; split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
} }
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@ -390,7 +395,7 @@ struct GPUHistMakerDevice {
nidx, left_nidx, right_nidx, split_data, nidx, left_nidx, right_nidx, split_data,
[=] __device__(bst_uint ridx, const NodeSplitData& data) { [=] __device__(bst_uint ridx, const NodeSplitData& data) {
// given a row index, returns the node id it belongs to // given a row index, returns the node id it belongs to
bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex()); float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
// Missing value // Missing value
bool go_left = true; bool go_left = true;
if (isnan(cut_value)) { if (isnan(cut_value)) {
@ -620,7 +625,6 @@ struct GPUHistMakerDevice {
CHECK(common::CheckNAN(candidate.split.fvalue)); CHECK(common::CheckNAN(candidate.split.fvalue));
std::vector<common::CatBitField::value_type> split_cats; std::vector<common::CatBitField::value_type> split_cats;
CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex); auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0); split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);

View File

@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} ${SPARK_VERSION}
echo "--- Stash XGBoost4J JARs (Scala 2.12)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
echo "--- Build XGBoost JVM packages scala 2.13" echo "--- Build XGBoost JVM packages scala 2.13"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} "" "" "true" ${SPARK_VERSION} "" "" "true"
echo "--- Stash XGBoost4J JARs" echo "--- Stash XGBoost4J JARs (Scala 2.13)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"

View File

@ -0,0 +1,8 @@
steps:
- block: ":rocket: Run this test job"
if: build.pull_request.id != null || build.branch =~ /^dependabot\//
- label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11"
command: "tests/buildkite/test-macos-m1-clang11.sh"
key: mac-m1-appleclang11
agents:
queue: mac-mini-m1

View File

@ -0,0 +1,50 @@
#!/bin/bash
set -euo pipefail
source tests/buildkite/conftest.sh
# Display system info
echo "--- Display system information"
set -x
system_profiler SPSoftwareDataType
sysctl -n machdep.cpu.brand_string
uname -m
set +x
# Build XGBoost4J binary
echo "--- Build libxgboost4j.dylib"
set -x
mkdir build
pushd build
export JAVA_HOME=$(/usr/libexec/java_home)
cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15
ninja -v
popd
rm -rf build
set +x
echo "--- Upload Python wheel"
set -x
pushd lib
mv -v libxgboost4j.dylib libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
buildkite-agent artifact upload libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib \
s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \
--acl public-read --no-progress
fi
popd
set +x
# Ensure that XGBoost can be built with Clang 11
echo "--- Build and Test XGBoost with MacOS M1, Clang 11"
set -x
LLVM11_PATH=$(brew --prefix llvm\@11)
mkdir build
pushd build
cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \
-DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \
-DUSE_DMLC_GTEST=ON
ninja -v

View File

@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG ARG NCCL_VERSION_ARG
ARG RAPIDS_VERSION_ARG ARG RAPIDS_VERSION_ARG

View File

@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
# Install all basic requirements # Install all basic requirements

View File

@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG ARG NCCL_VERSION_ARG

View File

@ -27,6 +27,9 @@ fi
mvn_profile_string="" mvn_profile_string=""
if [ "x$use_scala213" != "x" ]; then if [ "x$use_scala213" != "x" ]; then
export mvn_profile_string="-Pdefault,scala-2.13" export mvn_profile_string="-Pdefault,scala-2.13"
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages
fi fi
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options

View File

@ -32,11 +32,10 @@ dependencies:
- jsonschema - jsonschema
- boto3 - boto3
- awscli - awscli
- py-ubjson
- cffi - cffi
- pyarrow - pyarrow
- pyspark>=3.4.0 - pyspark>=3.4.0
- cloudpickle - cloudpickle
- pip: - pip:
- sphinx_rtd_theme - sphinx_rtd_theme
- datatable - py-ubjson

View File

@ -27,6 +27,9 @@ rm -rf ../build/
# Deploy to S3 bucket xgboost-maven-repo # Deploy to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
# Deploy scala 2.13 to S3 bucket xgboost-maven-repo # Deploy scala 2.13 to S3 bucket xgboost-maven-repo
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages/
mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests

View File

@ -21,9 +21,18 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
fi fi
# including maven profiles for different scala versions: 2.12 is the default at the moment. # including maven profiles for different scala versions: 2.12 is the default at the moment.
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do for scala_binary_version in "2.12" "2.13"; do
cd ..
python dev/change_scala_version.py --scala-version ${scala_binary_version}
cd jvm-packages
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout) scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout) if [[ "$scala_binary_version" == "2.12" ]]; then
_maven_profile_string=""
elif [[ "$scala_binary_version" == "2.13" ]]; then
_maven_profile_string="-Pdefault,scala-2.13"
else
echo "Unexpected scala version: $scala_version ($scala_binary_version)."
fi
# Install XGBoost4J JAR into local Maven repository # Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar

View File

@ -148,7 +148,8 @@ TEST(IO, Resource) {
fout << 1.0 << std::endl; fout << 1.0 << std::endl;
fout.close(); fout.close();
auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double)); auto resource = std::shared_ptr<MmapResource>{
new MmapResource{path, 0, sizeof(double)}};
ASSERT_EQ(resource->Size(), sizeof(double)); ASSERT_EQ(resource->Size(), sizeof(double));
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap); ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
ASSERT_EQ(resource->DataAs<double>()[0], val); ASSERT_EQ(resource->DataAs<double>()[0], val);

View File

@ -0,0 +1,42 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/global_config.h> // for GlobalConfigThreadLocalStore
#include <xgboost/json.h> // for Json, Object
#include <xgboost/learner.h> // for Learner
#include <algorithm> // for transform
#include <string> // for string
#include <utility> // for swap
#include "../helpers.h" // for RandomDataGenerator
namespace xgboost {
TEST(GBlinear, DispatchUpdater) {
auto verbosity = 3;
std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
auto test = [](std::string device) {
auto p_fmat = RandomDataGenerator{10, 10, 0.0f}.GenerateDMatrix(true);
std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
learner->SetParams(
Args{{"booster", "gblinear"}, {"updater", "coord_descent"}, {"device", device}});
learner->Configure();
for (std::int32_t iter = 0; iter < 3; ++iter) {
learner->UpdateOneIter(iter, p_fmat);
}
Json config{Object{}};
::testing::internal::CaptureStderr();
learner->SaveConfig(&config);
auto str = ::testing::internal::GetCapturedStderr();
std::transform(device.cbegin(), device.cend(), device.begin(),
[](char c) { return std::toupper(c); });
ASSERT_NE(str.find(device), std::string::npos);
};
test("cpu");
test("gpu");
std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
}
} // namespace xgboost

View File

@ -58,21 +58,6 @@ void TestInplaceFallback(Context const* ctx) {
HostDeviceVector<float>* out_predt{nullptr}; HostDeviceVector<float>* out_predt{nullptr};
ConsoleLogger::Configure(Args{{"verbosity", "1"}}); ConsoleLogger::Configure(Args{{"verbosity", "1"}});
std::string output; std::string output;
// test whether the warning is raised
#if !defined(_WIN32)
// Windows has issue with CUDA and thread local storage. For some reason, on Windows a
// cudaInitializationError is raised during destruction of `HostDeviceVector`. This
// might be related to https://github.com/dmlc/xgboost/issues/5793
::testing::internal::CaptureStderr();
std::thread{[&] {
// Launch a new thread to ensure a warning is raised as we prevent over-verbose
// warning by using thread-local flags.
learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
&out_predt, 0, 0);
}}.join();
output = testing::internal::GetCapturedStderr();
ASSERT_NE(output.find("Falling back"), std::string::npos);
#endif
learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(), learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
&out_predt, 0, 0); &out_predt, 0, 0);

View File

@ -6,6 +6,7 @@
#include <xgboost/objective.h> #include <xgboost/objective.h>
#include "../helpers.h" #include "../helpers.h"
#include "../objective_helpers.h"
TEST(Objective, UnknownFunction) { TEST(Objective, UnknownFunction) {
xgboost::ObjFunction* obj = nullptr; xgboost::ObjFunction* obj = nullptr;
@ -43,4 +44,61 @@ TEST(Objective, PredTransform) {
ASSERT_TRUE(predts.HostCanWrite()); ASSERT_TRUE(predts.HostCanWrite());
} }
} }
class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
Context ctx_;
public:
void Run(std::string objective) {
auto Xy = MakeFmatForObjTest(objective);
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
learner->SetParam("objective", objective);
if (objective.find("multi") != std::string::npos) {
learner->SetParam("num_class", "3");
objfn->Configure(Args{{"num_class", "3"}});
} else if (objective.find("quantile") != std::string::npos) {
learner->SetParam("quantile_alpha", "0.5");
objfn->Configure(Args{{"quantile_alpha", "0.5"}});
} else {
objfn->Configure(Args{});
}
learner->Configure();
learner->UpdateOneIter(0, Xy);
learner->EvalOneIter(0, {Xy}, {"train"});
Json config{Object{}};
learner->SaveConfig(&config);
auto jobj = get<Object const>(config["learner"]["objective"]);
ASSERT_TRUE(jobj.find("name") != jobj.cend());
// FIXME(jiamingy): We should have the following check, but some legacy parameter like
// "pos_weight", "delta_step" in objectives are not in metrics.
// if (jobj.size() > 1) {
// ASSERT_FALSE(IsA<Null>(objfn->DefaultMetricConfig()));
// }
auto mconfig = objfn->DefaultMetricConfig();
if (!IsA<Null>(mconfig)) {
// make sure metric can handle it
std::unique_ptr<Metric> metricfn{Metric::Create(get<String const>(mconfig["name"]), &ctx_)};
metricfn->LoadConfig(mconfig);
Json loaded(Object{});
metricfn->SaveConfig(&loaded);
metricfn->Configure(Args{});
ASSERT_EQ(mconfig, loaded);
}
}
};
TEST_P(TestDefaultObjConfig, Objective) {
std::string objective = GetParam();
this->Run(objective);
}
INSTANTIATE_TEST_SUITE_P(Objective, TestDefaultObjConfig,
::testing::ValuesIn(MakeObjNamesForTest()),
[](const ::testing::TestParamInfo<TestDefaultObjConfig::ParamType>& info) {
return ObjTestNameGenerator(info);
});
} // namespace xgboost } // namespace xgboost

View File

@ -0,0 +1,31 @@
/**
* Copyright (c) 2023, XGBoost contributors
*/
#include "objective_helpers.h"
#include "../../src/common/linalg_op.h" // for begin, end
#include "helpers.h" // for RandomDataGenerator
namespace xgboost {
std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
} // namespace xgboost

View File

@ -1,6 +1,8 @@
/** /**
* Copyright (c) 2023, XGBoost contributors * Copyright (c) 2023, XGBoost contributors
*/ */
#pragma once
#include <dmlc/registry.h> // for Registry #include <dmlc/registry.h> // for Registry
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <xgboost/objective.h> // for ObjFunctionReg #include <xgboost/objective.h> // for ObjFunctionReg
@ -29,4 +31,6 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
} }
return name; return name;
}; };
std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj);
} // namespace xgboost } // namespace xgboost

View File

@ -655,33 +655,11 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); } TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
class TestColumnSplit : public ::testing::TestWithParam<std::string> { class TestColumnSplit : public ::testing::TestWithParam<std::string> {
static auto MakeFmat(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) { void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
auto const world_size = collective::GetWorldSize(); auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank(); auto const rank = collective::GetRank();
auto p_fmat = MakeFmat(objective); auto p_fmat = MakeFmatForObjTest(objective);
std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)}; std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
std::unique_ptr<Learner> learner{Learner::Create({sliced})}; std::unique_ptr<Learner> learner{Learner::Create({sliced})};
learner->SetParam("tree_method", "approx"); learner->SetParam("tree_method", "approx");
@ -705,7 +683,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
public: public:
void Run(std::string objective) { void Run(std::string objective) {
auto p_fmat = MakeFmat(objective); auto p_fmat = MakeFmatForObjTest(objective);
std::unique_ptr<Learner> learner{Learner::Create({p_fmat})}; std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
learner->SetParam("tree_method", "approx"); learner->SetParam("tree_method", "approx");
learner->SetParam("objective", objective); learner->SetParam("objective", objective);

View File

@ -191,14 +191,32 @@ class TestGPUPredict:
np.testing.assert_allclose(predt_0, predt_3) np.testing.assert_allclose(predt_0, predt_3)
np.testing.assert_allclose(predt_0, predt_4) np.testing.assert_allclose(predt_0, predt_4)
def run_inplace_base_margin(self, booster, dtrain, X, base_margin): def run_inplace_base_margin(
self, device: int, booster: xgb.Booster, dtrain: xgb.DMatrix, X, base_margin
) -> None:
import cupy as cp import cupy as cp
booster.set_param({"device": f"cuda:{device}"})
dtrain.set_info(base_margin=base_margin) dtrain.set_info(base_margin=base_margin)
from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
from_dmatrix = booster.predict(dtrain) from_dmatrix = booster.predict(dtrain)
cp.testing.assert_allclose(from_inplace, from_dmatrix) cp.testing.assert_allclose(from_inplace, from_dmatrix)
booster = booster.copy() # clear prediction cache.
booster.set_param({"device": "cpu"})
from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
from_dmatrix = booster.predict(dtrain)
cp.testing.assert_allclose(from_inplace, from_dmatrix)
booster = booster.copy() # clear prediction cache.
base_margin = cp.asnumpy(base_margin)
if hasattr(X, "values"):
X = cp.asnumpy(X.values)
booster.set_param({"device": f"cuda:{device}"})
from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
from_dmatrix = booster.predict(dtrain)
cp.testing.assert_allclose(from_inplace, from_dmatrix, rtol=1e-6)
def run_inplace_predict_cupy(self, device: int) -> None: def run_inplace_predict_cupy(self, device: int) -> None:
import cupy as cp import cupy as cp
@ -244,7 +262,7 @@ class TestGPUPredict:
run_threaded_predict(X, rows, predict_dense) run_threaded_predict(X, rows, predict_dense)
base_margin = cp_rng.randn(rows) base_margin = cp_rng.randn(rows)
self.run_inplace_base_margin(booster, dtrain, X, base_margin) self.run_inplace_base_margin(device, booster, dtrain, X, base_margin)
# Create a wide dataset # Create a wide dataset
X = cp_rng.randn(100, 10000) X = cp_rng.randn(100, 10000)
@ -318,7 +336,7 @@ class TestGPUPredict:
run_threaded_predict(X, rows, predict_df) run_threaded_predict(X, rows, predict_df)
base_margin = cudf.Series(rng.randn(rows)) base_margin = cudf.Series(rng.randn(rows))
self.run_inplace_base_margin(booster, dtrain, X, base_margin) self.run_inplace_base_margin(0, booster, dtrain, X, base_margin)
@given( @given(
strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy

View File

@ -9,7 +9,7 @@ import pytest
import xgboost as xgb import xgboost as xgb
from xgboost import testing as tm from xgboost import testing as tm
from xgboost.testing.ranking import run_ranking_qid_df from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
sys.path.append("tests/python") sys.path.append("tests/python")
import test_with_sklearn as twskl # noqa import test_with_sklearn as twskl # noqa
@ -165,6 +165,11 @@ def test_ranking_qid_df():
run_ranking_qid_df(cudf, "gpu_hist") run_ranking_qid_df(cudf, "gpu_hist")
@pytest.mark.skipif(**tm.no_pandas())
def test_ranking_categorical() -> None:
run_ranking_categorical(device="cuda")
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu @pytest.mark.mgpu
def test_device_ordinal() -> None: def test_device_ordinal() -> None:

View File

@ -211,7 +211,7 @@ class TestPandas:
y = np.random.randn(kRows) y = np.random.randn(kRows)
w = np.random.uniform(size=kRows).astype(np.float32) w = np.random.uniform(size=kRows).astype(np.float32)
w_pd = pd.DataFrame(w) w_pd = pd.DataFrame(w)
data = xgb.DMatrix(X, y, w_pd) data = xgb.DMatrix(X, y, weight=w_pd)
assert data.num_row() == kRows assert data.num_row() == kRows
assert data.num_col() == kCols assert data.num_col() == kCols
@ -301,14 +301,14 @@ class TestPandas:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix]) @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None: def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical_dtype from xgboost.data import is_pd_cat_dtype
for orig, df in pd_dtypes(): for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"): if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical_dtype for dtype in df.dtypes) enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
else: else:
# series # series
enable_categorical = is_categorical_dtype(df.dtype) enable_categorical = is_pd_cat_dtype(df.dtype)
f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df

View File

@ -12,7 +12,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks
import xgboost as xgb import xgboost as xgb
from xgboost import testing as tm from xgboost import testing as tm
from xgboost.testing.ranking import run_ranking_qid_df from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
from xgboost.testing.shared import get_feature_weights, validate_data_initialization from xgboost.testing.shared import get_feature_weights, validate_data_initialization
from xgboost.testing.updater import get_basescore from xgboost.testing.updater import get_basescore
@ -173,6 +173,11 @@ def test_ranking():
np.testing.assert_almost_equal(pred, pred_orig) np.testing.assert_almost_equal(pred, pred_orig)
@pytest.mark.skipif(**tm.no_pandas())
def test_ranking_categorical() -> None:
run_ranking_categorical(device="cpu")
def test_ranking_metric() -> None: def test_ranking_metric() -> None:
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
@ -935,6 +940,7 @@ def save_load_model(model_path):
predt_0 = clf.predict(X) predt_0 = clf.predict(X)
clf.save_model(model_path) clf.save_model(model_path)
clf.load_model(model_path) clf.load_model(model_path)
assert clf.booster == "gblinear"
predt_1 = clf.predict(X) predt_1 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_1) np.testing.assert_allclose(predt_0, predt_1)
assert clf.best_iteration == best_iteration assert clf.best_iteration == best_iteration
@ -950,25 +956,26 @@ def save_load_model(model_path):
def test_save_load_model(): def test_save_load_model():
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model') model_path = os.path.join(tempdir, "digits.model")
save_load_model(model_path) save_load_model(model_path)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model.json') model_path = os.path.join(tempdir, "digits.model.json")
save_load_model(model_path) save_load_model(model_path)
from sklearn.datasets import load_digits from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model.ubj') model_path = os.path.join(tempdir, "digits.model.ubj")
digits = load_digits(n_class=2) digits = load_digits(n_class=2)
y = digits['target'] y = digits["target"]
X = digits['data'] X = digits["data"]
booster = xgb.train({'tree_method': 'hist', booster = xgb.train(
'objective': 'binary:logistic'}, {"tree_method": "hist", "objective": "binary:logistic"},
dtrain=xgb.DMatrix(X, y), dtrain=xgb.DMatrix(X, y),
num_boost_round=4) num_boost_round=4,
)
predt_0 = booster.predict(xgb.DMatrix(X)) predt_0 = booster.predict(xgb.DMatrix(X))
booster.save_model(model_path) booster.save_model(model_path)
cls = xgb.XGBClassifier() cls = xgb.XGBClassifier()
@ -1002,6 +1009,8 @@ def test_save_load_model():
clf = xgb.XGBClassifier() clf = xgb.XGBClassifier()
clf.load_model(model_path) clf.load_model(model_path)
assert clf.classes_.size == 10 assert clf.classes_.size == 10
assert clf.objective == "multi:softprob"
np.testing.assert_equal(clf.classes_, np.arange(10)) np.testing.assert_equal(clf.classes_, np.arange(10))
assert clf.n_classes_ == 10 assert clf.n_classes_ == 10

View File

@ -2,6 +2,7 @@ import json
import logging import logging
import subprocess import subprocess
import numpy as np
import pytest import pytest
import sklearn import sklearn
@ -13,7 +14,7 @@ from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor, SparkXGBRegressorModel
gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh" gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
@ -242,3 +243,33 @@ def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature
evaluator = RegressionEvaluator(metricName="rmse") evaluator = RegressionEvaluator(metricName="rmse")
rmse = evaluator.evaluate(pred_result_df) rmse = evaluator.evaluate(pred_result_df)
assert rmse <= 65.0 assert rmse <= 65.0
def test_gpu_transform(spark_diabetes_dataset) -> None:
regressor = SparkXGBRegressor(device="cuda", num_workers=num_workers)
train_df, test_df = spark_diabetes_dataset
model: SparkXGBRegressorModel = regressor.fit(train_df)
# The model trained with GPUs, and transform with GPU configurations.
assert model._gpu_transform()
model.set_device("cpu")
assert not model._gpu_transform()
# without error
cpu_rows = model.transform(test_df).select("prediction").collect()
regressor = SparkXGBRegressor(device="cpu", num_workers=num_workers)
model = regressor.fit(train_df)
# The model trained with CPUs. Even with GPU configurations,
# still prefer transforming with CPUs
assert not model._gpu_transform()
# Set gpu transform explicitly.
model.set_device("cuda")
assert model._gpu_transform()
# without error
gpu_rows = model.transform(test_df).select("prediction").collect()
for cpu, gpu in zip(cpu_rows, gpu_rows):
np.testing.assert_allclose(cpu.prediction, gpu.prediction, atol=1e-3)

View File

@ -1932,6 +1932,7 @@ class TestWithDask:
cls.client = client cls.client = client
cls.fit(X, y) cls.fit(X, y)
predt_0 = cls.predict(X) predt_0 = cls.predict(X)
proba_0 = cls.predict_proba(X)
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.pkl") path = os.path.join(tmpdir, "model.pkl")
@ -1941,7 +1942,9 @@ class TestWithDask:
with open(path, "rb") as fd: with open(path, "rb") as fd:
cls = pickle.load(fd) cls = pickle.load(fd)
predt_1 = cls.predict(X) predt_1 = cls.predict(X)
proba_1 = cls.predict_proba(X)
np.testing.assert_allclose(predt_0.compute(), predt_1.compute()) np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
np.testing.assert_allclose(proba_0.compute(), proba_1.compute())
path = os.path.join(tmpdir, "cls.json") path = os.path.join(tmpdir, "cls.json")
cls.save_model(path) cls.save_model(path)
@ -1950,16 +1953,20 @@ class TestWithDask:
cls.load_model(path) cls.load_model(path)
assert cls.n_classes_ == 10 assert cls.n_classes_ == 10
predt_2 = cls.predict(X) predt_2 = cls.predict(X)
proba_2 = cls.predict_proba(X)
np.testing.assert_allclose(predt_0.compute(), predt_2.compute()) np.testing.assert_allclose(predt_0.compute(), predt_2.compute())
np.testing.assert_allclose(proba_0.compute(), proba_2.compute())
# Use single node to load # Use single node to load
cls = xgb.XGBClassifier() cls = xgb.XGBClassifier()
cls.load_model(path) cls.load_model(path)
assert cls.n_classes_ == 10 assert cls.n_classes_ == 10
predt_3 = cls.predict(X_) predt_3 = cls.predict(X_)
proba_3 = cls.predict_proba(X_)
np.testing.assert_allclose(predt_0.compute(), predt_3) np.testing.assert_allclose(predt_0.compute(), predt_3)
np.testing.assert_allclose(proba_0.compute(), proba_3)
def test_dask_unsupported_features(client: "Client") -> None: def test_dask_unsupported_features(client: "Client") -> None:

View File

@ -888,6 +888,34 @@ class TestPySparkLocal:
clf = SparkXGBClassifier(device="cuda") clf = SparkXGBClassifier(device="cuda")
clf._validate_params() clf._validate_params()
def test_gpu_transform(self, clf_data: ClfData) -> None:
"""local mode"""
classifier = SparkXGBClassifier(device="cpu")
model: SparkXGBClassifierModel = classifier.fit(clf_data.cls_df_train)
with tempfile.TemporaryDirectory() as tmpdir:
path = "file:" + tmpdir
model.write().overwrite().save(path)
# The model trained with CPU, transform defaults to cpu
assert not model._gpu_transform()
# without error
model.transform(clf_data.cls_df_test).collect()
model.set_device("cuda")
assert model._gpu_transform()
model_loaded = SparkXGBClassifierModel.load(path)
# The model trained with CPU, transform defaults to cpu
assert not model_loaded._gpu_transform()
# without error
model_loaded.transform(clf_data.cls_df_test).collect()
model_loaded.set_device("cuda")
assert model_loaded._gpu_transform()
class XgboostLocalTest(SparkTestCase): class XgboostLocalTest(SparkTestCase):
def setUp(self): def setUp(self):