Compare commits

..

23 Commits

Author SHA1 Message Date
Philip Hyunsu Cho
82d846bbeb Update change_scala_version.py to also change scala.version property (#9897) 2023-12-18 23:49:41 -08:00
Philip Hyunsu Cho
71d330afdc Bump version to 2.0.3 (#9895) 2023-12-14 17:54:05 -08:00
Philip Hyunsu Cho
3acbd8692b [jvm-packages] Fix POM for xgboost-jvm metapackage (#9893)
* [jvm-packages] Fix POM for xgboost-jvm metapackage

* Add script for updating the Scala version
2023-12-14 16:50:34 -08:00
Philip Hyunsu Cho
ad524f76ab [backport] [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9887)
* [CI] Set up CI for Mac M1 (#9699)

* [CI] Improve CI for Mac M1 (#9748)

* [CI] Build libxgboost4j.dylib with CMAKE_OSX_DEPLOYMENT_TARGET (#9749)

* [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9886)
2023-12-13 16:05:40 -08:00
Jiaming Yuan
d2d1751c03 [backport][py] Use the first found native library. (#9860) (#9879) 2023-12-13 14:20:30 +08:00
Jiaming Yuan
e4ee4e79dc [backport][sklearn] Fix loading model attributes. (#9808) (#9880) 2023-12-13 14:20:04 +08:00
Philip Hyunsu Cho
41ce8f28b2 [jvm-packages] Add Scala version suffix to xgboost-jvm package (#9776)
* Update JVM script (#9714)

* Bump version to 2.0.2; revamp pom.xml

* Update instructions in prepare_jvm_release.py

* Fix formatting
2023-11-08 10:17:26 -08:00
Jiaming Yuan
0ffc52e05c [backport] Fix using categorical data with the ranker. (#9753) (#9778) 2023-11-09 01:20:52 +08:00
Philip Hyunsu Cho
a408254c2f Use sys.base_prefix instead of sys.prefix (#9711)
* Use sys.base_prefix instead of sys.prefix

* Update libpath.py too
2023-10-23 23:31:40 -07:00
Philip Hyunsu Cho
22e891dafa [jvm-packages] Remove hard dependency on libjvm (#9698) (#9705) 2023-10-23 21:21:14 -07:00
Philip Hyunsu Cho
89530c80a7 [CI] Build libxgboost4j.dylib for Intel Mac (#9704) 2023-10-23 20:45:01 -07:00
Philip Hyunsu Cho
946ab53b57 Fix libpath logic for Windows (#9687) 2023-10-19 10:42:46 -07:00
Philip Hyunsu Cho
afd03a6934 Fix build for AppleClang 11 (#9684) 2023-10-18 09:35:59 -07:00
Jiaming Yuan
f7da938458 [backport][pyspark] Support stage-level scheduling (#9519) (#9686)
Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2023-10-18 14:05:08 +08:00
Philip Hyunsu Cho
6ab6577511 Fix build for GCC 8.x (#9670) 2023-10-12 23:36:41 -07:00
Philip Hyunsu Cho
8c57558d74 [backport] [CI] Pull CentOS 7 images from NGC (#9666) (#9668) 2023-10-13 14:09:54 +08:00
Jiaming Yuan
58aa98a796 Bump version to 2.0.1. (#9660) 2023-10-13 08:47:32 +08:00
Jiaming Yuan
92273b39d8 [backport] Add support for cgroupv2. (#9651) (#9656) 2023-10-12 11:39:27 +08:00
Jiaming Yuan
e824b18bf6 [backport] Support pandas 2.1.0. (#9557) (#9655) 2023-10-12 11:29:59 +08:00
Jiaming Yuan
66ee89d8b4 [backport] Workaround Apple clang issue. (#9615) (#9636) 2023-10-08 15:42:15 +08:00
Jiaming Yuan
54d1d72d01 [backport] Use array interface for testing numpy arrays. (#9602) (#9635) 2023-10-08 11:45:49 +08:00
Jiaming Yuan
032bcc57f9 [backport][R] Fix method name. (#9577) (#9592) 2023-09-19 02:08:46 +08:00
Jiaming Yuan
ace7713201 [backport] Fix default metric configuration. (#9575) (#9590) 2023-09-18 23:40:43 +08:00
54 changed files with 998 additions and 316 deletions

View File

@@ -51,14 +51,14 @@ jobs:
id: extract_branch id: extract_branch
if: | if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest' (matrix.os == 'windows-latest' || matrix.os == 'macos-11')
- name: Publish artifact xgboost4j.dll to S3 - name: Publish artifact xgboost4j.dll to S3
run: | run: |
cd lib/ cd lib/
Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
dir dir
python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
if: | if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest' matrix.os == 'windows-latest'
@@ -66,6 +66,19 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Publish artifact libxgboost4j.dylib to S3
run: |
cd lib/
mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
ls
python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'macos-11'
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Test XGBoost4J (Core, Spark, Examples) - name: Test XGBoost4J (Core, Spark, Examples)
run: | run: |

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR) cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 2.0.0) project(xgboost LANGUAGES CXX C VERSION 2.0.3)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)
@@ -233,6 +233,11 @@ endif (RABIT_BUILD_MPI)
add_subdirectory(${xgboost_SOURCE_DIR}/src) add_subdirectory(${xgboost_SOURCE_DIR}/src)
target_link_libraries(objxgboost PUBLIC dmlc) target_link_libraries(objxgboost PUBLIC dmlc)
# Link -lstdc++fs for GCC 8.x
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
target_link_libraries(objxgboost PUBLIC stdc++fs)
endif()
# Exports some R specific definitions and objects # Exports some R specific definitions and objects
if (R_LIB) if (R_LIB)
add_subdirectory(${xgboost_SOURCE_DIR}/R-package) add_subdirectory(${xgboost_SOURCE_DIR}/R-package)

View File

@@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 2.0.0.1 Version: 2.0.3.1
Date: 2023-09-11 Date: 2023-12-14
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"), email = "tianqi.tchen@gmail.com"),

View File

@@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
i == env$begin_iteration || i == env$begin_iteration ||
i == env$end_iteration) { i == env$end_iteration) {
stdev <- if (showsd) env$bst_evaluation_err else NULL stdev <- if (showsd) env$bst_evaluation_err else NULL
msg <- format.eval.string(i, env$bst_evaluation, stdev) msg <- .format_eval_string(i, env$bst_evaluation, stdev)
cat(msg, '\n') cat(msg, '\n')
} }
} }
@@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
if ((maximize && score > best_score) || if ((maximize && score > best_score) ||
(!maximize && score < best_score)) { (!maximize && score < best_score)) {
best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) best_msg <<- .format_eval_string(
i, env$bst_evaluation, env$bst_evaluation_err
)
best_score <<- score best_score <<- score
best_iteration <<- i best_iteration <<- i
best_ntreelimit <<- best_iteration * env$num_parallel_tree best_ntreelimit <<- best_iteration * env$num_parallel_tree
@@ -754,7 +756,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
# #
# Format the evaluation metric string # Format the evaluation metric string
format.eval.string <- function(iter, eval_res, eval_err = NULL) { .format_eval_string <- function(iter, eval_res, eval_err = NULL) {
if (length(eval_res) == 0) if (length(eval_res) == 0)
stop('no evaluation results') stop('no evaluation results')
enames <- names(eval_res) enames <- names(eval_res)

18
R-package/configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for xgboost 2.0.0. # Generated by GNU Autoconf 2.71 for xgboost 2.0.3.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='xgboost' PACKAGE_NAME='xgboost'
PACKAGE_TARNAME='xgboost' PACKAGE_TARNAME='xgboost'
PACKAGE_VERSION='2.0.0' PACKAGE_VERSION='2.0.3'
PACKAGE_STRING='xgboost 2.0.0' PACKAGE_STRING='xgboost 2.0.3'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems. \`configure' configures xgboost 2.0.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1287,7 +1287,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of xgboost 2.0.0:";; short | recursive ) echo "Configuration of xgboost 2.0.3:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1367,7 +1367,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
xgboost configure 2.0.0 xgboost configure 2.0.3
generated by GNU Autoconf 2.71 generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc. Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by xgboost $as_me 2.0.0, which was It was created by xgboost $as_me 2.0.3, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by xgboost $as_me 2.0.0, which was This file was extended by xgboost $as_me 2.0.3, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
xgboost config.status 2.0.0 xgboost config.status 2.0.3
configured by $0, generated by GNU Autoconf 2.71, configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -2,7 +2,7 @@
AC_PREREQ(2.69) AC_PREREQ(2.69)
AC_INIT([xgboost],[2.0.0],[],[xgboost],[]) AC_INIT([xgboost],[2.0.3],[],[xgboost],[])
: ${R_HOME=`R RHOME`} : ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then if test -z "${R_HOME}"; then

View File

@@ -0,0 +1,79 @@
import argparse
import pathlib
import re
import shutil
def main(args):
if args.scala_version == "2.12":
scala_ver = "2.12"
scala_patchver = "2.12.18"
elif args.scala_version == "2.13":
scala_ver = "2.13"
scala_patchver = "2.13.11"
else:
raise ValueError(f"Unsupported Scala version: {args.scala_version}")
# Clean artifacts
if args.purge_artifacts:
for target in pathlib.Path("jvm-packages/").glob("**/target"):
if target.is_dir():
print(f"Removing {target}...")
shutil.rmtree(target)
# Update pom.xml
for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):
print(f"Updating {pom}...")
with open(pom, "r", encoding="utf-8") as f:
lines = f.readlines()
with open(pom, "w", encoding="utf-8") as f:
replaced_scalaver = False
replaced_scala_binver = False
for line in lines:
for artifact in [
"xgboost-jvm",
"xgboost4j",
"xgboost4j-gpu",
"xgboost4j-spark",
"xgboost4j-spark-gpu",
"xgboost4j-flink",
"xgboost4j-example",
]:
line = re.sub(
f"<artifactId>{artifact}_[0-9\\.]*",
f"<artifactId>{artifact}_{scala_ver}",
line,
)
# Only replace the first occurrence of scala.version
if not replaced_scalaver:
line, nsubs = re.subn(
r"<scala.version>[0-9\.]*",
f"<scala.version>{scala_patchver}",
line,
)
if nsubs > 0:
replaced_scalaver = True
# Only replace the first occurrence of scala.binary.version
if not replaced_scala_binver:
line, nsubs = re.subn(
r"<scala.binary.version>[0-9\.]*",
f"<scala.binary.version>{scala_ver}",
line,
)
if nsubs > 0:
replaced_scala_binver = True
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--purge-artifacts", action="store_true")
parser.add_argument(
"--scala-version",
type=str,
required=True,
help="Version of Scala to use in the JVM packages",
choices=["2.12", "2.13"],
)
parsed_args = parser.parse_args()
main(parsed_args)

View File

@@ -2,7 +2,6 @@ import argparse
import errno import errno
import glob import glob
import os import os
import platform
import re import re
import shutil import shutil
import subprocess import subprocess
@@ -21,12 +20,14 @@ def normpath(path):
else: else:
return normalized return normalized
def cp(source, target): def cp(source, target):
source = normpath(source) source = normpath(source)
target = normpath(target) target = normpath(target)
print("cp {0} {1}".format(source, target)) print("cp {0} {1}".format(source, target))
shutil.copy(source, target) shutil.copy(source, target)
def maybe_makedirs(path): def maybe_makedirs(path):
path = normpath(path) path = normpath(path)
print("mkdir -p " + path) print("mkdir -p " + path)
@@ -36,6 +37,7 @@ def maybe_makedirs(path):
if e.errno != errno.EEXIST: if e.errno != errno.EEXIST:
raise raise
@contextmanager @contextmanager
def cd(path): def cd(path):
path = normpath(path) path = normpath(path)
@@ -47,18 +49,22 @@ def cd(path):
finally: finally:
os.chdir(cwd) os.chdir(cwd)
def run(command, **kwargs): def run(command, **kwargs):
print(command) print(command)
subprocess.check_call(command, shell=True, **kwargs) subprocess.check_call(command, shell=True, **kwargs)
def get_current_git_tag(): def get_current_git_tag():
out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]) out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"])
return out.decode().split("\n")[0] return out.decode().split("\n")[0]
def get_current_commit_hash(): def get_current_commit_hash():
out = subprocess.check_output(["git", "rev-parse", "HEAD"]) out = subprocess.check_output(["git", "rev-parse", "HEAD"])
return out.decode().split("\n")[0] return out.decode().split("\n")[0]
def get_current_git_branch(): def get_current_git_branch():
out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"]) out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"])
m = re.search(r"release_[0-9\.]+", out.decode()) m = re.search(r"release_[0-9\.]+", out.decode())
@@ -66,38 +72,49 @@ def get_current_git_branch():
raise ValueError("Expected branch name of form release_xxx") raise ValueError("Expected branch name of form release_xxx")
return m.group(0) return m.group(0)
def retrieve(url, filename=None): def retrieve(url, filename=None):
print(f"{url} -> {filename}") print(f"{url} -> {filename}")
return urlretrieve(url, filename) return urlretrieve(url, filename)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--release-version", type=str, required=True, parser.add_argument(
help="Version of the release being prepared") "--release-version",
type=str,
required=True,
help="Version of the release being prepared",
)
args = parser.parse_args() args = parser.parse_args()
if sys.platform != "darwin" or platform.machine() != "x86_64":
raise NotImplementedError("Please run this script using an Intel Mac")
version = args.release_version version = args.release_version
expected_git_tag = "v" + version expected_git_tag = "v" + version
current_git_tag = get_current_git_tag() current_git_tag = get_current_git_tag()
if current_git_tag != expected_git_tag: if current_git_tag != expected_git_tag:
if not current_git_tag: if not current_git_tag:
raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD has no tag. " raise ValueError(
f"Run: git checkout {expected_git_tag}") f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD is at tag " f"Run: git checkout {expected_git_tag}"
f"{current_git_tag}. Run: git checkout {expected_git_tag}") )
raise ValueError(
f"Expected git tag {expected_git_tag} but current HEAD is at tag "
f"{current_git_tag}. Run: git checkout {expected_git_tag}"
)
commit_hash = get_current_commit_hash() commit_hash = get_current_commit_hash()
git_branch = get_current_git_branch() git_branch = get_current_git_branch()
print(f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}") print(
f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}"
)
with cd("jvm-packages/"): with cd("jvm-packages/"):
print("====copying pure-Python tracker====") print("====copying pure-Python tracker====")
for use_cuda in [True, False]: for use_cuda in [True, False]:
xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j" xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
cp("../python-package/xgboost/tracker.py", f"{xgboost4j}/src/main/resources") cp(
"../python-package/xgboost/tracker.py",
f"{xgboost4j}/src/main/resources",
)
print("====copying resources for testing====") print("====copying resources for testing====")
with cd("../demo/CLI/regression"): with cd("../demo/CLI/regression"):
@@ -115,7 +132,12 @@ def main():
cp(file, f"{xgboost4j_spark}/src/test/resources") cp(file, f"{xgboost4j_spark}/src/test/resources")
print("====Creating directories to hold native binaries====") print("====Creating directories to hold native binaries====")
for os_ident, arch in [("linux", "x86_64"), ("windows", "x86_64"), ("macos", "x86_64")]: for os_ident, arch in [
("linux", "x86_64"),
("windows", "x86_64"),
("macos", "x86_64"),
("macos", "aarch64"),
]:
output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}" output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
maybe_makedirs(output_dir) maybe_makedirs(output_dir)
for os_ident, arch in [("linux", "x86_64")]: for os_ident, arch in [("linux", "x86_64")]:
@@ -123,52 +145,98 @@ def main():
maybe_makedirs(output_dir) maybe_makedirs(output_dir)
print("====Downloading native binaries from CI====") print("====Downloading native binaries from CI====")
nightly_bucket_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds" nightly_bucket_prefix = (
maven_repo_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc" "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
)
maven_repo_prefix = (
"https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
)
retrieve(url=f"{nightly_bucket_prefix}/{git_branch}/xgboost4j_{commit_hash}.dll", retrieve(
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll") url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll",
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
# libxgboost4j.so for Linux x86_64, CPU only # libxgboost4j.so for Linux x86_64, CPU only
zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar") zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j") extract_dir = os.path.join(tempdir, "xgboost4j")
retrieve(url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/" retrieve(
f"xgboost4j_2.12-{version}.jar", url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
filename=zip_path) f"xgboost4j_2.12-{version}.jar",
filename=zip_path,
)
os.mkdir(extract_dir) os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t: with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir) t.extractall(extract_dir)
cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"), cp(
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so") os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
# libxgboost4j.so for Linux x86_64, GPU support # libxgboost4j.so for Linux x86_64, GPU support
zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar") zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j-gpu") extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
retrieve(url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/" retrieve(
f"xgboost4j-gpu_2.12-{version}.jar", url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
filename=zip_path) f"xgboost4j-gpu_2.12-{version}.jar",
filename=zip_path,
)
os.mkdir(extract_dir) os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t: with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir) t.extractall(extract_dir)
cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"), cp(
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so") os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
print("====Next Steps====") print("====Next Steps====")
print("1. Gain upload right to Maven Central repo.") print("1. Gain upload right to Maven Central repo.")
print("1-1. Sign up for a JIRA account at Sonatype: ") print("1-1. Sign up for a JIRA account at Sonatype: ")
print("1-2. File a JIRA ticket: " print(
"https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: " "1-2. File a JIRA ticket: "
"https://issues.sonatype.org/browse/OSSRH-67724") "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
print("2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in " "https://issues.sonatype.org/browse/OSSRH-67724"
"https://central.sonatype.org/publish/publish-maven/") )
print("3. Now on a Mac machine, run:") print(
print(" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests") "2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
print("4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging " "https://central.sonatype.org/publish/publish-maven/"
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 " )
"to inspect the staged JAR files. Finally, press Release button to publish the " print(
"artifacts to the Maven Central repository.") "3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
"Make sure to use an Internet connection with fast upload speed:"
)
print(
" # Skip native build, since we have all needed native binaries from CI\n"
" export MAVEN_SKIP_NATIVE_BUILD=1\n"
" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
)
print(
"4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
"to inspect the staged JAR files. Finally, press Release button to publish the "
"artifacts to the Maven Central repository. The top-level metapackage should be "
"named xgboost-jvm_2.12."
)
print(
"5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
" export MAVEN_SKIP_NATIVE_BUILD=1\n"
" python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
" GPG_TTY=$(tty) mvn deploy -Prelease-cpu-only,scala-2.13 -DskipTests"
)
print(
"6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
"The top-level metapackage should be named xgboost-jvm_2.13."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 2 /* NOLINT */ #define XGBOOST_VER_MAJOR 2 /* NOLINT */
#define XGBOOST_VER_MINOR 0 /* NOLINT */ #define XGBOOST_VER_MINOR 0 /* NOLINT */
#define XGBOOST_VER_PATCH 0 /* NOLINT */ #define XGBOOST_VER_PATCH 3 /* NOLINT */
#endif // XGBOOST_VERSION_CONFIG_H_ #endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -25,4 +25,3 @@ target_include_directories(xgboost4j
${PROJECT_SOURCE_DIR}/rabit/include) ${PROJECT_SOURCE_DIR}/rabit/include)
set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib) set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
target_link_libraries(xgboost4j PRIVATE ${JAVA_JVM_LIBRARY})

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import errno
import argparse import argparse
import errno
import glob import glob
import os import os
import platform import platform
@@ -19,11 +19,10 @@ CONFIG = {
"USE_HDFS": "OFF", "USE_HDFS": "OFF",
"USE_AZURE": "OFF", "USE_AZURE": "OFF",
"USE_S3": "OFF", "USE_S3": "OFF",
"USE_CUDA": "OFF", "USE_CUDA": "OFF",
"USE_NCCL": "OFF", "USE_NCCL": "OFF",
"JVM_BINDINGS": "ON", "JVM_BINDINGS": "ON",
"LOG_CAPI_INVOCATION": "OFF" "LOG_CAPI_INVOCATION": "OFF",
} }
@@ -70,26 +69,22 @@ def normpath(path):
return normalized return normalized
if __name__ == "__main__": def native_build(args):
parser = argparse.ArgumentParser()
parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
cli_args = parser.parse_args()
if sys.platform == "darwin": if sys.platform == "darwin":
# Enable of your compiler supports OpenMP. # Enable of your compiler supports OpenMP.
CONFIG["USE_OPENMP"] = "OFF" CONFIG["USE_OPENMP"] = "OFF"
os.environ["JAVA_HOME"] = subprocess.check_output( os.environ["JAVA_HOME"] = (
"/usr/libexec/java_home").strip().decode() subprocess.check_output("/usr/libexec/java_home").strip().decode()
)
print("building Java wrapper") print("building Java wrapper")
with cd(".."): with cd(".."):
build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build' build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
maybe_makedirs(build_dir) maybe_makedirs(build_dir)
with cd(build_dir): with cd(build_dir):
if sys.platform == "win32": if sys.platform == "win32":
# Force x64 build on Windows. # Force x64 build on Windows.
maybe_generator = ' -A x64' maybe_generator = " -A x64"
else: else:
maybe_generator = "" maybe_generator = ""
if sys.platform == "linux": if sys.platform == "linux":
@@ -97,12 +92,12 @@ if __name__ == "__main__":
else: else:
maybe_parallel_build = "" maybe_parallel_build = ""
if cli_args.log_capi_invocation == 'ON': if cli_args.log_capi_invocation == "ON":
CONFIG['LOG_CAPI_INVOCATION'] = 'ON' CONFIG["LOG_CAPI_INVOCATION"] = "ON"
if cli_args.use_cuda == 'ON': if cli_args.use_cuda == "ON":
CONFIG['USE_CUDA'] = 'ON' CONFIG["USE_CUDA"] = "ON"
CONFIG['USE_NCCL'] = 'ON' CONFIG["USE_NCCL"] = "ON"
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()] args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
@@ -115,7 +110,7 @@ if __name__ == "__main__":
if gpu_arch_flag is not None: if gpu_arch_flag is not None:
args.append("%s" % gpu_arch_flag) args.append("%s" % gpu_arch_flag)
lib_dir = os.path.join(os.pardir, 'lib') lib_dir = os.path.join(os.pardir, "lib")
if os.path.exists(lib_dir): if os.path.exists(lib_dir):
shutil.rmtree(lib_dir) shutil.rmtree(lib_dir)
run("cmake .. " + " ".join(args) + maybe_generator) run("cmake .. " + " ".join(args) + maybe_generator)
@@ -125,8 +120,10 @@ if __name__ == "__main__":
run(f'"{sys.executable}" mapfeat.py') run(f'"{sys.executable}" mapfeat.py')
run(f'"{sys.executable}" mknfold.py machine.txt 1') run(f'"{sys.executable}" mknfold.py machine.txt 1')
xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j' xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark' xgboost4j_spark = (
"xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
)
print("copying native library") print("copying native library")
library_name, os_folder = { library_name, os_folder = {
@@ -141,14 +138,19 @@ if __name__ == "__main__":
"i86pc": "x86_64", # on Solaris x86_64 "i86pc": "x86_64", # on Solaris x86_64
"sun4v": "sparc", # on Solaris sparc "sun4v": "sparc", # on Solaris sparc
"arm64": "aarch64", # on macOS & Windows ARM 64-bit "arm64": "aarch64", # on macOS & Windows ARM 64-bit
"aarch64": "aarch64" "aarch64": "aarch64",
}[platform.machine().lower()] }[platform.machine().lower()]
output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder) output_folder = "{}/src/main/resources/lib/{}/{}".format(
xgboost4j, os_folder, arch_folder
)
maybe_makedirs(output_folder) maybe_makedirs(output_folder)
cp("../lib/" + library_name, output_folder) cp("../lib/" + library_name, output_folder)
print("copying pure-Python tracker") print("copying pure-Python tracker")
cp("../python-package/xgboost/tracker.py", "{}/src/main/resources".format(xgboost4j)) cp(
"../python-package/xgboost/tracker.py",
"{}/src/main/resources".format(xgboost4j),
)
print("copying train/test files") print("copying train/test files")
maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark)) maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
@@ -164,3 +166,18 @@ if __name__ == "__main__":
maybe_makedirs("{}/src/test/resources".format(xgboost4j)) maybe_makedirs("{}/src/test/resources".format(xgboost4j))
for file in glob.glob("../demo/data/agaricus.*"): for file in glob.glob("../demo/data/agaricus.*"):
cp(file, "{}/src/test/resources".format(xgboost4j)) cp(file, "{}/src/test/resources".format(xgboost4j))
if __name__ == "__main__":
if "MAVEN_SKIP_NATIVE_BUILD" in os.environ:
print("MAVEN_SKIP_NATIVE_BUILD is set. Skipping native build...")
else:
parser = argparse.ArgumentParser()
parser.add_argument(
"--log-capi-invocation", type=str, choices=["ON", "OFF"], default="OFF"
)
parser.add_argument(
"--use-cuda", type=str, choices=["ON", "OFF"], default="OFF"
)
cli_args = parser.parse_args()
native_build(cli_args)

View File

@@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>
@@ -189,6 +189,93 @@
</plugins> </plugins>
</build> </build>
</profile> </profile>
<profile>
<id>release-cpu-only</id>
<modules>
<module>xgboost4j</module>
<module>xgboost4j-example</module>
<module>xgboost4j-spark</module>
<module>xgboost4j-flink</module>
</modules>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>empty-javadoc-jar</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>javadoc</classifier>
<classesDirectory>${basedir}/javadoc</classesDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.1</version>
<configuration>
<autoVersionSubmodules>true</autoVersionSubmodules>
<useReleaseProfile>false</useReleaseProfile>
<releaseProfiles>release</releaseProfiles>
<goals>deploy</goals>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile> <profile>
<id>assembly</id> <id>assembly</id>
<build> <build>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-example</name> <name>xgboost4j-example</name>
<artifactId>xgboost4j-example_${scala.binary.version}</artifactId> <artifactId>xgboost4j-example_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>
@@ -26,7 +26,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@@ -37,7 +37,7 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@@ -5,13 +5,13 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-flink</name> <name>xgboost4j-flink</name>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
<properties> <properties>
<flink-ml.version>2.2.0</flink-ml.version> <flink-ml.version>2.2.0</flink-ml.version>
</properties> </properties>
@@ -30,7 +30,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<name>xgboost4j-gpu</name> <name>xgboost4j-gpu</name>
<version>2.0.0</version> <version>2.0.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -5,11 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-spark-gpu</name> <name>xgboost4j-spark-gpu</name>
<artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -24,7 +24,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@@ -5,11 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j-spark</name> <name>xgboost4j-spark</name>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -24,7 +24,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
</parent> </parent>
<name>xgboost4j</name> <name>xgboost4j</name>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>2.0.0</version> <version>2.0.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -132,16 +132,28 @@ def locate_or_build_libxgboost(
if build_config.use_system_libxgboost: if build_config.use_system_libxgboost:
# Find libxgboost from system prefix # Find libxgboost from system prefix
sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve() sys_prefix = pathlib.Path(sys.base_prefix)
libxgboost_sys = sys_base_prefix / "lib" / _lib_name() sys_prefix_candidates = [
if not libxgboost_sys.exists(): sys_prefix / "lib",
raise RuntimeError( # Paths possibly used on Windows
f"use_system_libxgboost was specified but {_lib_name()} is " sys_prefix / "bin",
f"not found in {libxgboost_sys.parent}" sys_prefix / "Library",
) sys_prefix / "Library" / "bin",
sys_prefix / "Library" / "lib",
logger.info("Using system XGBoost: %s", str(libxgboost_sys)) ]
return libxgboost_sys sys_prefix_candidates = [
p.expanduser().resolve() for p in sys_prefix_candidates
]
for candidate_dir in sys_prefix_candidates:
libtreelite_sys = candidate_dir / _lib_name()
if libtreelite_sys.exists():
logger.info("Using system XGBoost: %s", str(libtreelite_sys))
return libtreelite_sys
raise RuntimeError(
f"use_system_libxgboost was specified but {_lib_name()} is "
f"not found. Paths searched (in order): \n"
+ "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
)
libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger) libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
if libxgboost is not None: if libxgboost is not None:

View File

@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
[project] [project]
name = "xgboost" name = "xgboost"
version = "2.0.0" version = "2.0.3"
authors = [ authors = [
{ name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" }, { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" } { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }

View File

@@ -1 +1 @@
2.0.0 2.0.3

View File

@@ -206,6 +206,7 @@ def _load_lib() -> ctypes.CDLL:
lib = ctypes.cdll.LoadLibrary(lib_path) lib = ctypes.cdll.LoadLibrary(lib_path)
setattr(lib, "path", os.path.normpath(lib_path)) setattr(lib, "path", os.path.normpath(lib_path))
lib_success = True lib_success = True
break
except OSError as e: except OSError as e:
os_error_list.append(str(e)) os_error_list.append(str(e))
continue continue
@@ -2399,6 +2400,7 @@ class Booster:
_is_cudf_df, _is_cudf_df,
_is_cupy_array, _is_cupy_array,
_is_list, _is_list,
_is_np_array_like,
_is_pandas_df, _is_pandas_df,
_is_pandas_series, _is_pandas_series,
_is_tuple, _is_tuple,
@@ -2428,7 +2430,7 @@ class Booster:
f"got {data.shape[1]}" f"got {data.shape[1]}"
) )
if isinstance(data, np.ndarray): if _is_np_array_like(data):
from .data import _ensure_np_dtype from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)

View File

@@ -78,7 +78,6 @@ from .data import _is_cudf_ser, _is_cupy_array
from .sklearn import ( from .sklearn import (
XGBClassifier, XGBClassifier,
XGBClassifierBase, XGBClassifierBase,
XGBClassifierMixIn,
XGBModel, XGBModel,
XGBRanker, XGBRanker,
XGBRankerMixIn, XGBRankerMixIn,
@@ -1854,7 +1853,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
"Implementation of the scikit-learn API for XGBoost classification.", "Implementation of the scikit-learn API for XGBoost classification.",
["estimators", "model"], ["estimators", "model"],
) )
class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase): class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
# pylint: disable=missing-class-docstring # pylint: disable=missing-class-docstring
async def _fit_async( async def _fit_async(
self, self,
@@ -2036,10 +2035,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
preds = da.map_blocks(_argmax, pred_probs, drop_axis=1) preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
return preds return preds
def load_model(self, fname: ModelIn) -> None:
super().load_model(fname)
self._load_model_attributes(self.get_booster())
@xgboost_model_doc( @xgboost_model_doc(
"""Implementation of the Scikit-Learn API for XGBoost Ranking. """Implementation of the Scikit-Learn API for XGBoost Ranking.

View File

@@ -164,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
return isinstance(data, scipy.sparse.coo_matrix) return isinstance(data, scipy.sparse.coo_matrix)
def _is_numpy_array(data: DataType) -> bool: def _is_np_array_like(data: DataType) -> bool:
return isinstance(data, (np.ndarray, np.matrix)) return hasattr(data, "__array_interface__")
def _ensure_np_dtype( def _ensure_np_dtype(
@@ -317,7 +317,6 @@ def pandas_feature_info(
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
"""Handle feature info for pandas dataframe.""" """Handle feature info for pandas dataframe."""
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype, is_sparse
# handle feature names # handle feature names
if feature_names is None and meta is None: if feature_names is None and meta is None:
@@ -332,10 +331,10 @@ def pandas_feature_info(
if feature_types is None and meta is None: if feature_types is None and meta is None:
feature_types = [] feature_types = []
for dtype in data.dtypes: for dtype in data.dtypes:
if is_sparse(dtype): if is_pd_sparse_dtype(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif ( elif (
is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
) and enable_categorical: ) and enable_categorical:
feature_types.append(CAT_T) feature_types.append(CAT_T)
else: else:
@@ -345,18 +344,13 @@ def pandas_feature_info(
def is_nullable_dtype(dtype: PandasDType) -> bool: def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Whether dtype is a pandas nullable type.""" """Whether dtype is a pandas nullable type."""
from pandas.api.types import ( from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
is_bool_dtype,
is_categorical_dtype,
is_float_dtype,
is_integer_dtype,
)
is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
return is_int or is_bool or is_float or is_categorical_dtype(dtype) return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
def is_pa_ext_dtype(dtype: Any) -> bool: def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
) )
def is_pd_cat_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas category type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import CategoricalDtype
return isinstance(dtype, CategoricalDtype)
from pandas.api.types import is_categorical_dtype
return is_categorical_dtype(dtype)
def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas sparse type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import SparseDtype
return isinstance(dtype, SparseDtype)
from pandas.api.types import is_sparse
return is_sparse(dtype)
def pandas_cat_null(data: DataFrame) -> DataFrame: def pandas_cat_null(data: DataFrame) -> DataFrame:
"""Handle categorical dtype and nullable extension types from pandas.""" """Handle categorical dtype and nullable extension types from pandas."""
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype
# handle category codes and nullable. # handle category codes and nullable.
cat_columns = [] cat_columns = []
nul_columns = [] nul_columns = []
# avoid an unnecessary conversion if possible # avoid an unnecessary conversion if possible
for col, dtype in zip(data.columns, data.dtypes): for col, dtype in zip(data.columns, data.dtypes):
if is_categorical_dtype(dtype): if is_pd_cat_dtype(dtype):
cat_columns.append(col) cat_columns.append(col)
elif is_pa_ext_categorical_dtype(dtype): elif is_pa_ext_categorical_dtype(dtype):
raise ValueError( raise ValueError(
@@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
transformed = data transformed = data
def cat_codes(ser: pd.Series) -> pd.Series: def cat_codes(ser: pd.Series) -> pd.Series:
if is_categorical_dtype(ser.dtype): if is_pd_cat_dtype(ser.dtype):
return ser.cat.codes return ser.cat.codes
assert is_pa_ext_categorical_dtype(ser.dtype) assert is_pa_ext_categorical_dtype(ser.dtype)
# Not yet supported, the index is not ordered for some reason. Alternately: # Not yet supported, the index is not ordered for some reason. Alternately:
@@ -454,14 +479,12 @@ def _transform_pandas_df(
meta: Optional[str] = None, meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None, meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
from pandas.api.types import is_categorical_dtype, is_sparse
pyarrow_extension = False pyarrow_extension = False
for dtype in data.dtypes: for dtype in data.dtypes:
if not ( if not (
(dtype.name in _pandas_dtype_mapper) (dtype.name in _pandas_dtype_mapper)
or is_sparse(dtype) or is_pd_sparse_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical) or (is_pd_cat_dtype(dtype) and enable_categorical)
or is_pa_ext_dtype(dtype) or is_pa_ext_dtype(dtype)
): ):
_invalid_dataframe_dtype(data) _invalid_dataframe_dtype(data)
@@ -515,9 +538,8 @@ def _meta_from_pandas_series(
) -> None: ) -> None:
"""Help transform pandas series for meta data like labels""" """Help transform pandas series for meta data like labels"""
data = data.values.astype("float") data = data.values.astype("float")
from pandas.api.types import is_sparse
if is_sparse(data): if is_pd_sparse_dtype(getattr(data, "dtype", data)):
data = data.to_dense() # type: ignore data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
@@ -539,13 +561,11 @@ def _from_pandas_series(
feature_names: Optional[FeatureNames], feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType: ) -> DispatchedDataBackendReturnType:
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not ( if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical is_pd_cat_dtype(data.dtype) and enable_categorical
): ):
_invalid_dataframe_dtype(data) _invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype): if enable_categorical and is_pd_cat_dtype(data.dtype):
data = data.cat.codes data = data.cat.codes
return _from_numpy_array( return _from_numpy_array(
data.values.reshape(data.shape[0], 1).astype("float"), data.values.reshape(data.shape[0], 1).astype("float"),
@@ -1051,7 +1071,7 @@ def dispatch_data_backend(
return _from_scipy_csr( return _from_scipy_csr(
data.tocsr(), missing, threads, feature_names, feature_types data.tocsr(), missing, threads, feature_names, feature_types
) )
if _is_numpy_array(data): if _is_np_array_like(data):
return _from_numpy_array( return _from_numpy_array(
data, missing, threads, feature_names, feature_types, data_split_mode data, missing, threads, feature_names, feature_types, data_split_mode
) )
@@ -1194,7 +1214,7 @@ def dispatch_meta_backend(
if _is_tuple(data): if _is_tuple(data):
_meta_from_tuple(data, name, dtype, handle) _meta_from_tuple(data, name, dtype, handle)
return return
if _is_numpy_array(data): if _is_np_array_like(data):
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
return return
if _is_pandas_df(data): if _is_pandas_df(data):
@@ -1281,7 +1301,7 @@ def _proxy_transform(
return _transform_dlpack(data), None, feature_names, feature_types return _transform_dlpack(data), None, feature_names, feature_types
if _is_list(data) or _is_tuple(data): if _is_list(data) or _is_tuple(data):
data = np.array(data) data = np.array(data)
if _is_numpy_array(data): if _is_np_array_like(data):
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types return data, None, feature_names, feature_types
if _is_scipy_csr(data): if _is_scipy_csr(data):
@@ -1331,7 +1351,7 @@ def dispatch_proxy_set_data(
if not allow_host: if not allow_host:
raise err raise err
if _is_numpy_array(data): if _is_np_array_like(data):
_check_data_shape(data) _check_data_shape(data)
proxy._set_data_from_array(data) # pylint: disable=W0212 proxy._set_data_from_array(data) # pylint: disable=W0212
return return

View File

@@ -31,16 +31,15 @@ def find_lib_path() -> List[str]:
] ]
if sys.platform == "win32": if sys.platform == "win32":
if platform.architecture()[0] == "64bit": # On Windows, Conda may install libs in different paths
dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/")) dll_path.extend(
# hack for pip installation when copy all parent source [
# directory here os.path.join(sys.base_prefix, "bin"),
dll_path.append(os.path.join(curr_path, "./windows/x64/Release/")) os.path.join(sys.base_prefix, "Library"),
else: os.path.join(sys.base_prefix, "Library", "bin"),
dll_path.append(os.path.join(curr_path, "../../windows/Release/")) os.path.join(sys.base_prefix, "Library", "lib"),
# hack for pip installation when copy all parent source ]
# directory here )
dll_path.append(os.path.join(curr_path, "./windows/Release/"))
dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path] dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
elif sys.platform.startswith(("linux", "freebsd", "emscripten")): elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path] dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]

View File

@@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
from .training import train from .training import train
class XGBClassifierMixIn: # pylint: disable=too-few-public-methods
"""MixIn for classification."""
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
def _load_model_attributes(self, booster: Booster) -> None:
config = json.loads(booster.save_config())
self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
# binary classification is treated as regression in XGBoost.
self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
class XGBRankerMixIn: # pylint: disable=too-few-public-methods class XGBRankerMixIn: # pylint: disable=too-few-public-methods
"""MixIn for ranking, defines the _estimator_type usually defined in scikit-learn """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
base classes. base classes.
@@ -845,21 +832,38 @@ class XGBModel(XGBModelBase):
self.get_booster().load_model(fname) self.get_booster().load_model(fname)
meta_str = self.get_booster().attr("scikit_learn") meta_str = self.get_booster().attr("scikit_learn")
if meta_str is None: if meta_str is not None:
return meta = json.loads(meta_str)
t = meta.get("_estimator_type", None)
if t is not None and t != self._get_type():
raise TypeError(
"Loading an estimator with different type. Expecting: "
f"{self._get_type()}, got: {t}"
)
meta = json.loads(meta_str)
t = meta.get("_estimator_type", None)
if t is not None and t != self._get_type():
raise TypeError(
"Loading an estimator with different type. Expecting: "
f"{self._get_type()}, got: {t}"
)
self.feature_types = self.get_booster().feature_types self.feature_types = self.get_booster().feature_types
self.get_booster().set_attr(scikit_learn=None) self.get_booster().set_attr(scikit_learn=None)
config = json.loads(self.get_booster().save_config())
self._load_model_attributes(config)
load_model.__doc__ = f"""{Booster.load_model.__doc__}""" load_model.__doc__ = f"""{Booster.load_model.__doc__}"""
def _load_model_attributes(self, config: dict) -> None:
"""Load model attributes without hyper-parameters."""
from sklearn.base import is_classifier
booster = self.get_booster()
self.objective = config["learner"]["objective"]["name"]
self.booster = config["learner"]["gradient_booster"]["name"]
self.base_score = config["learner"]["learner_model_param"]["base_score"]
self.feature_types = booster.feature_types
if is_classifier(self):
self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
# binary classification is treated as regression in XGBoost.
self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
# pylint: disable=too-many-branches # pylint: disable=too-many-branches
def _configure_fit( def _configure_fit(
self, self,
@@ -1409,7 +1413,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
Number of boosting rounds. Number of boosting rounds.
""", """,
) )
class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase): class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
@_deprecate_positional_args @_deprecate_positional_args
def __init__( def __init__(
@@ -1637,10 +1641,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
def classes_(self) -> np.ndarray: def classes_(self) -> np.ndarray:
return np.arange(self.n_classes_) return np.arange(self.n_classes_)
def load_model(self, fname: ModelIn) -> None:
super().load_model(fname)
self._load_model_attributes(self.get_booster())
@xgboost_model_doc( @xgboost_model_doc(
"scikit-learn API for XGBoost random forest classification.", "scikit-learn API for XGBoost random forest classification.",
@@ -2093,7 +2093,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
""" """
X, qid = _get_qid(X, None) X, qid = _get_qid(X, None)
Xyq = DMatrix(X, y, qid=qid) # fixme(jiamingy): base margin and group weight is not yet supported. We might
# need to make extra special fields in the dataframe.
Xyq = DMatrix(
X,
y,
qid=qid,
missing=self.missing,
enable_categorical=self.enable_categorical,
nthread=self.n_jobs,
feature_types=self.feature_types,
)
if callable(self.eval_metric): if callable(self.eval_metric):
metric = ltr_metric_decorator(self.eval_metric, self.n_jobs) metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric) result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)

View File

@@ -22,7 +22,7 @@ from typing import (
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pyspark import SparkContext, cloudpickle from pyspark import RDD, SparkContext, cloudpickle
from pyspark.ml import Estimator, Model from pyspark.ml import Estimator, Model
from pyspark.ml.functions import array_to_vector, vector_to_array from pyspark.ml.functions import array_to_vector, vector_to_array
from pyspark.ml.linalg import VectorUDT from pyspark.ml.linalg import VectorUDT
@@ -44,6 +44,7 @@ from pyspark.ml.util import (
MLWritable, MLWritable,
MLWriter, MLWriter,
) )
from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
from pyspark.sql import Column, DataFrame from pyspark.sql import Column, DataFrame
from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
from pyspark.sql.types import ( from pyspark.sql.types import (
@@ -88,6 +89,7 @@ from .utils import (
_get_rabit_args, _get_rabit_args,
_get_spark_session, _get_spark_session,
_is_local, _is_local,
_is_standalone_or_localcluster,
deserialize_booster, deserialize_booster,
deserialize_xgb_model, deserialize_xgb_model,
get_class_name, get_class_name,
@@ -342,6 +344,54 @@ class _SparkXGBParams(
predict_params[param.name] = self.getOrDefault(param) predict_params[param.name] = self.getOrDefault(param)
return predict_params return predict_params
def _validate_gpu_params(self) -> None:
"""Validate the gpu parameters and gpu configurations"""
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
ss = _get_spark_session()
sc = ss.sparkContext
if _is_local(sc):
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
if executor_gpus is None:
raise ValueError(
"The `spark.executor.resource.gpu.amount` is required for training"
" on GPU."
)
if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
# We will enable stage-level scheduling in spark 3.4.0+ which doesn't
# require spark.task.resource.gpu.amount to be set explicitly
gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
def _validate_params(self) -> None: def _validate_params(self) -> None:
# pylint: disable=too-many-branches # pylint: disable=too-many-branches
init_model = self.getOrDefault("xgb_model") init_model = self.getOrDefault("xgb_model")
@@ -421,53 +471,7 @@ class _SparkXGBParams(
"`pyspark.ml.linalg.Vector` type." "`pyspark.ml.linalg.Vector` type."
) )
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu): self._validate_gpu_params()
gpu_per_task = (
_get_spark_session()
.sparkContext.getConf()
.get("spark.task.resource.gpu.amount")
)
is_local = _is_local(_get_spark_session().sparkContext)
if is_local:
# checking spark local mode.
if gpu_per_task is not None:
raise RuntimeError(
"The spark local mode does not support gpu configuration."
"Please remove spark.executor.resource.gpu.amount and "
"spark.task.resource.gpu.amount"
)
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
# checking spark non-local mode.
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
def _validate_and_convert_feature_col_as_float_col_list( def _validate_and_convert_feature_col_as_float_col_list(
@@ -592,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
arbitrary_params_dict={}, arbitrary_params_dict={},
) )
self.logger = get_logger(self.__class__.__name__)
def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name
""" """
Set params for the estimator. Set params for the estimator.
@@ -894,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
return booster_params, train_call_kwargs_params, dmatrix_kwargs return booster_params, train_call_kwargs_params, dmatrix_kwargs
def _skip_stage_level_scheduling(self) -> bool:
# pylint: disable=too-many-return-statements
"""Check if stage-level scheduling is not needed,
return true to skip stage-level scheduling"""
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
ss = _get_spark_session()
sc = ss.sparkContext
if ss.version < "3.4.0":
self.logger.info(
"Stage-level scheduling in xgboost requires spark version 3.4.0+"
)
return True
if not _is_standalone_or_localcluster(sc):
self.logger.info(
"Stage-level scheduling in xgboost requires spark standalone or "
"local-cluster mode"
)
return True
executor_cores = sc.getConf().get("spark.executor.cores")
executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
if executor_cores is None or executor_gpus is None:
self.logger.info(
"Stage-level scheduling in xgboost requires spark.executor.cores, "
"spark.executor.resource.gpu.amount to be set."
)
return True
if int(executor_cores) == 1:
# there will be only 1 task running at any time.
self.logger.info(
"Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
)
return True
if int(executor_gpus) > 1:
# For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
# to make xgboost run successfully.
#
self.logger.info(
"Stage-level scheduling in xgboost will not work "
"when spark.executor.resource.gpu.amount>1"
)
return True
task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
if task_gpu_amount is None:
# The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
# but with stage-level scheduling, we can make training task grab the gpu.
return False
if float(task_gpu_amount) == float(executor_gpus):
# spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
# results in only 1 task running at a time, which may cause perf issue.
return True
# We can enable stage-level scheduling
return False
# CPU training doesn't require stage-level scheduling
return True
def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
"""Try to enable stage-level scheduling"""
if self._skip_stage_level_scheduling():
return rdd
ss = _get_spark_session()
# executor_cores will not be None
executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
assert executor_cores is not None
# Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
# If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
# ETL gpu tasks running alongside training tasks.
spark_plugins = ss.conf.get("spark.plugins", " ")
assert spark_plugins is not None
spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
assert spark_rapids_sql_enabled is not None
task_cores = (
int(executor_cores)
if "com.nvidia.spark.SQLPlugin" in spark_plugins
and "true" == spark_rapids_sql_enabled.lower()
else (int(executor_cores) // 2) + 1
)
# Each training task requires cpu cores > total executor cores//2 + 1 which can
# make sure the tasks be sent to different executors.
#
# Please note that we can't use GPU to limit the concurrent tasks because of
# https://issues.apache.org/jira/browse/SPARK-45527.
task_gpus = 1.0
treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
rp = ResourceProfileBuilder().require(treqs).build
self.logger.info(
"XGBoost training tasks require the resource(cores=%s, gpu=%s).",
task_cores,
task_gpus,
)
return rdd.withResources(rp)
def _fit(self, dataset: DataFrame) -> "_SparkXGBModel": def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
# pylint: disable=too-many-statements, too-many-locals # pylint: disable=too-many-statements, too-many-locals
self._validate_params() self._validate_params()
@@ -994,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
) )
def _run_job() -> Tuple[str, str]: def _run_job() -> Tuple[str, str]:
ret = ( rdd = (
dataset.mapInPandas( dataset.mapInPandas(
_train_booster, schema="config string, booster string" # type: ignore _train_booster, # type: ignore
schema="config string, booster string",
) )
.rdd.barrier() .rdd.barrier()
.mapPartitions(lambda x: x) .mapPartitions(lambda x: x)
.collect()[0]
) )
rdd_with_resource = self._try_stage_level_scheduling(rdd)
ret = rdd_with_resource.collect()[0]
return ret[0], ret[1] return ret[0], ret[1]
get_logger("XGBoost-PySpark").info( get_logger("XGBoost-PySpark").info(

View File

@@ -129,6 +129,13 @@ def _is_local(spark_context: SparkContext) -> bool:
return spark_context._jsc.sc().isLocal() return spark_context._jsc.sc().isLocal()
def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
master = spark_context.getConf().get("spark.master")
return master is not None and (
master.startswith("spark://") or master.startswith("local-cluster")
)
def _get_gpu_id(task_context: TaskContext) -> int: def _get_gpu_id(task_context: TaskContext) -> int:
"""Get the gpu id from the task resources""" """Get the gpu id from the task resources"""
if task_context is None: if task_context is None:

View File

@@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
with pytest.raises(ValueError, match="Either `group` or `qid`."): with pytest.raises(ValueError, match="Either `group` or `qid`."):
ranker.fit(df, y, eval_set=[(X, y)]) ranker.fit(df, y, eval_set=[(X, y)])
def run_ranking_categorical(device: str) -> None:
"""Test LTR with categorical features."""
from sklearn.model_selection import cross_val_score
X, y = tm.make_categorical(
n_samples=512, n_features=10, n_categories=3, onehot=False
)
rng = np.random.default_rng(1994)
qid = rng.choice(3, size=y.shape[0])
qid = np.sort(qid)
X["qid"] = qid
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
ltr.fit(X, y)
score = ltr.score(X, y)
assert score > 0.9
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
# test using the score function inside sklearn.
scores = cross_val_score(ltr, X, y)
for s in scores:
assert s > 0.7

View File

@@ -384,7 +384,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
* @param length See the `length` parameter of `mmap` for details. * @param length See the `length` parameter of `mmap` for details.
*/ */
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length) explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {} : AlignedResourceReadStream{std::shared_ptr<MmapResource>{ // NOLINT
new MmapResource{std::move(path), offset, length}}} {}
~PrivateMmapConstStream() noexcept(false) override; ~PrivateMmapConstStream() noexcept(false) override;
}; };

View File

@@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT [[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT [[nodiscard]] size_type size_bytes() const { // NOLINT
return Span{data(), size()}.size_bytes(); return Span<const value_type>{data(), size()}.size_bytes();
} }
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT [[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT [[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT

View File

@@ -3,14 +3,23 @@
*/ */
#include "threading_utils.h" #include "threading_utils.h"
#include <fstream> #include <algorithm> // for max
#include <string> #include <exception> // for exception
#include <filesystem> // for path, exists
#include <fstream> // for ifstream
#include <string> // for string
#include "xgboost/logging.h" #include "common.h" // for DivRoundUp
namespace xgboost { namespace xgboost::common {
namespace common { /**
int32_t GetCfsCPUCount() noexcept { * Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
std::filesystem::path const& peroid_path) {
#if defined(__linux__) #if defined(__linux__)
// https://bugs.openjdk.java.net/browse/JDK-8146115 // https://bugs.openjdk.java.net/browse/JDK-8146115
// http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42 // http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
} }
}; };
// complete fair scheduler from Linux // complete fair scheduler from Linux
auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")); auto const cfs_quota(read_int(quota_path.c_str()));
auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us")); auto const cfs_period(read_int(peroid_path.c_str()));
if ((cfs_quota > 0) && (cfs_period > 0)) { if ((cfs_quota > 0) && (cfs_period > 0)) {
return std::max(cfs_quota / cfs_period, 1); return std::max(cfs_quota / cfs_period, 1);
} }
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
return -1; return -1;
} }
std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
std::int32_t cnt{-1};
#if defined(__linux__)
namespace fs = std::filesystem;
std::int32_t a{0}, b{0};
auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
try {
std::ifstream fin{bandwidth_path, std::ios::in};
fin >> a;
fin >> b;
} catch (std::exception const&) {
warn();
return cnt;
}
if (a > 0 && b > 0) {
cnt = std::max(common::DivRoundUp(a, b), 1);
}
#endif // defined(__linux__)
return cnt;
}
std::int32_t GetCfsCPUCount() noexcept {
namespace fs = std::filesystem;
fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
auto has_v2 = fs::exists(bandwidth_path);
if (has_v2) {
return GetCGroupV2Count(bandwidth_path);
}
fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
if (has_v1) {
return GetCGroupV1Count(quota_path, peroid_path);
}
return -1;
}
std::int32_t OmpGetNumThreads(std::int32_t n_threads) { std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
// Don't use parallel if we are in a parallel region. // Don't use parallel if we are in a parallel region.
if (omp_in_parallel()) { if (omp_in_parallel()) {
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
n_threads = std::max(n_threads, 1); n_threads = std::max(n_threads, 1);
return n_threads; return n_threads;
} }
} // namespace common } // namespace xgboost::common
} // namespace xgboost

View File

@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
* \brief Get thread limit from CFS. * \brief Get thread limit from CFS.
* *
* This function has non-trivial overhead and should not be called repeatly. * This function has non-trivial overhead and should not be called repeatly.
*
* Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/ */
std::int32_t GetCfsCPUCount() noexcept; std::int32_t GetCfsCPUCount() noexcept;

View File

@@ -1317,7 +1317,9 @@ class LearnerImpl : public LearnerIO {
if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) { if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_)); metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
auto config = obj_->DefaultMetricConfig(); auto config = obj_->DefaultMetricConfig();
metrics_.back()->LoadConfig(config); if (!IsA<Null>(config)) {
metrics_.back()->LoadConfig(config);
}
metrics_.back()->Configure({cfg_.begin(), cfg_.end()}); metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
} }

View File

@@ -268,6 +268,13 @@ class PseudoHuberRegression : public FitIntercept {
} }
FromJson(in["pseudo_huber_param"], &param_); FromJson(in["pseudo_huber_param"], &param_);
} }
[[nodiscard]] Json DefaultMetricConfig() const override {
CHECK(param_.GetInitialised());
Json config{Object{}};
config["name"] = String{this->DefaultEvalMetric()};
config["pseudo_huber_param"] = ToJson(param_);
return config;
}
}; };
XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror") XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")

View File

@@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} ${SPARK_VERSION}
echo "--- Stash XGBoost4J JARs (Scala 2.12)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
echo "--- Build XGBoost JVM packages scala 2.13" echo "--- Build XGBoost JVM packages scala 2.13"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} "" "" "true" ${SPARK_VERSION} "" "" "true"
echo "--- Stash XGBoost4J JARs" echo "--- Stash XGBoost4J JARs (Scala 2.13)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"

View File

@@ -0,0 +1,8 @@
steps:
- block: ":rocket: Run this test job"
if: build.pull_request.id != null || build.branch =~ /^dependabot\//
- label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11"
command: "tests/buildkite/test-macos-m1-clang11.sh"
key: mac-m1-appleclang11
agents:
queue: mac-mini-m1

View File

@@ -0,0 +1,50 @@
#!/bin/bash
set -euo pipefail
source tests/buildkite/conftest.sh
# Display system info
echo "--- Display system information"
set -x
system_profiler SPSoftwareDataType
sysctl -n machdep.cpu.brand_string
uname -m
set +x
# Build XGBoost4J binary
echo "--- Build libxgboost4j.dylib"
set -x
mkdir build
pushd build
export JAVA_HOME=$(/usr/libexec/java_home)
cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15
ninja -v
popd
rm -rf build
set +x
echo "--- Upload Python wheel"
set -x
pushd lib
mv -v libxgboost4j.dylib libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
buildkite-agent artifact upload libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib \
s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \
--acl public-read --no-progress
fi
popd
set +x
# Ensure that XGBoost can be built with Clang 11
echo "--- Build and Test XGBoost with MacOS M1, Clang 11"
set -x
LLVM11_PATH=$(brew --prefix llvm\@11)
mkdir build
pushd build
cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \
-DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \
-DUSE_DMLC_GTEST=ON
ninja -v

View File

@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG ARG NCCL_VERSION_ARG
ARG RAPIDS_VERSION_ARG ARG RAPIDS_VERSION_ARG

View File

@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
# Install all basic requirements # Install all basic requirements

View File

@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG ARG NCCL_VERSION_ARG

View File

@@ -27,6 +27,9 @@ fi
mvn_profile_string="" mvn_profile_string=""
if [ "x$use_scala213" != "x" ]; then if [ "x$use_scala213" != "x" ]; then
export mvn_profile_string="-Pdefault,scala-2.13" export mvn_profile_string="-Pdefault,scala-2.13"
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages
fi fi
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options

View File

@@ -32,11 +32,10 @@ dependencies:
- jsonschema - jsonschema
- boto3 - boto3
- awscli - awscli
- py-ubjson
- cffi - cffi
- pyarrow - pyarrow
- pyspark>=3.4.0 - pyspark>=3.4.0
- cloudpickle - cloudpickle
- pip: - pip:
- sphinx_rtd_theme - sphinx_rtd_theme
- datatable - py-ubjson

View File

@@ -27,6 +27,9 @@ rm -rf ../build/
# Deploy to S3 bucket xgboost-maven-repo # Deploy to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
# Deploy scala 2.13 to S3 bucket xgboost-maven-repo # Deploy scala 2.13 to S3 bucket xgboost-maven-repo
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages/
mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests

View File

@@ -21,9 +21,18 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
fi fi
# including maven profiles for different scala versions: 2.12 is the default at the moment. # including maven profiles for different scala versions: 2.12 is the default at the moment.
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do for scala_binary_version in "2.12" "2.13"; do
cd ..
python dev/change_scala_version.py --scala-version ${scala_binary_version}
cd jvm-packages
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout) scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout) if [[ "$scala_binary_version" == "2.12" ]]; then
_maven_profile_string=""
elif [[ "$scala_binary_version" == "2.13" ]]; then
_maven_profile_string="-Pdefault,scala-2.13"
else
echo "Unexpected scala version: $scala_version ($scala_binary_version)."
fi
# Install XGBoost4J JAR into local Maven repository # Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar

View File

@@ -148,7 +148,8 @@ TEST(IO, Resource) {
fout << 1.0 << std::endl; fout << 1.0 << std::endl;
fout.close(); fout.close();
auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double)); auto resource = std::shared_ptr<MmapResource>{
new MmapResource{path, 0, sizeof(double)}};
ASSERT_EQ(resource->Size(), sizeof(double)); ASSERT_EQ(resource->Size(), sizeof(double));
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap); ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
ASSERT_EQ(resource->DataAs<double>()[0], val); ASSERT_EQ(resource->DataAs<double>()[0], val);

View File

@@ -6,6 +6,7 @@
#include <xgboost/objective.h> #include <xgboost/objective.h>
#include "../helpers.h" #include "../helpers.h"
#include "../objective_helpers.h"
TEST(Objective, UnknownFunction) { TEST(Objective, UnknownFunction) {
xgboost::ObjFunction* obj = nullptr; xgboost::ObjFunction* obj = nullptr;
@@ -43,4 +44,61 @@ TEST(Objective, PredTransform) {
ASSERT_TRUE(predts.HostCanWrite()); ASSERT_TRUE(predts.HostCanWrite());
} }
} }
class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
Context ctx_;
public:
void Run(std::string objective) {
auto Xy = MakeFmatForObjTest(objective);
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
learner->SetParam("objective", objective);
if (objective.find("multi") != std::string::npos) {
learner->SetParam("num_class", "3");
objfn->Configure(Args{{"num_class", "3"}});
} else if (objective.find("quantile") != std::string::npos) {
learner->SetParam("quantile_alpha", "0.5");
objfn->Configure(Args{{"quantile_alpha", "0.5"}});
} else {
objfn->Configure(Args{});
}
learner->Configure();
learner->UpdateOneIter(0, Xy);
learner->EvalOneIter(0, {Xy}, {"train"});
Json config{Object{}};
learner->SaveConfig(&config);
auto jobj = get<Object const>(config["learner"]["objective"]);
ASSERT_TRUE(jobj.find("name") != jobj.cend());
// FIXME(jiamingy): We should have the following check, but some legacy parameter like
// "pos_weight", "delta_step" in objectives are not in metrics.
// if (jobj.size() > 1) {
// ASSERT_FALSE(IsA<Null>(objfn->DefaultMetricConfig()));
// }
auto mconfig = objfn->DefaultMetricConfig();
if (!IsA<Null>(mconfig)) {
// make sure metric can handle it
std::unique_ptr<Metric> metricfn{Metric::Create(get<String const>(mconfig["name"]), &ctx_)};
metricfn->LoadConfig(mconfig);
Json loaded(Object{});
metricfn->SaveConfig(&loaded);
metricfn->Configure(Args{});
ASSERT_EQ(mconfig, loaded);
}
}
};
TEST_P(TestDefaultObjConfig, Objective) {
std::string objective = GetParam();
this->Run(objective);
}
INSTANTIATE_TEST_SUITE_P(Objective, TestDefaultObjConfig,
::testing::ValuesIn(MakeObjNamesForTest()),
[](const ::testing::TestParamInfo<TestDefaultObjConfig::ParamType>& info) {
return ObjTestNameGenerator(info);
});
} // namespace xgboost } // namespace xgboost

View File

@@ -0,0 +1,31 @@
/**
* Copyright (c) 2023, XGBoost contributors
*/
#include "objective_helpers.h"
#include "../../src/common/linalg_op.h" // for begin, end
#include "helpers.h" // for RandomDataGenerator
namespace xgboost {
std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
} // namespace xgboost

View File

@@ -1,6 +1,8 @@
/** /**
* Copyright (c) 2023, XGBoost contributors * Copyright (c) 2023, XGBoost contributors
*/ */
#pragma once
#include <dmlc/registry.h> // for Registry #include <dmlc/registry.h> // for Registry
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <xgboost/objective.h> // for ObjFunctionReg #include <xgboost/objective.h> // for ObjFunctionReg
@@ -29,4 +31,6 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
} }
return name; return name;
}; };
std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj);
} // namespace xgboost } // namespace xgboost

View File

@@ -655,33 +655,11 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); } TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
class TestColumnSplit : public ::testing::TestWithParam<std::string> { class TestColumnSplit : public ::testing::TestWithParam<std::string> {
static auto MakeFmat(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) { void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
auto const world_size = collective::GetWorldSize(); auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank(); auto const rank = collective::GetRank();
auto p_fmat = MakeFmat(objective); auto p_fmat = MakeFmatForObjTest(objective);
std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)}; std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
std::unique_ptr<Learner> learner{Learner::Create({sliced})}; std::unique_ptr<Learner> learner{Learner::Create({sliced})};
learner->SetParam("tree_method", "approx"); learner->SetParam("tree_method", "approx");
@@ -705,7 +683,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
public: public:
void Run(std::string objective) { void Run(std::string objective) {
auto p_fmat = MakeFmat(objective); auto p_fmat = MakeFmatForObjTest(objective);
std::unique_ptr<Learner> learner{Learner::Create({p_fmat})}; std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
learner->SetParam("tree_method", "approx"); learner->SetParam("tree_method", "approx");
learner->SetParam("objective", objective); learner->SetParam("objective", objective);

View File

@@ -9,7 +9,7 @@ import pytest
import xgboost as xgb import xgboost as xgb
from xgboost import testing as tm from xgboost import testing as tm
from xgboost.testing.ranking import run_ranking_qid_df from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
sys.path.append("tests/python") sys.path.append("tests/python")
import test_with_sklearn as twskl # noqa import test_with_sklearn as twskl # noqa
@@ -165,6 +165,11 @@ def test_ranking_qid_df():
run_ranking_qid_df(cudf, "gpu_hist") run_ranking_qid_df(cudf, "gpu_hist")
@pytest.mark.skipif(**tm.no_pandas())
def test_ranking_categorical() -> None:
run_ranking_categorical(device="cuda")
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu @pytest.mark.mgpu
def test_device_ordinal() -> None: def test_device_ordinal() -> None:

View File

@@ -211,7 +211,7 @@ class TestPandas:
y = np.random.randn(kRows) y = np.random.randn(kRows)
w = np.random.uniform(size=kRows).astype(np.float32) w = np.random.uniform(size=kRows).astype(np.float32)
w_pd = pd.DataFrame(w) w_pd = pd.DataFrame(w)
data = xgb.DMatrix(X, y, w_pd) data = xgb.DMatrix(X, y, weight=w_pd)
assert data.num_row() == kRows assert data.num_row() == kRows
assert data.num_col() == kCols assert data.num_col() == kCols
@@ -301,14 +301,14 @@ class TestPandas:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix]) @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None: def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical_dtype from xgboost.data import is_pd_cat_dtype
for orig, df in pd_dtypes(): for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"): if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical_dtype for dtype in df.dtypes) enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
else: else:
# series # series
enable_categorical = is_categorical_dtype(df.dtype) enable_categorical = is_pd_cat_dtype(df.dtype)
f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df

View File

@@ -12,7 +12,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks
import xgboost as xgb import xgboost as xgb
from xgboost import testing as tm from xgboost import testing as tm
from xgboost.testing.ranking import run_ranking_qid_df from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
from xgboost.testing.shared import get_feature_weights, validate_data_initialization from xgboost.testing.shared import get_feature_weights, validate_data_initialization
from xgboost.testing.updater import get_basescore from xgboost.testing.updater import get_basescore
@@ -173,6 +173,11 @@ def test_ranking():
np.testing.assert_almost_equal(pred, pred_orig) np.testing.assert_almost_equal(pred, pred_orig)
@pytest.mark.skipif(**tm.no_pandas())
def test_ranking_categorical() -> None:
run_ranking_categorical(device="cpu")
def test_ranking_metric() -> None: def test_ranking_metric() -> None:
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
@@ -935,6 +940,7 @@ def save_load_model(model_path):
predt_0 = clf.predict(X) predt_0 = clf.predict(X)
clf.save_model(model_path) clf.save_model(model_path)
clf.load_model(model_path) clf.load_model(model_path)
assert clf.booster == "gblinear"
predt_1 = clf.predict(X) predt_1 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_1) np.testing.assert_allclose(predt_0, predt_1)
assert clf.best_iteration == best_iteration assert clf.best_iteration == best_iteration
@@ -950,25 +956,26 @@ def save_load_model(model_path):
def test_save_load_model(): def test_save_load_model():
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model') model_path = os.path.join(tempdir, "digits.model")
save_load_model(model_path) save_load_model(model_path)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model.json') model_path = os.path.join(tempdir, "digits.model.json")
save_load_model(model_path) save_load_model(model_path)
from sklearn.datasets import load_digits from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model.ubj') model_path = os.path.join(tempdir, "digits.model.ubj")
digits = load_digits(n_class=2) digits = load_digits(n_class=2)
y = digits['target'] y = digits["target"]
X = digits['data'] X = digits["data"]
booster = xgb.train({'tree_method': 'hist', booster = xgb.train(
'objective': 'binary:logistic'}, {"tree_method": "hist", "objective": "binary:logistic"},
dtrain=xgb.DMatrix(X, y), dtrain=xgb.DMatrix(X, y),
num_boost_round=4) num_boost_round=4,
)
predt_0 = booster.predict(xgb.DMatrix(X)) predt_0 = booster.predict(xgb.DMatrix(X))
booster.save_model(model_path) booster.save_model(model_path)
cls = xgb.XGBClassifier() cls = xgb.XGBClassifier()
@@ -1002,6 +1009,8 @@ def test_save_load_model():
clf = xgb.XGBClassifier() clf = xgb.XGBClassifier()
clf.load_model(model_path) clf.load_model(model_path)
assert clf.classes_.size == 10 assert clf.classes_.size == 10
assert clf.objective == "multi:softprob"
np.testing.assert_equal(clf.classes_, np.arange(10)) np.testing.assert_equal(clf.classes_, np.arange(10))
assert clf.n_classes_ == 10 assert clf.n_classes_ == 10

View File

@@ -1932,6 +1932,7 @@ class TestWithDask:
cls.client = client cls.client = client
cls.fit(X, y) cls.fit(X, y)
predt_0 = cls.predict(X) predt_0 = cls.predict(X)
proba_0 = cls.predict_proba(X)
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.pkl") path = os.path.join(tmpdir, "model.pkl")
@@ -1941,7 +1942,9 @@ class TestWithDask:
with open(path, "rb") as fd: with open(path, "rb") as fd:
cls = pickle.load(fd) cls = pickle.load(fd)
predt_1 = cls.predict(X) predt_1 = cls.predict(X)
proba_1 = cls.predict_proba(X)
np.testing.assert_allclose(predt_0.compute(), predt_1.compute()) np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
np.testing.assert_allclose(proba_0.compute(), proba_1.compute())
path = os.path.join(tmpdir, "cls.json") path = os.path.join(tmpdir, "cls.json")
cls.save_model(path) cls.save_model(path)
@@ -1950,16 +1953,20 @@ class TestWithDask:
cls.load_model(path) cls.load_model(path)
assert cls.n_classes_ == 10 assert cls.n_classes_ == 10
predt_2 = cls.predict(X) predt_2 = cls.predict(X)
proba_2 = cls.predict_proba(X)
np.testing.assert_allclose(predt_0.compute(), predt_2.compute()) np.testing.assert_allclose(predt_0.compute(), predt_2.compute())
np.testing.assert_allclose(proba_0.compute(), proba_2.compute())
# Use single node to load # Use single node to load
cls = xgb.XGBClassifier() cls = xgb.XGBClassifier()
cls.load_model(path) cls.load_model(path)
assert cls.n_classes_ == 10 assert cls.n_classes_ == 10
predt_3 = cls.predict(X_) predt_3 = cls.predict(X_)
proba_3 = cls.predict_proba(X_)
np.testing.assert_allclose(predt_0.compute(), predt_3) np.testing.assert_allclose(predt_0.compute(), predt_3)
np.testing.assert_allclose(proba_0.compute(), proba_3)
def test_dask_unsupported_features(client: "Client") -> None: def test_dask_unsupported_features(client: "Client") -> None: