Compare commits

..

23 Commits

Author SHA1 Message Date
Philip Hyunsu Cho
82d846bbeb Update change_scala_version.py to also change scala.version property (#9897) 2023-12-18 23:49:41 -08:00
Philip Hyunsu Cho
71d330afdc Bump version to 2.0.3 (#9895) 2023-12-14 17:54:05 -08:00
Philip Hyunsu Cho
3acbd8692b [jvm-packages] Fix POM for xgboost-jvm metapackage (#9893)
* [jvm-packages] Fix POM for xgboost-jvm metapackage

* Add script for updating the Scala version
2023-12-14 16:50:34 -08:00
Philip Hyunsu Cho
ad524f76ab [backport] [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9887)
* [CI] Set up CI for Mac M1 (#9699)

* [CI] Improve CI for Mac M1 (#9748)

* [CI] Build libxgboost4j.dylib with CMAKE_OSX_DEPLOYMENT_TARGET (#9749)

* [CI] Upload libxgboost4j.dylib (M1) to S3 bucket (#9886)
2023-12-13 16:05:40 -08:00
Jiaming Yuan
d2d1751c03 [backport][py] Use the first found native library. (#9860) (#9879) 2023-12-13 14:20:30 +08:00
Jiaming Yuan
e4ee4e79dc [backport][sklearn] Fix loading model attributes. (#9808) (#9880) 2023-12-13 14:20:04 +08:00
Philip Hyunsu Cho
41ce8f28b2 [jvm-packages] Add Scala version suffix to xgboost-jvm package (#9776)
* Update JVM script (#9714)

* Bump version to 2.0.2; revamp pom.xml

* Update instructions in prepare_jvm_release.py

* Fix formatting
2023-11-08 10:17:26 -08:00
Jiaming Yuan
0ffc52e05c [backport] Fix using categorical data with the ranker. (#9753) (#9778) 2023-11-09 01:20:52 +08:00
Philip Hyunsu Cho
a408254c2f Use sys.base_prefix instead of sys.prefix (#9711)
* Use sys.base_prefix instead of sys.prefix

* Update libpath.py too
2023-10-23 23:31:40 -07:00
Philip Hyunsu Cho
22e891dafa [jvm-packages] Remove hard dependency on libjvm (#9698) (#9705) 2023-10-23 21:21:14 -07:00
Philip Hyunsu Cho
89530c80a7 [CI] Build libxgboost4j.dylib for Intel Mac (#9704) 2023-10-23 20:45:01 -07:00
Philip Hyunsu Cho
946ab53b57 Fix libpath logic for Windows (#9687) 2023-10-19 10:42:46 -07:00
Philip Hyunsu Cho
afd03a6934 Fix build for AppleClang 11 (#9684) 2023-10-18 09:35:59 -07:00
Jiaming Yuan
f7da938458 [backport][pyspark] Support stage-level scheduling (#9519) (#9686)
Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2023-10-18 14:05:08 +08:00
Philip Hyunsu Cho
6ab6577511 Fix build for GCC 8.x (#9670) 2023-10-12 23:36:41 -07:00
Philip Hyunsu Cho
8c57558d74 [backport] [CI] Pull CentOS 7 images from NGC (#9666) (#9668) 2023-10-13 14:09:54 +08:00
Jiaming Yuan
58aa98a796 Bump version to 2.0.1. (#9660) 2023-10-13 08:47:32 +08:00
Jiaming Yuan
92273b39d8 [backport] Add support for cgroupv2. (#9651) (#9656) 2023-10-12 11:39:27 +08:00
Jiaming Yuan
e824b18bf6 [backport] Support pandas 2.1.0. (#9557) (#9655) 2023-10-12 11:29:59 +08:00
Jiaming Yuan
66ee89d8b4 [backport] Workaround Apple clang issue. (#9615) (#9636) 2023-10-08 15:42:15 +08:00
Jiaming Yuan
54d1d72d01 [backport] Use array interface for testing numpy arrays. (#9602) (#9635) 2023-10-08 11:45:49 +08:00
Jiaming Yuan
032bcc57f9 [backport][R] Fix method name. (#9577) (#9592) 2023-09-19 02:08:46 +08:00
Jiaming Yuan
ace7713201 [backport] Fix default metric configuration. (#9575) (#9590) 2023-09-18 23:40:43 +08:00
54 changed files with 998 additions and 316 deletions

View File

@@ -51,14 +51,14 @@ jobs:
id: extract_branch
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest'
(matrix.os == 'windows-latest' || matrix.os == 'macos-11')
- name: Publish artifact xgboost4j.dll to S3
run: |
cd lib/
Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
dir
python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest'
@@ -66,6 +66,19 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Publish artifact libxgboost4j.dylib to S3
run: |
cd lib/
mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
ls
python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'macos-11'
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- name: Test XGBoost4J (Core, Spark, Examples)
run: |

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 2.0.0)
project(xgboost LANGUAGES CXX C VERSION 2.0.3)
include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)
@@ -233,6 +233,11 @@ endif (RABIT_BUILD_MPI)
add_subdirectory(${xgboost_SOURCE_DIR}/src)
target_link_libraries(objxgboost PUBLIC dmlc)
# Link -lstdc++fs for GCC 8.x
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
target_link_libraries(objxgboost PUBLIC stdc++fs)
endif()
# Exports some R specific definitions and objects
if (R_LIB)
add_subdirectory(${xgboost_SOURCE_DIR}/R-package)

View File

@@ -1,8 +1,8 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 2.0.0.1
Date: 2023-09-11
Version: 2.0.3.1
Date: 2023-12-14
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"),

View File

@@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
i == env$begin_iteration ||
i == env$end_iteration) {
stdev <- if (showsd) env$bst_evaluation_err else NULL
msg <- format.eval.string(i, env$bst_evaluation, stdev)
msg <- .format_eval_string(i, env$bst_evaluation, stdev)
cat(msg, '\n')
}
}
@@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
if ((maximize && score > best_score) ||
(!maximize && score < best_score)) {
best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
best_msg <<- .format_eval_string(
i, env$bst_evaluation, env$bst_evaluation_err
)
best_score <<- score
best_iteration <<- i
best_ntreelimit <<- best_iteration * env$num_parallel_tree
@@ -754,7 +756,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
#
# Format the evaluation metric string
format.eval.string <- function(iter, eval_res, eval_err = NULL) {
.format_eval_string <- function(iter, eval_res, eval_err = NULL) {
if (length(eval_res) == 0)
stop('no evaluation results')
enames <- names(eval_res)

18
R-package/configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for xgboost 2.0.0.
# Generated by GNU Autoconf 2.71 for xgboost 2.0.3.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='xgboost'
PACKAGE_TARNAME='xgboost'
PACKAGE_VERSION='2.0.0'
PACKAGE_STRING='xgboost 2.0.0'
PACKAGE_VERSION='2.0.3'
PACKAGE_STRING='xgboost 2.0.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems.
\`configure' configures xgboost 2.0.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1287,7 +1287,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of xgboost 2.0.0:";;
short | recursive ) echo "Configuration of xgboost 2.0.3:";;
esac
cat <<\_ACEOF
@@ -1367,7 +1367,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
xgboost configure 2.0.0
xgboost configure 2.0.3
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by xgboost $as_me 2.0.0, which was
It was created by xgboost $as_me 2.0.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by xgboost $as_me 2.0.0, which was
This file was extended by xgboost $as_me 2.0.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
xgboost config.status 2.0.0
xgboost config.status 2.0.3
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -2,7 +2,7 @@
AC_PREREQ(2.69)
AC_INIT([xgboost],[2.0.0],[],[xgboost],[])
AC_INIT([xgboost],[2.0.3],[],[xgboost],[])
: ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then

View File

@@ -0,0 +1,79 @@
import argparse
import pathlib
import re
import shutil
def main(args):
if args.scala_version == "2.12":
scala_ver = "2.12"
scala_patchver = "2.12.18"
elif args.scala_version == "2.13":
scala_ver = "2.13"
scala_patchver = "2.13.11"
else:
raise ValueError(f"Unsupported Scala version: {args.scala_version}")
# Clean artifacts
if args.purge_artifacts:
for target in pathlib.Path("jvm-packages/").glob("**/target"):
if target.is_dir():
print(f"Removing {target}...")
shutil.rmtree(target)
# Update pom.xml
for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):
print(f"Updating {pom}...")
with open(pom, "r", encoding="utf-8") as f:
lines = f.readlines()
with open(pom, "w", encoding="utf-8") as f:
replaced_scalaver = False
replaced_scala_binver = False
for line in lines:
for artifact in [
"xgboost-jvm",
"xgboost4j",
"xgboost4j-gpu",
"xgboost4j-spark",
"xgboost4j-spark-gpu",
"xgboost4j-flink",
"xgboost4j-example",
]:
line = re.sub(
f"<artifactId>{artifact}_[0-9\\.]*",
f"<artifactId>{artifact}_{scala_ver}",
line,
)
# Only replace the first occurrence of scala.version
if not replaced_scalaver:
line, nsubs = re.subn(
r"<scala.version>[0-9\.]*",
f"<scala.version>{scala_patchver}",
line,
)
if nsubs > 0:
replaced_scalaver = True
# Only replace the first occurrence of scala.binary.version
if not replaced_scala_binver:
line, nsubs = re.subn(
r"<scala.binary.version>[0-9\.]*",
f"<scala.binary.version>{scala_ver}",
line,
)
if nsubs > 0:
replaced_scala_binver = True
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--purge-artifacts", action="store_true")
parser.add_argument(
"--scala-version",
type=str,
required=True,
help="Version of Scala to use in the JVM packages",
choices=["2.12", "2.13"],
)
parsed_args = parser.parse_args()
main(parsed_args)

View File

@@ -2,7 +2,6 @@ import argparse
import errno
import glob
import os
import platform
import re
import shutil
import subprocess
@@ -21,12 +20,14 @@ def normpath(path):
else:
return normalized
def cp(source, target):
source = normpath(source)
target = normpath(target)
print("cp {0} {1}".format(source, target))
shutil.copy(source, target)
def maybe_makedirs(path):
path = normpath(path)
print("mkdir -p " + path)
@@ -36,6 +37,7 @@ def maybe_makedirs(path):
if e.errno != errno.EEXIST:
raise
@contextmanager
def cd(path):
path = normpath(path)
@@ -47,18 +49,22 @@ def cd(path):
finally:
os.chdir(cwd)
def run(command, **kwargs):
print(command)
subprocess.check_call(command, shell=True, **kwargs)
def get_current_git_tag():
out = subprocess.check_output(["git", "tag", "--points-at", "HEAD"])
return out.decode().split("\n")[0]
def get_current_commit_hash():
out = subprocess.check_output(["git", "rev-parse", "HEAD"])
return out.decode().split("\n")[0]
def get_current_git_branch():
out = subprocess.check_output(["git", "log", "-n", "1", "--pretty=%d", "HEAD"])
m = re.search(r"release_[0-9\.]+", out.decode())
@@ -66,38 +72,49 @@ def get_current_git_branch():
raise ValueError("Expected branch name of form release_xxx")
return m.group(0)
def retrieve(url, filename=None):
print(f"{url} -> {filename}")
return urlretrieve(url, filename)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--release-version", type=str, required=True,
help="Version of the release being prepared")
parser.add_argument(
"--release-version",
type=str,
required=True,
help="Version of the release being prepared",
)
args = parser.parse_args()
if sys.platform != "darwin" or platform.machine() != "x86_64":
raise NotImplementedError("Please run this script using an Intel Mac")
version = args.release_version
expected_git_tag = "v" + version
current_git_tag = get_current_git_tag()
if current_git_tag != expected_git_tag:
if not current_git_tag:
raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
f"Run: git checkout {expected_git_tag}")
raise ValueError(f"Expected git tag {expected_git_tag} but current HEAD is at tag "
f"{current_git_tag}. Run: git checkout {expected_git_tag}")
raise ValueError(
f"Expected git tag {expected_git_tag} but current HEAD has no tag. "
f"Run: git checkout {expected_git_tag}"
)
raise ValueError(
f"Expected git tag {expected_git_tag} but current HEAD is at tag "
f"{current_git_tag}. Run: git checkout {expected_git_tag}"
)
commit_hash = get_current_commit_hash()
git_branch = get_current_git_branch()
print(f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}")
print(
f"Using commit {commit_hash} of branch {git_branch}, git tag {current_git_tag}"
)
with cd("jvm-packages/"):
print("====copying pure-Python tracker====")
for use_cuda in [True, False]:
xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
cp("../python-package/xgboost/tracker.py", f"{xgboost4j}/src/main/resources")
cp(
"../python-package/xgboost/tracker.py",
f"{xgboost4j}/src/main/resources",
)
print("====copying resources for testing====")
with cd("../demo/CLI/regression"):
@@ -115,7 +132,12 @@ def main():
cp(file, f"{xgboost4j_spark}/src/test/resources")
print("====Creating directories to hold native binaries====")
for os_ident, arch in [("linux", "x86_64"), ("windows", "x86_64"), ("macos", "x86_64")]:
for os_ident, arch in [
("linux", "x86_64"),
("windows", "x86_64"),
("macos", "x86_64"),
("macos", "aarch64"),
]:
output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
maybe_makedirs(output_dir)
for os_ident, arch in [("linux", "x86_64")]:
@@ -123,52 +145,98 @@ def main():
maybe_makedirs(output_dir)
print("====Downloading native binaries from CI====")
nightly_bucket_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
maven_repo_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
nightly_bucket_prefix = (
"https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
)
maven_repo_prefix = (
"https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
)
retrieve(url=f"{nightly_bucket_prefix}/{git_branch}/xgboost4j_{commit_hash}.dll",
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll")
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll",
filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
)
retrieve(
url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
)
with tempfile.TemporaryDirectory() as tempdir:
# libxgboost4j.so for Linux x86_64, CPU only
zip_path = os.path.join(tempdir, "xgboost4j_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j")
retrieve(url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
f"xgboost4j_2.12-{version}.jar",
filename=zip_path)
retrieve(
url=f"{maven_repo_prefix}/xgboost4j_2.12/{version}/"
f"xgboost4j_2.12-{version}.jar",
filename=zip_path,
)
os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir)
cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
cp(
os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
# libxgboost4j.so for Linux x86_64, GPU support
zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
retrieve(url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
f"xgboost4j-gpu_2.12-{version}.jar",
filename=zip_path)
retrieve(
url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
f"xgboost4j-gpu_2.12-{version}.jar",
filename=zip_path,
)
os.mkdir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as t:
t.extractall(extract_dir)
cp(os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so")
cp(
os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
"xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
)
print("====Next Steps====")
print("1. Gain upload right to Maven Central repo.")
print("1-1. Sign up for a JIRA account at Sonatype: ")
print("1-2. File a JIRA ticket: "
"https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
"https://issues.sonatype.org/browse/OSSRH-67724")
print("2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
"https://central.sonatype.org/publish/publish-maven/")
print("3. Now on a Mac machine, run:")
print(" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests")
print("4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 "
"to inspect the staged JAR files. Finally, press Release button to publish the "
"artifacts to the Maven Central repository.")
print(
"1-2. File a JIRA ticket: "
"https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134. Example: "
"https://issues.sonatype.org/browse/OSSRH-67724"
)
print(
"2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
"https://central.sonatype.org/publish/publish-maven/"
)
print(
"3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
"Make sure to use an Internet connection with fast upload speed:"
)
print(
" # Skip native build, since we have all needed native binaries from CI\n"
" export MAVEN_SKIP_NATIVE_BUILD=1\n"
" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
)
print(
"4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
"Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
"to inspect the staged JAR files. Finally, press Release button to publish the "
"artifacts to the Maven Central repository. The top-level metapackage should be "
"named xgboost-jvm_2.12."
)
print(
"5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
" export MAVEN_SKIP_NATIVE_BUILD=1\n"
" python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
" GPG_TTY=$(tty) mvn deploy -Prelease-cpu-only,scala-2.13 -DskipTests"
)
print(
"6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
"The top-level metapackage should be named xgboost-jvm_2.13."
)
if __name__ == "__main__":
main()

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 2 /* NOLINT */
#define XGBOOST_VER_MINOR 0 /* NOLINT */
#define XGBOOST_VER_PATCH 0 /* NOLINT */
#define XGBOOST_VER_PATCH 3 /* NOLINT */
#endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -25,4 +25,3 @@ target_include_directories(xgboost4j
${PROJECT_SOURCE_DIR}/rabit/include)
set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
target_link_libraries(xgboost4j PRIVATE ${JAVA_JVM_LIBRARY})

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
import errno
import argparse
import errno
import glob
import os
import platform
@@ -19,11 +19,10 @@ CONFIG = {
"USE_HDFS": "OFF",
"USE_AZURE": "OFF",
"USE_S3": "OFF",
"USE_CUDA": "OFF",
"USE_NCCL": "OFF",
"JVM_BINDINGS": "ON",
"LOG_CAPI_INVOCATION": "OFF"
"LOG_CAPI_INVOCATION": "OFF",
}
@@ -70,26 +69,22 @@ def normpath(path):
return normalized
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
cli_args = parser.parse_args()
def native_build(args):
if sys.platform == "darwin":
# Enable of your compiler supports OpenMP.
CONFIG["USE_OPENMP"] = "OFF"
os.environ["JAVA_HOME"] = subprocess.check_output(
"/usr/libexec/java_home").strip().decode()
os.environ["JAVA_HOME"] = (
subprocess.check_output("/usr/libexec/java_home").strip().decode()
)
print("building Java wrapper")
with cd(".."):
build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' else 'build'
build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
maybe_makedirs(build_dir)
with cd(build_dir):
if sys.platform == "win32":
# Force x64 build on Windows.
maybe_generator = ' -A x64'
maybe_generator = " -A x64"
else:
maybe_generator = ""
if sys.platform == "linux":
@@ -97,12 +92,12 @@ if __name__ == "__main__":
else:
maybe_parallel_build = ""
if cli_args.log_capi_invocation == 'ON':
CONFIG['LOG_CAPI_INVOCATION'] = 'ON'
if cli_args.log_capi_invocation == "ON":
CONFIG["LOG_CAPI_INVOCATION"] = "ON"
if cli_args.use_cuda == 'ON':
CONFIG['USE_CUDA'] = 'ON'
CONFIG['USE_NCCL'] = 'ON'
if cli_args.use_cuda == "ON":
CONFIG["USE_CUDA"] = "ON"
CONFIG["USE_NCCL"] = "ON"
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
@@ -115,7 +110,7 @@ if __name__ == "__main__":
if gpu_arch_flag is not None:
args.append("%s" % gpu_arch_flag)
lib_dir = os.path.join(os.pardir, 'lib')
lib_dir = os.path.join(os.pardir, "lib")
if os.path.exists(lib_dir):
shutil.rmtree(lib_dir)
run("cmake .. " + " ".join(args) + maybe_generator)
@@ -125,8 +120,10 @@ if __name__ == "__main__":
run(f'"{sys.executable}" mapfeat.py')
run(f'"{sys.executable}" mknfold.py machine.txt 1')
xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j'
xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' else 'xgboost4j-spark'
xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
xgboost4j_spark = (
"xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
)
print("copying native library")
library_name, os_folder = {
@@ -141,14 +138,19 @@ if __name__ == "__main__":
"i86pc": "x86_64", # on Solaris x86_64
"sun4v": "sparc", # on Solaris sparc
"arm64": "aarch64", # on macOS & Windows ARM 64-bit
"aarch64": "aarch64"
"aarch64": "aarch64",
}[platform.machine().lower()]
output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder)
output_folder = "{}/src/main/resources/lib/{}/{}".format(
xgboost4j, os_folder, arch_folder
)
maybe_makedirs(output_folder)
cp("../lib/" + library_name, output_folder)
print("copying pure-Python tracker")
cp("../python-package/xgboost/tracker.py", "{}/src/main/resources".format(xgboost4j))
cp(
"../python-package/xgboost/tracker.py",
"{}/src/main/resources".format(xgboost4j),
)
print("copying train/test files")
maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
@@ -164,3 +166,18 @@ if __name__ == "__main__":
maybe_makedirs("{}/src/test/resources".format(xgboost4j))
for file in glob.glob("../demo/data/agaricus.*"):
cp(file, "{}/src/test/resources".format(xgboost4j))
if __name__ == "__main__":
if "MAVEN_SKIP_NATIVE_BUILD" in os.environ:
print("MAVEN_SKIP_NATIVE_BUILD is set. Skipping native build...")
else:
parser = argparse.ArgumentParser()
parser.add_argument(
"--log-capi-invocation", type=str, choices=["ON", "OFF"], default="OFF"
)
parser.add_argument(
"--use-cuda", type=str, choices=["ON", "OFF"], default="OFF"
)
cli_args = parser.parse_args()
native_build(cli_args)

View File

@@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
<packaging>pom</packaging>
<name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description>
@@ -189,6 +189,93 @@
</plugins>
</build>
</profile>
<profile>
<id>release-cpu-only</id>
<modules>
<module>xgboost4j</module>
<module>xgboost4j-example</module>
<module>xgboost4j-spark</module>
<module>xgboost4j-flink</module>
</modules>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>empty-javadoc-jar</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>javadoc</classifier>
<classesDirectory>${basedir}/javadoc</classesDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.1</version>
<configuration>
<autoVersionSubmodules>true</autoVersionSubmodules>
<useReleaseProfile>false</useReleaseProfile>
<releaseProfiles>release</releaseProfiles>
<goals>deploy</goals>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>assembly</id>
<build>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
</parent>
<name>xgboost4j-example</name>
<artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
<version>2.0.0</version>
<artifactId>xgboost4j-example_2.12</artifactId>
<version>2.0.3</version>
<packaging>jar</packaging>
<build>
<plugins>
@@ -26,7 +26,7 @@
<dependencies>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<artifactId>xgboost4j-spark_2.12</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
@@ -37,7 +37,7 @@
</dependency>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<artifactId>xgboost4j-flink_2.12</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>

View File

@@ -5,13 +5,13 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
</parent>
<name>xgboost4j-flink</name>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>2.0.0</version>
<artifactId>xgboost4j-flink_2.12</artifactId>
<version>2.0.3</version>
<properties>
<flink-ml.version>2.2.0</flink-ml.version>
</properties>
@@ -30,7 +30,7 @@
<dependencies>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
</parent>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<artifactId>xgboost4j-gpu_2.12</artifactId>
<name>xgboost4j-gpu</name>
<version>2.0.0</version>
<version>2.0.3</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -5,11 +5,11 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
</parent>
<name>xgboost4j-spark-gpu</name>
<artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build>
<plugins>
<plugin>
@@ -24,7 +24,7 @@
<dependencies>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<artifactId>xgboost4j-gpu_2.12</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>

View File

@@ -5,11 +5,11 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
</parent>
<name>xgboost4j-spark</name>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<artifactId>xgboost4j-spark_2.12</artifactId>
<build>
<plugins>
<plugin>
@@ -24,7 +24,7 @@
<dependencies>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>

View File

@@ -5,12 +5,12 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm</artifactId>
<version>2.0.0</version>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.0.3</version>
</parent>
<name>xgboost4j</name>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>2.0.0</version>
<artifactId>xgboost4j_2.12</artifactId>
<version>2.0.3</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -132,16 +132,28 @@ def locate_or_build_libxgboost(
if build_config.use_system_libxgboost:
# Find libxgboost from system prefix
sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
if not libxgboost_sys.exists():
raise RuntimeError(
f"use_system_libxgboost was specified but {_lib_name()} is "
f"not found in {libxgboost_sys.parent}"
)
logger.info("Using system XGBoost: %s", str(libxgboost_sys))
return libxgboost_sys
sys_prefix = pathlib.Path(sys.base_prefix)
sys_prefix_candidates = [
sys_prefix / "lib",
# Paths possibly used on Windows
sys_prefix / "bin",
sys_prefix / "Library",
sys_prefix / "Library" / "bin",
sys_prefix / "Library" / "lib",
]
sys_prefix_candidates = [
p.expanduser().resolve() for p in sys_prefix_candidates
]
for candidate_dir in sys_prefix_candidates:
libtreelite_sys = candidate_dir / _lib_name()
if libtreelite_sys.exists():
logger.info("Using system XGBoost: %s", str(libtreelite_sys))
return libtreelite_sys
raise RuntimeError(
f"use_system_libxgboost was specified but {_lib_name()} is "
f"not found. Paths searched (in order): \n"
+ "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
)
libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
if libxgboost is not None:

View File

@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
[project]
name = "xgboost"
version = "2.0.0"
version = "2.0.3"
authors = [
{ name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }

View File

@@ -1 +1 @@
2.0.0
2.0.3

View File

@@ -206,6 +206,7 @@ def _load_lib() -> ctypes.CDLL:
lib = ctypes.cdll.LoadLibrary(lib_path)
setattr(lib, "path", os.path.normpath(lib_path))
lib_success = True
break
except OSError as e:
os_error_list.append(str(e))
continue
@@ -2399,6 +2400,7 @@ class Booster:
_is_cudf_df,
_is_cupy_array,
_is_list,
_is_np_array_like,
_is_pandas_df,
_is_pandas_series,
_is_tuple,
@@ -2428,7 +2430,7 @@ class Booster:
f"got {data.shape[1]}"
)
if isinstance(data, np.ndarray):
if _is_np_array_like(data):
from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype)

View File

@@ -78,7 +78,6 @@ from .data import _is_cudf_ser, _is_cupy_array
from .sklearn import (
XGBClassifier,
XGBClassifierBase,
XGBClassifierMixIn,
XGBModel,
XGBRanker,
XGBRankerMixIn,
@@ -1854,7 +1853,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
"Implementation of the scikit-learn API for XGBoost classification.",
["estimators", "model"],
)
class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase):
class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
# pylint: disable=missing-class-docstring
async def _fit_async(
self,
@@ -2036,10 +2035,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
return preds
def load_model(self, fname: ModelIn) -> None:
super().load_model(fname)
self._load_model_attributes(self.get_booster())
@xgboost_model_doc(
"""Implementation of the Scikit-Learn API for XGBoost Ranking.

View File

@@ -164,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
return isinstance(data, scipy.sparse.coo_matrix)
def _is_numpy_array(data: DataType) -> bool:
return isinstance(data, (np.ndarray, np.matrix))
def _is_np_array_like(data: DataType) -> bool:
return hasattr(data, "__array_interface__")
def _ensure_np_dtype(
@@ -317,7 +317,6 @@ def pandas_feature_info(
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
"""Handle feature info for pandas dataframe."""
import pandas as pd
from pandas.api.types import is_categorical_dtype, is_sparse
# handle feature names
if feature_names is None and meta is None:
@@ -332,10 +331,10 @@ def pandas_feature_info(
if feature_types is None and meta is None:
feature_types = []
for dtype in data.dtypes:
if is_sparse(dtype):
if is_pd_sparse_dtype(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif (
is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
) and enable_categorical:
feature_types.append(CAT_T)
else:
@@ -345,18 +344,13 @@ def pandas_feature_info(
def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Whether dtype is a pandas nullable type."""
from pandas.api.types import (
is_bool_dtype,
is_categorical_dtype,
is_float_dtype,
is_integer_dtype,
)
from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
return is_int or is_bool or is_float or is_categorical_dtype(dtype)
return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
)
def is_pd_cat_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas category type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import CategoricalDtype
return isinstance(dtype, CategoricalDtype)
from pandas.api.types import is_categorical_dtype
return is_categorical_dtype(dtype)
def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
"""Wrapper for testing pandas sparse type."""
import pandas as pd
if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
Version = pd.util.version.Version
if Version(pd.__version__) >= Version("2.1.0"):
from pandas import SparseDtype
return isinstance(dtype, SparseDtype)
from pandas.api.types import is_sparse
return is_sparse(dtype)
def pandas_cat_null(data: DataFrame) -> DataFrame:
"""Handle categorical dtype and nullable extension types from pandas."""
import pandas as pd
from pandas.api.types import is_categorical_dtype
# handle category codes and nullable.
cat_columns = []
nul_columns = []
# avoid an unnecessary conversion if possible
for col, dtype in zip(data.columns, data.dtypes):
if is_categorical_dtype(dtype):
if is_pd_cat_dtype(dtype):
cat_columns.append(col)
elif is_pa_ext_categorical_dtype(dtype):
raise ValueError(
@@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
transformed = data
def cat_codes(ser: pd.Series) -> pd.Series:
if is_categorical_dtype(ser.dtype):
if is_pd_cat_dtype(ser.dtype):
return ser.cat.codes
assert is_pa_ext_categorical_dtype(ser.dtype)
# Not yet supported, the index is not ordered for some reason. Alternately:
@@ -454,14 +479,12 @@ def _transform_pandas_df(
meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
from pandas.api.types import is_categorical_dtype, is_sparse
pyarrow_extension = False
for dtype in data.dtypes:
if not (
(dtype.name in _pandas_dtype_mapper)
or is_sparse(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
or is_pd_sparse_dtype(dtype)
or (is_pd_cat_dtype(dtype) and enable_categorical)
or is_pa_ext_dtype(dtype)
):
_invalid_dataframe_dtype(data)
@@ -515,9 +538,8 @@ def _meta_from_pandas_series(
) -> None:
"""Help transform pandas series for meta data like labels"""
data = data.values.astype("float")
from pandas.api.types import is_sparse
if is_sparse(data):
if is_pd_sparse_dtype(getattr(data, "dtype", data)):
data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
@@ -539,13 +561,11 @@ def _from_pandas_series(
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical
is_pd_cat_dtype(data.dtype) and enable_categorical
):
_invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype):
if enable_categorical and is_pd_cat_dtype(data.dtype):
data = data.cat.codes
return _from_numpy_array(
data.values.reshape(data.shape[0], 1).astype("float"),
@@ -1051,7 +1071,7 @@ def dispatch_data_backend(
return _from_scipy_csr(
data.tocsr(), missing, threads, feature_names, feature_types
)
if _is_numpy_array(data):
if _is_np_array_like(data):
return _from_numpy_array(
data, missing, threads, feature_names, feature_types, data_split_mode
)
@@ -1194,7 +1214,7 @@ def dispatch_meta_backend(
if _is_tuple(data):
_meta_from_tuple(data, name, dtype, handle)
return
if _is_numpy_array(data):
if _is_np_array_like(data):
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_df(data):
@@ -1281,7 +1301,7 @@ def _proxy_transform(
return _transform_dlpack(data), None, feature_names, feature_types
if _is_list(data) or _is_tuple(data):
data = np.array(data)
if _is_numpy_array(data):
if _is_np_array_like(data):
data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
@@ -1331,7 +1351,7 @@ def dispatch_proxy_set_data(
if not allow_host:
raise err
if _is_numpy_array(data):
if _is_np_array_like(data):
_check_data_shape(data)
proxy._set_data_from_array(data) # pylint: disable=W0212
return

View File

@@ -31,16 +31,15 @@ def find_lib_path() -> List[str]:
]
if sys.platform == "win32":
if platform.architecture()[0] == "64bit":
dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/"))
# hack for pip installation when copy all parent source
# directory here
dll_path.append(os.path.join(curr_path, "./windows/x64/Release/"))
else:
dll_path.append(os.path.join(curr_path, "../../windows/Release/"))
# hack for pip installation when copy all parent source
# directory here
dll_path.append(os.path.join(curr_path, "./windows/Release/"))
# On Windows, Conda may install libs in different paths
dll_path.extend(
[
os.path.join(sys.base_prefix, "bin"),
os.path.join(sys.base_prefix, "Library"),
os.path.join(sys.base_prefix, "Library", "bin"),
os.path.join(sys.base_prefix, "Library", "lib"),
]
)
dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]

View File

@@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
from .training import train
class XGBClassifierMixIn: # pylint: disable=too-few-public-methods
"""MixIn for classification."""
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
def _load_model_attributes(self, booster: Booster) -> None:
config = json.loads(booster.save_config())
self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
# binary classification is treated as regression in XGBoost.
self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
class XGBRankerMixIn: # pylint: disable=too-few-public-methods
"""MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
base classes.
@@ -845,21 +832,38 @@ class XGBModel(XGBModelBase):
self.get_booster().load_model(fname)
meta_str = self.get_booster().attr("scikit_learn")
if meta_str is None:
return
if meta_str is not None:
meta = json.loads(meta_str)
t = meta.get("_estimator_type", None)
if t is not None and t != self._get_type():
raise TypeError(
"Loading an estimator with different type. Expecting: "
f"{self._get_type()}, got: {t}"
)
meta = json.loads(meta_str)
t = meta.get("_estimator_type", None)
if t is not None and t != self._get_type():
raise TypeError(
"Loading an estimator with different type. Expecting: "
f"{self._get_type()}, got: {t}"
)
self.feature_types = self.get_booster().feature_types
self.get_booster().set_attr(scikit_learn=None)
config = json.loads(self.get_booster().save_config())
self._load_model_attributes(config)
load_model.__doc__ = f"""{Booster.load_model.__doc__}"""
def _load_model_attributes(self, config: dict) -> None:
"""Load model attributes without hyper-parameters."""
from sklearn.base import is_classifier
booster = self.get_booster()
self.objective = config["learner"]["objective"]["name"]
self.booster = config["learner"]["gradient_booster"]["name"]
self.base_score = config["learner"]["learner_model_param"]["base_score"]
self.feature_types = booster.feature_types
if is_classifier(self):
self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
# binary classification is treated as regression in XGBoost.
self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
# pylint: disable=too-many-branches
def _configure_fit(
self,
@@ -1409,7 +1413,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
Number of boosting rounds.
""",
)
class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
@_deprecate_positional_args
def __init__(
@@ -1637,10 +1641,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
def classes_(self) -> np.ndarray:
return np.arange(self.n_classes_)
def load_model(self, fname: ModelIn) -> None:
super().load_model(fname)
self._load_model_attributes(self.get_booster())
@xgboost_model_doc(
"scikit-learn API for XGBoost random forest classification.",
@@ -2093,7 +2093,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
"""
X, qid = _get_qid(X, None)
Xyq = DMatrix(X, y, qid=qid)
# fixme(jiamingy): base margin and group weight is not yet supported. We might
# need to make extra special fields in the dataframe.
Xyq = DMatrix(
X,
y,
qid=qid,
missing=self.missing,
enable_categorical=self.enable_categorical,
nthread=self.n_jobs,
feature_types=self.feature_types,
)
if callable(self.eval_metric):
metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)

View File

@@ -22,7 +22,7 @@ from typing import (
import numpy as np
import pandas as pd
from pyspark import SparkContext, cloudpickle
from pyspark import RDD, SparkContext, cloudpickle
from pyspark.ml import Estimator, Model
from pyspark.ml.functions import array_to_vector, vector_to_array
from pyspark.ml.linalg import VectorUDT
@@ -44,6 +44,7 @@ from pyspark.ml.util import (
MLWritable,
MLWriter,
)
from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
from pyspark.sql import Column, DataFrame
from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
from pyspark.sql.types import (
@@ -88,6 +89,7 @@ from .utils import (
_get_rabit_args,
_get_spark_session,
_is_local,
_is_standalone_or_localcluster,
deserialize_booster,
deserialize_xgb_model,
get_class_name,
@@ -342,6 +344,54 @@ class _SparkXGBParams(
predict_params[param.name] = self.getOrDefault(param)
return predict_params
def _validate_gpu_params(self) -> None:
"""Validate the gpu parameters and gpu configurations"""
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
ss = _get_spark_session()
sc = ss.sparkContext
if _is_local(sc):
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
if executor_gpus is None:
raise ValueError(
"The `spark.executor.resource.gpu.amount` is required for training"
" on GPU."
)
if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
# We will enable stage-level scheduling in spark 3.4.0+ which doesn't
# require spark.task.resource.gpu.amount to be set explicitly
gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
def _validate_params(self) -> None:
# pylint: disable=too-many-branches
init_model = self.getOrDefault("xgb_model")
@@ -421,53 +471,7 @@ class _SparkXGBParams(
"`pyspark.ml.linalg.Vector` type."
)
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
gpu_per_task = (
_get_spark_session()
.sparkContext.getConf()
.get("spark.task.resource.gpu.amount")
)
is_local = _is_local(_get_spark_session().sparkContext)
if is_local:
# checking spark local mode.
if gpu_per_task is not None:
raise RuntimeError(
"The spark local mode does not support gpu configuration."
"Please remove spark.executor.resource.gpu.amount and "
"spark.task.resource.gpu.amount"
)
# Support GPU training in Spark local mode is just for debugging
# purposes, so it's okay for printing the below warning instead of
# checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You have enabled GPU in spark local mode. Please make sure your"
" local node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
# checking spark non-local mode.
if gpu_per_task is not None:
if float(gpu_per_task) < 1.0:
raise ValueError(
"XGBoost doesn't support GPU fractional configurations. "
"Please set `spark.task.resource.gpu.amount=spark.executor"
".resource.gpu.amount`"
)
if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning(
"%s GPUs for each Spark task is configured, but each "
"XGBoost training task uses only 1 GPU.",
gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
)
self._validate_gpu_params()
def _validate_and_convert_feature_col_as_float_col_list(
@@ -592,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
arbitrary_params_dict={},
)
self.logger = get_logger(self.__class__.__name__)
def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name
"""
Set params for the estimator.
@@ -894,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
return booster_params, train_call_kwargs_params, dmatrix_kwargs
def _skip_stage_level_scheduling(self) -> bool:
# pylint: disable=too-many-return-statements
"""Check if stage-level scheduling is not needed,
return true to skip stage-level scheduling"""
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
ss = _get_spark_session()
sc = ss.sparkContext
if ss.version < "3.4.0":
self.logger.info(
"Stage-level scheduling in xgboost requires spark version 3.4.0+"
)
return True
if not _is_standalone_or_localcluster(sc):
self.logger.info(
"Stage-level scheduling in xgboost requires spark standalone or "
"local-cluster mode"
)
return True
executor_cores = sc.getConf().get("spark.executor.cores")
executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
if executor_cores is None or executor_gpus is None:
self.logger.info(
"Stage-level scheduling in xgboost requires spark.executor.cores, "
"spark.executor.resource.gpu.amount to be set."
)
return True
if int(executor_cores) == 1:
# there will be only 1 task running at any time.
self.logger.info(
"Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
)
return True
if int(executor_gpus) > 1:
# For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
# to make xgboost run successfully.
#
self.logger.info(
"Stage-level scheduling in xgboost will not work "
"when spark.executor.resource.gpu.amount>1"
)
return True
task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
if task_gpu_amount is None:
# The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
# but with stage-level scheduling, we can make training task grab the gpu.
return False
if float(task_gpu_amount) == float(executor_gpus):
# spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
# results in only 1 task running at a time, which may cause perf issue.
return True
# We can enable stage-level scheduling
return False
# CPU training doesn't require stage-level scheduling
return True
def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
"""Try to enable stage-level scheduling"""
if self._skip_stage_level_scheduling():
return rdd
ss = _get_spark_session()
# executor_cores will not be None
executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
assert executor_cores is not None
# Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
# If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
# ETL gpu tasks running alongside training tasks.
spark_plugins = ss.conf.get("spark.plugins", " ")
assert spark_plugins is not None
spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
assert spark_rapids_sql_enabled is not None
task_cores = (
int(executor_cores)
if "com.nvidia.spark.SQLPlugin" in spark_plugins
and "true" == spark_rapids_sql_enabled.lower()
else (int(executor_cores) // 2) + 1
)
# Each training task requires cpu cores > total executor cores//2 + 1 which can
# make sure the tasks be sent to different executors.
#
# Please note that we can't use GPU to limit the concurrent tasks because of
# https://issues.apache.org/jira/browse/SPARK-45527.
task_gpus = 1.0
treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
rp = ResourceProfileBuilder().require(treqs).build
self.logger.info(
"XGBoost training tasks require the resource(cores=%s, gpu=%s).",
task_cores,
task_gpus,
)
return rdd.withResources(rp)
def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
# pylint: disable=too-many-statements, too-many-locals
self._validate_params()
@@ -994,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
)
def _run_job() -> Tuple[str, str]:
ret = (
rdd = (
dataset.mapInPandas(
_train_booster, schema="config string, booster string" # type: ignore
_train_booster, # type: ignore
schema="config string, booster string",
)
.rdd.barrier()
.mapPartitions(lambda x: x)
.collect()[0]
)
rdd_with_resource = self._try_stage_level_scheduling(rdd)
ret = rdd_with_resource.collect()[0]
return ret[0], ret[1]
get_logger("XGBoost-PySpark").info(

View File

@@ -129,6 +129,13 @@ def _is_local(spark_context: SparkContext) -> bool:
return spark_context._jsc.sc().isLocal()
def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
master = spark_context.getConf().get("spark.master")
return master is not None and (
master.startswith("spark://") or master.startswith("local-cluster")
)
def _get_gpu_id(task_context: TaskContext) -> int:
"""Get the gpu id from the task resources"""
if task_context is None:

View File

@@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
with pytest.raises(ValueError, match="Either `group` or `qid`."):
ranker.fit(df, y, eval_set=[(X, y)])
def run_ranking_categorical(device: str) -> None:
"""Test LTR with categorical features."""
from sklearn.model_selection import cross_val_score
X, y = tm.make_categorical(
n_samples=512, n_features=10, n_categories=3, onehot=False
)
rng = np.random.default_rng(1994)
qid = rng.choice(3, size=y.shape[0])
qid = np.sort(qid)
X["qid"] = qid
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
ltr.fit(X, y)
score = ltr.score(X, y)
assert score > 0.9
ltr = xgb.XGBRanker(enable_categorical=True, device=device)
# test using the score function inside sklearn.
scores = cross_val_score(ltr, X, y)
for s in scores:
assert s > 0.7

View File

@@ -384,7 +384,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
* @param length See the `length` parameter of `mmap` for details.
*/
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
: AlignedResourceReadStream{std::shared_ptr<MmapResource>{ // NOLINT
new MmapResource{std::move(path), offset, length}}} {}
~PrivateMmapConstStream() noexcept(false) override;
};

View File

@@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT
return Span{data(), size()}.size_bytes();
return Span<const value_type>{data(), size()}.size_bytes();
}
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT

View File

@@ -3,14 +3,23 @@
*/
#include "threading_utils.h"
#include <fstream>
#include <string>
#include <algorithm> // for max
#include <exception> // for exception
#include <filesystem> // for path, exists
#include <fstream> // for ifstream
#include <string> // for string
#include "xgboost/logging.h"
#include "common.h" // for DivRoundUp
namespace xgboost {
namespace common {
int32_t GetCfsCPUCount() noexcept {
namespace xgboost::common {
/**
* Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
std::filesystem::path const& peroid_path) {
#if defined(__linux__)
// https://bugs.openjdk.java.net/browse/JDK-8146115
// http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
}
};
// complete fair scheduler from Linux
auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
auto const cfs_quota(read_int(quota_path.c_str()));
auto const cfs_period(read_int(peroid_path.c_str()));
if ((cfs_quota > 0) && (cfs_period > 0)) {
return std::max(cfs_quota / cfs_period, 1);
}
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
return -1;
}
std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
std::int32_t cnt{-1};
#if defined(__linux__)
namespace fs = std::filesystem;
std::int32_t a{0}, b{0};
auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
try {
std::ifstream fin{bandwidth_path, std::ios::in};
fin >> a;
fin >> b;
} catch (std::exception const&) {
warn();
return cnt;
}
if (a > 0 && b > 0) {
cnt = std::max(common::DivRoundUp(a, b), 1);
}
#endif // defined(__linux__)
return cnt;
}
std::int32_t GetCfsCPUCount() noexcept {
namespace fs = std::filesystem;
fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
auto has_v2 = fs::exists(bandwidth_path);
if (has_v2) {
return GetCGroupV2Count(bandwidth_path);
}
fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
if (has_v1) {
return GetCGroupV1Count(quota_path, peroid_path);
}
return -1;
}
std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
// Don't use parallel if we are in a parallel region.
if (omp_in_parallel()) {
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
n_threads = std::max(n_threads, 1);
return n_threads;
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
* \brief Get thread limit from CFS.
*
* This function has non-trivial overhead and should not be called repeatly.
*
* Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCfsCPUCount() noexcept;

View File

@@ -1317,7 +1317,9 @@ class LearnerImpl : public LearnerIO {
if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
auto config = obj_->DefaultMetricConfig();
metrics_.back()->LoadConfig(config);
if (!IsA<Null>(config)) {
metrics_.back()->LoadConfig(config);
}
metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
}

View File

@@ -268,6 +268,13 @@ class PseudoHuberRegression : public FitIntercept {
}
FromJson(in["pseudo_huber_param"], &param_);
}
[[nodiscard]] Json DefaultMetricConfig() const override {
CHECK(param_.GetInitialised());
Json config{Object{}};
config["name"] = String{this->DefaultEvalMetric()};
config["pseudo_huber_param"] = ToJson(param_);
return config;
}
};
XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")

View File

@@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION}
echo "--- Stash XGBoost4J JARs (Scala 2.12)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
echo "--- Build XGBoost JVM packages scala 2.13"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} "" "" "true"
echo "--- Stash XGBoost4J JARs"
echo "--- Stash XGBoost4J JARs (Scala 2.13)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"

View File

@@ -0,0 +1,8 @@
steps:
- block: ":rocket: Run this test job"
if: build.pull_request.id != null || build.branch =~ /^dependabot\//
- label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11"
command: "tests/buildkite/test-macos-m1-clang11.sh"
key: mac-m1-appleclang11
agents:
queue: mac-mini-m1

View File

@@ -0,0 +1,50 @@
#!/bin/bash
set -euo pipefail
source tests/buildkite/conftest.sh
# Display system info
echo "--- Display system information"
set -x
system_profiler SPSoftwareDataType
sysctl -n machdep.cpu.brand_string
uname -m
set +x
# Build XGBoost4J binary
echo "--- Build libxgboost4j.dylib"
set -x
mkdir build
pushd build
export JAVA_HOME=$(/usr/libexec/java_home)
cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15
ninja -v
popd
rm -rf build
set +x
echo "--- Upload Python wheel"
set -x
pushd lib
mv -v libxgboost4j.dylib libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
buildkite-agent artifact upload libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib \
s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \
--acl public-read --no-progress
fi
popd
set +x
# Ensure that XGBoost can be built with Clang 11
echo "--- Build and Test XGBoost with MacOS M1, Clang 11"
set -x
LLVM11_PATH=$(brew --prefix llvm\@11)
mkdir build
pushd build
cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \
-DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \
-DUSE_DMLC_GTEST=ON
ninja -v

View File

@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG
ARG RAPIDS_VERSION_ARG

View File

@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
# Install all basic requirements

View File

@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG

View File

@@ -27,6 +27,9 @@ fi
mvn_profile_string=""
if [ "x$use_scala213" != "x" ]; then
export mvn_profile_string="-Pdefault,scala-2.13"
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages
fi
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options

View File

@@ -32,11 +32,10 @@ dependencies:
- jsonschema
- boto3
- awscli
- py-ubjson
- cffi
- pyarrow
- pyspark>=3.4.0
- cloudpickle
- pip:
- sphinx_rtd_theme
- datatable
- py-ubjson

View File

@@ -27,6 +27,9 @@ rm -rf ../build/
# Deploy to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
# Deploy scala 2.13 to S3 bucket xgboost-maven-repo
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages/
mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests

View File

@@ -21,9 +21,18 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
fi
# including maven profiles for different scala versions: 2.12 is the default at the moment.
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
for scala_binary_version in "2.12" "2.13"; do
cd ..
python dev/change_scala_version.py --scala-version ${scala_binary_version}
cd jvm-packages
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
if [[ "$scala_binary_version" == "2.12" ]]; then
_maven_profile_string=""
elif [[ "$scala_binary_version" == "2.13" ]]; then
_maven_profile_string="-Pdefault,scala-2.13"
else
echo "Unexpected scala version: $scala_version ($scala_binary_version)."
fi
# Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar

View File

@@ -148,7 +148,8 @@ TEST(IO, Resource) {
fout << 1.0 << std::endl;
fout.close();
auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
auto resource = std::shared_ptr<MmapResource>{
new MmapResource{path, 0, sizeof(double)}};
ASSERT_EQ(resource->Size(), sizeof(double));
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
ASSERT_EQ(resource->DataAs<double>()[0], val);

View File

@@ -6,6 +6,7 @@
#include <xgboost/objective.h>
#include "../helpers.h"
#include "../objective_helpers.h"
TEST(Objective, UnknownFunction) {
xgboost::ObjFunction* obj = nullptr;
@@ -43,4 +44,61 @@ TEST(Objective, PredTransform) {
ASSERT_TRUE(predts.HostCanWrite());
}
}
class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
Context ctx_;
public:
void Run(std::string objective) {
auto Xy = MakeFmatForObjTest(objective);
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
learner->SetParam("objective", objective);
if (objective.find("multi") != std::string::npos) {
learner->SetParam("num_class", "3");
objfn->Configure(Args{{"num_class", "3"}});
} else if (objective.find("quantile") != std::string::npos) {
learner->SetParam("quantile_alpha", "0.5");
objfn->Configure(Args{{"quantile_alpha", "0.5"}});
} else {
objfn->Configure(Args{});
}
learner->Configure();
learner->UpdateOneIter(0, Xy);
learner->EvalOneIter(0, {Xy}, {"train"});
Json config{Object{}};
learner->SaveConfig(&config);
auto jobj = get<Object const>(config["learner"]["objective"]);
ASSERT_TRUE(jobj.find("name") != jobj.cend());
// FIXME(jiamingy): We should have the following check, but some legacy parameter like
// "pos_weight", "delta_step" in objectives are not in metrics.
// if (jobj.size() > 1) {
// ASSERT_FALSE(IsA<Null>(objfn->DefaultMetricConfig()));
// }
auto mconfig = objfn->DefaultMetricConfig();
if (!IsA<Null>(mconfig)) {
// make sure metric can handle it
std::unique_ptr<Metric> metricfn{Metric::Create(get<String const>(mconfig["name"]), &ctx_)};
metricfn->LoadConfig(mconfig);
Json loaded(Object{});
metricfn->SaveConfig(&loaded);
metricfn->Configure(Args{});
ASSERT_EQ(mconfig, loaded);
}
}
};
TEST_P(TestDefaultObjConfig, Objective) {
std::string objective = GetParam();
this->Run(objective);
}
INSTANTIATE_TEST_SUITE_P(Objective, TestDefaultObjConfig,
::testing::ValuesIn(MakeObjNamesForTest()),
[](const ::testing::TestParamInfo<TestDefaultObjConfig::ParamType>& info) {
return ObjTestNameGenerator(info);
});
} // namespace xgboost

View File

@@ -0,0 +1,31 @@
/**
* Copyright (c) 2023, XGBoost contributors
*/
#include "objective_helpers.h"
#include "../../src/common/linalg_op.h" // for begin, end
#include "helpers.h" // for RandomDataGenerator
namespace xgboost {
std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
} // namespace xgboost

View File

@@ -1,6 +1,8 @@
/**
* Copyright (c) 2023, XGBoost contributors
*/
#pragma once
#include <dmlc/registry.h> // for Registry
#include <gtest/gtest.h>
#include <xgboost/objective.h> // for ObjFunctionReg
@@ -29,4 +31,6 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
}
return name;
};
std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj);
} // namespace xgboost

View File

@@ -655,33 +655,11 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
class TestColumnSplit : public ::testing::TestWithParam<std::string> {
static auto MakeFmat(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto p_fmat = MakeFmat(objective);
auto p_fmat = MakeFmatForObjTest(objective);
std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
std::unique_ptr<Learner> learner{Learner::Create({sliced})};
learner->SetParam("tree_method", "approx");
@@ -705,7 +683,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
public:
void Run(std::string objective) {
auto p_fmat = MakeFmat(objective);
auto p_fmat = MakeFmatForObjTest(objective);
std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", objective);

View File

@@ -9,7 +9,7 @@ import pytest
import xgboost as xgb
from xgboost import testing as tm
from xgboost.testing.ranking import run_ranking_qid_df
from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
sys.path.append("tests/python")
import test_with_sklearn as twskl # noqa
@@ -165,6 +165,11 @@ def test_ranking_qid_df():
run_ranking_qid_df(cudf, "gpu_hist")
@pytest.mark.skipif(**tm.no_pandas())
def test_ranking_categorical() -> None:
run_ranking_categorical(device="cuda")
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
def test_device_ordinal() -> None:

View File

@@ -211,7 +211,7 @@ class TestPandas:
y = np.random.randn(kRows)
w = np.random.uniform(size=kRows).astype(np.float32)
w_pd = pd.DataFrame(w)
data = xgb.DMatrix(X, y, w_pd)
data = xgb.DMatrix(X, y, weight=w_pd)
assert data.num_row() == kRows
assert data.num_col() == kCols
@@ -301,14 +301,14 @@ class TestPandas:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical_dtype
from xgboost.data import is_pd_cat_dtype
for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
else:
# series
enable_categorical = is_categorical_dtype(df.dtype)
enable_categorical = is_pd_cat_dtype(df.dtype)
f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df

View File

@@ -12,7 +12,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks
import xgboost as xgb
from xgboost import testing as tm
from xgboost.testing.ranking import run_ranking_qid_df
from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
from xgboost.testing.shared import get_feature_weights, validate_data_initialization
from xgboost.testing.updater import get_basescore
@@ -173,6 +173,11 @@ def test_ranking():
np.testing.assert_almost_equal(pred, pred_orig)
@pytest.mark.skipif(**tm.no_pandas())
def test_ranking_categorical() -> None:
run_ranking_categorical(device="cpu")
def test_ranking_metric() -> None:
from sklearn.metrics import roc_auc_score
@@ -935,6 +940,7 @@ def save_load_model(model_path):
predt_0 = clf.predict(X)
clf.save_model(model_path)
clf.load_model(model_path)
assert clf.booster == "gblinear"
predt_1 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_1)
assert clf.best_iteration == best_iteration
@@ -950,25 +956,26 @@ def save_load_model(model_path):
def test_save_load_model():
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model')
model_path = os.path.join(tempdir, "digits.model")
save_load_model(model_path)
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model.json')
model_path = os.path.join(tempdir, "digits.model.json")
save_load_model(model_path)
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, 'digits.model.ubj')
model_path = os.path.join(tempdir, "digits.model.ubj")
digits = load_digits(n_class=2)
y = digits['target']
X = digits['data']
booster = xgb.train({'tree_method': 'hist',
'objective': 'binary:logistic'},
dtrain=xgb.DMatrix(X, y),
num_boost_round=4)
y = digits["target"]
X = digits["data"]
booster = xgb.train(
{"tree_method": "hist", "objective": "binary:logistic"},
dtrain=xgb.DMatrix(X, y),
num_boost_round=4,
)
predt_0 = booster.predict(xgb.DMatrix(X))
booster.save_model(model_path)
cls = xgb.XGBClassifier()
@@ -1002,6 +1009,8 @@ def test_save_load_model():
clf = xgb.XGBClassifier()
clf.load_model(model_path)
assert clf.classes_.size == 10
assert clf.objective == "multi:softprob"
np.testing.assert_equal(clf.classes_, np.arange(10))
assert clf.n_classes_ == 10

View File

@@ -1932,6 +1932,7 @@ class TestWithDask:
cls.client = client
cls.fit(X, y)
predt_0 = cls.predict(X)
proba_0 = cls.predict_proba(X)
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.pkl")
@@ -1941,7 +1942,9 @@ class TestWithDask:
with open(path, "rb") as fd:
cls = pickle.load(fd)
predt_1 = cls.predict(X)
proba_1 = cls.predict_proba(X)
np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
np.testing.assert_allclose(proba_0.compute(), proba_1.compute())
path = os.path.join(tmpdir, "cls.json")
cls.save_model(path)
@@ -1950,16 +1953,20 @@ class TestWithDask:
cls.load_model(path)
assert cls.n_classes_ == 10
predt_2 = cls.predict(X)
proba_2 = cls.predict_proba(X)
np.testing.assert_allclose(predt_0.compute(), predt_2.compute())
np.testing.assert_allclose(proba_0.compute(), proba_2.compute())
# Use single node to load
cls = xgb.XGBClassifier()
cls.load_model(path)
assert cls.n_classes_ == 10
predt_3 = cls.predict(X_)
proba_3 = cls.predict_proba(X_)
np.testing.assert_allclose(predt_0.compute(), predt_3)
np.testing.assert_allclose(proba_0.compute(), proba_3)
def test_dask_unsupported_features(client: "Client") -> None: