Merge branch 'master' into sync-condition-2023Apr11

This commit is contained in:
amdsc21 2023-04-11 19:38:38 +02:00
commit 08bc4b0c0f
56 changed files with 1912 additions and 983 deletions

20
NEWS.md
View File

@ -3,6 +3,26 @@ XGBoost Change Log
This file records the changes in xgboost library in reverse chronological order.
## 1.7.5 (2023 Mar 30)
This is a patch release for bug fixes.
* C++ requirement is updated to C++-17, along with which, CUDA 11.8 is used as the default CTK. (#8860, #8855, #8853)
* Fix import for pyspark ranker. (#8692)
* Fix Windows binary wheel to be compatible with Poetry (#8991)
* Fix GPU hist with column sampling. (#8850)
* Make sure iterative DMatrix is properly initialized. (#8997)
* [R] Update link in document. (#8998)
## 1.7.4 (2023 Feb 16)
This is a patch release for bug fixes.
* [R] Fix OpenMP detection on macOS. (#8684)
* [Python] Make sure input numpy array is aligned. (#8690)
* Fix feature interaction with column sampling in gpu_hist evaluator. (#8754)
* Fix GPU L1 error. (#8749)
* [PySpark] Fix feature types param (#8772)
* Fix ranking with quantile dmatrix and group weight. (#8762)
## 1.7.3 (2023 Jan 6)
This is a patch release for bug fixes.

View File

@ -1,4 +1,4 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Copyright (c) 2014-2023, Tianqi Chen and XBGoost Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View File

@ -34,7 +34,7 @@
#' The branches that also used for missing values are marked as bold
#' (as in "carrying extra capacity").
#'
#' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
#' This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
#'
#' @return
#'

View File

@ -67,7 +67,7 @@ The "Yes" branches are marked by the "< split_value" label.
The branches that also used for missing values are marked as bold
(as in "carrying extra capacity").
This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
}
\examples{
data(agaricus.train, package='xgboost')

View File

@ -2,6 +2,8 @@
Quantile Regression
===================
.. versionadded:: 2.0.0
The script is inspired by this awesome example in sklearn:
https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html

343
dev/release-artifacts.py Normal file
View File

@ -0,0 +1,343 @@
"""Simple script for managing Python, R, and source release packages.
tqdm, sh are required to run this script.
"""
import argparse
import os
import shutil
import subprocess
import tarfile
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from urllib.request import urlretrieve
import tqdm
from packaging import version
from sh.contrib import git
# The package building is managed by Jenkins CI.
PREFIX = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/release_"
ROOT = Path(__file__).absolute().parent.parent
DIST = ROOT / "python-package" / "dist"
pbar = None
class DirectoryExcursion:
def __init__(self, path: Union[os.PathLike, str]) -> None:
self.path = path
self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
def __enter__(self) -> None:
os.chdir(self.path)
def __exit__(self, *args: Any) -> None:
os.chdir(self.curdir)
def show_progress(block_num, block_size, total_size):
"Show file download progress."
global pbar
if pbar is None:
pbar = tqdm.tqdm(total=total_size / 1024, unit="kB")
downloaded = block_num * block_size
if downloaded < total_size:
upper = (total_size - downloaded) / 1024
pbar.update(min(block_size / 1024, upper))
else:
pbar.close()
pbar = None
def retrieve(url, filename=None):
print(f"{url} -> {filename}")
return urlretrieve(url, filename, reporthook=show_progress)
def latest_hash() -> str:
"Get latest commit hash."
ret = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True)
assert ret.returncode == 0, "Failed to get latest commit hash."
commit_hash = ret.stdout.decode("utf-8").strip()
return commit_hash
def download_wheels(
platforms: List[str],
dir_URL: str,
src_filename_prefix: str,
target_filename_prefix: str,
outdir: str,
) -> List[str]:
"""Download all binary wheels. dir_URL is the URL for remote directory storing the
release wheels.
"""
filenames = []
outdir = os.path.join(outdir, "dist")
if not os.path.exists(outdir):
os.mkdir(outdir)
for platform in platforms:
src_wheel = src_filename_prefix + platform + ".whl"
url = dir_URL + src_wheel
target_wheel = target_filename_prefix + platform + ".whl"
filename = os.path.join(outdir, target_wheel)
filenames.append(filename)
retrieve(url=url, filename=filename)
ret = subprocess.run(["twine", "check", filename], capture_output=True)
assert ret.returncode == 0, "Failed twine check"
stderr = ret.stderr.decode("utf-8")
stdout = ret.stdout.decode("utf-8")
assert stderr.find("warning") == -1, "Unresolved warnings:\n" + stderr
assert stdout.find("warning") == -1, "Unresolved warnings:\n" + stdout
return filenames
def make_pysrc_wheel(release: str, outdir: str) -> None:
"""Make Python source distribution."""
dist = os.path.join(outdir, "dist")
if not os.path.exists(dist):
os.mkdir(dist)
with DirectoryExcursion(os.path.join(ROOT, "python-package")):
subprocess.check_call(["python", "setup.py", "sdist"])
src = os.path.join(DIST, f"xgboost-{release}.tar.gz")
subprocess.check_call(["twine", "check", src])
shutil.move(src, os.path.join(dist, f"xgboost-{release}.tar.gz"))
def download_py_packages(
branch: str, major: int, minor: int, commit_hash: str, outdir: str
) -> None:
platforms = [
"win_amd64",
"manylinux2014_x86_64",
"manylinux2014_aarch64",
"macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64",
"macosx_12_0_arm64",
]
branch = branch.split("_")[1] # release_x.y.z
dir_URL = PREFIX + branch + "/"
src_filename_prefix = "xgboost-" + args.release + "%2B" + commit_hash + "-py3-none-"
target_filename_prefix = "xgboost-" + args.release + "-py3-none-"
if not os.path.exists(DIST):
os.mkdir(DIST)
filenames = download_wheels(
platforms, dir_URL, src_filename_prefix, target_filename_prefix, outdir
)
print("List of downloaded wheels:", filenames)
print(
"""
Following steps should be done manually:
- Upload pypi package by `python3 -m twine upload dist/<Package Name>` for all wheels.
- Check the uploaded files on `https://pypi.org/project/xgboost/<VERSION>/#files` and
`pip install xgboost==<VERSION>` """
)
def download_r_packages(
release: str, branch: str, rc: str, commit: str, outdir: str
) -> Tuple[Dict[str, str], List[str]]:
platforms = ["win64", "linux"]
dirname = os.path.join(outdir, "r-packages")
if not os.path.exists(dirname):
os.mkdir(dirname)
filenames = []
branch = branch.split("_")[1] # release_x.y.z
urls = {}
for plat in platforms:
url = f"{PREFIX}{branch}/xgboost_r_gpu_{plat}_{commit}.tar.gz"
if not rc:
filename = f"xgboost_r_gpu_{plat}_{release}.tar.gz"
else:
filename = f"xgboost_r_gpu_{plat}_{release}-{rc}.tar.gz"
target = os.path.join(dirname, filename)
retrieve(url=url, filename=target)
filenames.append(target)
urls[plat] = url
print("Finished downloading R packages:", filenames)
hashes = []
with DirectoryExcursion(os.path.join(outdir, "r-packages")):
for f in filenames:
ret = subprocess.run(["sha256sum", os.path.basename(f)], capture_output=True)
h = ret.stdout.decode().strip()
hashes.append(h)
return urls, hashes
def check_path():
root = os.path.abspath(os.path.curdir)
assert os.path.basename(root) == "xgboost", "Must be run on project root."
def make_src_package(release: str, outdir: str) -> Tuple[str, str]:
tarname = f"xgboost-{release}.tar.gz"
tarpath = os.path.join(outdir, tarname)
if os.path.exists(tarpath):
os.remove(tarpath)
with tempfile.TemporaryDirectory() as tmpdir_str:
tmpdir = Path(tmpdir_str)
shutil.copytree(os.path.curdir, tmpdir / "xgboost")
with DirectoryExcursion(tmpdir / "xgboost"):
ret = subprocess.run(
["git", "submodule", "foreach", "--quiet", "echo $sm_path"],
capture_output=True,
)
submodules = ret.stdout.decode().strip().split()
for mod in submodules:
mod_path = os.path.join(os.path.abspath(os.path.curdir), mod, ".git")
os.remove(mod_path)
shutil.rmtree(".git")
with tarfile.open(tarpath, "x:gz") as tar:
src = tmpdir / "xgboost"
tar.add(src, arcname="xgboost")
with DirectoryExcursion(os.path.dirname(tarpath)):
ret = subprocess.run(["sha256sum", tarname], capture_output=True)
h = ret.stdout.decode().strip()
return tarname, h
def release_note(
release: str,
artifact_hashes: List[str],
r_urls: Dict[str, str],
tarname: str,
outdir: str,
) -> None:
"""Generate a note for GitHub release description."""
r_gpu_linux_url = r_urls["linux"]
r_gpu_win64_url = r_urls["win64"]
src_tarball = (
f"https://github.com/dmlc/xgboost/releases/download/v{release}/{tarname}"
)
hash_note = "\n".join(artifact_hashes)
end_note = f"""
### Additional artifacts:
You can verify the downloaded packages by running the following command on your Unix shell:
``` sh
echo "<hash> <artifact>" | shasum -a 256 --check
```
```
{hash_note}
```
**Experimental binary packages for R with CUDA enabled**
* xgboost_r_gpu_linux_1.7.5.tar.gz: [Download]({r_gpu_linux_url})
* xgboost_r_gpu_win64_1.7.5.tar.gz: [Download]({r_gpu_win64_url})
**Source tarball**
* xgboost.tar.gz: [Download]({src_tarball})"""
print(end_note)
with open(os.path.join(outdir, "end_note.md"), "w") as fd:
fd.write(end_note)
def main(args: argparse.Namespace) -> None:
check_path()
rel = version.parse(args.release)
assert isinstance(rel, version.Version)
major = rel.major
minor = rel.minor
patch = rel.micro
print("Release:", rel)
if not rel.is_prerelease:
# Major release
rc: Optional[str] = None
rc_ver: Optional[int] = None
else:
# RC release
major = rel.major
minor = rel.minor
patch = rel.micro
assert rel.pre is not None
rc, rc_ver = rel.pre
assert rc == "rc"
release = str(major) + "." + str(minor) + "." + str(patch)
if args.branch is not None:
branch = args.branch
else:
branch = "release_" + str(major) + "." + str(minor) + ".0"
git.clean("-xdf")
git.checkout(branch)
git.pull("origin", branch)
git.submodule("update")
commit_hash = latest_hash()
if not os.path.exists(args.outdir):
os.mkdir(args.outdir)
# source tarball
hashes: List[str] = []
tarname, h = make_src_package(release, args.outdir)
hashes.append(h)
# CUDA R packages
urls, hr = download_r_packages(
release,
branch,
"" if rc is None else rc + str(rc_ver),
commit_hash,
args.outdir,
)
hashes.extend(hr)
# Python source wheel
make_pysrc_wheel(release, args.outdir)
# Python binary wheels
download_py_packages(branch, major, minor, commit_hash, args.outdir)
# Write end note
release_note(release, hashes, urls, tarname, args.outdir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--release",
type=str,
required=True,
help="Version tag, e.g. '1.3.2', or '1.5.0rc1'",
)
parser.add_argument(
"--branch",
type=str,
default=None,
help=(
"Optional branch. Usually patch releases reuse the same branch of the"
" major release, but there can be exception."
),
)
parser.add_argument(
"--outdir",
type=str,
default=None,
required=True,
help="Directory to store the generated packages.",
)
args = parser.parse_args()
main(args)

View File

@ -1,200 +0,0 @@
"""Simple script for downloading and checking pypi release wheels.
tqdm, sh are required to run this script.
"""
import argparse
import os
import subprocess
from typing import List, Optional
from urllib.request import urlretrieve
import tqdm
from packaging import version
from sh.contrib import git
# The package building is managed by Jenkins CI.
PREFIX = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/release_"
DIST = os.path.join(os.path.curdir, "python-package", "dist")
pbar = None
def show_progress(block_num, block_size, total_size):
"Show file download progress."
global pbar
if pbar is None:
pbar = tqdm.tqdm(total=total_size / 1024, unit="kB")
downloaded = block_num * block_size
if downloaded < total_size:
upper = (total_size - downloaded) / 1024
pbar.update(min(block_size / 1024, upper))
else:
pbar.close()
pbar = None
def retrieve(url, filename=None):
print(f"{url} -> {filename}")
return urlretrieve(url, filename, reporthook=show_progress)
def latest_hash() -> str:
"Get latest commit hash."
ret = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True)
assert ret.returncode == 0, "Failed to get latest commit hash."
commit_hash = ret.stdout.decode("utf-8").strip()
return commit_hash
def download_wheels(
platforms: List[str],
dir_URL: str,
src_filename_prefix: str,
target_filename_prefix: str,
) -> List[str]:
"""Download all binary wheels. dir_URL is the URL for remote directory storing the release
wheels
"""
filenames = []
for platform in platforms:
src_wheel = src_filename_prefix + platform + ".whl"
url = dir_URL + src_wheel
target_wheel = target_filename_prefix + platform + ".whl"
filename = os.path.join(DIST, target_wheel)
filenames.append(filename)
retrieve(url=url, filename=filename)
ret = subprocess.run(["twine", "check", filename], capture_output=True)
assert ret.returncode == 0, "Failed twine check"
stderr = ret.stderr.decode("utf-8")
stdout = ret.stdout.decode("utf-8")
assert stderr.find("warning") == -1, "Unresolved warnings:\n" + stderr
assert stdout.find("warning") == -1, "Unresolved warnings:\n" + stdout
return filenames
def download_py_packages(branch: str, major: int, minor: int, commit_hash: str) -> None:
platforms = [
"win_amd64",
"manylinux2014_x86_64",
"manylinux2014_aarch64",
"macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64",
"macosx_12_0_arm64"
]
branch = branch.split("_")[1] # release_x.y.z
dir_URL = PREFIX + branch + "/"
src_filename_prefix = "xgboost-" + args.release + "%2B" + commit_hash + "-py3-none-"
target_filename_prefix = "xgboost-" + args.release + "-py3-none-"
if not os.path.exists(DIST):
os.mkdir(DIST)
filenames = download_wheels(
platforms, dir_URL, src_filename_prefix, target_filename_prefix
)
print("List of downloaded wheels:", filenames)
print(
"""
Following steps should be done manually:
- Generate source package by running `python setup.py sdist`.
- Upload pypi package by `python3 -m twine upload dist/<Package Name>` for all wheels.
- Check the uploaded files on `https://pypi.org/project/xgboost/<VERSION>/#files` and `pip
install xgboost==<VERSION>` """
)
def download_r_packages(release: str, branch: str, rc: str, commit: str) -> None:
platforms = ["win64", "linux"]
dirname = "./r-packages"
if not os.path.exists(dirname):
os.mkdir(dirname)
filenames = []
branch = branch.split("_")[1] # release_x.y.z
for plat in platforms:
url = f"{PREFIX}{branch}/xgboost_r_gpu_{plat}_{commit}.tar.gz"
if not rc:
filename = f"xgboost_r_gpu_{plat}_{release}.tar.gz"
else:
filename = f"xgboost_r_gpu_{plat}_{release}-{rc}.tar.gz"
target = os.path.join(dirname, filename)
retrieve(url=url, filename=target)
filenames.append(target)
print("Finished downloading R packages:", filenames)
def check_path():
root = os.path.abspath(os.path.curdir)
assert os.path.basename(root) == "xgboost", "Must be run on project root."
def main(args: argparse.Namespace) -> None:
check_path()
rel = version.parse(args.release)
assert isinstance(rel, version.Version)
major = rel.major
minor = rel.minor
patch = rel.micro
print("Release:", rel)
if not rel.is_prerelease:
# Major release
rc: Optional[str] = None
rc_ver: Optional[int] = None
else:
# RC release
major = rel.major
minor = rel.minor
patch = rel.micro
assert rel.pre is not None
rc, rc_ver = rel.pre
assert rc == "rc"
release = str(major) + "." + str(minor) + "." + str(patch)
if args.branch is not None:
branch = args.branch
else:
branch = "release_" + str(major) + "." + str(minor) + ".0"
git.clean("-xdf")
git.checkout(branch)
git.pull("origin", branch)
git.submodule("update")
commit_hash = latest_hash()
download_r_packages(
release, branch, "" if rc is None else rc + str(rc_ver), commit_hash
)
download_py_packages(branch, major, minor, commit_hash)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--release",
type=str,
required=True,
help="Version tag, e.g. '1.3.2', or '1.5.0rc1'"
)
parser.add_argument(
"--branch",
type=str,
default=None,
help=(
"Optional branch. Usually patch releases reuse the same branch of the"
" major release, but there can be exception."
)
)
args = parser.parse_args()
main(args)

View File

@ -1,91 +0,0 @@
#!/usr/bin/env bash
# Helper script for creating release tarball.
print_usage() {
printf "Script for making release source tarball.\n"
printf "Usage:\n\trelease-tarball.sh <TAG>\n\n"
}
print_error() {
local msg=$1
printf "\u001b[31mError\u001b[0m: $msg\n\n"
print_usage
}
check_input() {
local TAG=$1
if [ -z $TAG ]; then
print_error "Empty tag argument"
exit -1
fi
}
check_curdir() {
local CUR_ABS=$1
printf "Current directory: ${CUR_ABS}\n"
local CUR=$(basename $CUR_ABS)
if [ $CUR == "dev" ]; then
cd ..
CUR=$(basename $(pwd))
fi
if [ $CUR != "xgboost" ]; then
print_error "Must be in project root or xgboost/dev. Current directory: ${CUR}"
exit -1;
fi
}
# Remove all submodules.
cleanup_git() {
local TAG=$1
check_input $TAG
git checkout $TAG || exit -1
local SUBMODULES=$(grep "path = " ./.gitmodules | cut -f 3 --delimiter=' ' -)
for module in $SUBMODULES; do
rm -rf ${module}/.git
done
rm -rf .git
}
make_tarball() {
local SRCDIR=$1
local CUR_ABS=$2
tar -czf xgboost.tar.gz xgboost
printf "Copying ${SRCDIR}/xgboost.tar.gz back to ${CUR_ABS}/xgboost.tar.gz .\n"
cp xgboost.tar.gz ${CUR_ABS}/xgboost.tar.gz
printf "Writing hash to ${CUR_ABS}/hash .\n"
sha256sum -z ${CUR_ABS}/xgboost.tar.gz | cut -f 1 --delimiter=' ' > ${CUR_ABS}/hash
}
main() {
local TAG=$1
check_input $TAG
local CUR_ABS=$(pwd)
check_curdir $CUR_ABS
local TMPDIR=$(mktemp -d)
printf "tmpdir: ${TMPDIR}\n"
git clean -xdf || exit -1
cp -R . $TMPDIR/xgboost
pushd .
cd $TMPDIR/xgboost
cleanup_git $TAG
cd ..
make_tarball $TMPDIR $CUR_ABS
popd
rm -rf $TMPDIR
}
main $1

View File

@ -23,7 +23,9 @@ Making a Release
5. Make a release on GitHub tag page, which might be done with previous step if the tag is created on GitHub.
6. Submit pip, CRAN, and Maven packages.
+ The pip package is maintained by `Hyunsu Cho <https://github.com/hcho3>`__ and `Jiaming Yuan <https://github.com/trivialfis>`__. There's a helper script for downloading pre-built wheels and R packages ``xgboost/dev/release-pypi-r.py`` along with simple instructions for using ``twine``.
There are helper scripts for automating the process in ``xgboost/dev/``.
+ The pip package is maintained by `Hyunsu Cho <https://github.com/hcho3>`__ and `Jiaming Yuan <https://github.com/trivialfis>`__.
+ The CRAN package is maintained by `Tong He <https://github.com/hetong007>`_ and `Jiaming Yuan <https://github.com/trivialfis>`__.

View File

@ -360,7 +360,13 @@ Specify the learning task and the corresponding learning objective. The objectiv
- ``reg:logistic``: logistic regression.
- ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
- ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal.
.. versionadded:: 1.7.0
- ``reg:quantileerror``: Quantile loss, also known as ``pinball loss``. See later sections for its parameter and :ref:`sphx_glr_python_examples_quantile_regression.py` for a worked example.
.. versionadded:: 2.0.0
- ``binary:logistic``: logistic regression for binary classification, output probability
- ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
- ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
@ -467,6 +473,8 @@ Parameter for using Quantile Loss (``reg:quantileerror``)
* ``quantile_alpha``: A scala or a list of targeted quantiles.
.. versionadded:: 2.0.0
Parameter for using AFT Survival Loss (``survival:aft``) and Negative Log Likelihood of AFT metric (``aft-nloglik``)
====================================================================================================================

View File

@ -66,7 +66,7 @@ on a dask cluster:
Here we first create a cluster in single-node mode with
:py:class:`distributed.LocalCluster`, then connect a :py:class:`distributed.Client` to
this cluster, setting up an environment for later computation. Notice that the cluster
construction is guared by ``__name__ == "__main__"``, which is necessary otherwise there
construction is guarded by ``__name__ == "__main__"``, which is necessary otherwise there
might be obscure errors.
We then create a :py:class:`xgboost.dask.DaskDMatrix` object and pass it to
@ -226,13 +226,9 @@ collection.
Working with other clusters
***************************
``LocalCluster`` is mostly used for testing. In real world applications some other
clusters might be preferred. Examples are like ``LocalCUDACluster`` for single node
multi-GPU instance, manually launched cluster by using command line utilities like
``dask-worker`` from ``distributed`` for not yet automated environments. Some special
clusters like ``KubeCluster`` from ``dask-kubernetes`` package are also possible. The
dask API in xgboost is orthogonal to the cluster type and can be used with any of them. A
typical testing workflow with ``KubeCluster`` looks like this:
Using Dask's ``LocalCluster`` is convenient for getting started quickly on a single-machine. Once you're ready to scale your work, though, there are a number of ways to deploy Dask on a distributed cluster. You can use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/quickstart.html>`_, for example, for GPUs and you can use Dask Cloud Provider to `deploy Dask clusters in the cloud <https://docs.dask.org/en/stable/deploying.html#cloud>`_. See the `Dask documentation for a more comprehensive list <https://docs.dask.org/en/stable/deploying.html#distributed-computing>`_.
In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernetes <https://docs.dask.org/en/stable/deploying-kubernetes.html>`_:
.. code-block:: python
@ -272,8 +268,7 @@ typical testing workflow with ``KubeCluster`` looks like this:
# main function will connect to that cluster and start training xgboost model.
main()
However, these clusters might have their subtle differences like network configuration, or
Different cluster classes might have subtle differences like network configuration, or
specific cluster implementation might contains bugs that we are not aware of. Open an
issue if such case is found and there's no documentation on how to resolve it in that
cluster implementation.

View File

@ -1,5 +1,5 @@
/*!
* Copyright (c) 2022 by XGBoost Contributors
/**
* Copyright (c) 2022-2023, XGBoost Contributors
*/
#pragma once
@ -18,7 +18,11 @@
#include <utility> // std::swap
#if !defined(xgboost_IS_MINGW)
#define xgboost_IS_MINGW() defined(__MINGW32__)
#if defined(__MINGW32__)
#define xgboost_IS_MINGW 1
#endif // defined(__MINGW32__)
#endif // xgboost_IS_MINGW
#if defined(_WIN32)
@ -32,7 +36,7 @@ using in_port_t = std::uint16_t;
#pragma comment(lib, "Ws2_32.lib")
#endif // _MSC_VER
#if !xgboost_IS_MINGW()
#if !defined(xgboost_IS_MINGW)
using ssize_t = int;
#endif // !xgboost_IS_MINGW()
@ -62,10 +66,10 @@ using ssize_t = int;
namespace xgboost {
#if xgboost_IS_MINGW()
#if defined(xgboost_IS_MINGW)
// see the dummy implementation of `poll` in rabit for more info.
inline void MingWError() { LOG(FATAL) << "Distributed training on mingw is not supported."; }
#endif // xgboost_IS_MINGW()
#endif // defined(xgboost_IS_MINGW)
namespace system {
inline std::int32_t LastError() {
@ -144,7 +148,7 @@ inline void SocketFinalize() {
#endif // defined(_WIN32)
}
#if defined(_WIN32) && xgboost_IS_MINGW()
#if defined(_WIN32) && defined(xgboost_IS_MINGW)
// dummy definition for old mysys32.
inline const char *inet_ntop(int, const void *, char *, socklen_t) { // NOLINT
MingWError();
@ -152,7 +156,7 @@ inline const char *inet_ntop(int, const void *, char *, socklen_t) { // NOLINT
}
#else
using ::inet_ntop;
#endif
#endif // defined(_WIN32) && defined(xgboost_IS_MINGW)
} // namespace system
@ -296,8 +300,7 @@ class TCPSocket {
#else
struct sockaddr sa;
socklen_t sizeofsa = sizeof(sa);
xgboost_CHECK_SYS_CALL(
getsockname(handle_, &sa, &sizeofsa), 0);
xgboost_CHECK_SYS_CALL(getsockname(handle_, &sa, &sizeofsa), 0);
if (sizeofsa < sizeof(uchar_t) * 2) {
return ret_iafamily(AF_INET);
}
@ -508,7 +511,7 @@ class TCPSocket {
* \brief Create a TCP socket on specified domain.
*/
static TCPSocket Create(SockDomain domain) {
#if xgboost_IS_MINGW()
#if defined(xgboost_IS_MINGW)
MingWError();
return {};
#else
@ -522,7 +525,7 @@ class TCPSocket {
socket.domain_ = domain;
#endif // defined(__APPLE__)
return socket;
#endif // xgboost_IS_MINGW()
#endif // defined(xgboost_IS_MINGW)
}
};
@ -544,4 +547,7 @@ inline std::string GetHostName() {
} // namespace xgboost
#undef xgboost_CHECK_SYS_CALL
#if defined(xgboost_IS_MINGW)
#undef xgboost_IS_MINGW
#endif

View File

@ -23,13 +23,7 @@ from typing import (
import numpy
from . import collective
from .core import (
Booster,
DMatrix,
XGBoostError,
_get_booster_layer_trees,
_parse_eval_str,
)
from .core import Booster, DMatrix, XGBoostError, _parse_eval_str
__all__ = [
"TrainingCallback",
@ -177,22 +171,14 @@ class CallbackContainer:
assert isinstance(model, Booster), msg
if not self.is_cv:
num_parallel_tree, _ = _get_booster_layer_trees(model)
if model.attr("best_score") is not None:
model.best_score = float(cast(str, model.attr("best_score")))
model.best_iteration = int(cast(str, model.attr("best_iteration")))
# num_class is handled internally
model.set_attr(
best_ntree_limit=str((model.best_iteration + 1) * num_parallel_tree)
)
model.best_ntree_limit = int(cast(str, model.attr("best_ntree_limit")))
else:
# Due to compatibility with version older than 1.4, these attributes are
# added to Python object even if early stopping is not used.
model.best_iteration = model.num_boosted_rounds() - 1
model.set_attr(best_iteration=str(model.best_iteration))
model.best_ntree_limit = (model.best_iteration + 1) * num_parallel_tree
model.set_attr(best_ntree_limit=str(model.best_ntree_limit))
return model

View File

@ -94,9 +94,9 @@ def from_cstr_to_pystr(data: CStrPptr, length: c_bst_ulong) -> List[str]:
Parameters
----------
data : ctypes pointer
data :
pointer to data
length : ctypes pointer
length :
pointer to length of data
"""
res = []
@ -126,31 +126,12 @@ def _parse_eval_str(result: str) -> List[Tuple[str, float]]:
IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int])
def _convert_ntree_limit(
booster: "Booster", ntree_limit: Optional[int], iteration_range: IterRange
) -> IterRange:
if ntree_limit is not None and ntree_limit != 0:
warnings.warn(
"ntree_limit is deprecated, use `iteration_range` or model "
"slicing instead.",
UserWarning,
)
if iteration_range is not None and iteration_range[1] != 0:
raise ValueError(
"Only one of `iteration_range` and `ntree_limit` can be non zero."
)
num_parallel_tree, _ = _get_booster_layer_trees(booster)
num_parallel_tree = max([num_parallel_tree, 1])
iteration_range = (0, ntree_limit // num_parallel_tree)
return iteration_range
def _expect(expectations: Sequence[Type], got: Type) -> str:
"""Translate input error into string.
Parameters
----------
expectations: sequence
expectations :
a list of expected value.
got :
actual input
@ -282,7 +263,7 @@ def _check_call(ret: int) -> None:
Parameters
----------
ret : int
ret :
return value from API calls
"""
if ret != 0:
@ -290,10 +271,10 @@ def _check_call(ret: int) -> None:
def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also, please
note that build time dependency is not the same as runtime dependency. For instance,
it's possible to build XGBoost with older CUDA version but run it with the lastest
one.
"""Build information of XGBoost. The returned value format is not stable. Also,
please note that build time dependency is not the same as runtime dependency. For
instance, it's possible to build XGBoost with older CUDA version but run it with the
lastest one.
.. versionadded:: 1.6.0
@ -677,28 +658,28 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
data :
Data source of DMatrix. See :ref:`py-data` for a list of supported input
types.
label : array_like
label :
Label of the training data.
weight : array_like
weight :
Weight for each instance.
.. note:: For ranking task, weights are per-group.
.. note::
In ranking task, one weight is assigned to each group (not each
data point). This is because we only care about the relative
ordering of data points within each group, so it doesn't make
sense to assign weights to individual data points.
For ranking task, weights are per-group. In ranking task, one weight
is assigned to each group (not each data point). This is because we
only care about the relative ordering of data points within each group,
so it doesn't make sense to assign weights to individual data points.
base_margin: array_like
base_margin :
Base margin used for boosting from existing model.
missing : float, optional
Value in the input data which needs to be present as a missing
value. If None, defaults to np.nan.
silent : boolean, optional
missing :
Value in the input data which needs to be present as a missing value. If
None, defaults to np.nan.
silent :
Whether print messages during construction
feature_names : list, optional
feature_names :
Set names for features.
feature_types : FeatureTypes
feature_types :
Set types for features. When `enable_categorical` is set to `True`, string
"c" represents categorical data type while "q" represents numerical feature
@ -708,20 +689,20 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
`.cat.codes` method. This is useful when users want to specify categorical
features without having to construct a dataframe as input.
nthread : integer, optional
nthread :
Number of threads to use for loading data when parallelization is
applicable. If -1, uses maximum threads available on the system.
group : array_like
group :
Group size for all ranking group.
qid : array_like
qid :
Query ID for data samples, used for ranking.
label_lower_bound : array_like
label_lower_bound :
Lower bound for survival training.
label_upper_bound : array_like
label_upper_bound :
Upper bound for survival training.
feature_weights : array_like, optional
feature_weights :
Set feature weights for column sampling.
enable_categorical: boolean, optional
enable_categorical :
.. versionadded:: 1.3.0
@ -1508,41 +1489,6 @@ Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
"""Get number of trees added to booster per-iteration. This function will be removed
once `best_ntree_limit` is dropped in favor of `best_iteration`. Returns
`num_parallel_tree` and `num_groups`.
"""
config = json.loads(model.save_config())
booster = config["learner"]["gradient_booster"]["name"]
if booster == "gblinear":
num_parallel_tree = 0
elif booster == "dart":
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree"]["gbtree_model_param"][
"num_parallel_tree"
]
)
elif booster == "gbtree":
try:
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree_model_param"][
"num_parallel_tree"
]
)
except KeyError:
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree_train_param"][
"num_parallel_tree"
]
)
else:
raise ValueError(f"Unknown booster: {booster}")
num_groups = int(config["learner"]["learner_model_param"]["num_class"])
return num_parallel_tree, num_groups
def _configure_metrics(params: BoosterParam) -> BoosterParam:
if (
isinstance(params, dict)
@ -1576,11 +1522,11 @@ class Booster:
"""
Parameters
----------
params : dict
params :
Parameters for boosters.
cache : list
cache :
List of cache items.
model_file : string/os.PathLike/Booster/bytearray
model_file :
Path to the model file if it's string or PathLike.
"""
cache = cache if cache is not None else []
@ -1766,6 +1712,7 @@ class Booster:
string.
.. versionadded:: 1.0.0
"""
json_string = ctypes.c_char_p()
length = c_bst_ulong()
@ -1798,8 +1745,8 @@ class Booster:
Returns
-------
booster: `Booster`
a copied booster model
booster :
A copied booster model
"""
return copy.copy(self)
@ -1808,12 +1755,12 @@ class Booster:
Parameters
----------
key : str
key :
The key to get attribute from.
Returns
-------
value : str
value :
The attribute value of the key, returns None if attribute do not exist.
"""
ret = ctypes.c_char_p()
@ -1932,9 +1879,9 @@ class Booster:
Parameters
----------
params: dict/list/str
params :
list of key,value pairs, dict of key to value or simply str key
value: optional
value :
value of the specified parameter, when params is str key
"""
if isinstance(params, Mapping):
@ -1957,11 +1904,11 @@ class Booster:
Parameters
----------
dtrain : DMatrix
dtrain :
Training data.
iteration : int
iteration :
Current iteration number.
fobj : function
fobj :
Customized objective function.
"""
@ -2100,7 +2047,6 @@ class Booster:
self,
data: DMatrix,
output_margin: bool = False,
ntree_limit: int = 0,
pred_leaf: bool = False,
pred_contribs: bool = False,
approx_contribs: bool = False,
@ -2127,9 +2073,6 @@ class Booster:
output_margin :
Whether to output the raw untransformed margin value.
ntree_limit :
Deprecated, use `iteration_range` instead.
pred_leaf :
When this option is on, the output will be a matrix of (nsample,
ntrees) with each record indicating the predicted leaf index of
@ -2196,7 +2139,6 @@ class Booster:
raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
if validate_features:
self._validate_dmatrix_features(data)
iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
args = {
"type": 0,
"training": training,
@ -2264,8 +2206,7 @@ class Booster:
Parameters
----------
data : numpy.ndarray/scipy.sparse.csr_matrix/cupy.ndarray/
cudf.DataFrame/pd.DataFrame
data :
The input data, must not be a view for numpy array. Set
``predictor`` to ``gpu_predictor`` for running prediction on CuPy
array or CuDF DataFrame.
@ -2449,7 +2390,7 @@ class Booster:
Parameters
----------
fname : string or os.PathLike
fname :
Output file name
"""
@ -2522,8 +2463,6 @@ class Booster:
self.best_iteration = int(self.attr("best_iteration")) # type: ignore
if self.attr("best_score") is not None:
self.best_score = float(self.attr("best_score")) # type: ignore
if self.attr("best_ntree_limit") is not None:
self.best_ntree_limit = int(self.attr("best_ntree_limit")) # type: ignore
def num_boosted_rounds(self) -> int:
"""Get number of boosted rounds. For gblinear this is reset to 0 after
@ -2555,13 +2494,13 @@ class Booster:
Parameters
----------
fout : string or os.PathLike
fout :
Output file name.
fmap : string or os.PathLike, optional
fmap :
Name of the file containing feature map names.
with_stats : bool, optional
with_stats :
Controls whether the split statistics are output.
dump_format : string, optional
dump_format :
Format of model dump file. Can be 'text' or 'json'.
"""
if isinstance(fout, (str, os.PathLike)):
@ -2716,7 +2655,7 @@ class Booster:
Parameters
----------
fmap: str or os.PathLike (optional)
fmap :
The name of feature map file.
"""
# pylint: disable=too-many-locals
@ -2882,15 +2821,15 @@ class Booster:
Parameters
----------
feature: str
feature :
The name of the feature.
fmap: str or os.PathLike (optional)
fmap:
The name of feature map file.
bin: int, default None
bin :
The maximum number of bins.
Number of bins equals number of unique split values n_unique,
if bins == None or bins > n_unique.
as_pandas: bool, default True
as_pandas :
Return pd.DataFrame when pandas is installed.
If False or pandas is not installed, return numpy ndarray.

View File

@ -1653,14 +1653,11 @@ class DaskScikitLearnBase(XGBModel):
self,
X: _DataT,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
_assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self.client.sync(
self._predict_async,
X,
@ -1694,12 +1691,9 @@ class DaskScikitLearnBase(XGBModel):
def apply(
self,
X: _DataT,
ntree_limit: Optional[int] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
_assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
def __await__(self) -> Awaitable[Any]:
@ -1993,14 +1987,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
def predict_proba(
self,
X: _DaskCollection,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
_assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self._client_sync(
self._predict_proba_async,
X=X,

View File

@ -1,10 +1,9 @@
# pylint: disable=too-many-locals, too-many-arguments, invalid-name,
# pylint: disable=too-many-branches
# coding: utf-8
"""Plotting Library."""
import json
from io import BytesIO
from typing import Any, Optional
from typing import Any, Optional, Union
import numpy as np
@ -17,7 +16,7 @@ GraphvizSource = Any # real type is graphviz.Source
def plot_importance(
booster: Booster,
booster: Union[XGBModel, Booster, dict],
ax: Optional[Axes] = None,
height: float = 0.2,
xlim: Optional[tuple] = None,
@ -37,40 +36,42 @@ def plot_importance(
Parameters
----------
booster : Booster, XGBModel or dict
booster :
Booster or XGBModel instance, or dict taken by Booster.get_fscore()
ax : matplotlib Axes, default None
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
grid : bool, Turn the axes grids on or off. Default is True (On).
importance_type : str, default "weight"
grid :
Turn the axes grids on or off. Default is True (On).
importance_type :
How the importance is calculated: either "weight", "gain", or "cover"
* "weight" is the number of times a feature appears in a tree
* "gain" is the average gain of splits which use the feature
* "cover" is the average coverage of splits which use the feature
where coverage is defined as the number of samples affected by the split
max_num_features : int, default None
Maximum number of top features displayed on plot. If None, all features will be displayed.
height : float, default 0.2
max_num_features :
Maximum number of top features displayed on plot. If None, all features will be
displayed.
height :
Bar height, passed to ax.barh()
xlim : tuple, default None
xlim :
Tuple passed to axes.xlim()
ylim : tuple, default None
ylim :
Tuple passed to axes.ylim()
title : str, default "Feature importance"
title :
Axes title. To disable, pass None.
xlabel : str, default "F score"
xlabel :
X axis title label. To disable, pass None.
ylabel : str, default "Features"
ylabel :
Y axis title label. To disable, pass None.
fmap: str or os.PathLike (optional)
fmap :
The name of feature map file.
show_values : bool, default True
show_values :
Show values on plot. To disable, pass False.
values_format : str, default "{v}"
Format string for values. "v" will be replaced by the value of the feature importance.
e.g. Pass "{v:.2f}" in order to limit the number of digits after the decimal point
to two, for each value printed on the graph.
values_format :
Format string for values. "v" will be replaced by the value of the feature
importance. e.g. Pass "{v:.2f}" in order to limit the number of digits after
the decimal point to two, for each value printed on the graph.
kwargs :
Other keywords passed to ax.barh()
@ -146,7 +147,7 @@ def plot_importance(
def to_graphviz(
booster: Booster,
booster: Union[Booster, XGBModel],
fmap: PathLike = "",
num_trees: int = 0,
rankdir: Optional[str] = None,
@ -162,19 +163,19 @@ def to_graphviz(
Parameters
----------
booster : Booster, XGBModel
booster :
Booster or XGBModel instance
fmap: str (optional)
fmap :
The name of feature map file
num_trees : int, default 0
num_trees :
Specify the ordinal number of target tree
rankdir : str, default "UT"
rankdir :
Passed to graphviz via graph_attr
yes_color : str, default '#0000FF'
yes_color :
Edge color when meets the node condition.
no_color : str, default '#FF0000'
no_color :
Edge color when doesn't meet the node condition.
condition_node_params : dict, optional
condition_node_params :
Condition node configuration for for graphviz. Example:
.. code-block:: python
@ -183,7 +184,7 @@ def to_graphviz(
'style': 'filled,rounded',
'fillcolor': '#78bceb'}
leaf_node_params : dict, optional
leaf_node_params :
Leaf node configuration for graphviz. Example:
.. code-block:: python
@ -192,7 +193,7 @@ def to_graphviz(
'style': 'filled',
'fillcolor': '#e48038'}
\\*\\*kwargs: dict, optional
kwargs :
Other keywords passed to graphviz graph_attr, e.g. ``graph [ {key} = {value} ]``
Returns

View File

@ -36,7 +36,6 @@ from .core import (
Objective,
QuantileDMatrix,
XGBoostError,
_convert_ntree_limit,
_deprecate_positional_args,
_parse_eval_str,
)
@ -391,8 +390,7 @@ __model_doc = f"""
metric will be used for early stopping.
- If early stopping occurs, the model will have three additional fields:
:py:attr:`best_score`, :py:attr:`best_iteration` and
:py:attr:`best_ntree_limit`.
:py:attr:`best_score`, :py:attr:`best_iteration`.
.. note::
@ -1014,9 +1012,9 @@ class XGBModel(XGBModelBase):
verbose :
If `verbose` is True and an evaluation set is used, the evaluation metric
measured on the validation set is printed to stdout at each boosting stage.
If `verbose` is an integer, the evaluation metric is printed at each `verbose`
boosting stage. The last boosting stage / the boosting stage found by using
`early_stopping_rounds` is also printed.
If `verbose` is an integer, the evaluation metric is printed at each
`verbose` boosting stage. The last boosting stage / the boosting stage found
by using `early_stopping_rounds` is also printed.
xgb_model :
file name of stored XGBoost model or 'Booster' instance XGBoost model to be
loaded before training (allows training continuation).
@ -1117,7 +1115,6 @@ class XGBModel(XGBModelBase):
self,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@ -1135,8 +1132,6 @@ class XGBModel(XGBModelBase):
Data to predict with.
output_margin :
Whether to output the raw untransformed margin value.
ntree_limit :
Deprecated, use `iteration_range` instead.
validate_features :
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
@ -1156,9 +1151,6 @@ class XGBModel(XGBModelBase):
"""
with config_context(verbosity=self.verbosity):
iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range
)
iteration_range = self._get_iteration_range(iteration_range)
if self._can_use_inplace_predict():
try:
@ -1197,7 +1189,6 @@ class XGBModel(XGBModelBase):
def apply(
self,
X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
"""Return the predicted leaf every tree for each sample. If the model is trained
@ -1211,9 +1202,6 @@ class XGBModel(XGBModelBase):
iteration_range :
See :py:meth:`predict`.
ntree_limit :
Deprecated, use ``iteration_range`` instead.
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
@ -1223,9 +1211,6 @@ class XGBModel(XGBModelBase):
"""
with config_context(verbosity=self.verbosity):
iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range
)
iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = DMatrix(
X,
@ -1309,10 +1294,6 @@ class XGBModel(XGBModelBase):
"""
return int(self._early_stopping_attr("best_iteration"))
@property
def best_ntree_limit(self) -> int:
return int(self._early_stopping_attr("best_ntree_limit"))
@property
def feature_importances_(self) -> np.ndarray:
"""Feature importances property, return depends on `importance_type`
@ -1562,7 +1543,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
self,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@ -1571,7 +1551,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
class_probs = super().predict(
X=X,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
@ -1599,7 +1578,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
def predict_proba(
self,
X: ArrayLike,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@ -1612,14 +1590,12 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
Parameters
----------
X : array_like
X :
Feature matrix. See :ref:`py-data` for a list of supported types.
ntree_limit : int
Deprecated, use `iteration_range` instead.
validate_features : bool
validate_features :
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
base_margin : array_like
base_margin :
Margin added to prediction.
iteration_range :
Specifies which layer of trees are used in prediction. For example, if a
@ -1642,7 +1618,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
if self.objective == "multi:softmax":
raw_predt = super().predict(
X=X,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
@ -1652,7 +1627,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
return class_prob
class_probs = super().predict(
X=X,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
@ -1990,9 +1964,9 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
verbose :
If `verbose` is True and an evaluation set is used, the evaluation metric
measured on the validation set is printed to stdout at each boosting stage.
If `verbose` is an integer, the evaluation metric is printed at each `verbose`
boosting stage. The last boosting stage / the boosting stage found by using
`early_stopping_rounds` is also printed.
If `verbose` is an integer, the evaluation metric is printed at each
`verbose` boosting stage. The last boosting stage / the boosting stage found
by using `early_stopping_rounds` is also printed.
xgb_model :
file name of stored XGBoost model or 'Booster' instance XGBoost model to be
loaded before training (allows training continuation).
@ -2074,7 +2048,6 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
self,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@ -2083,20 +2056,18 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
return super().predict(
X,
output_margin,
ntree_limit,
validate_features,
base_margin,
iteration_range,
iteration_range=iteration_range,
)
def apply(
self,
X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None,
) -> ArrayLike:
X, _ = _get_qid(X, None)
return super().apply(X, ntree_limit, iteration_range)
return super().apply(X, iteration_range)
def score(self, X: ArrayLike, y: ArrayLike) -> float:
"""Evaluate score for data using the last evaluation metric. If the model is

View File

@ -11,7 +11,6 @@ from xgboost import DataIter, DMatrix, QuantileDMatrix, XGBModel
from xgboost.compat import concat
from .._typing import ArrayLike
from ..core import _convert_ntree_limit
from .utils import get_logger # type: ignore
@ -343,8 +342,7 @@ def pred_contribs(
strict_shape: bool = False,
) -> np.ndarray:
"""Predict contributions with data with the full model."""
iteration_range = _convert_ntree_limit(model.get_booster(), None, None)
iteration_range = model._get_iteration_range(iteration_range)
iteration_range = model._get_iteration_range(None)
data_dmatrix = DMatrix(
data,
base_margin=base_margin,

View File

@ -95,7 +95,7 @@ def train(
feval :
.. deprecated:: 1.6.0
Use `custom_metric` instead.
maximize : bool
maximize :
Whether to maximize feval.
early_stopping_rounds :
Activates early stopping. Validation metric needs to improve at least once in

View File

@ -123,7 +123,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
DMLC_DECLARE_PARAMETER(LambdaRankParam) {
DMLC_DECLARE_FIELD(lambdarank_pair_method)
.set_default(PairMethod::kMean)
.set_default(PairMethod::kTopK)
.add_enum("mean", PairMethod::kMean)
.add_enum("topk", PairMethod::kTopK)
.describe("Method for constructing pairs.");

View File

@ -112,7 +112,6 @@ class PerGroupWeightPolicy {
return info.GetWeight(group_id);
}
};
} // anonymous namespace
namespace xgboost::metric {

View File

@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
size_t n_leaf = nidx.size();
if (nptr.empty()) {
std::vector<float> quantiles;
UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
return;
}
@ -99,6 +99,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
predt.Size() / info.num_row_);
if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
// loop over each leaf
common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
auto nidx = h_node_idx[k];
@ -130,8 +131,14 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
}
quantiles.at(k) = q;
});
}
UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
if (info.IsVerticalFederated()) {
collective::Broadcast(static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float),
0);
}
UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
}
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

View File

@ -185,7 +185,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
if (nptr.Empty()) {
std::vector<float> quantiles;
UpdateLeafValues(&quantiles, nidx.ConstHostVector(), learning_rate, p_tree);
UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
}
HostDeviceVector<float> quantiles;
@ -220,7 +220,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
w_it + d_weights.size(), &quantiles);
}
UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), learning_rate, p_tree);
UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
}
} // namespace detail
} // namespace obj

View File

@ -36,13 +36,15 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
}
inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
float learning_rate, RegTree* p_tree) {
MetaInfo const& info, float learning_rate, RegTree* p_tree) {
auto& tree = *p_tree;
auto& quantiles = *p_quantiles;
auto const& h_node_idx = nidx;
size_t n_leaf{h_node_idx.size()};
if (info.IsRowSplit()) {
collective::Allreduce<collective::Operation::kMax>(&n_leaf, 1);
}
CHECK(quantiles.empty() || quantiles.size() == n_leaf);
if (quantiles.empty()) {
quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@ -52,12 +54,16 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
std::vector<int32_t> n_valids(quantiles.size());
std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
[](float q) { return static_cast<int32_t>(!std::isnan(q)); });
if (info.IsRowSplit()) {
collective::Allreduce<collective::Operation::kSum>(n_valids.data(), n_valids.size());
}
// convert to 0 for all reduce
std::replace_if(
quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
// use the mean value
if (info.IsRowSplit()) {
collective::Allreduce<collective::Operation::kSum>(quantiles.data(), quantiles.size());
}
for (size_t i = 0; i < n_leaf; ++i) {
if (n_valids[i] > 0) {
quantiles[i] /= static_cast<float>(n_valids[i]);

View File

@ -14,8 +14,7 @@
#include "xgboost/linalg.h" // Tensor,Vector
#include "xgboost/task.h" // ObjInfo
namespace xgboost {
namespace obj {
namespace xgboost::obj {
void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
if (this->Task().task == ObjInfo::kRegression) {
CheckInitInputs(info);
@ -31,14 +30,13 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
new_obj->LoadConfig(config);
new_obj->GetGradient(dummy_predt, info, 0, &gpair);
bst_target_t n_targets = this->Targets(info);
linalg::Vector<float> leaf_weight;
tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
// workaround, we don't support multi-target due to binary model serialization for
// base margin.
common::Mean(this->ctx_, leaf_weight, base_score);
this->PredTransform(base_score->Data());
}
} // namespace obj
} // namespace xgboost
} // namespace xgboost::obj

View File

@ -7,8 +7,7 @@
#include "xgboost/linalg.h" // Tensor
#include "xgboost/objective.h" // ObjFunction
namespace xgboost {
namespace obj {
namespace xgboost::obj {
class FitIntercept : public ObjFunction {
void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override;
};
@ -20,6 +19,5 @@ inline void CheckInitInputs(MetaInfo const& info) {
<< "Number of weights should be equal to number of data points.";
}
}
} // namespace obj
} // namespace xgboost
} // namespace xgboost::obj
#endif // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_

View File

@ -0,0 +1,62 @@
/**
* Copyright 2015-2023 by XGBoost contributors
*
* \brief CUDA implementation of lambdarank.
*/
#include <thrust/fill.h> // for fill_n
#include <thrust/for_each.h> // for for_each_n
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/iterator/zip_iterator.h> // for make_zip_iterator
#include <thrust/tuple.h> // for make_tuple, tuple, tie, get
#include <algorithm> // for min
#include <cassert> // for assert
#include <cmath> // for abs, log2, isinf
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <memory> // for shared_ptr
#include <utility>
#include "../common/algorithm.cuh" // for SegmentedArgSort
#include "../common/cuda_context.cuh" // for CUDAContext
#include "../common/deterministic.cuh" // for CreateRoundingFactor, TruncateWithRounding
#include "../common/device_helpers.cuh" // for SegmentId, TemporaryArray, AtomicAddGpair
#include "../common/optional_weight.h" // for MakeOptionalWeights
#include "../common/ranking_utils.h" // for NDCGCache, LambdaRankParam, rel_degree_t
#include "lambdarank_obj.cuh"
#include "lambdarank_obj.h"
#include "xgboost/base.h" // for bst_group_t, XGBOOST_DEVICE, GradientPair
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for VectorView, Range, Vector
#include "xgboost/logging.h"
#include "xgboost/span.h" // for Span
namespace xgboost::obj {
DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu);
namespace cuda_impl {
common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
common::Span<std::size_t const> d_rank,
std::shared_ptr<ltr::RankingCache> p_cache) {
auto const d_group_ptr = p_cache->DataGroupPtr(ctx);
auto label = info.labels.View(ctx->gpu_id);
// The buffer for ranked y is necessary as cub segmented sort accepts only pointer.
auto d_y_ranked = p_cache->RankedY(ctx, info.num_row_);
thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_y_ranked.size(),
[=] XGBOOST_DEVICE(std::size_t i) {
auto g = dh::SegmentId(d_group_ptr, i);
auto g_label =
label.Slice(linalg::Range(d_group_ptr[g], d_group_ptr[g + 1]), 0);
auto g_rank_idx = d_rank.subspan(d_group_ptr[g], g_label.Size());
i -= d_group_ptr[g];
auto g_y_ranked = d_y_ranked.subspan(d_group_ptr[g], g_label.Size());
g_y_ranked[i] = g_label(g_rank_idx[i]);
});
auto d_y_sorted_idx = p_cache->SortedIdxY(ctx, info.num_row_);
common::SegmentedArgSort<false, true>(ctx, d_y_ranked, d_group_ptr, d_y_sorted_idx);
return d_y_sorted_idx;
}
} // namespace cuda_impl
} // namespace xgboost::obj

View File

@ -0,0 +1,172 @@
/**
* Copyright 2023 XGBoost contributors
*/
#ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_
#define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_
#include <thrust/binary_search.h> // for lower_bound, upper_bound
#include <thrust/functional.h> // for greater
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/random/linear_congruential_engine.h> // for minstd_rand
#include <thrust/random/uniform_int_distribution.h> // for uniform_int_distribution
#include <cassert> // for cassert
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <tuple> // for make_tuple, tuple
#include "../common/device_helpers.cuh" // for MakeTransformIterator
#include "../common/ranking_utils.cuh" // for PairsForGroup
#include "../common/ranking_utils.h" // for RankingCache
#include "../common/threading_utils.cuh" // for UnravelTrapeziodIdx
#include "xgboost/base.h" // for bst_group_t, GradientPair, XGBOOST_DEVICE
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/linalg.h" // for VectorView, Range, UnravelIndex
#include "xgboost/span.h" // for Span
namespace xgboost::obj::cuda_impl {
/**
* \brief Find number of elements left to the label bucket
*/
template <typename It, typename T = typename std::iterator_traits<It>::value_type>
XGBOOST_DEVICE __forceinline__ std::size_t CountNumItemsToTheLeftOf(It items, std::size_t n, T v) {
return thrust::lower_bound(thrust::seq, items, items + n, v, thrust::greater<T>{}) - items;
}
/**
* \brief Find number of elements right to the label bucket
*/
template <typename It, typename T = typename std::iterator_traits<It>::value_type>
XGBOOST_DEVICE __forceinline__ std::size_t CountNumItemsToTheRightOf(It items, std::size_t n, T v) {
return n - (thrust::upper_bound(thrust::seq, items, items + n, v, thrust::greater<T>{}) - items);
}
/**
* \brief Sort labels according to rank list for making pairs.
*/
common::Span<std::size_t const> SortY(Context const *ctx, MetaInfo const &info,
common::Span<std::size_t const> d_rank,
std::shared_ptr<ltr::RankingCache> p_cache);
/**
* \brief Parameters needed for calculating gradient
*/
struct KernelInputs {
linalg::VectorView<double const> ti_plus; // input bias ratio
linalg::VectorView<double const> tj_minus; // input bias ratio
linalg::VectorView<double> li;
linalg::VectorView<double> lj;
common::Span<bst_group_t const> d_group_ptr;
common::Span<std::size_t const> d_threads_group_ptr;
common::Span<std::size_t const> d_sorted_idx;
linalg::MatrixView<float const> labels;
common::Span<float const> predts;
common::Span<GradientPair> gpairs;
linalg::VectorView<GradientPair const> d_roundings;
double const *d_cost_rounding;
common::Span<std::size_t const> d_y_sorted_idx;
std::int32_t iter;
};
/**
* \brief Functor for generating pairs
*/
template <bool has_truncation>
struct MakePairsOp {
KernelInputs args;
/**
* \brief Make pair for the topk pair method.
*/
XGBOOST_DEVICE std::tuple<std::size_t, std::size_t> WithTruncation(std::size_t idx,
bst_group_t g) const {
auto thread_group_begin = args.d_threads_group_ptr[g];
auto idx_in_thread_group = idx - thread_group_begin;
auto data_group_begin = static_cast<std::size_t>(args.d_group_ptr[g]);
std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin;
// obtain group segment data.
auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0);
auto g_sorted_idx = args.d_sorted_idx.subspan(data_group_begin, n_data);
std::size_t i = 0, j = 0;
common::UnravelTrapeziodIdx(idx_in_thread_group, n_data, &i, &j);
std::size_t rank_high = i, rank_low = j;
return std::make_tuple(rank_high, rank_low);
}
/**
* \brief Make pair for the mean pair method
*/
XGBOOST_DEVICE std::tuple<std::size_t, std::size_t> WithSampling(std::size_t idx,
bst_group_t g) const {
std::size_t n_samples = args.labels.Size();
assert(n_samples == args.predts.size());
// Constructed from ranking cache.
std::size_t n_pairs =
ltr::cuda_impl::PairsForGroup(args.d_threads_group_ptr[g + 1] - args.d_threads_group_ptr[g],
args.d_group_ptr[g + 1] - args.d_group_ptr[g]);
assert(n_pairs > 0);
auto [sample_idx, sample_pair_idx] = linalg::UnravelIndex(idx, {n_samples, n_pairs});
auto g_begin = static_cast<std::size_t>(args.d_group_ptr[g]);
std::size_t n_data = args.d_group_ptr[g + 1] - g_begin;
auto g_label = args.labels.Slice(linalg::Range(g_begin, g_begin + n_data));
auto g_rank_idx = args.d_sorted_idx.subspan(args.d_group_ptr[g], n_data);
auto g_y_sorted_idx = args.d_y_sorted_idx.subspan(g_begin, n_data);
std::size_t const i = sample_idx - g_begin;
assert(sample_pair_idx < n_samples);
assert(i <= sample_idx);
auto g_sorted_label = dh::MakeTransformIterator<float>(
thrust::make_counting_iterator(0ul),
[&](std::size_t i) { return g_label(g_rank_idx[g_y_sorted_idx[i]]); });
// Are the labels diverse enough? If they are all the same, then there is nothing to pick
// from another group - bail sooner
if (g_label.Size() == 0 || g_sorted_label[0] == g_sorted_label[n_data - 1]) {
auto z = static_cast<std::size_t>(0ul);
return std::make_tuple(z, z);
}
std::size_t n_lefts = CountNumItemsToTheLeftOf(g_sorted_label, i + 1, g_sorted_label[i]);
std::size_t n_rights =
CountNumItemsToTheRightOf(g_sorted_label + i, n_data - i, g_sorted_label[i]);
// The index pointing to the first element of the next bucket
std::size_t right_bound = n_data - n_rights;
thrust::minstd_rand rng(args.iter);
auto pair_idx = i;
rng.discard(sample_pair_idx * n_data + g + pair_idx); // fixme
thrust::uniform_int_distribution<std::size_t> dist(0, n_lefts + n_rights - 1);
auto ridx = dist(rng);
SPAN_CHECK(ridx < n_lefts + n_rights);
if (ridx >= n_lefts) {
ridx = ridx - n_lefts + right_bound; // fixme
}
auto idx0 = g_y_sorted_idx[pair_idx];
auto idx1 = g_y_sorted_idx[ridx];
return std::make_tuple(idx0, idx1);
}
/**
* \brief Generate a single pair.
*
* \param idx Pair index (CUDA thread index).
* \param g Query group index.
*/
XGBOOST_DEVICE auto operator()(std::size_t idx, bst_group_t g) const {
if (has_truncation) {
return this->WithTruncation(idx, g);
} else {
return this->WithSampling(idx, g);
}
}
};
} // namespace xgboost::obj::cuda_impl
#endif // XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_

View File

@ -0,0 +1,260 @@
/**
* Copyright 2023 XGBoost contributors
*/
#ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
#define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
#include <algorithm> // for min, max
#include <cassert> // for assert
#include <cmath> // for log, abs
#include <cstddef> // for size_t
#include <functional> // for greater
#include <memory> // for shared_ptr
#include <random> // for minstd_rand, uniform_int_distribution
#include <vector> // for vector
#include "../common/algorithm.h" // for ArgSort
#include "../common/math.h" // for Sigmoid
#include "../common/ranking_utils.h" // for CalcDCGGain
#include "../common/transform_iterator.h" // for MakeIndexTransformIter
#include "xgboost/base.h" // for GradientPair, XGBOOST_DEVICE, kRtEps
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for VectorView, Vector
#include "xgboost/logging.h" // for CHECK_EQ
#include "xgboost/span.h" // for Span
namespace xgboost::obj {
template <bool exp>
XGBOOST_DEVICE double DeltaNDCG(float y_high, float y_low, std::size_t r_high, std::size_t r_low,
double inv_IDCG, common::Span<double const> discount) {
double gain_high = exp ? ltr::CalcDCGGain(y_high) : y_high;
double discount_high = discount[r_high];
double gain_low = exp ? ltr::CalcDCGGain(y_low) : y_low;
double discount_low = discount[r_low];
double original = gain_high * discount_high + gain_low * discount_low;
double changed = gain_low * discount_high + gain_high * discount_low;
double delta_NDCG = (original - changed) * inv_IDCG;
assert(delta_NDCG >= -1.0);
assert(delta_NDCG <= 1.0);
return delta_NDCG;
}
XGBOOST_DEVICE inline double DeltaMAP(float y_high, float y_low, std::size_t rank_high,
std::size_t rank_low, common::Span<double const> n_rel,
common::Span<double const> acc) {
double r_h = static_cast<double>(rank_high) + 1.0;
double r_l = static_cast<double>(rank_low) + 1.0;
double delta{0.0};
double n_total_relevances = n_rel.back();
assert(n_total_relevances > 0.0);
auto m = n_rel[rank_low];
double n = n_rel[rank_high];
if (y_high < y_low) {
auto a = m / r_l - (n + 1.0) / r_h;
auto b = acc[rank_low - 1] - acc[rank_high];
delta = (a - b) / n_total_relevances;
} else {
auto a = n / r_h - m / r_l;
auto b = acc[rank_low - 1] - acc[rank_high];
delta = (a + b) / n_total_relevances;
}
return delta;
}
template <bool unbiased, typename Delta>
XGBOOST_DEVICE GradientPair
LambdaGrad(linalg::VectorView<float const> labels, common::Span<float const> predts,
common::Span<size_t const> sorted_idx,
std::size_t rank_high, // cordiniate
std::size_t rank_low, // cordiniate
Delta delta, // delta score
linalg::VectorView<double const> t_plus, // input bias ratio
linalg::VectorView<double const> t_minus, // input bias ratio
double* p_cost) {
assert(sorted_idx.size() > 0 && "Empty sorted idx for a group.");
std::size_t idx_high = sorted_idx[rank_high];
std::size_t idx_low = sorted_idx[rank_low];
if (labels(idx_high) == labels(idx_low)) {
*p_cost = 0;
return {0.0f, 0.0f};
}
auto best_score = predts[sorted_idx.front()];
auto worst_score = predts[sorted_idx.back()];
auto y_high = labels(idx_high);
float s_high = predts[idx_high];
auto y_low = labels(idx_low);
float s_low = predts[idx_low];
// Use double whenever possible as we are working on the exp space.
double delta_score = std::abs(s_high - s_low);
double sigmoid = common::Sigmoid(s_high - s_low);
// Change in metric score like \delta NDCG or \delta MAP
double delta_metric = std::abs(delta(y_high, y_low, rank_high, rank_low));
if (best_score != worst_score) {
delta_metric /= (delta_score + kRtEps);
}
if (unbiased) {
*p_cost = std::log(1.0 / (1.0 - sigmoid)) * delta_metric;
}
constexpr double kEps = 1e-16;
auto lambda_ij = (sigmoid - 1.0) * delta_metric;
auto hessian_ij = std::max(sigmoid * (1.0 - sigmoid), kEps) * delta_metric * 2.0;
auto k = t_plus.Size();
assert(t_minus.Size() == k && "Invalid size of position bias");
if (unbiased && idx_high < k && idx_low < k) {
lambda_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
hessian_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
}
auto pg = GradientPair{static_cast<float>(lambda_ij), static_cast<float>(hessian_ij)};
return pg;
}
XGBOOST_DEVICE inline GradientPair Repulse(GradientPair pg) {
auto ng = GradientPair{-pg.GetGrad(), pg.GetHess()};
return ng;
}
namespace cuda_impl {
void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
HostDeviceVector<float> const& preds, MetaInfo const& info,
std::shared_ptr<ltr::NDCGCache> p_cache,
linalg::VectorView<double const> t_plus, // input bias ratio
linalg::VectorView<double const> t_minus, // input bias ratio
linalg::VectorView<double> li, linalg::VectorView<double> lj,
HostDeviceVector<GradientPair>* out_gpair);
/**
* \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
*/
void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t const> d_rank_idx,
std::shared_ptr<ltr::MAPCache> p_cache);
void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
HostDeviceVector<float> const& predt, MetaInfo const& info,
std::shared_ptr<ltr::MAPCache> p_cache,
linalg::VectorView<double const> t_plus, // input bias ratio
linalg::VectorView<double const> t_minus, // input bias ratio
linalg::VectorView<double> li, linalg::VectorView<double> lj,
HostDeviceVector<GradientPair>* out_gpair);
void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
HostDeviceVector<float> const& predt, const MetaInfo& info,
std::shared_ptr<ltr::RankingCache> p_cache,
linalg::VectorView<double const> ti_plus, // input bias ratio
linalg::VectorView<double const> tj_minus, // input bias ratio
linalg::VectorView<double> li, linalg::VectorView<double> lj,
HostDeviceVector<GradientPair>* out_gpair);
void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
linalg::VectorView<double const> lj_full,
linalg::Vector<double>* p_ti_plus,
linalg::Vector<double>* p_tj_minus, linalg::Vector<double>* p_li,
linalg::Vector<double>* p_lj,
std::shared_ptr<ltr::RankingCache> p_cache);
} // namespace cuda_impl
namespace cpu_impl {
/**
* \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
*
* \param label Ground truth relevance label.
* \param rank_idx Sorted index of prediction.
* \param p_cache An initialized MAPCache.
*/
void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
common::Span<std::size_t const> rank_idx, std::shared_ptr<ltr::MAPCache> p_cache);
} // namespace cpu_impl
/**
* \param Construct pairs on CPU
*
* \tparam Op Functor for upgrading a pair of gradients.
*
* \param ctx The global context.
* \param iter The boosting iteration.
* \param cache ltr cache.
* \param g The current query group
* \param g_label label The labels for the current query group
* \param g_rank Sorted index of model scores for the current query group.
* \param op A callable that accepts two index for a pair of documents. The index is for
* the ranked list (labels sorted according to model scores).
*/
template <typename Op>
void MakePairs(Context const* ctx, std::int32_t iter,
std::shared_ptr<ltr::RankingCache> const cache, bst_group_t g,
linalg::VectorView<float const> g_label, common::Span<std::size_t const> g_rank,
Op op) {
auto group_ptr = cache->DataGroupPtr(ctx);
ltr::position_t cnt = group_ptr[g + 1] - group_ptr[g];
if (cache->Param().HasTruncation()) {
for (std::size_t i = 0; i < std::min(cnt, cache->Param().NumPair()); ++i) {
for (std::size_t j = i + 1; j < cnt; ++j) {
op(i, j);
}
}
} else {
CHECK_EQ(g_rank.size(), g_label.Size());
std::minstd_rand rnd(iter);
rnd.discard(g); // fixme(jiamingy): honor the global seed
// sort label according to the rank list
auto it = common::MakeIndexTransformIter(
[&g_rank, &g_label](std::size_t idx) { return g_label(g_rank[idx]); });
std::vector<std::size_t> y_sorted_idx =
common::ArgSort<std::size_t>(ctx, it, it + cnt, std::greater<>{});
// permutation iterator to get the original label
auto rev_it = common::MakeIndexTransformIter(
[&](std::size_t idx) { return g_label(g_rank[y_sorted_idx[idx]]); });
for (std::size_t i = 0; i < cnt;) {
std::size_t j = i + 1;
// find the bucket boundary
while (j < cnt && rev_it[i] == rev_it[j]) {
++j;
}
// Bucket [i,j), construct n_samples pairs for each sample inside the bucket with
// another sample outside the bucket.
//
// n elements left to the bucket, and n elements right to the bucket
std::size_t n_lefts = i, n_rights = static_cast<std::size_t>(cnt - j);
if (n_lefts + n_rights == 0) {
i = j;
continue;
}
auto n_samples = cache->Param().NumPair();
// for each pair specifed by the user
while (n_samples--) {
// for each sample in the bucket
for (std::size_t pair_idx = i; pair_idx < j; ++pair_idx) {
std::size_t ridx = std::uniform_int_distribution<std::size_t>(
static_cast<std::size_t>(0), n_lefts + n_rights - 1)(rnd);
if (ridx >= n_lefts) {
ridx = ridx - i + j; // shift to the right of the bucket
}
// index that points to the rank list.
auto idx0 = y_sorted_idx[pair_idx];
auto idx1 = y_sorted_idx[ridx];
op(idx0, idx1);
}
}
i = j;
}
}
}
} // namespace xgboost::obj
#endif // XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_

View File

@ -35,7 +35,10 @@ class QuantileRegression : public ObjFunction {
bst_target_t Targets(MetaInfo const& info) const override {
auto const& alpha = param_.quantile_alpha.Get();
CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target is not yet supported by the quantile loss.";
if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
CHECK_EQ(info.labels.Shape(1), 1)
<< "Multi-target is not yet supported by the quantile loss.";
}
CHECK(!alpha.empty());
// We have some placeholders for multi-target in the quantile loss. But it's not
// supported as the gbtree doesn't know how to slice the gradient and there's no 3-dim
@ -167,8 +170,10 @@ class QuantileRegression : public ObjFunction {
common::Mean(ctx_, *base_score, &temp);
double meanq = temp(0) * sw;
if (info.IsRowSplit()) {
collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
collective::Allreduce<collective::Operation::kSum>(&sw, 1);
}
meanq /= (sw + kRtEps);
base_score->Reshape(1);
base_score->Data()->Fill(meanq);

View File

@ -728,8 +728,10 @@ class MeanAbsoluteError : public ObjFunction {
std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
[w](float v) { return v * w; });
if (info.IsRowSplit()) {
collective::Allreduce<collective::Operation::kSum>(out.Values().data(), out.Values().size());
collective::Allreduce<collective::Operation::kSum>(&w, 1);
}
if (common::CloseTo(w, 0.0)) {
// Mostly for handling empty dataset test.

View File

@ -31,6 +31,5 @@ dependencies:
- pyspark
- cloudpickle
- pip:
- shap
- awscli
- auditwheel

View File

@ -37,7 +37,6 @@ dependencies:
- pyarrow
- protobuf
- cloudpickle
- shap>=0.41
- modin
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz

View File

@ -146,6 +146,7 @@ def main(args: argparse.Namespace) -> None:
"tests/python/test_config.py",
"tests/python/test_data_iterator.py",
"tests/python/test_dt.py",
"tests/python/test_predict.py",
"tests/python/test_quantile_dmatrix.py",
"tests/python/test_tree_regularization.py",
"tests/python-gpu/test_gpu_data_iterator.py",

View File

@ -0,0 +1,106 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#include "test_lambdarank_obj.h"
#include <gtest/gtest.h> // for Test, Message, TestPartResult, CmpHel...
#include <cstddef> // for size_t
#include <initializer_list> // for initializer_list
#include <map> // for map
#include <memory> // for unique_ptr, shared_ptr, make_shared
#include <numeric> // for iota
#include <string> // for char_traits, basic_string, string
#include <vector> // for vector
#include "../../../src/common/ranking_utils.h" // for LambdaRankParam
#include "../../../src/common/ranking_utils.h" // for NDCGCache, LambdaRankParam
#include "../helpers.h" // for CheckRankingObjFunction, CheckConfigReload
#include "xgboost/base.h" // for GradientPair, bst_group_t, Args
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo, DMatrix
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for Tensor, All, TensorView
#include "xgboost/objective.h" // for ObjFunction
#include "xgboost/span.h" // for Span
namespace xgboost::obj {
void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt) {
out_predt->SetDevice(ctx->gpu_id);
MetaInfo& info = *out_info;
info.num_row_ = 128;
info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
shape[0] = info.num_row_;
shape[1] = 1;
auto& h_data = data->HostVector();
h_data.resize(shape[0]);
for (std::size_t i = 0; i < h_data.size(); ++i) {
h_data[i] = i % 2;
}
});
std::vector<float> predt(info.num_row_);
std::iota(predt.rbegin(), predt.rend(), 0.0f);
out_predt->HostVector() = predt;
}
TEST(LambdaRank, MakePair) {
Context ctx;
MetaInfo info;
HostDeviceVector<float> predt;
InitMakePairTest(&ctx, &info, &predt);
ltr::LambdaRankParam param;
param.UpdateAllowUnknown(Args{{"lambdarank_pair_method", "topk"}});
ASSERT_TRUE(param.HasTruncation());
std::shared_ptr<ltr::RankingCache> p_cache = std::make_shared<ltr::NDCGCache>(&ctx, info, param);
auto const& h_predt = predt.ConstHostVector();
{
auto rank_idx = p_cache->SortedIdx(&ctx, h_predt);
for (std::size_t i = 0; i < h_predt.size(); ++i) {
ASSERT_EQ(rank_idx[i], static_cast<std::size_t>(*(h_predt.crbegin() + i)));
}
std::int32_t n_pairs{0};
MakePairs(&ctx, 0, p_cache, 0, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
[&](auto i, auto j) {
ASSERT_GT(j, i);
ASSERT_LT(i, p_cache->Param().NumPair());
++n_pairs;
});
ASSERT_EQ(n_pairs, 3568);
}
auto const h_label = info.labels.HostView();
{
param.UpdateAllowUnknown(Args{{"lambdarank_pair_method", "mean"}});
auto p_cache = std::make_shared<ltr::NDCGCache>(&ctx, info, param);
ASSERT_FALSE(param.HasTruncation());
std::int32_t n_pairs = 0;
auto rank_idx = p_cache->SortedIdx(&ctx, h_predt);
MakePairs(&ctx, 0, p_cache, 0, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
[&](auto i, auto j) {
++n_pairs;
// Not in the same bucket
ASSERT_NE(h_label(rank_idx[i]), h_label(rank_idx[j]));
});
ASSERT_EQ(n_pairs, info.num_row_ * param.NumPair());
}
{
param.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", "2"}});
auto p_cache = std::make_shared<ltr::NDCGCache>(&ctx, info, param);
auto rank_idx = p_cache->SortedIdx(&ctx, h_predt);
std::int32_t n_pairs = 0;
MakePairs(&ctx, 0, p_cache, 0, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
[&](auto i, auto j) {
++n_pairs;
// Not in the same bucket
ASSERT_NE(h_label(rank_idx[i]), h_label(rank_idx[j]));
});
ASSERT_EQ(param.NumPair(), 2);
ASSERT_EQ(n_pairs, info.num_row_ * param.NumPair());
}
}
} // namespace xgboost::obj

View File

@ -0,0 +1,138 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/context.h> // for Context
#include <cstdint> // for uint32_t
#include <vector> // for vector
#include "../../../src/common/cuda_context.cuh" // for CUDAContext
#include "../../../src/objective/lambdarank_obj.cuh"
#include "test_lambdarank_obj.h"
namespace xgboost::obj {
void TestGPUMakePair() {
Context ctx;
ctx.gpu_id = 0;
MetaInfo info;
HostDeviceVector<float> predt;
InitMakePairTest(&ctx, &info, &predt);
ltr::LambdaRankParam param;
auto make_args = [&](std::shared_ptr<ltr::RankingCache> p_cache, auto rank_idx,
common::Span<std::size_t const> y_sorted_idx) {
linalg::Vector<double> dummy;
auto d = dummy.View(ctx.gpu_id);
linalg::Vector<GradientPair> dgpair;
auto dg = dgpair.View(ctx.gpu_id);
cuda_impl::KernelInputs args{d,
d,
d,
d,
p_cache->DataGroupPtr(&ctx),
p_cache->CUDAThreadsGroupPtr(),
rank_idx,
info.labels.View(ctx.gpu_id),
predt.ConstDeviceSpan(),
{},
dg,
nullptr,
y_sorted_idx,
0};
return args;
};
{
param.UpdateAllowUnknown(Args{{"lambdarank_pair_method", "topk"}});
auto p_cache = std::make_shared<ltr::NDCGCache>(&ctx, info, param);
auto rank_idx = p_cache->SortedIdx(&ctx, predt.ConstDeviceSpan());
ASSERT_EQ(p_cache->CUDAThreads(), 3568);
auto args = make_args(p_cache, rank_idx, {});
auto n_pairs = p_cache->Param().NumPair();
auto make_pair = cuda_impl::MakePairsOp<true>{args};
dh::LaunchN(p_cache->CUDAThreads(), ctx.CUDACtx()->Stream(),
[=] XGBOOST_DEVICE(std::size_t idx) {
auto [i, j] = make_pair(idx, 0);
SPAN_CHECK(j > i);
SPAN_CHECK(i < n_pairs);
});
}
{
param.UpdateAllowUnknown(Args{{"lambdarank_pair_method", "mean"}});
auto p_cache = std::make_shared<ltr::NDCGCache>(&ctx, info, param);
auto rank_idx = p_cache->SortedIdx(&ctx, predt.ConstDeviceSpan());
auto y_sorted_idx = cuda_impl::SortY(&ctx, info, rank_idx, p_cache);
ASSERT_FALSE(param.HasTruncation());
ASSERT_EQ(p_cache->CUDAThreads(), info.num_row_ * param.NumPair());
auto args = make_args(p_cache, rank_idx, y_sorted_idx);
auto make_pair = cuda_impl::MakePairsOp<false>{args};
auto n_pairs = p_cache->Param().NumPair();
ASSERT_EQ(n_pairs, 1);
dh::LaunchN(
p_cache->CUDAThreads(), ctx.CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t idx) {
idx = 97;
auto [i, j] = make_pair(idx, 0);
// Not in the same bucket
SPAN_CHECK(make_pair.args.labels(rank_idx[i]) != make_pair.args.labels(rank_idx[j]));
});
}
{
param.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", "2"}});
auto p_cache = std::make_shared<ltr::NDCGCache>(&ctx, info, param);
auto rank_idx = p_cache->SortedIdx(&ctx, predt.ConstDeviceSpan());
auto y_sorted_idx = cuda_impl::SortY(&ctx, info, rank_idx, p_cache);
auto args = make_args(p_cache, rank_idx, y_sorted_idx);
auto make_pair = cuda_impl::MakePairsOp<false>{args};
dh::LaunchN(
p_cache->CUDAThreads(), ctx.CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t idx) {
auto [i, j] = make_pair(idx, 0);
// Not in the same bucket
SPAN_CHECK(make_pair.args.labels(rank_idx[i]) != make_pair.args.labels(rank_idx[j]));
});
ASSERT_EQ(param.NumPair(), 2);
ASSERT_EQ(p_cache->CUDAThreads(), info.num_row_ * param.NumPair());
}
}
TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }
template <typename CountFunctor>
void RankItemCountImpl(std::vector<std::uint32_t> const &sorted_items, CountFunctor f,
std::uint32_t find_val, std::uint32_t exp_val) {
EXPECT_NE(std::find(sorted_items.begin(), sorted_items.end(), find_val), sorted_items.end());
EXPECT_EQ(f(&sorted_items[0], sorted_items.size(), find_val), exp_val);
}
TEST(LambdaRank, RankItemCountOnLeft) {
// Items sorted descendingly
std::vector<std::uint32_t> sorted_items{10, 10, 6, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0};
auto wrapper = [](auto const &...args) { return cuda_impl::CountNumItemsToTheLeftOf(args...); };
RankItemCountImpl(sorted_items, wrapper, 10, static_cast<uint32_t>(0));
RankItemCountImpl(sorted_items, wrapper, 6, static_cast<uint32_t>(2));
RankItemCountImpl(sorted_items, wrapper, 4, static_cast<uint32_t>(3));
RankItemCountImpl(sorted_items, wrapper, 1, static_cast<uint32_t>(7));
RankItemCountImpl(sorted_items, wrapper, 0, static_cast<uint32_t>(12));
}
TEST(LambdaRank, RankItemCountOnRight) {
// Items sorted descendingly
std::vector<std::uint32_t> sorted_items{10, 10, 6, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0};
auto wrapper = [](auto const &...args) { return cuda_impl::CountNumItemsToTheRightOf(args...); };
RankItemCountImpl(sorted_items, wrapper, 10, static_cast<uint32_t>(11));
RankItemCountImpl(sorted_items, wrapper, 6, static_cast<uint32_t>(10));
RankItemCountImpl(sorted_items, wrapper, 4, static_cast<uint32_t>(6));
RankItemCountImpl(sorted_items, wrapper, 1, static_cast<uint32_t>(1));
RankItemCountImpl(sorted_items, wrapper, 0, static_cast<uint32_t>(0));
}
} // namespace xgboost::obj

View File

@ -0,0 +1,26 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#ifndef XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
#define XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
#include <gtest/gtest.h>
#include <xgboost/data.h> // for MetaInfo
#include <xgboost/host_device_vector.h> // for HostDeviceVector
#include <xgboost/linalg.h> // for All
#include <xgboost/objective.h> // for ObjFunction
#include <memory> // for shared_ptr, make_shared
#include <numeric> // for iota
#include <vector> // for vector
#include "../../../src/common/ranking_utils.h" // for LambdaRankParam, MAPCache
#include "../../../src/objective/lambdarank_obj.h" // for MAPStat
#include "../helpers.h" // for EmptyDMatrix
namespace xgboost::obj {
/**
* \brief Initialize test data for make pair tests.
*/
void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt);
} // namespace xgboost::obj
#endif // XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_

View File

@ -89,43 +89,6 @@ TEST(Objective, RankSegmentSorterAscendingTest) {
5, 4, 6});
}
using CountFunctor = uint32_t (*)(const int *, uint32_t, int);
void RankItemCountImpl(const std::vector<int> &sorted_items, CountFunctor f,
int find_val, uint32_t exp_val) {
EXPECT_NE(std::find(sorted_items.begin(), sorted_items.end(), find_val), sorted_items.end());
EXPECT_EQ(f(&sorted_items[0], sorted_items.size(), find_val), exp_val);
}
TEST(Objective, RankItemCountOnLeft) {
// Items sorted descendingly
std::vector<int> sorted_items{10, 10, 6, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0};
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
10, static_cast<uint32_t>(0));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
6, static_cast<uint32_t>(2));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
4, static_cast<uint32_t>(3));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
1, static_cast<uint32_t>(7));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
0, static_cast<uint32_t>(12));
}
TEST(Objective, RankItemCountOnRight) {
// Items sorted descendingly
std::vector<int> sorted_items{10, 10, 6, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0};
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
10, static_cast<uint32_t>(11));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
6, static_cast<uint32_t>(10));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
4, static_cast<uint32_t>(6));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
1, static_cast<uint32_t>(1));
RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
0, static_cast<uint32_t>(0));
}
TEST(Objective, NDCGLambdaWeightComputerTest) {
std::vector<float> hlabels = {3.1f, 1.2f, 2.3f, 4.4f, // Labels
7.8f, 5.01f, 6.96f,

View File

@ -0,0 +1,32 @@
/**
* Copyright (c) 2023, XGBoost contributors
*/
#include <dmlc/registry.h> // for Registry
#include <gtest/gtest.h>
#include <xgboost/objective.h> // for ObjFunctionReg
#include <algorithm> // for transform
#include <iterator> // for back_insert_iterator, back_inserter
#include <string> // for string
#include <vector> // for vector
namespace xgboost {
inline auto MakeObjNamesForTest() {
auto list = ::dmlc::Registry<::xgboost::ObjFunctionReg>::List();
std::vector<std::string> names;
std::transform(list.cbegin(), list.cend(), std::back_inserter(names),
[](auto const* entry) { return entry->name; });
return names;
}
template <typename ParamType>
inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType>& info) {
auto name = std::string{info.param};
// Name must be a valid c++ symbol
auto it = std::find(name.cbegin(), name.cend(), ':');
if (it != name.cend()) {
name[std::distance(name.cbegin(), it)] = '_';
}
return name;
};
} // namespace xgboost

View File

@ -8,6 +8,7 @@
#include <xgboost/json.h>
#include <random>
#include <thread> // for thread, sleep_for
#include "../../../plugin/federated/federated_server.h"
#include "../../../src/collective/communicator-inl.h"
@ -33,13 +34,17 @@ inline std::string GetServerAddress() {
namespace xgboost {
class BaseFederatedTest : public ::testing::Test {
protected:
void SetUp() override {
class ServerForTest {
std::string server_address_;
std::unique_ptr<std::thread> server_thread_;
std::unique_ptr<grpc::Server> server_;
public:
explicit ServerForTest(std::int32_t world_size) {
server_address_ = GetServerAddress();
server_thread_.reset(new std::thread([this] {
server_thread_.reset(new std::thread([this, world_size] {
grpc::ServerBuilder builder;
xgboost::federated::FederatedService service{kWorldSize};
xgboost::federated::FederatedService service{world_size};
builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
server_ = builder.BuildAndStart();
@ -47,15 +52,21 @@ class BaseFederatedTest : public ::testing::Test {
}));
}
void TearDown() override {
~ServerForTest() {
server_->Shutdown();
server_thread_->join();
}
auto Address() const { return server_address_; }
};
class BaseFederatedTest : public ::testing::Test {
protected:
void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
void TearDown() override { server_.reset(nullptr); }
static int const kWorldSize{3};
std::string server_address_;
std::unique_ptr<std::thread> server_thread_;
std::unique_ptr<grpc::Server> server_;
std::unique_ptr<ServerForTest> server_;
};
template <typename Function, typename... Args>

View File

@ -29,7 +29,7 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back([rank, server_address = server_address_] {
threads.emplace_back([rank, server_address = server_->Address()] {
FederatedCommunicator comm{kWorldSize, rank, server_address};
// Assign device 0 to all workers, since we run gtest in a single-GPU machine
DeviceCommunicatorAdapter adapter{0, &comm};
@ -52,7 +52,7 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back([rank, server_address = server_address_] {
threads.emplace_back([rank, server_address = server_->Address()] {
FederatedCommunicator comm{kWorldSize, rank, server_address};
// Assign device 0 to all workers, since we run gtest in a single-GPU machine
DeviceCommunicatorAdapter adapter{0, &comm};

View File

@ -92,7 +92,7 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
config["federated_server_address"] = server_address;
config["federated_world_size"] = std::string("1");
config["federated_rank"] = Integer(0);
auto *comm = FederatedCommunicator::Create(config);
FederatedCommunicator::Create(config);
};
EXPECT_THROW(construct(), dmlc::Error);
}
@ -104,7 +104,7 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
config["federated_server_address"] = server_address;
config["federated_world_size"] = 1;
config["federated_rank"] = std::string("0");
auto *comm = FederatedCommunicator::Create(config);
FederatedCommunicator::Create(config);
};
EXPECT_THROW(construct(), dmlc::Error);
}
@ -125,7 +125,7 @@ TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
TEST_F(FederatedCommunicatorTest, Allgather) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedCommunicatorTest::VerifyAllgather, rank, server_address_);
threads.emplace_back(&FederatedCommunicatorTest::VerifyAllgather, rank, server_->Address());
}
for (auto &thread : threads) {
thread.join();
@ -135,7 +135,7 @@ TEST_F(FederatedCommunicatorTest, Allgather) {
TEST_F(FederatedCommunicatorTest, Allreduce) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_);
threads.emplace_back(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_->Address());
}
for (auto &thread : threads) {
thread.join();
@ -145,7 +145,7 @@ TEST_F(FederatedCommunicatorTest, Allreduce) {
TEST_F(FederatedCommunicatorTest, Broadcast) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_);
threads.emplace_back(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_->Address());
}
for (auto &thread : threads) {
thread.join();

View File

@ -38,8 +38,8 @@ void VerifyLoadUri() {
auto index = 0;
int offsets[] = {0, 8, 17};
int offset = offsets[rank];
for (auto row = 0; row < kRows; row++) {
for (auto col = 0; col < kCols; col++) {
for (std::size_t row = 0; row < kRows; row++) {
for (std::size_t col = 0; col < kCols; col++) {
EXPECT_EQ(entries[index].index, col + offset);
index++;
}
@ -48,6 +48,6 @@ void VerifyLoadUri() {
}
TEST_F(FederatedDataTest, LoadUri) {
RunWithFederatedCommunicator(kWorldSize, server_address_, &VerifyLoadUri);
RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyLoadUri);
}
} // namespace xgboost

View File

@ -8,71 +8,113 @@
#include "../../../plugin/federated/federated_server.h"
#include "../../../src/collective/communicator-inl.h"
#include "../../../src/common/linalg_op.h"
#include "../helpers.h"
#include "../objective_helpers.h" // for MakeObjNamesForTest, ObjTestNameGenerator
#include "helpers.h"
namespace xgboost {
class FederatedLearnerTest : public BaseFederatedTest {
protected:
static auto constexpr kRows{16};
static auto constexpr kCols{16};
};
void VerifyBaseScore(size_t rows, size_t cols, float expected_base_score) {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::shared_ptr<DMatrix> Xy_{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
std::shared_ptr<DMatrix> sliced{Xy_->SliceCol(world_size, rank)};
std::unique_ptr<Learner> learner{Learner::Create({sliced})};
namespace {
auto MakeModel(std::string objective, std::shared_ptr<DMatrix> dmat) {
std::unique_ptr<Learner> learner{Learner::Create({dmat})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", "binary:logistic");
learner->UpdateOneIter(0, sliced);
learner->SetParam("objective", objective);
if (objective.find("quantile") != std::string::npos) {
learner->SetParam("quantile_alpha", "0.5");
}
if (objective.find("multi") != std::string::npos) {
learner->SetParam("num_class", "3");
}
learner->UpdateOneIter(0, dmat);
Json config{Object{}};
learner->SaveConfig(&config);
auto base_score = GetBaseScore(config);
ASSERT_EQ(base_score, expected_base_score);
}
void VerifyModel(size_t rows, size_t cols, Json const& expected_model) {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::shared_ptr<DMatrix> Xy_{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
std::shared_ptr<DMatrix> sliced{Xy_->SliceCol(world_size, rank)};
std::unique_ptr<Learner> learner{Learner::Create({sliced})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", "binary:logistic");
learner->UpdateOneIter(0, sliced);
Json model{Object{}};
learner->SaveModel(&model);
return model;
}
void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json expected_model,
std::string objective) {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::shared_ptr<DMatrix> dmat{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
if (rank == 0) {
auto &h_upper = dmat->Info().labels_upper_bound_.HostVector();
auto &h_lower = dmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(rows);
h_upper.resize(rows);
for (size_t i = 0; i < rows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (objective.find("rank:") != std::string::npos) {
auto h_label = dmat->Info().labels.HostView();
std::size_t k = 0;
for (auto &v : h_label) {
v = k % 2 == 0;
++k;
}
}
}
std::shared_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
auto model = MakeModel(objective, sliced);
auto base_score = GetBaseScore(model);
ASSERT_EQ(base_score, expected_base_score);
ASSERT_EQ(model, expected_model);
}
} // namespace
TEST_F(FederatedLearnerTest, BaseScore) {
std::shared_ptr<DMatrix> Xy_{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", "binary:logistic");
learner->UpdateOneIter(0, Xy_);
Json config{Object{}};
learner->SaveConfig(&config);
auto base_score = GetBaseScore(config);
ASSERT_NE(base_score, ObjFunction::DefaultBaseScore());
class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
std::unique_ptr<ServerForTest> server_;
static int const kWorldSize{3};
RunWithFederatedCommunicator(kWorldSize, server_address_, &VerifyBaseScore, kRows, kCols,
base_score);
protected:
void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
void TearDown() override { server_.reset(nullptr); }
void Run(std::string objective) {
static auto constexpr kRows{16};
static auto constexpr kCols{16};
std::shared_ptr<DMatrix> dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
auto &h_upper = dmat->Info().labels_upper_bound_.HostVector();
auto &h_lower = dmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (objective.find("rank:") != std::string::npos) {
auto h_label = dmat->Info().labels.HostView();
std::size_t k = 0;
for (auto &v : h_label) {
v = k % 2 == 0;
++k;
}
}
TEST_F(FederatedLearnerTest, Model) {
std::shared_ptr<DMatrix> Xy_{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", "binary:logistic");
learner->UpdateOneIter(0, Xy_);
Json model{Object{}};
learner->SaveModel(&model);
auto model = MakeModel(objective, dmat);
auto score = GetBaseScore(model);
RunWithFederatedCommunicator(kWorldSize, server_address_, &VerifyModel, kRows, kCols,
std::cref(model));
RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyObjective, kRows, kCols,
score, model, objective);
}
};
TEST_P(FederatedLearnerTest, Objective) {
std::string objective = GetParam();
this->Run(objective);
}
INSTANTIATE_TEST_SUITE_P(FederatedLearnerObjective, FederatedLearnerTest,
::testing::ValuesIn(MakeObjNamesForTest()),
[](const ::testing::TestParamInfo<FederatedLearnerTest::ParamType> &info) {
return ObjTestNameGenerator(info);
});
} // namespace xgboost

View File

@ -73,7 +73,7 @@ class FederatedServerTest : public BaseFederatedTest {
TEST_F(FederatedServerTest, Allgather) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedServerTest::VerifyAllgather, rank, server_address_);
threads.emplace_back(&FederatedServerTest::VerifyAllgather, rank, server_->Address());
}
for (auto& thread : threads) {
thread.join();
@ -83,7 +83,7 @@ TEST_F(FederatedServerTest, Allgather) {
TEST_F(FederatedServerTest, Allreduce) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedServerTest::VerifyAllreduce, rank, server_address_);
threads.emplace_back(&FederatedServerTest::VerifyAllreduce, rank, server_->Address());
}
for (auto& thread : threads) {
thread.join();
@ -93,7 +93,7 @@ TEST_F(FederatedServerTest, Allreduce) {
TEST_F(FederatedServerTest, Broadcast) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedServerTest::VerifyBroadcast, rank, server_address_);
threads.emplace_back(&FederatedServerTest::VerifyBroadcast, rank, server_->Address());
}
for (auto& thread : threads) {
thread.join();
@ -103,7 +103,7 @@ TEST_F(FederatedServerTest, Broadcast) {
TEST_F(FederatedServerTest, Mixture) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(&FederatedServerTest::VerifyMixture, rank, server_address_);
threads.emplace_back(&FederatedServerTest::VerifyMixture, rank, server_->Address());
}
for (auto& thread : threads) {
thread.join();

View File

@ -1,22 +1,49 @@
/*!
* Copyright 2017-2023 by XGBoost contributors
/**
* Copyright (c) 2017-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/learner.h>
#include <xgboost/objective.h> // ObjFunction
#include <xgboost/version_config.h>
#include <xgboost/learner.h> // for Learner
#include <xgboost/logging.h> // for LogCheck_NE, CHECK_NE, LogCheck_EQ
#include <xgboost/objective.h> // for ObjFunction
#include <xgboost/version_config.h> // for XGBOOST_VER_MAJOR, XGBOOST_VER_MINOR
#include <string> // std::stof, std::string
#include <thread>
#include <vector>
#include <algorithm> // for equal, transform
#include <cinttypes> // for int32_t, int64_t, uint32_t
#include <cstddef> // for size_t
#include <iosfwd> // for ofstream
#include <iterator> // for back_insert_iterator, back_inserter
#include <limits> // for numeric_limits
#include <map> // for map
#include <memory> // for unique_ptr, shared_ptr, __shared_ptr_...
#include <random> // for uniform_real_distribution
#include <string> // for allocator, basic_string, string, oper...
#include <thread> // for thread
#include <type_traits> // for is_integral
#include <utility> // for pair
#include <vector> // for vector
#include "../../src/common/api_entry.h" // XGBAPIThreadLocalEntry
#include "../../src/common/io.h"
#include "../../src/common/linalg_op.h"
#include "../../src/common/random.h"
#include "filesystem.h" // dmlc::TemporaryDirectory
#include "helpers.h"
#include "xgboost/json.h"
#include "../../src/collective/communicator-inl.h" // for GetRank, GetWorldSize
#include "../../src/common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../../src/common/io.h" // for LoadSequentialFile
#include "../../src/common/linalg_op.h" // for ElementWiseTransformHost, begin, end
#include "../../src/common/random.h" // for GlobalRandom
#include "../../src/common/transform_iterator.h" // for IndexTransformIter
#include "dmlc/io.h" // for Stream
#include "dmlc/omp.h" // for omp_get_max_threads
#include "dmlc/registry.h" // for Registry
#include "filesystem.h" // for TemporaryDirectory
#include "helpers.h" // for GetBaseScore, RandomDataGenerator
#include "objective_helpers.h" // for MakeObjNamesForTest, ObjTestNameGenerator
#include "xgboost/base.h" // for bst_float, Args, bst_feature_t, bst_int
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, MetaInfo, DataType
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Json, Object, get, String, IsA, opera...
#include "xgboost/linalg.h" // for Tensor, TensorView
#include "xgboost/logging.h" // for ConsoleLogger
#include "xgboost/predictor.h" // for PredictionCacheEntry
#include "xgboost/span.h" // for Span, operator!=, SpanIterator
#include "xgboost/string_view.h" // for StringView
namespace xgboost {
TEST(Learner, Basic) {
@ -608,31 +635,90 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
void TestColumnSplitBaseScore(std::shared_ptr<DMatrix> Xy_, float expected_base_score) {
class TestColumnSplit : public ::testing::TestWithParam<std::string> {
static auto MakeFmat(std::string const& obj) {
auto constexpr kRows = 10, kCols = 10;
auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
h_lower.resize(kRows);
h_upper.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_lower[i] = 1;
h_upper[i] = 10;
}
if (obj.find("rank:") != std::string::npos) {
auto h_label = p_fmat->Info().labels.HostView();
std::size_t k = 0;
for (auto& v : h_label) {
v = k % 2 == 0;
++k;
}
}
return p_fmat;
};
void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::shared_ptr<DMatrix> sliced{Xy_->SliceCol(world_size, rank)};
auto p_fmat = MakeFmat(objective);
std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
std::unique_ptr<Learner> learner{Learner::Create({sliced})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", "binary:logistic");
learner->SetParam("objective", objective);
if (objective.find("quantile") != std::string::npos) {
learner->SetParam("quantile_alpha", "0.5");
}
if (objective.find("multi") != std::string::npos) {
learner->SetParam("num_class", "3");
}
learner->UpdateOneIter(0, sliced);
Json config{Object{}};
learner->SaveConfig(&config);
auto base_score = GetBaseScore(config);
ASSERT_EQ(base_score, expected_base_score);
Json model{Object{}};
learner->SaveModel(&model);
ASSERT_EQ(model, expected_model);
}
TEST_F(InitBaseScore, ColumnSplit) {
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
public:
void Run(std::string objective) {
auto p_fmat = MakeFmat(objective);
std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
learner->SetParam("tree_method", "approx");
learner->SetParam("objective", "binary:logistic");
learner->UpdateOneIter(0, Xy_);
learner->SetParam("objective", objective);
if (objective.find("quantile") != std::string::npos) {
learner->SetParam("quantile_alpha", "0.5");
}
if (objective.find("multi") != std::string::npos) {
learner->SetParam("num_class", "3");
}
learner->UpdateOneIter(0, p_fmat);
Json config{Object{}};
learner->SaveConfig(&config);
auto base_score = GetBaseScore(config);
ASSERT_NE(base_score, ObjFunction::DefaultBaseScore());
Json model{Object{}};
learner->SaveModel(&model);
auto constexpr kWorldSize{3};
RunWithInMemoryCommunicator(kWorldSize, &TestColumnSplitBaseScore, Xy_, base_score);
auto call = [this, &objective](auto&... args) { TestBaseScore(objective, args...); };
auto score = GetBaseScore(config);
RunWithInMemoryCommunicator(kWorldSize, call, score, model);
}
};
TEST_P(TestColumnSplit, Objective) {
std::string objective = GetParam();
this->Run(objective);
}
INSTANTIATE_TEST_SUITE_P(ColumnSplitObjective, TestColumnSplit,
::testing::ValuesIn(MakeObjNamesForTest()),
[](const ::testing::TestParamInfo<TestColumnSplit::ParamType>& info) {
return ObjTestNameGenerator(info);
});
} // namespace xgboost

View File

@ -64,7 +64,7 @@ class TestModels:
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dtest, ntree_limit=num_round)
preds = bst.predict(dtest, iteration_range=(0, num_round))
labels = dtest.get_label()
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
@ -83,7 +83,7 @@ class TestModels:
bst2 = xgb.Booster(params=param, model_file=model_path)
dtest2 = xgb.DMatrix(dtest_path)
preds2 = bst2.predict(dtest2, ntree_limit=num_round)
preds2 = bst2.predict(dtest2, iteration_range=(0, num_round))
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
@ -96,7 +96,7 @@ class TestModels:
# check whether custom evaluation metrics work
bst = xgb.train(param, dtrain, num_round, watchlist,
feval=my_logloss)
preds3 = bst.predict(dtest, ntree_limit=num_round)
preds3 = bst.predict(dtest, iteration_range=(0, num_round))
assert all(preds3 == preds)
# check whether sample_type and normalize_type work
@ -110,7 +110,7 @@ class TestModels:
param['sample_type'] = p[0]
param['normalize_type'] = p[1]
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest, ntree_limit=num_round)
preds = bst.predict(dtest, iteration_range=(0, num_round))
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
@ -472,8 +472,8 @@ class TestModels:
X, y = load_iris(return_X_y=True)
cls = xgb.XGBClassifier(n_estimators=2)
cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
assert cls.get_booster().best_ntree_limit == 2
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
assert cls.get_booster().best_iteration == cls.n_estimators - 1
assert cls.best_iteration == cls.get_booster().best_iteration
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "cls.json")
@ -481,8 +481,8 @@ class TestModels:
cls = xgb.XGBClassifier(n_estimators=2)
cls.load_model(path)
assert cls.get_booster().best_ntree_limit == 2
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
assert cls.get_booster().best_iteration == cls.n_estimators - 1
assert cls.best_iteration == cls.get_booster().best_iteration
def run_slice(
self,
@ -664,7 +664,7 @@ class TestModels:
y = rng.randn(rows)
feature_names = ["test_feature_" + str(i) for i in range(cols)]
X_pd = pd.DataFrame(X, columns=feature_names)
X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int32)
X_pd[f"test_feature_{3}"] = X_pd.iloc[:, 3].astype(np.int32)
Xy = xgb.DMatrix(X_pd, y)
assert Xy.feature_types[3] == "int"

View File

@ -102,7 +102,6 @@ eval[test] = {data_path}
booster.feature_names = None
booster.feature_types = None
booster.set_attr(best_iteration=None)
booster.set_attr(best_ntree_limit=None)
booster.save_model(model_out_py)
py_predt = booster.predict(data)

View File

@ -1,4 +1,4 @@
'''Tests for running inplace prediction.'''
"""Tests for running inplace prediction."""
from concurrent.futures import ThreadPoolExecutor
import numpy as np
@ -17,7 +17,7 @@ def run_threaded_predict(X, rows, predict_func):
per_thread = 20
with ThreadPoolExecutor(max_workers=10) as e:
for i in range(0, rows, int(rows / per_thread)):
if hasattr(X, 'iloc'):
if hasattr(X, "iloc"):
predictor = X.iloc[i : i + per_thread, :]
else:
predictor = X[i : i + per_thread, ...]
@ -61,13 +61,16 @@ def run_predict_leaf(predictor):
validate_leaf_output(leaf, num_parallel_tree)
ntree_limit = 2
n_iters = 2
sliced = booster.predict(
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True
m,
pred_leaf=True,
iteration_range=(0, n_iters),
strict_shape=True,
)
first = sliced[0, ...]
assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
assert np.prod(first.shape) == classes * num_parallel_tree * n_iters
# When there's only 1 tree, the output is a 1 dim vector
booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
@ -77,11 +80,12 @@ def run_predict_leaf(predictor):
def test_predict_leaf():
run_predict_leaf('cpu_predictor')
run_predict_leaf("cpu_predictor")
def test_predict_shape():
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
reg = xgb.XGBRegressor(n_estimators=1)
reg.fit(X, y)
@ -119,7 +123,8 @@ def test_predict_shape():
class TestInplacePredict:
'''Tests for running inplace prediction'''
"""Tests for running inplace prediction"""
@classmethod
def setup_class(cls):
cls.rows = 1000
@ -139,7 +144,7 @@ class TestInplacePredict:
cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)
cls.num_boost_round = 10
cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)
cls.booster = xgb.train({"tree_method": "hist"}, dtrain, num_boost_round=10)
def test_predict(self):
booster = self.booster
@ -162,28 +167,22 @@ class TestInplacePredict:
predt_from_array = booster.inplace_predict(
X[:10, ...], iteration_range=(0, 4), missing=self.missing
)
predt_from_dmatrix = booster.predict(test, ntree_limit=4)
predt_from_dmatrix = booster.predict(test, iteration_range=(0, 4))
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
with pytest.raises(ValueError):
booster.predict(test, ntree_limit=booster.best_ntree_limit + 1)
with pytest.raises(ValueError):
booster.predict(test, iteration_range=(0, booster.best_iteration + 2))
default = booster.predict(test)
range_full = booster.predict(test, iteration_range=(0, self.num_boost_round))
ntree_full = booster.predict(test, ntree_limit=self.num_boost_round)
np.testing.assert_allclose(range_full, default)
np.testing.assert_allclose(ntree_full, default)
range_full = booster.predict(
test, iteration_range=(0, booster.best_iteration + 1)
)
ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit)
np.testing.assert_allclose(range_full, default)
np.testing.assert_allclose(ntree_full, default)
def predict_dense(x):
inplace_predt = booster.inplace_predict(x)
@ -251,6 +250,7 @@ class TestInplacePredict:
@pytest.mark.skipif(**tm.no_pandas())
def test_pd_dtypes(self) -> None:
from pandas.api.types import is_bool_dtype
for orig, x in pd_dtypes():
dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):

View File

@ -60,7 +60,7 @@ def test_ranking_with_weighted_data():
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
for i in range(1, 11):
pred = bst.predict(dtrain, ntree_limit=i)
pred = bst.predict(dtrain, iteration_range=(0, i))
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
is_sorted = []
for k in range(0, 20, 5):

View File

@ -95,44 +95,39 @@ class TestTrainingContinuation:
res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
assert res1 == res2
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class,
num_boost_round=3)
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration +
1) * self.num_parallel_tree
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3)
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
res2 = mean_squared_error(y_2class,
res2 = mean_squared_error(
y_2class,
gbdt_04.predict(
dtrain_2class,
ntree_limit=gbdt_04.best_ntree_limit))
dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
)
)
assert res1 == res2
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class,
num_boost_round=7, xgb_model=gbdt_04)
assert gbdt_04.best_ntree_limit == (
gbdt_04.best_iteration + 1) * self.num_parallel_tree
gbdt_04 = xgb.train(
xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04
)
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
res2 = mean_squared_error(y_2class,
res2 = mean_squared_error(
y_2class,
gbdt_04.predict(
dtrain_2class,
ntree_limit=gbdt_04.best_ntree_limit))
dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
)
)
assert res1 == res2
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
num_boost_round=7)
assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree
gbdt_05 = xgb.train(xgb_params_03,
dtrain_5class,
num_boost_round=3,
xgb_model=gbdt_05)
assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree
res1 = gbdt_05.predict(dtrain_5class)
res2 = gbdt_05.predict(dtrain_5class,
ntree_limit=gbdt_05.best_ntree_limit)
res2 = gbdt_05.predict(
dtrain_5class, iteration_range=(0, gbdt_05.best_iteration + 1)
)
np.testing.assert_almost_equal(res1, res2)
@pytest.mark.skipif(**tm.no_sklearn())

View File

@ -77,6 +77,9 @@ class TestPandas:
np.testing.assert_array_equal(result, exp)
dm = xgb.DMatrix(dummies)
assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
if int(pd.__version__[0]) >= 2:
assert dm.feature_types == ['int', 'i', 'i', 'i']
else:
assert dm.feature_types == ['int', 'int', 'int', 'int']
assert dm.num_row() == 3
assert dm.num_col() == 4
@ -298,14 +301,14 @@ class TestPandas:
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None:
from pandas.api.types import is_categorical
from pandas.api.types import is_categorical_dtype
for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"):
enable_categorical = any(is_categorical for dtype in df.dtypes)
enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
else:
# series
enable_categorical = is_categorical(df.dtype)
enable_categorical = is_categorical_dtype(df.dtype)
f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df

View File

@ -13,9 +13,9 @@ except Exception:
pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
# Check integration is not broken from xgboost side
# Changes in binary format may cause problems
def test_with_shap():
# xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
@pytest.mark.xfail
def test_with_shap() -> None:
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)

View File

@ -63,9 +63,15 @@ def test_multiclass_classification(objective):
assert xgb_model.get_booster().num_boosted_rounds() == 100
preds = xgb_model.predict(X[test_index])
# test other params in XGBClassifier().fit
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
preds2 = xgb_model.predict(
X[test_index], output_margin=True, iteration_range=(0, 1)
)
preds3 = xgb_model.predict(
X[test_index], output_margin=True, iteration_range=None
)
preds4 = xgb_model.predict(
X[test_index], output_margin=False, iteration_range=(0, 1)
)
labels = y[test_index]
check_pred(preds, labels, output_margin=False)
@ -86,25 +92,21 @@ def test_multiclass_classification(objective):
assert proba.shape[1] == cls.n_classes_
def test_best_ntree_limit():
def test_best_iteration():
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
def train(booster, forest):
def train(booster: str, forest: Optional[int]) -> None:
rounds = 4
cls = xgb.XGBClassifier(
n_estimators=rounds, num_parallel_tree=forest, booster=booster
).fit(
X, y, eval_set=[(X, y)], early_stopping_rounds=3
)
assert cls.best_iteration == rounds - 1
if forest:
assert cls.best_ntree_limit == rounds * forest
else:
assert cls.best_ntree_limit == 0
# best_ntree_limit is used by default, assert that under gblinear it's
# best_iteration is used by default, assert that under gblinear it's
# automatically ignored due to being 0.
cls.predict(X)
@ -430,12 +432,15 @@ def test_regression():
preds = xgb_model.predict(X[test_index])
# test other params in XGBRegressor().fit
preds2 = xgb_model.predict(X[test_index], output_margin=True,
ntree_limit=3)
preds3 = xgb_model.predict(X[test_index], output_margin=True,
ntree_limit=0)
preds4 = xgb_model.predict(X[test_index], output_margin=False,
ntree_limit=3)
preds2 = xgb_model.predict(
X[test_index], output_margin=True, iteration_range=(0, 3)
)
preds3 = xgb_model.predict(
X[test_index], output_margin=True, iteration_range=None
)
preds4 = xgb_model.predict(
X[test_index], output_margin=False, iteration_range=(0, 3)
)
labels = y[test_index]
assert mean_squared_error(preds, labels) < 25

View File

@ -169,7 +169,7 @@ def reg_with_weight(
)
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test"))
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test", "reg_params"))
@pytest.fixture
@ -181,6 +181,13 @@ def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
predt0 = reg1.predict(X)
pred_contrib0: np.ndarray = pred_contribs(reg1, X, None, False)
reg_params = {
"max_depth": 5,
"n_estimators": 10,
"iteration_range": [0, 5],
"max_bin": 9,
}
# convert np array to pyspark dataframe
reg_df_train_data = [
(Vectors.dense(X[0, :]), int(y[0])),
@ -188,26 +195,34 @@ def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
]
reg_df_train = spark.createDataFrame(reg_df_train_data, ["features", "label"])
reg2 = xgb.XGBRegressor(max_depth=5, n_estimators=10)
reg2.fit(X, y)
predt2 = reg2.predict(X, iteration_range=[0, 5])
# array([0.22185266, 0.77814734], dtype=float32)
reg_df_test = spark.createDataFrame(
[
(
Vectors.dense(X[0, :]),
float(predt0[0]),
pred_contrib0[0, :].tolist(),
float(predt2[0]),
),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
float(predt0[1]),
pred_contrib0[1, :].tolist(),
float(predt2[1]),
),
],
[
"features",
"expected_prediction",
"expected_pred_contribs",
"expected_prediction_with_params",
],
)
yield RegData(reg_df_train, reg_df_test)
yield RegData(reg_df_train, reg_df_test, reg_params)
MultiClfData = namedtuple("MultiClfData", ("multi_clf_df_train", "multi_clf_df_test"))
@ -740,6 +755,76 @@ class TestPySparkLocal:
model = classifier.fit(clf_data.cls_df_train)
model.transform(clf_data.cls_df_test).collect()
def test_regressor_model_save_load(self, reg_data: RegData) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
path = "file:" + tmpdir
regressor = SparkXGBRegressor(**reg_data.reg_params)
model = regressor.fit(reg_data.reg_df_train)
model.save(path)
loaded_model = SparkXGBRegressorModel.load(path)
assert model.uid == loaded_model.uid
for k, v in reg_data.reg_params.items():
assert loaded_model.getOrDefault(k) == v
pred_result = loaded_model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
with pytest.raises(AssertionError, match="Expected class name"):
SparkXGBClassifierModel.load(path)
assert_model_compatible(model, tmpdir)
def test_regressor_with_params(self, reg_data: RegData) -> None:
regressor = SparkXGBRegressor(**reg_data.reg_params)
all_params = dict(
**(regressor._gen_xgb_params_dict()),
**(regressor._gen_fit_params_dict()),
**(regressor._gen_predict_params_dict()),
)
check_sub_dict_match(
reg_data.reg_params, all_params, excluding_keys=_non_booster_params
)
model = regressor.fit(reg_data.reg_df_train)
all_params = dict(
**(model._gen_xgb_params_dict()),
**(model._gen_fit_params_dict()),
**(model._gen_predict_params_dict()),
)
check_sub_dict_match(
reg_data.reg_params, all_params, excluding_keys=_non_booster_params
)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
def test_regressor_model_pipeline_save_load(self, reg_data: RegData) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
path = "file:" + tmpdir
regressor = SparkXGBRegressor()
pipeline = Pipeline(stages=[regressor])
pipeline = pipeline.copy(
extra=get_params_map(reg_data.reg_params, regressor)
)
model = pipeline.fit(reg_data.reg_df_train)
model.save(path)
loaded_model = PipelineModel.load(path)
for k, v in reg_data.reg_params.items():
assert loaded_model.stages[0].getOrDefault(k) == v
pred_result = loaded_model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
assert_model_compatible(model.stages[0], tmpdir)
class XgboostLocalTest(SparkTestCase):
def setUp(self):
@ -918,12 +1003,6 @@ class XgboostLocalTest(SparkTestCase):
def get_local_tmp_dir(self):
return self.tempdir + str(uuid.uuid4())
def assert_model_compatible(self, model: XGBModel, model_path: str):
bst = xgb.Booster()
path = glob.glob(f"{model_path}/**/model/part-00000", recursive=True)[0]
bst.load_model(path)
self.assertEqual(model.get_booster().save_raw("json"), bst.save_raw("json"))
def test_convert_to_sklearn_model_reg(self) -> None:
regressor = SparkXGBRegressor(
n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
@ -1007,80 +1086,6 @@ class XgboostLocalTest(SparkTestCase):
== "float64"
)
def test_regressor_with_params(self):
regressor = SparkXGBRegressor(**self.reg_params)
all_params = dict(
**(regressor._gen_xgb_params_dict()),
**(regressor._gen_fit_params_dict()),
**(regressor._gen_predict_params_dict()),
)
check_sub_dict_match(
self.reg_params, all_params, excluding_keys=_non_booster_params
)
model = regressor.fit(self.reg_df_train)
all_params = dict(
**(model._gen_xgb_params_dict()),
**(model._gen_fit_params_dict()),
**(model._gen_predict_params_dict()),
)
check_sub_dict_match(
self.reg_params, all_params, excluding_keys=_non_booster_params
)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
def test_regressor_model_save_load(self):
tmp_dir = self.get_local_tmp_dir()
path = "file:" + tmp_dir
regressor = SparkXGBRegressor(**self.reg_params)
model = regressor.fit(self.reg_df_train)
model.save(path)
loaded_model = SparkXGBRegressorModel.load(path)
self.assertEqual(model.uid, loaded_model.uid)
for k, v in self.reg_params.items():
self.assertEqual(loaded_model.getOrDefault(k), v)
pred_result = loaded_model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
with self.assertRaisesRegex(AssertionError, "Expected class name"):
SparkXGBClassifierModel.load(path)
self.assert_model_compatible(model, tmp_dir)
def test_regressor_model_pipeline_save_load(self):
tmp_dir = self.get_local_tmp_dir()
path = "file:" + tmp_dir
regressor = SparkXGBRegressor()
pipeline = Pipeline(stages=[regressor])
pipeline = pipeline.copy(extra=get_params_map(self.reg_params, regressor))
model = pipeline.fit(self.reg_df_train)
model.save(path)
loaded_model = PipelineModel.load(path)
for k, v in self.reg_params.items():
self.assertEqual(loaded_model.stages[0].getOrDefault(k), v)
pred_result = loaded_model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
self.assert_model_compatible(model.stages[0], tmp_dir)
def test_callbacks(self):
from xgboost.callback import LearningRateScheduler

View File

@ -1,16 +1,24 @@
import json
import logging
import os
import random
import tempfile
import uuid
from collections import namedtuple
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
from xgboost.callback import LearningRateScheduler
pytestmark = pytest.mark.skipif(**tm.no_spark())
from typing import Generator
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
from xgboost.spark.utils import _get_max_num_concurrent_tasks
@ -18,43 +26,70 @@ from xgboost.spark.utils import _get_max_num_concurrent_tasks
from .utils import SparkLocalClusterTestCase
class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
def setUp(self):
random.seed(2020)
@pytest.fixture
def spark() -> Generator[SparkSession, None, None]:
config = {
"spark.master": "local-cluster[2, 2, 1024]",
"spark.python.worker.reuse": "false",
"spark.driver.host": "127.0.0.1",
"spark.task.maxFailures": "1",
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
"spark.cores.max": "4",
"spark.task.cpus": "1",
"spark.executor.cores": "2",
}
self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
# The following code use xgboost python library to train xgb model and predict.
#
# >>> import numpy as np
# >>> import xgboost
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
# >>> y = np.array([0, 1])
# >>> reg1 = xgboost.XGBRegressor()
# >>> reg1.fit(X, y)
# >>> reg1.predict(X)
# array([8.8363886e-04, 9.9911636e-01], dtype=float32)
# >>> def custom_lr(boosting_round, num_boost_round):
# ... return 1.0 / (boosting_round + 1)
# ...
# >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
# >>> reg1.predict(X)
builder = SparkSession.builder.appName("XGBoost PySpark Python API Tests")
for k, v in config.items():
builder.config(k, v)
logging.getLogger("pyspark").setLevel(logging.INFO)
sess = builder.getOrCreate()
yield sess
sess.stop()
sess.sparkContext.stop()
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test", "reg_params"))
@pytest.fixture
def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
reg_params = {"max_depth": 5, "n_estimators": 10, "iteration_range": (0, 5)}
X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
y = np.array([0, 1])
def custom_lr(boosting_round):
return 1.0 / (boosting_round + 1)
reg1 = xgb.XGBRegressor(callbacks=[LearningRateScheduler(custom_lr)])
reg1.fit(X, y)
predt1 = reg1.predict(X)
# array([0.02406833, 0.97593164], dtype=float32)
# >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
# >>> reg2.fit(X, y)
# >>> reg2.predict(X, ntree_limit=5)
reg2 = xgb.XGBRegressor(max_depth=5, n_estimators=10)
reg2.fit(X, y)
predt2 = reg2.predict(X, iteration_range=(0, 5))
# array([0.22185263, 0.77814734], dtype=float32)
self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
self.reg_df_train = self.session.createDataFrame(
reg_df_train = spark.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
],
["features", "label"],
)
self.reg_df_test = self.session.createDataFrame(
reg_df_test = spark.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
(Vectors.dense(1.0, 2.0, 3.0), 0.0, float(predt2[0]), float(predt1[0])),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
1.0,
float(predt2[1]),
float(predt1[1]),
),
],
[
"features",
@ -63,6 +98,47 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
"expected_prediction_with_callbacks",
],
)
yield RegData(reg_df_train, reg_df_test, reg_params)
class TestPySparkLocalCluster:
def test_regressor_basic_with_params(self, reg_data: RegData) -> None:
regressor = SparkXGBRegressor(**reg_data.reg_params)
model = regressor.fit(reg_data.reg_df_train)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
def test_callbacks(self, reg_data: RegData) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, str(uuid.uuid4()))
def custom_lr(boosting_round):
return 1.0 / (boosting_round + 1)
cb = [LearningRateScheduler(custom_lr)]
regressor = SparkXGBRegressor(callbacks=cb)
# Test the save/load of the estimator instead of the model, since
# the callbacks param only exists in the estimator but not in the model
regressor.save(path)
regressor = SparkXGBRegressor.load(path)
model = regressor.fit(reg_data.reg_df_train)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
assert np.isclose(
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
)
class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
def setUp(self):
random.seed(2020)
self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
# Distributed section
# Binary classification
@ -218,42 +294,6 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
self.reg_best_score_eval = 5.239e-05
self.reg_best_score_weight_and_eval = 4.850e-05
def test_regressor_basic_with_params(self):
regressor = SparkXGBRegressor(**self.reg_params)
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
def test_callbacks(self):
from xgboost.callback import LearningRateScheduler
path = os.path.join(self.tempdir, str(uuid.uuid4()))
def custom_learning_rate(boosting_round):
return 1.0 / (boosting_round + 1)
cb = [LearningRateScheduler(custom_learning_rate)]
regressor = SparkXGBRegressor(callbacks=cb)
# Test the save/load of the estimator instead of the model, since
# the callbacks param only exists in the estimator but not in the model
regressor.save(path)
regressor = SparkXGBRegressor.load(path)
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
)
)
def test_classifier_distributed_basic(self):
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
model = classifier.fit(self.cls_df_train_distributed)
@ -409,7 +449,6 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
pred_result = model.transform(
self.cls_df_test_distributed_lower_estimators
).collect()
print(pred_result)
for row in pred_result:
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
self.assertTrue(