merge latest changes

This commit is contained in:
Hui Liu
2024-01-24 13:30:08 -08:00
83 changed files with 1408 additions and 1273 deletions

View File

@@ -4,15 +4,13 @@ facilities.
# Directories
* ci_build: Test facilities for Jenkins CI and GitHub action.
* cli: Basic test for command line executable `xgboost`. Most of the other command line
specific tests are in Python test `test_cli.py`
specific tests are in Python test `test_cli.py`.
* cpp: Tests for C++ core, using Google test framework.
* python: Tests for Python package, demonstrations and CLI. For how to setup the
dependencies for tests, see conda files in `ci_build`.
* python-gpu: Similar to python tests, but for GPU.
* travis: CI facilities for Travis.
* distributed: Test for distributed system.
* benchmark: Legacy benchmark code. There are a number of benchmark projects for
XGBoost with much better configurations.
* test_distributed: Test for distributed systems including spark and dask.
# Others
* pytest.ini: Describes the `pytest` marker for python tests, some markers are generated

View File

@@ -1,69 +0,0 @@
#pylint: skip-file
import argparse
import xgboost as xgb
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import time
import ast
rng = np.random.RandomState(1994)
def run_benchmark(args):
try:
dtest = xgb.DMatrix('dtest.dm')
dtrain = xgb.DMatrix('dtrain.dm')
if not (dtest.num_col() == args.columns \
and dtrain.num_col() == args.columns):
raise ValueError("Wrong cols")
if not (dtest.num_row() == args.rows * args.test_size \
and dtrain.num_row() == args.rows * (1-args.test_size)):
raise ValueError("Wrong rows")
except:
print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
tmp = time.time()
X, y = make_classification(args.rows, n_features=args.columns, n_redundant=0, n_informative=args.columns, n_repeated=0, random_state=7)
if args.sparsity < 1.0:
X = np.array([[np.nan if rng.uniform(0, 1) < args.sparsity else x for x in x_row] for x_row in X])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7)
print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
tmp = time.time()
print ("DMatrix Start")
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
dtest.save_binary('dtest.dm')
dtrain.save_binary('dtrain.dm')
param = {'objective': 'binary:logistic','booster':'gblinear'}
if args.params != '':
param.update(ast.literal_eval(args.params))
param['updater'] = args.updater
print("Training with '%s'" % param['updater'])
tmp = time.time()
xgb.train(param, dtrain, args.iterations, evals=[(dtrain,"train")], early_stopping_rounds = args.columns)
print ("Train Time: %s seconds" % (str(time.time() - tmp)))
parser = argparse.ArgumentParser()
parser.add_argument('--updater', default='coord_descent')
parser.add_argument('--sparsity', type=float, default=0.0)
parser.add_argument('--lambda', type=float, default=1.0)
parser.add_argument('--tol', type=float, default=1e-5)
parser.add_argument('--alpha', type=float, default=1.0)
parser.add_argument('--rows', type=int, default=1000000)
parser.add_argument('--iterations', type=int, default=10000)
parser.add_argument('--columns', type=int, default=50)
parser.add_argument('--test_size', type=float, default=0.25)
parser.add_argument('--standardise', type=bool, default=False)
parser.add_argument('--params', default='', help='Provide additional parameters as a Python dict string, e.g. --params \"{\'max_depth\':2}\"')
args = parser.parse_args()
run_benchmark(args)

View File

@@ -1,86 +0,0 @@
"""Run benchmark on the tree booster."""
import argparse
import ast
import time
import numpy as np
import xgboost as xgb
RNG = np.random.RandomState(1994)
def run_benchmark(args):
"""Runs the benchmark."""
try:
dtest = xgb.DMatrix('dtest.dm')
dtrain = xgb.DMatrix('dtrain.dm')
if not (dtest.num_col() == args.columns
and dtrain.num_col() == args.columns):
raise ValueError("Wrong cols")
if not (dtest.num_row() == args.rows * args.test_size
and dtrain.num_row() == args.rows * (1 - args.test_size)):
raise ValueError("Wrong rows")
except:
print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
tmp = time.time()
X = RNG.rand(args.rows, args.columns)
y = RNG.randint(0, 2, args.rows)
if 0.0 < args.sparsity < 1.0:
X = np.array([[np.nan if RNG.uniform(0, 1) < args.sparsity else x for x in x_row]
for x_row in X])
train_rows = int(args.rows * (1.0 - args.test_size))
test_rows = int(args.rows * args.test_size)
X_train = X[:train_rows, :]
X_test = X[-test_rows:, :]
y_train = y[:train_rows]
y_test = y[-test_rows:]
print("Generate Time: %s seconds" % (str(time.time() - tmp)))
del X, y
tmp = time.time()
print("DMatrix Start")
dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
print("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
del X_train, y_train, X_test, y_test
dtest.save_binary('dtest.dm')
dtrain.save_binary('dtrain.dm')
param = {'objective': 'binary:logistic'}
if args.params != '':
param.update(ast.literal_eval(args.params))
param['tree_method'] = args.tree_method
print("Training with '%s'" % param['tree_method'])
tmp = time.time()
xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
print("Train Time: %s seconds" % (str(time.time() - tmp)))
def main():
"""The main function.
Defines and parses command line arguments and calls the benchmark.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--tree_method', default='gpu_hist')
parser.add_argument('--sparsity', type=float, default=0.0)
parser.add_argument('--rows', type=int, default=1000000)
parser.add_argument('--columns', type=int, default=50)
parser.add_argument('--iterations', type=int, default=500)
parser.add_argument('--test_size', type=float, default=0.25)
parser.add_argument('--params', default='',
help='Provide additional parameters as a Python dict string, e.g. --params '
'\"{\'max_depth\':2}\"')
args = parser.parse_args()
run_benchmark(args)
if __name__ == '__main__':
main()

View File

@@ -1,87 +0,0 @@
"""Generate synthetic data in LIBSVM format."""
import argparse
import io
import time
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
RNG = np.random.RandomState(2019)
def generate_data(args):
"""Generates the data."""
print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
print("Sparsity {}".format(args.sparsity))
print("{}/{} train/test split".format(1.0 - args.test_size, args.test_size))
tmp = time.time()
n_informative = args.columns * 7 // 10
n_redundant = args.columns // 10
n_repeated = args.columns // 10
print("n_informative: {}, n_redundant: {}, n_repeated: {}".format(n_informative, n_redundant,
n_repeated))
x, y = make_classification(n_samples=args.rows, n_features=args.columns,
n_informative=n_informative, n_redundant=n_redundant,
n_repeated=n_repeated, shuffle=False, random_state=RNG)
print("Generate Time: {} seconds".format(time.time() - tmp))
tmp = time.time()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=args.test_size,
random_state=RNG, shuffle=False)
print("Train/Test Split Time: {} seconds".format(time.time() - tmp))
tmp = time.time()
write_file('train.libsvm', x_train, y_train, args.sparsity)
print("Write Train Time: {} seconds".format(time.time() - tmp))
tmp = time.time()
write_file('test.libsvm', x_test, y_test, args.sparsity)
print("Write Test Time: {} seconds".format(time.time() - tmp))
def write_file(filename, x_data, y_data, sparsity):
with open(filename, 'w') as f:
for x, y in zip(x_data, y_data):
write_line(f, x, y, sparsity)
def write_line(f, x, y, sparsity):
with io.StringIO() as line:
line.write(str(y))
for i, col in enumerate(x):
if 0.0 < sparsity < 1.0:
if RNG.uniform(0, 1) > sparsity:
write_feature(line, i, col)
else:
write_feature(line, i, col)
line.write('\n')
f.write(line.getvalue())
def write_feature(line, index, feature):
line.write(' ')
line.write(str(index))
line.write(':')
line.write(str(feature))
def main():
"""The main function.
Defines and parses command line arguments and calls the generator.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--rows', type=int, default=1000000)
parser.add_argument('--columns', type=int, default=50)
parser.add_argument('--sparsity', type=float, default=0.0)
parser.add_argument('--test_size', type=float, default=0.01)
args = parser.parse_args()
generate_data(args)
if __name__ == '__main__':
main()

View File

@@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION}
echo "--- Stash XGBoost4J JARs (Scala 2.12)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
echo "--- Build XGBoost JVM packages scala 2.13"
tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} "" "" "true"
echo "--- Stash XGBoost4J JARs"
echo "--- Stash XGBoost4J JARs (Scala 2.13)"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"

View File

@@ -1,21 +0,0 @@
$ErrorActionPreference = "Stop"
. tests/buildkite/conftest.ps1
Write-Host "--- Build XGBoost R package with CUDA"
nvcc --version
$arch_flag = "-DGPU_COMPUTE_VER=75"
bash tests/ci_build/build_r_pkg_with_cuda_win64.sh $Env:BUILDKITE_COMMIT
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
if ( $is_release_branch -eq 1 ) {
Write-Host "--- Upload R tarball"
Get-ChildItem . -Filter xgboost_r_gpu_win64_*.tar.gz |
Foreach-Object {
& aws s3 cp $_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ `
--acl public-read --no-progress
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
}
}

View File

@@ -13,11 +13,6 @@ steps:
key: build-win64-gpu
agents:
queue: windows-cpu
- label: ":windows: Build XGBoost R package for Windows with CUDA"
command: "tests/buildkite/build-rpkg-win64-gpu.ps1"
key: build-rpkg-win64-gpu
agents:
queue: windows-cpu
- wait

View File

@@ -24,12 +24,13 @@ if [ "x$gpu_arch" != "x" ]; then
export GPU_ARCH_FLAG=$gpu_arch
fi
mvn_profile_string=""
if [ "x$use_scala213" != "x" ]; then
export mvn_profile_string="-Pdefault,scala-2.13"
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages
fi
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
set +x
set +e

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -e
set -x
if [ "$#" -ne 1 ]
then
echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]"
exit 1
fi
commit_hash="$1"
# Clear all positional args
set --
source activate
python tests/ci_build/test_r_package.py --task=pack
mv xgboost/ xgboost_rpack/
mkdir build
cd build
cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-4.3.2" -DCMAKE_PREFIX_PATH="C:\\rtools43\\x86_64-w64-mingw32.static.posix\\bin"
cmake --build . --config Release --parallel
cd ..
# This super wacky hack is found in cmake/RPackageInstall.cmake.in and
# cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
# and have R use xgboost.dll that we've already built.
rm -v xgboost_rpack/configure
rm -rfv xgboost_rpack/src
mkdir -p xgboost_rpack/src
cp -v lib/xgboost.dll xgboost_rpack/src/
echo 'all:' > xgboost_rpack/src/Makefile
echo 'all:' > xgboost_rpack/src/Makefile.win
mv xgboost_rpack/ xgboost/
/c/Rtools43/usr/bin/tar -cvf xgboost_r_gpu_win64_${commit_hash}.tar xgboost/
/c/Rtools43/usr/bin/gzip -9c xgboost_r_gpu_win64_${commit_hash}.tar > xgboost_r_gpu_win64_${commit_hash}.tar.gz

View File

@@ -27,7 +27,10 @@ rm -rf ../build/
# Deploy to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
# Deploy scala 2.13 to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
cd ..
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
cd jvm-packages/
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
set +x

View File

@@ -18,14 +18,17 @@ class LintersPaths:
"python-package/",
# tests
"tests/python/test_config.py",
"tests/python/test_callback.py",
"tests/python/test_data_iterator.py",
"tests/python/test_dmatrix.py",
"tests/python/test_dt.py",
"tests/python/test_demos.py",
"tests/python/test_eval_metrics.py",
"tests/python/test_multi_target.py",
"tests/python/test_predict.py",
"tests/python/test_quantile_dmatrix.py",
"tests/python/test_tree_regularization.py",
"tests/python/test_training_continuation.py",
"tests/python/test_shap.py",
"tests/python/test_model_io.py",
"tests/python/test_with_pandas.py",
@@ -39,12 +42,15 @@ class LintersPaths:
"demo/dask/",
"demo/rmm_plugin",
"demo/json-model/json_parser.py",
"demo/guide-python/continuation.py",
"demo/guide-python/cat_in_the_dat.py",
"demo/guide-python/callbacks.py",
"demo/guide-python/categorical.py",
"demo/guide-python/cat_pipeline.py",
"demo/guide-python/feature_weights.py",
"demo/guide-python/sklearn_parallel.py",
"demo/guide-python/sklearn_examples.py",
"demo/guide-python/sklearn_evals_result.py",
"demo/guide-python/spark_estimator_examples.py",
"demo/guide-python/external_memory.py",
"demo/guide-python/individual_trees.py",
@@ -86,6 +92,7 @@ class LintersPaths:
"tests/python/test_multi_target.py",
"tests/python-gpu/test_gpu_data_iterator.py",
"tests/python-gpu/load_pickle.py",
"tests/python-gpu/test_gpu_training_continuation.py",
"tests/python/test_model_io.py",
"tests/test_distributed/test_with_spark/test_data.py",
"tests/test_distributed/test_gpu_with_spark/test_data.py",
@@ -93,6 +100,7 @@ class LintersPaths:
# demo
"demo/json-model/json_parser.py",
"demo/guide-python/external_memory.py",
"demo/guide-python/continuation.py",
"demo/guide-python/callbacks.py",
"demo/guide-python/cat_in_the_dat.py",
"demo/guide-python/categorical.py",

View File

@@ -20,10 +20,11 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
cd $jvm_packages_dir
fi
# including maven profiles for different scala versions: 2.12 is the default at the moment.
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
for scala_binary_version in "2.12" "2.13"; do
cd ..
python dev/change_scala_version.py --scala-version ${scala_binary_version}
cd jvm-packages
scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
# Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar

View File

@@ -253,6 +253,5 @@ void TestColumnSplit(bst_target_t n_targets) {
TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); }
} // namespace xgboost::tree

View File

@@ -1,18 +1,21 @@
/**
* Copyright 2020-2023 by XGBoost Contributors
* Copyright 2020-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/context.h> // for Context
#include <xgboost/task.h> // for ObjInfo
#include <xgboost/tree_model.h>
#include <xgboost/tree_updater.h>
#include <xgboost/context.h> // for Context
#include <xgboost/task.h> // for ObjInfo
#include <xgboost/tree_model.h> // for RegTree
#include <xgboost/tree_updater.h> // for TreeUpdater
#include <memory> // for unique_ptr
#include <memory> // for unique_ptr
#include "../../../src/tree/param.h" // for TrainParam
#include "../helpers.h"
namespace xgboost {
/**
* @brief Test the tree statistic (like sum Hessian) is correct.
*/
class UpdaterTreeStatTest : public ::testing::Test {
protected:
std::shared_ptr<DMatrix> p_dmat_;
@@ -28,13 +31,12 @@ class UpdaterTreeStatTest : public ::testing::Test {
gpairs_.Data()->Copy(g);
}
void RunTest(std::string updater) {
void RunTest(Context const* ctx, std::string updater) {
tree::TrainParam param;
ObjInfo task{ObjInfo::kRegression};
param.Init(Args{});
Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, ctx, &task)};
up->Configure(Args{});
RegTree tree{1u, kCols};
std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -51,77 +53,136 @@ class UpdaterTreeStatTest : public ::testing::Test {
};
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
TEST_F(UpdaterTreeStatTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
TEST_F(UpdaterTreeStatTest, GpuHist) {
auto ctx = MakeCUDACtx(0);
this->RunTest(&ctx, "grow_gpu_hist");
}
TEST_F(UpdaterTreeStatTest, Hist) { this->RunTest("grow_quantile_histmaker"); }
TEST_F(UpdaterTreeStatTest, GpuApprox) {
auto ctx = MakeCUDACtx(0);
this->RunTest(&ctx, "grow_gpu_approx");
}
#endif // defined(XGBOOST_USE_CUDA)
TEST_F(UpdaterTreeStatTest, Exact) { this->RunTest("grow_colmaker"); }
TEST_F(UpdaterTreeStatTest, Hist) {
Context ctx;
this->RunTest(&ctx, "grow_quantile_histmaker");
}
TEST_F(UpdaterTreeStatTest, Approx) { this->RunTest("grow_histmaker"); }
TEST_F(UpdaterTreeStatTest, Exact) {
Context ctx;
this->RunTest(&ctx, "grow_colmaker");
}
class UpdaterEtaTest : public ::testing::Test {
TEST_F(UpdaterTreeStatTest, Approx) {
Context ctx;
this->RunTest(&ctx, "grow_histmaker");
}
/**
* @brief Test changing learning rate doesn't change internal splits.
*/
class TestSplitWithEta : public ::testing::Test {
protected:
std::shared_ptr<DMatrix> p_dmat_;
linalg::Matrix<GradientPair> gpairs_;
size_t constexpr static kRows = 10;
size_t constexpr static kCols = 10;
size_t constexpr static kClasses = 10;
void Run(Context const* ctx, bst_target_t n_targets, std::string name) {
auto Xy = RandomDataGenerator{512, 64, 0.2}.Targets(n_targets).GenerateDMatrix(true);
void SetUp() override {
p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true, false, kClasses);
auto g = GenerateRandomGradients(kRows);
gpairs_.Reshape(kRows, 1);
gpairs_.Data()->Copy(g);
}
auto gen_tree = [&](float eta) {
auto tree =
std::make_unique<RegTree>(n_targets, static_cast<bst_feature_t>(Xy->Info().num_col_));
std::vector<RegTree*> trees{tree.get()};
ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(name, ctx, &task)};
updater->Configure({});
void RunTest(std::string updater) {
ObjInfo task{ObjInfo::kClassification};
auto grad = GenerateRandomGradients(ctx, Xy->Info().num_row_, n_targets);
CHECK_EQ(grad.Shape(1), n_targets);
tree::TrainParam param;
param.Init(Args{{"learning_rate", std::to_string(eta)}});
HostDeviceVector<bst_node_t> position;
Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
float eta = 0.4;
auto up_0 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
up_0->Configure(Args{});
tree::TrainParam param0;
param0.Init(Args{{"eta", std::to_string(eta)}});
auto up_1 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
up_1->Configure(Args{{"eta", "1.0"}});
tree::TrainParam param1;
param1.Init(Args{{"eta", "1.0"}});
for (size_t iter = 0; iter < 4; ++iter) {
RegTree tree_0{1u, kCols};
{
std::vector<HostDeviceVector<bst_node_t>> position(1);
up_0->Update(&param0, &gpairs_, p_dmat_.get(), position, {&tree_0});
updater->Update(&param, &grad, Xy.get(), common::Span{&position, 1}, trees);
CHECK_EQ(tree->NumTargets(), n_targets);
if (n_targets > 1) {
CHECK(tree->IsMultiTarget());
}
return tree;
};
RegTree tree_1{1u, kCols};
{
std::vector<HostDeviceVector<bst_node_t>> position(1);
up_1->Update(&param1, &gpairs_, p_dmat_.get(), position, {&tree_1});
}
tree_0.WalkTree([&](bst_node_t nidx) {
if (tree_0[nidx].IsLeaf()) {
EXPECT_NEAR(tree_1[nidx].LeafValue() * eta, tree_0[nidx].LeafValue(), kRtEps);
auto eta_ratio = 8.0f;
auto p_tree0 = gen_tree(0.1f);
auto p_tree1 = gen_tree(0.1f * eta_ratio);
// Just to make sure we are not testing a stump.
CHECK_GE(p_tree0->NumExtraNodes(), 32);
bst_node_t n_nodes{0};
p_tree0->WalkTree([&](bst_node_t nidx) {
if (p_tree0->IsLeaf(nidx)) {
CHECK(p_tree1->IsLeaf(nidx));
if (p_tree0->IsMultiTarget()) {
CHECK(p_tree1->IsMultiTarget());
auto leaf_0 = p_tree0->GetMultiTargetTree()->LeafValue(nidx);
auto leaf_1 = p_tree1->GetMultiTargetTree()->LeafValue(nidx);
CHECK_EQ(leaf_0.Size(), leaf_1.Size());
for (std::size_t i = 0; i < leaf_0.Size(); ++i) {
CHECK_EQ(leaf_0(i) * eta_ratio, leaf_1(i));
}
CHECK(std::isnan(p_tree0->SplitCond(nidx)));
CHECK(std::isnan(p_tree1->SplitCond(nidx)));
} else {
// NON-mt tree reuses split cond for leaf value.
auto leaf_0 = p_tree0->SplitCond(nidx);
auto leaf_1 = p_tree1->SplitCond(nidx);
CHECK_EQ(leaf_0 * eta_ratio, leaf_1);
}
return true;
});
}
} else {
CHECK(!p_tree1->IsLeaf(nidx));
CHECK_EQ(p_tree0->SplitCond(nidx), p_tree1->SplitCond(nidx));
}
n_nodes++;
return true;
});
ASSERT_EQ(n_nodes, p_tree0->NumExtraNodes() + 1);
}
};
TEST_F(UpdaterEtaTest, Hist) { this->RunTest("grow_quantile_histmaker"); }
TEST_F(TestSplitWithEta, HistMulti) {
Context ctx;
bst_target_t n_targets{3};
this->Run(&ctx, n_targets, "grow_quantile_histmaker");
}
TEST_F(UpdaterEtaTest, Exact) { this->RunTest("grow_colmaker"); }
TEST_F(TestSplitWithEta, Hist) {
Context ctx;
bst_target_t n_targets{1};
this->Run(&ctx, n_targets, "grow_quantile_histmaker");
}
TEST_F(UpdaterEtaTest, Approx) { this->RunTest("grow_histmaker"); }
TEST_F(TestSplitWithEta, Approx) {
Context ctx;
bst_target_t n_targets{1};
this->Run(&ctx, n_targets, "grow_histmaker");
}
TEST_F(TestSplitWithEta, Exact) {
Context ctx;
bst_target_t n_targets{1};
this->Run(&ctx, n_targets, "grow_colmaker");
}
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
TEST_F(UpdaterEtaTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
TEST_F(TestSplitWithEta, GpuHist) {
auto ctx = MakeCUDACtx(0);
bst_target_t n_targets{1};
this->Run(&ctx, n_targets, "grow_gpu_hist");
}
TEST_F(TestSplitWithEta, GpuApprox) {
auto ctx = MakeCUDACtx(0);
bst_target_t n_targets{1};
this->Run(&ctx, n_targets, "grow_gpu_approx");
}
#endif // defined(XGBOOST_USE_CUDA)
class TestMinSplitLoss : public ::testing::Test {
std::shared_ptr<DMatrix> dmat_;

View File

@@ -1,54 +1,12 @@
import json
import numpy as np
import pytest
import xgboost as xgb
from xgboost.testing.continuation import run_training_continuation_model_output
rng = np.random.RandomState(1994)
class TestGPUTrainingContinuation:
def test_training_continuation(self):
kRows = 64
kCols = 32
X = np.random.randn(kRows, kCols)
y = np.random.randn(kRows)
dtrain = xgb.DMatrix(X, y)
params = {
"tree_method": "gpu_hist",
"max_depth": "2",
"gamma": "0.1",
"alpha": "0.01",
}
bst_0 = xgb.train(params, dtrain, num_boost_round=64)
dump_0 = bst_0.get_dump(dump_format="json")
bst_1 = xgb.train(params, dtrain, num_boost_round=32)
bst_1 = xgb.train(params, dtrain, num_boost_round=32, xgb_model=bst_1)
dump_1 = bst_1.get_dump(dump_format="json")
def recursive_compare(obj_0, obj_1):
if isinstance(obj_0, float):
assert np.isclose(obj_0, obj_1, atol=1e-6)
elif isinstance(obj_0, str):
assert obj_0 == obj_1
elif isinstance(obj_0, int):
assert obj_0 == obj_1
elif isinstance(obj_0, dict):
keys_0 = list(obj_0.keys())
keys_1 = list(obj_1.keys())
values_0 = list(obj_0.values())
values_1 = list(obj_1.values())
for i in range(len(obj_0.items())):
assert keys_0[i] == keys_1[i]
if list(obj_0.keys())[i] != "missing":
recursive_compare(values_0[i], values_1[i])
else:
for i in range(len(obj_0)):
recursive_compare(obj_0[i], obj_1[i])
assert len(dump_0) == len(dump_1)
for i in range(len(dump_0)):
obj_0 = json.loads(dump_0[i])
obj_1 = json.loads(dump_1[i])
recursive_compare(obj_0, obj_1)
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_model_output(self, tree_method: str) -> None:
run_training_continuation_model_output("cuda", tree_method)

View File

@@ -16,13 +16,14 @@ class TestCallbacks:
@classmethod
def setup_class(cls):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
cls.X = X
cls.y = y
split = int(X.shape[0]*0.8)
cls.X_train = X[: split, ...]
cls.y_train = y[: split, ...]
split = int(X.shape[0] * 0.8)
cls.X_train = X[:split, ...]
cls.y_train = y[:split, ...]
cls.X_valid = X[split:, ...]
cls.y_valid = y[split:, ...]
@@ -31,31 +32,32 @@ class TestCallbacks:
D_train: xgb.DMatrix,
D_valid: xgb.DMatrix,
rounds: int,
verbose_eval: Union[bool, int]
verbose_eval: Union[bool, int],
):
def check_output(output: str) -> None:
if int(verbose_eval) == 1:
# Should print each iteration info
assert len(output.split('\n')) == rounds
assert len(output.split("\n")) == rounds
elif int(verbose_eval) > rounds:
# Should print first and latest iteration info
assert len(output.split('\n')) == 2
assert len(output.split("\n")) == 2
else:
# Should print info by each period additionaly to first and latest
# iteration
num_periods = rounds // int(verbose_eval)
# Extra information is required for latest iteration
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
assert len(output.split('\n')) == (
assert len(output.split("\n")) == (
1 + num_periods + int(is_extra_info_required)
)
evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
params = {"objective": "binary:logistic", "eval_metric": "error"}
with tm.captured_output() as (out, err):
xgb.train(
params, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
params,
D_train,
evals=[(D_train, "Train"), (D_valid, "Valid")],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=verbose_eval,
@@ -73,14 +75,16 @@ class TestCallbacks:
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
evals_result = {}
rounds = 10
xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=True)
assert len(evals_result['Train']['error']) == rounds
assert len(evals_result['Valid']['error']) == rounds
xgb.train(
{"objective": "binary:logistic", "eval_metric": "error"},
D_train,
evals=[(D_train, "Train"), (D_valid, "Valid")],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=True,
)
assert len(evals_result["Train"]["error"]) == rounds
assert len(evals_result["Valid"]["error"]) == rounds
self.run_evaluation_monitor(D_train, D_valid, rounds, True)
self.run_evaluation_monitor(D_train, D_valid, rounds, 2)
@@ -93,72 +97,83 @@ class TestCallbacks:
evals_result = {}
rounds = 30
early_stopping_rounds = 5
booster = xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=True,
early_stopping_rounds=early_stopping_rounds)
dump = booster.get_dump(dump_format='json')
booster = xgb.train(
{"objective": "binary:logistic", "eval_metric": "error"},
D_train,
evals=[(D_train, "Train"), (D_valid, "Valid")],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=True,
early_stopping_rounds=early_stopping_rounds,
)
dump = booster.get_dump(dump_format="json")
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_custom_eval(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
early_stopping_rounds = 5
booster = xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error',
'tree_method': 'hist'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
feval=tm.eval_error_metric,
num_boost_round=1000,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=False)
dump = booster.get_dump(dump_format='json')
booster = xgb.train(
{
"objective": "binary:logistic",
"eval_metric": "error",
"tree_method": "hist",
},
D_train,
evals=[(D_train, "Train"), (D_valid, "Valid")],
feval=tm.eval_error_metric,
num_boost_round=1000,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=False,
)
dump = booster.get_dump(dump_format="json")
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_customize(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
metric_name='CustomErr',
data_name='Train')
early_stop = xgb.callback.EarlyStopping(
rounds=early_stopping_rounds, metric_name="CustomErr", data_name="Train"
)
# Specify which dataset and which metric should be used for early stopping.
booster = xgb.train(
{'objective': 'binary:logistic',
'eval_metric': ['error', 'rmse'],
'tree_method': 'hist'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
{
"objective": "binary:logistic",
"eval_metric": ["error", "rmse"],
"tree_method": "hist",
},
D_train,
evals=[(D_train, "Train"), (D_valid, "Valid")],
feval=tm.eval_error_metric,
num_boost_round=1000,
callbacks=[early_stop],
verbose_eval=False)
dump = booster.get_dump(dump_format='json')
verbose_eval=False,
)
dump = booster.get_dump(dump_format="json")
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
assert len(early_stop.stopping_history['Train']['CustomErr']) == len(dump)
assert len(early_stop.stopping_history["Train"]["CustomErr"]) == len(dump)
rounds = 100
early_stop = xgb.callback.EarlyStopping(
rounds=early_stopping_rounds,
metric_name='CustomErr',
data_name='Train',
metric_name="CustomErr",
data_name="Train",
min_delta=100,
save_best=True,
)
booster = xgb.train(
{
'objective': 'binary:logistic',
'eval_metric': ['error', 'rmse'],
'tree_method': 'hist'
"objective": "binary:logistic",
"eval_metric": ["error", "rmse"],
"tree_method": "hist",
},
D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
evals=[(D_train, "Train"), (D_valid, "Valid")],
feval=tm.eval_error_metric,
num_boost_round=rounds,
callbacks=[early_stop],
verbose_eval=False
verbose_eval=False,
)
# No iteration can be made with min_delta == 100
assert booster.best_iteration == 0
@@ -166,18 +181,20 @@ class TestCallbacks:
def test_early_stopping_skl(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
early_stopping_rounds = 5
cls = xgb.XGBClassifier(
early_stopping_rounds=early_stopping_rounds, eval_metric='error'
early_stopping_rounds=early_stopping_rounds, eval_metric="error"
)
cls.fit(X, y, eval_set=[(X, y)])
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
dump = booster.get_dump(dump_format="json")
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_custom_eval_skl(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds)
@@ -186,11 +203,12 @@ class TestCallbacks:
)
cls.fit(X, y, eval_set=[(X, y)])
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
dump = booster.get_dump(dump_format="json")
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_save_best_model(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
n_estimators = 100
early_stopping_rounds = 5
@@ -200,11 +218,11 @@ class TestCallbacks:
cls = xgb.XGBClassifier(
n_estimators=n_estimators,
eval_metric=tm.eval_error_metric_skl,
callbacks=[early_stop]
callbacks=[early_stop],
)
cls.fit(X, y, eval_set=[(X, y)])
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
dump = booster.get_dump(dump_format="json")
assert len(dump) == booster.best_iteration + 1
early_stop = xgb.callback.EarlyStopping(
@@ -220,8 +238,9 @@ class TestCallbacks:
cls.fit(X, y, eval_set=[(X, y)])
# No error
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=False)
early_stop = xgb.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=False
)
xgb.XGBClassifier(
booster="gblinear",
n_estimators=10,
@@ -231,14 +250,17 @@ class TestCallbacks:
def test_early_stopping_continuation(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
cls = xgb.XGBClassifier(eval_metric=tm.eval_error_metric_skl)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=True
)
with pytest.warns(UserWarning):
cls.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
cls = xgb.XGBClassifier(
eval_metric=tm.eval_error_metric_skl, callbacks=[early_stop]
)
cls.fit(X, y, eval_set=[(X, y)])
booster = cls.get_booster()
assert booster.num_boosted_rounds() == booster.best_iteration + 1
@@ -256,21 +278,10 @@ class TestCallbacks:
)
cls.fit(X, y, eval_set=[(X, y)])
booster = cls.get_booster()
assert booster.num_boosted_rounds() == \
booster.best_iteration + early_stopping_rounds + 1
def test_deprecated(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=True
)
clf = xgb.XGBClassifier(
eval_metric=tm.eval_error_metric_skl, callbacks=[early_stop]
)
with pytest.raises(ValueError, match=r".*set_params.*"):
clf.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
assert (
booster.num_boosted_rounds()
== booster.best_iteration + early_stopping_rounds + 1
)
def run_eta_decay(self, tree_method):
"""Test learning rate scheduler, used by both CPU and GPU tests."""
@@ -343,7 +354,7 @@ class TestCallbacks:
callbacks=[scheduler([0, 0, 0, 0])],
evals_result=evals_result,
)
eval_errors_2 = list(map(float, evals_result['eval']['error']))
eval_errors_2 = list(map(float, evals_result["eval"]["error"]))
assert isinstance(bst, xgb.core.Booster)
# validation error should not decrease, if eta/learning_rate = 0
assert eval_errors_2[0] == eval_errors_2[-1]
@@ -361,7 +372,7 @@ class TestCallbacks:
callbacks=[scheduler(eta_decay)],
evals_result=evals_result,
)
eval_errors_3 = list(map(float, evals_result['eval']['error']))
eval_errors_3 = list(map(float, evals_result["eval"]["error"]))
assert isinstance(bst, xgb.core.Booster)

View File

@@ -15,23 +15,23 @@ class TestEarlyStopping:
from sklearn.model_selection import train_test_split
digits = load_digits(n_class=2)
X = digits['data']
y = digits['target']
X = digits["data"]
y = digits["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf1 = xgb.XGBClassifier(learning_rate=0.1)
clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
eval_set=[(X_test, y_test)])
clf2 = xgb.XGBClassifier(learning_rate=0.1)
clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc",
eval_set=[(X_test, y_test)])
clf1 = xgb.XGBClassifier(
learning_rate=0.1, early_stopping_rounds=5, eval_metric="auc"
)
clf1.fit(X_train, y_train, eval_set=[(X_test, y_test)])
clf2 = xgb.XGBClassifier(
learning_rate=0.1, early_stopping_rounds=4, eval_metric="auc"
)
clf2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
# should be the same
assert clf1.best_score == clf2.best_score
assert clf1.best_score != 1
# check overfit
clf3 = xgb.XGBClassifier(
learning_rate=0.1,
eval_metric="auc",
early_stopping_rounds=10
learning_rate=0.1, eval_metric="auc", early_stopping_rounds=10
)
clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
base_score = get_basescore(clf3)
@@ -39,9 +39,9 @@ class TestEarlyStopping:
clf3 = xgb.XGBClassifier(
learning_rate=0.1,
base_score=.5,
base_score=0.5,
eval_metric="auc",
early_stopping_rounds=10
early_stopping_rounds=10,
)
clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])

View File

@@ -9,37 +9,41 @@ rng = np.random.RandomState(1337)
class TestEvalMetrics:
xgb_params_01 = {'nthread': 1, 'eval_metric': 'error'}
xgb_params_01 = {"nthread": 1, "eval_metric": "error"}
xgb_params_02 = {'nthread': 1, 'eval_metric': ['error']}
xgb_params_02 = {"nthread": 1, "eval_metric": ["error"]}
xgb_params_03 = {'nthread': 1, 'eval_metric': ['rmse', 'error']}
xgb_params_03 = {"nthread": 1, "eval_metric": ["rmse", "error"]}
xgb_params_04 = {'nthread': 1, 'eval_metric': ['error', 'rmse']}
xgb_params_04 = {"nthread": 1, "eval_metric": ["error", "rmse"]}
def evalerror_01(self, preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
def evalerror_02(self, preds, dtrain):
labels = dtrain.get_label()
return [('error', float(sum(labels != (preds > 0.0))) / len(labels))]
return [("error", float(sum(labels != (preds > 0.0))) / len(labels))]
@pytest.mark.skipif(**tm.no_sklearn())
def evalerror_03(self, preds, dtrain):
from sklearn.metrics import mean_squared_error
labels = dtrain.get_label()
return [('rmse', mean_squared_error(labels, preds)),
('error', float(sum(labels != (preds > 0.0))) / len(labels))]
return [
("rmse", mean_squared_error(labels, preds)),
("error", float(sum(labels != (preds > 0.0))) / len(labels)),
]
@pytest.mark.skipif(**tm.no_sklearn())
def evalerror_04(self, preds, dtrain):
from sklearn.metrics import mean_squared_error
labels = dtrain.get_label()
return [('error', float(sum(labels != (preds > 0.0))) / len(labels)),
('rmse', mean_squared_error(labels, preds))]
return [
("error", float(sum(labels != (preds > 0.0))) / len(labels)),
("rmse", mean_squared_error(labels, preds)),
]
@pytest.mark.skipif(**tm.no_sklearn())
def test_eval_metrics(self):
@@ -50,15 +54,15 @@ class TestEvalMetrics:
from sklearn.datasets import load_digits
digits = load_digits(n_class=2)
X = digits['data']
y = digits['target']
X = digits["data"]
y = digits["target"]
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)
dtrain = xgb.DMatrix(Xt, label=yt)
dvalid = xgb.DMatrix(Xv, label=yv)
watchlist = [(dtrain, 'train'), (dvalid, 'val')]
watchlist = [(dtrain, "train"), (dvalid, "val")]
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, num_boost_round=10)
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, num_boost_round=10)
@@ -66,26 +70,54 @@ class TestEvalMetrics:
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_01 = xgb.train(
self.xgb_params_01, dtrain, 10, watchlist, early_stopping_rounds=2
)
gbdt_02 = xgb.train(
self.xgb_params_02, dtrain, 10, watchlist, early_stopping_rounds=2
)
gbdt_03 = xgb.train(
self.xgb_params_03, dtrain, 10, watchlist, early_stopping_rounds=2
)
gbdt_04 = xgb.train(
self.xgb_params_04, dtrain, 10, watchlist, early_stopping_rounds=2
)
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_01)
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_02)
gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_03)
gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_04)
gbdt_01 = xgb.train(
self.xgb_params_01,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_01,
)
gbdt_02 = xgb.train(
self.xgb_params_02,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_02,
)
gbdt_03 = xgb.train(
self.xgb_params_03,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_03,
)
gbdt_04 = xgb.train(
self.xgb_params_04,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_04,
)
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
@@ -93,6 +125,7 @@ class TestEvalMetrics:
@pytest.mark.skipif(**tm.no_sklearn())
def test_gamma_deviance(self):
from sklearn.metrics import mean_gamma_deviance
rng = np.random.RandomState(1994)
n_samples = 100
n_features = 30
@@ -101,8 +134,13 @@ class TestEvalMetrics:
y = rng.randn(n_samples)
y = y - y.min() * 100
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=10)
reg.fit(X, y, eval_metric="gamma-deviance")
reg = xgb.XGBRegressor(
tree_method="hist",
objective="reg:gamma",
n_estimators=10,
eval_metric="gamma-deviance",
)
reg.fit(X, y)
booster = reg.get_booster()
score = reg.predict(X)
@@ -113,16 +151,26 @@ class TestEvalMetrics:
@pytest.mark.skipif(**tm.no_sklearn())
def test_gamma_lik(self) -> None:
import scipy.stats as stats
rng = np.random.default_rng(1994)
n_samples = 32
n_features = 10
X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
X = rng.normal(0, 1, size=n_samples * n_features).reshape(
(n_samples, n_features)
)
alpha, loc, beta = 5.0, 11.1, 22
y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
y = stats.gamma.rvs(
alpha, loc=loc, scale=beta, size=n_samples, random_state=rng
)
reg = xgb.XGBRegressor(
tree_method="hist",
objective="reg:gamma",
n_estimators=64,
eval_metric="gamma-nloglik",
)
reg.fit(X, y, eval_set=[(X, y)])
score = reg.predict(X)
@@ -134,7 +182,7 @@ class TestEvalMetrics:
# XGBoost uses the canonical link function of gamma in evaluation function.
# so \theta = - (1.0 / y)
# dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
beta = - (1.0 / (- (1.0 / y))) # == y
beta = -(1.0 / (-(1.0 / y))) # == y
nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
@@ -153,7 +201,7 @@ class TestEvalMetrics:
n_features,
n_informative=n_features,
n_redundant=0,
random_state=rng
random_state=rng,
)
Xy = xgb.DMatrix(X, y)
booster = xgb.train(
@@ -197,7 +245,7 @@ class TestEvalMetrics:
n_informative=n_features,
n_redundant=0,
n_classes=n_classes,
random_state=rng
random_state=rng,
)
if weighted:
weights = rng.randn(n_samples)
@@ -242,20 +290,25 @@ class TestEvalMetrics:
def run_pr_auc_binary(self, tree_method):
from sklearn.datasets import make_classification
from sklearn.metrics import auc, precision_recall_curve
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
y_score = clf.predict_proba(X)[:, 1] # get the positive column
precision, recall, _ = precision_recall_curve(y, y_score)
prauc = auc(recall, precision)
# Interpolation results are slightly different from sklearn, but overall should be
# similar.
# Interpolation results are slightly different from sklearn, but overall should
# be similar.
np.testing.assert_allclose(prauc, evals_result, rtol=1e-2)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
np.testing.assert_allclose(0.99, evals_result, rtol=1e-2)
@@ -264,16 +317,21 @@ class TestEvalMetrics:
def run_pr_auc_multi(self, tree_method):
from sklearn.datasets import make_classification
X, y = make_classification(
64, 16, n_informative=8, n_classes=3, random_state=1994
)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
# No available implementation for comparison, just check that XGBoost converges to
# 1.0
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
# No available implementation for comparison, just check that XGBoost converges
# to 1.0
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
np.testing.assert_allclose(1.0, evals_result, rtol=1e-2)
@@ -282,9 +340,13 @@ class TestEvalMetrics:
def run_pr_auc_ltr(self, tree_method):
from sklearn.datasets import make_classification
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
ltr = xgb.XGBRanker(
tree_method=tree_method, n_estimators=16, objective="rank:pairwise"
tree_method=tree_method,
n_estimators=16,
objective="rank:pairwise",
eval_metric="aucpr",
)
groups = np.array([32, 32, 64])
ltr.fit(
@@ -293,7 +355,6 @@ class TestEvalMetrics:
group=groups,
eval_set=[(X, y)],
eval_group=[groups],
eval_metric="aucpr",
)
results = ltr.evals_result()["validation_0"]["aucpr"]
assert results[-1] >= 0.99

View File

@@ -6,6 +6,7 @@ import pytest
import xgboost as xgb
from xgboost import testing as tm
from xgboost.testing.continuation import run_training_continuation_model_output
rng = np.random.RandomState(1337)
@@ -15,54 +16,51 @@ class TestTrainingContinuation:
def generate_parameters(self):
xgb_params_01_binary = {
'nthread': 1,
"nthread": 1,
}
xgb_params_02_binary = {
'nthread': 1,
'num_parallel_tree': self.num_parallel_tree
"nthread": 1,
"num_parallel_tree": self.num_parallel_tree,
}
xgb_params_03_binary = {
'nthread': 1,
'num_class': 5,
'num_parallel_tree': self.num_parallel_tree
"nthread": 1,
"num_class": 5,
"num_parallel_tree": self.num_parallel_tree,
}
return [
xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary
]
return [xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary]
def run_training_continuation(self, xgb_params_01, xgb_params_02,
xgb_params_03):
def run_training_continuation(self, xgb_params_01, xgb_params_02, xgb_params_03):
from sklearn.datasets import load_digits
from sklearn.metrics import mean_squared_error
digits_2class = load_digits(n_class=2)
digits_5class = load_digits(n_class=5)
X_2class = digits_2class['data']
y_2class = digits_2class['target']
X_2class = digits_2class["data"]
y_2class = digits_2class["target"]
X_5class = digits_5class['data']
y_5class = digits_5class['target']
X_5class = digits_5class["data"]
y_5class = digits_5class["target"]
dtrain_2class = xgb.DMatrix(X_2class, label=y_2class)
dtrain_5class = xgb.DMatrix(X_5class, label=y_5class)
gbdt_01 = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=10)
gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10)
ntrees_01 = len(gbdt_01.get_dump())
assert ntrees_01 == 10
gbdt_02 = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=0)
gbdt_02.save_model('xgb_tc.json')
gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0)
gbdt_02.save_model("xgb_tc.json")
gbdt_02a = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=10, xgb_model=gbdt_02)
gbdt_02b = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=10, xgb_model="xgb_tc.json")
gbdt_02a = xgb.train(
xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02
)
gbdt_02b = xgb.train(
xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.json"
)
ntrees_02a = len(gbdt_02a.get_dump())
ntrees_02b = len(gbdt_02b.get_dump())
assert ntrees_02a == 10
@@ -76,20 +74,21 @@ class TestTrainingContinuation:
res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
assert res1 == res2
gbdt_03 = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=3)
gbdt_03.save_model('xgb_tc.json')
gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3)
gbdt_03.save_model("xgb_tc.json")
gbdt_03a = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=7, xgb_model=gbdt_03)
gbdt_03b = xgb.train(xgb_params_01, dtrain_2class,
num_boost_round=7, xgb_model="xgb_tc.json")
gbdt_03a = xgb.train(
xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03
)
gbdt_03b = xgb.train(
xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.json"
)
ntrees_03a = len(gbdt_03a.get_dump())
ntrees_03b = len(gbdt_03b.get_dump())
assert ntrees_03a == 10
assert ntrees_03b == 10
os.remove('xgb_tc.json')
os.remove("xgb_tc.json")
res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class))
res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
@@ -113,16 +112,14 @@ class TestTrainingContinuation:
y_2class,
gbdt_04.predict(
dtrain_2class, iteration_range=(0, gbdt_04.num_boosted_rounds())
)
),
)
assert res1 == res2
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
num_boost_round=7)
gbdt_05 = xgb.train(xgb_params_03,
dtrain_5class,
num_boost_round=3,
xgb_model=gbdt_05)
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7)
gbdt_05 = xgb.train(
xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05
)
res1 = gbdt_05.predict(dtrain_5class)
res2 = gbdt_05.predict(
@@ -149,8 +146,8 @@ class TestTrainingContinuation:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
clf = xgb.XGBClassifier(n_estimators=2)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
clf = xgb.XGBClassifier(n_estimators=2, eval_metric="logloss")
clf.fit(X, y, eval_set=[(X, y)])
assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
with tempfile.TemporaryDirectory() as tmpdir:
@@ -160,5 +157,10 @@ class TestTrainingContinuation:
clf = xgb.XGBClassifier(n_estimators=2)
# change metric to error
clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
clf.set_params(eval_metric="error")
clf.fit(X, y, eval_set=[(X, y)], xgb_model=loaded)
assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
def test_model_output(self, tree_method: str) -> None:
run_training_continuation_model_output("cpu", tree_method)

View File

@@ -30,8 +30,8 @@ def test_binary_classification():
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
for train_index, test_index in kf.split(X, y):
clf = cls(random_state=42)
xgb_model = clf.fit(X[train_index], y[train_index], eval_metric=['auc', 'logloss'])
clf = cls(random_state=42, eval_metric=['auc', 'logloss'])
xgb_model = clf.fit(X[train_index], y[train_index])
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
err = sum(1 for i in range(len(preds))
@@ -101,10 +101,11 @@ def test_best_iteration():
def train(booster: str, forest: Optional[int]) -> None:
rounds = 4
cls = xgb.XGBClassifier(
n_estimators=rounds, num_parallel_tree=forest, booster=booster
).fit(
X, y, eval_set=[(X, y)], early_stopping_rounds=3
)
n_estimators=rounds,
num_parallel_tree=forest,
booster=booster,
early_stopping_rounds=3,
).fit(X, y, eval_set=[(X, y)])
assert cls.best_iteration == rounds - 1
# best_iteration is used by default, assert that under gblinear it's
@@ -112,9 +113,9 @@ def test_best_iteration():
cls.predict(X)
num_parallel_tree = 4
train('gbtree', num_parallel_tree)
train('dart', num_parallel_tree)
train('gblinear', None)
train("gbtree", num_parallel_tree)
train("dart", num_parallel_tree)
train("gblinear", None)
def test_ranking():
@@ -258,6 +259,7 @@ def test_stacking_classification():
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf.fit(X_train, y_train).score(X_test, y_test)
@pytest.mark.skipif(**tm.no_pandas())
def test_feature_importances_weight():
from sklearn.datasets import load_digits
@@ -474,7 +476,8 @@ def run_housing_rf_regression(tree_method):
rfreg = xgb.XGBRFRegressor()
with pytest.raises(NotImplementedError):
rfreg.fit(X, y, early_stopping_rounds=10)
rfreg.set_params(early_stopping_rounds=10)
rfreg.fit(X, y)
def test_rf_regression():
@@ -574,7 +577,7 @@ def test_classification_with_custom_objective():
return logregobj(y, p)
cls.set_params(objective=wrapped)
cls.predict(X) # no throw
cls.predict(X) # no throw
cls.fit(X, y)
assert is_called[0]
@@ -844,51 +847,65 @@ def run_validation_weights(model):
y_train, y_test = y[:1600], y[1600:]
# instantiate model
param_dist = {'objective': 'binary:logistic', 'n_estimators': 2,
'random_state': 123}
param_dist = {
"objective": "binary:logistic",
"n_estimators": 2,
"random_state": 123,
}
clf = model(**param_dist)
# train it using instance weights only in the training set
weights_train = np.random.choice([1, 2], len(X_train))
clf.fit(X_train, y_train,
sample_weight=weights_train,
eval_set=[(X_test, y_test)],
eval_metric='logloss',
verbose=False)
clf.set_params(eval_metric="logloss")
clf.fit(
X_train,
y_train,
sample_weight=weights_train,
eval_set=[(X_test, y_test)],
verbose=False,
)
# evaluate logloss metric on test set *without* using weights
evals_result_without_weights = clf.evals_result()
logloss_without_weights = evals_result_without_weights[
"validation_0"]["logloss"]
logloss_without_weights = evals_result_without_weights["validation_0"]["logloss"]
# now use weights for the test set
np.random.seed(0)
weights_test = np.random.choice([1, 2], len(X_test))
clf.fit(X_train, y_train,
sample_weight=weights_train,
eval_set=[(X_test, y_test)],
sample_weight_eval_set=[weights_test],
eval_metric='logloss',
verbose=False)
clf.set_params(eval_metric="logloss")
clf.fit(
X_train,
y_train,
sample_weight=weights_train,
eval_set=[(X_test, y_test)],
sample_weight_eval_set=[weights_test],
verbose=False,
)
evals_result_with_weights = clf.evals_result()
logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]
# check that the logloss in the test set is actually different when using
# weights than when not using them
assert all((logloss_with_weights[i] != logloss_without_weights[i]
for i in [0, 1]))
assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
with pytest.raises(ValueError):
# length of eval set and sample weight doesn't match.
clf.fit(X_train, y_train, sample_weight=weights_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
sample_weight_eval_set=[weights_train])
clf.fit(
X_train,
y_train,
sample_weight=weights_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
sample_weight_eval_set=[weights_train],
)
with pytest.raises(ValueError):
cls = xgb.XGBClassifier()
cls.fit(X_train, y_train, sample_weight=weights_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
sample_weight_eval_set=[weights_train])
cls.fit(
X_train,
y_train,
sample_weight=weights_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
sample_weight_eval_set=[weights_train],
)
def test_validation_weights():
@@ -960,8 +977,7 @@ def test_XGBClassifier_resume():
# file name of stored xgb model
model1.save_model(model1_path)
model2 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=8)
model2 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
model2.fit(X, Y, xgb_model=model1_path)
pred2 = model2.predict(X)
@@ -972,8 +988,7 @@ def test_XGBClassifier_resume():
# file name of 'Booster' instance Xgb model
model1.get_booster().save_model(model1_booster_path)
model2 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=8)
model2 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
model2.fit(X, Y, xgb_model=model1_booster_path)
pred2 = model2.predict(X)
@@ -1279,12 +1294,16 @@ def test_estimator_reg(estimator, check):
):
estimator.fit(X, y)
return
if os.environ["PYTEST_CURRENT_TEST"].find("check_estimators_overwrite_params") != -1:
if (
os.environ["PYTEST_CURRENT_TEST"].find("check_estimators_overwrite_params")
!= -1
):
# A hack to pass the scikit-learn parameter mutation tests. XGBoost regressor
# returns actual internal default values for parameters in `get_params`, but those
# are set as `None` in sklearn interface to avoid duplication. So we fit a dummy
# model and obtain the default parameters here for the mutation tests.
# returns actual internal default values for parameters in `get_params`, but
# those are set as `None` in sklearn interface to avoid duplication. So we fit
# a dummy model and obtain the default parameters here for the mutation tests.
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=2, n_features=1)
estimator.set_params(**xgb.XGBRegressor().fit(X, y).get_params())
@@ -1325,6 +1344,7 @@ def test_categorical():
def test_evaluation_metric():
from sklearn.datasets import load_diabetes, load_digits
from sklearn.metrics import mean_absolute_error
X, y = load_diabetes(return_X_y=True)
n_estimators = 16
@@ -1341,17 +1361,6 @@ def test_evaluation_metric():
for line in lines:
assert line.find("mean_absolute_error") != -1
def metric(predt: np.ndarray, Xy: xgb.DMatrix):
y = Xy.get_label()
return "m", np.abs(predt - y).sum()
with pytest.warns(UserWarning):
reg = xgb.XGBRegressor(
tree_method="hist",
n_estimators=1,
)
reg.fit(X, y, eval_set=[(X, y)], eval_metric=metric)
def merror(y_true: np.ndarray, predt: np.ndarray):
n_samples = y_true.shape[0]
assert n_samples == predt.size

View File

@@ -363,12 +363,12 @@ class TestDistributedGPU:
device="cuda",
eval_metric="error",
n_estimators=100,
early_stopping_rounds=early_stopping_rounds,
)
cls.client = local_cuda_client
cls.fit(
X,
y,
early_stopping_rounds=early_stopping_rounds,
eval_set=[(valid_X, valid_y)],
)
booster = cls.get_booster()

View File

@@ -937,8 +937,10 @@ def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None
valid_X = dd.from_array(valid_X_, chunksize=n_samples)
valid_y = dd.from_array(valid_y_, chunksize=n_samples)
cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
cls = xgb.dask.DaskXGBClassifier(
device=device, n_estimators=2, eval_metric=["auc", "aucpr"]
)
cls.fit(X, y, eval_set=[(valid_X, valid_y)])
# multiclass
X_, y_ = make_classification(
@@ -966,8 +968,10 @@ def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None
valid_X = dd.from_array(valid_X_, chunksize=n_samples)
valid_y = dd.from_array(valid_y_, chunksize=n_samples)
cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
cls = xgb.dask.DaskXGBClassifier(
device=device, n_estimators=2, eval_metric=["auc", "aucpr"]
)
cls.fit(X, y, eval_set=[(valid_X, valid_y)])
def test_empty_dmatrix_auc() -> None:
@@ -994,11 +998,11 @@ def run_auc(client: "Client", device: str) -> None:
valid_X = dd.from_array(valid_X_, chunksize=10)
valid_y = dd.from_array(valid_y_, chunksize=10)
cls = xgb.XGBClassifier(device=device, n_estimators=2)
cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
cls = xgb.XGBClassifier(device=device, n_estimators=2, eval_metric="auc")
cls.fit(X_, y_, eval_set=[(valid_X_, valid_y_)])
dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2, eval_metric="auc")
dcls.fit(X, y, eval_set=[(valid_X, valid_y)])
approx = dcls.evals_result()["validation_0"]["auc"]
exact = cls.evals_result()["validation_0"]["auc"]
@@ -1267,16 +1271,16 @@ def test_dask_ranking(client: "Client") -> None:
qid_valid = qid_valid.astype(np.uint32)
qid_test = qid_test.astype(np.uint32)
rank = xgb.dask.DaskXGBRanker(n_estimators=2500)
rank = xgb.dask.DaskXGBRanker(
n_estimators=2500, eval_metric=["ndcg"], early_stopping_rounds=10
)
rank.fit(
x_train,
y_train,
qid=qid_train,
eval_set=[(x_test, y_test), (x_train, y_train)],
eval_qid=[qid_test, qid_train],
eval_metric=["ndcg"],
verbose=True,
early_stopping_rounds=10,
)
assert rank.n_features_in_ == 46
assert rank.best_score > 0.98
@@ -2150,13 +2154,15 @@ class TestDaskCallbacks:
valid_X, valid_y = load_breast_cancer(return_X_y=True)
valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
cls = xgb.dask.DaskXGBClassifier(
objective="binary:logistic", tree_method="hist", n_estimators=1000
objective="binary:logistic",
tree_method="hist",
n_estimators=1000,
early_stopping_rounds=early_stopping_rounds,
)
cls.client = client
cls.fit(
X,
y,
early_stopping_rounds=early_stopping_rounds,
eval_set=[(valid_X, valid_y)],
)
booster = cls.get_booster()
@@ -2165,15 +2171,17 @@ class TestDaskCallbacks:
# Specify the metric
cls = xgb.dask.DaskXGBClassifier(
objective="binary:logistic", tree_method="hist", n_estimators=1000
objective="binary:logistic",
tree_method="hist",
n_estimators=1000,
early_stopping_rounds=early_stopping_rounds,
eval_metric="error",
)
cls.client = client
cls.fit(
X,
y,
early_stopping_rounds=early_stopping_rounds,
eval_set=[(valid_X, valid_y)],
eval_metric="error",
)
assert tm.non_increasing(cls.evals_result()["validation_0"]["error"])
booster = cls.get_booster()
@@ -2215,12 +2223,12 @@ class TestDaskCallbacks:
tree_method="hist",
n_estimators=1000,
eval_metric=tm.eval_error_metric_skl,
early_stopping_rounds=early_stopping_rounds,
)
cls.client = client
cls.fit(
X,
y,
early_stopping_rounds=early_stopping_rounds,
eval_set=[(valid_X, valid_y)],
)
booster = cls.get_booster()
@@ -2234,21 +2242,22 @@ class TestDaskCallbacks:
X, y = load_breast_cancer(return_X_y=True)
X, y = da.from_array(X), da.from_array(y)
cls = xgb.dask.DaskXGBClassifier(
objective="binary:logistic", tree_method="hist", n_estimators=10
)
cls.client = client
with tempfile.TemporaryDirectory() as tmpdir:
cls.fit(
X,
y,
cls = xgb.dask.DaskXGBClassifier(
objective="binary:logistic",
tree_method="hist",
n_estimators=10,
callbacks=[
xgb.callback.TrainingCheckPoint(
directory=Path(tmpdir), interval=1, name="model"
)
],
)
cls.client = client
cls.fit(
X,
y,
)
for i in range(1, 10):
assert os.path.exists(
os.path.join(

View File

@@ -311,24 +311,20 @@ def clf_with_weight(
y_val = np.array([0, 1])
w_train = np.array([1.0, 2.0])
w_val = np.array([1.0, 2.0])
cls2 = XGBClassifier()
cls2 = XGBClassifier(eval_metric="logloss", early_stopping_rounds=1)
cls2.fit(
X_train,
y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=1,
eval_metric="logloss",
)
cls3 = XGBClassifier()
cls3 = XGBClassifier(eval_metric="logloss", early_stopping_rounds=1)
cls3.fit(
X_train,
y_train,
sample_weight=w_train,
eval_set=[(X_val, y_val)],
sample_weight_eval_set=[w_val],
early_stopping_rounds=1,
eval_metric="logloss",
)
cls_df_train_with_eval_weight = spark.createDataFrame(