merge latest changes
This commit is contained in:
@@ -4,15 +4,13 @@ facilities.
|
||||
# Directories
|
||||
* ci_build: Test facilities for Jenkins CI and GitHub action.
|
||||
* cli: Basic test for command line executable `xgboost`. Most of the other command line
|
||||
specific tests are in Python test `test_cli.py`
|
||||
specific tests are in Python test `test_cli.py`.
|
||||
* cpp: Tests for C++ core, using Google test framework.
|
||||
* python: Tests for Python package, demonstrations and CLI. For how to setup the
|
||||
dependencies for tests, see conda files in `ci_build`.
|
||||
* python-gpu: Similar to python tests, but for GPU.
|
||||
* travis: CI facilities for Travis.
|
||||
* distributed: Test for distributed system.
|
||||
* benchmark: Legacy benchmark code. There are a number of benchmark projects for
|
||||
XGBoost with much better configurations.
|
||||
* test_distributed: Test for distributed systems including spark and dask.
|
||||
|
||||
# Others
|
||||
* pytest.ini: Describes the `pytest` marker for python tests, some markers are generated
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
#pylint: skip-file
|
||||
import argparse
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import train_test_split
|
||||
import time
|
||||
import ast
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
def run_benchmark(args):
|
||||
|
||||
try:
|
||||
dtest = xgb.DMatrix('dtest.dm')
|
||||
dtrain = xgb.DMatrix('dtrain.dm')
|
||||
|
||||
if not (dtest.num_col() == args.columns \
|
||||
and dtrain.num_col() == args.columns):
|
||||
raise ValueError("Wrong cols")
|
||||
if not (dtest.num_row() == args.rows * args.test_size \
|
||||
and dtrain.num_row() == args.rows * (1-args.test_size)):
|
||||
raise ValueError("Wrong rows")
|
||||
except:
|
||||
|
||||
print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
|
||||
print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
|
||||
tmp = time.time()
|
||||
X, y = make_classification(args.rows, n_features=args.columns, n_redundant=0, n_informative=args.columns, n_repeated=0, random_state=7)
|
||||
if args.sparsity < 1.0:
|
||||
X = np.array([[np.nan if rng.uniform(0, 1) < args.sparsity else x for x in x_row] for x_row in X])
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7)
|
||||
print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
|
||||
tmp = time.time()
|
||||
print ("DMatrix Start")
|
||||
dtrain = xgb.DMatrix(X_train, y_train)
|
||||
dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
|
||||
print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
|
||||
|
||||
dtest.save_binary('dtest.dm')
|
||||
dtrain.save_binary('dtrain.dm')
|
||||
|
||||
param = {'objective': 'binary:logistic','booster':'gblinear'}
|
||||
if args.params != '':
|
||||
param.update(ast.literal_eval(args.params))
|
||||
|
||||
param['updater'] = args.updater
|
||||
print("Training with '%s'" % param['updater'])
|
||||
tmp = time.time()
|
||||
xgb.train(param, dtrain, args.iterations, evals=[(dtrain,"train")], early_stopping_rounds = args.columns)
|
||||
print ("Train Time: %s seconds" % (str(time.time() - tmp)))
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--updater', default='coord_descent')
|
||||
parser.add_argument('--sparsity', type=float, default=0.0)
|
||||
parser.add_argument('--lambda', type=float, default=1.0)
|
||||
parser.add_argument('--tol', type=float, default=1e-5)
|
||||
parser.add_argument('--alpha', type=float, default=1.0)
|
||||
parser.add_argument('--rows', type=int, default=1000000)
|
||||
parser.add_argument('--iterations', type=int, default=10000)
|
||||
parser.add_argument('--columns', type=int, default=50)
|
||||
parser.add_argument('--test_size', type=float, default=0.25)
|
||||
parser.add_argument('--standardise', type=bool, default=False)
|
||||
parser.add_argument('--params', default='', help='Provide additional parameters as a Python dict string, e.g. --params \"{\'max_depth\':2}\"')
|
||||
args = parser.parse_args()
|
||||
|
||||
run_benchmark(args)
|
||||
@@ -1,86 +0,0 @@
|
||||
"""Run benchmark on the tree booster."""
|
||||
|
||||
import argparse
|
||||
import ast
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
|
||||
RNG = np.random.RandomState(1994)
|
||||
|
||||
|
||||
def run_benchmark(args):
|
||||
"""Runs the benchmark."""
|
||||
try:
|
||||
dtest = xgb.DMatrix('dtest.dm')
|
||||
dtrain = xgb.DMatrix('dtrain.dm')
|
||||
|
||||
if not (dtest.num_col() == args.columns
|
||||
and dtrain.num_col() == args.columns):
|
||||
raise ValueError("Wrong cols")
|
||||
if not (dtest.num_row() == args.rows * args.test_size
|
||||
and dtrain.num_row() == args.rows * (1 - args.test_size)):
|
||||
raise ValueError("Wrong rows")
|
||||
except:
|
||||
print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
|
||||
print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
|
||||
tmp = time.time()
|
||||
X = RNG.rand(args.rows, args.columns)
|
||||
y = RNG.randint(0, 2, args.rows)
|
||||
if 0.0 < args.sparsity < 1.0:
|
||||
X = np.array([[np.nan if RNG.uniform(0, 1) < args.sparsity else x for x in x_row]
|
||||
for x_row in X])
|
||||
|
||||
train_rows = int(args.rows * (1.0 - args.test_size))
|
||||
test_rows = int(args.rows * args.test_size)
|
||||
X_train = X[:train_rows, :]
|
||||
X_test = X[-test_rows:, :]
|
||||
y_train = y[:train_rows]
|
||||
y_test = y[-test_rows:]
|
||||
print("Generate Time: %s seconds" % (str(time.time() - tmp)))
|
||||
del X, y
|
||||
|
||||
tmp = time.time()
|
||||
print("DMatrix Start")
|
||||
dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
|
||||
dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
|
||||
print("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
|
||||
del X_train, y_train, X_test, y_test
|
||||
|
||||
dtest.save_binary('dtest.dm')
|
||||
dtrain.save_binary('dtrain.dm')
|
||||
|
||||
param = {'objective': 'binary:logistic'}
|
||||
if args.params != '':
|
||||
param.update(ast.literal_eval(args.params))
|
||||
|
||||
param['tree_method'] = args.tree_method
|
||||
print("Training with '%s'" % param['tree_method'])
|
||||
tmp = time.time()
|
||||
xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
|
||||
print("Train Time: %s seconds" % (str(time.time() - tmp)))
|
||||
|
||||
|
||||
def main():
|
||||
"""The main function.
|
||||
|
||||
Defines and parses command line arguments and calls the benchmark.
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--tree_method', default='gpu_hist')
|
||||
parser.add_argument('--sparsity', type=float, default=0.0)
|
||||
parser.add_argument('--rows', type=int, default=1000000)
|
||||
parser.add_argument('--columns', type=int, default=50)
|
||||
parser.add_argument('--iterations', type=int, default=500)
|
||||
parser.add_argument('--test_size', type=float, default=0.25)
|
||||
parser.add_argument('--params', default='',
|
||||
help='Provide additional parameters as a Python dict string, e.g. --params '
|
||||
'\"{\'max_depth\':2}\"')
|
||||
args = parser.parse_args()
|
||||
|
||||
run_benchmark(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,87 +0,0 @@
|
||||
"""Generate synthetic data in LIBSVM format."""
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
RNG = np.random.RandomState(2019)
|
||||
|
||||
|
||||
def generate_data(args):
|
||||
"""Generates the data."""
|
||||
print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
|
||||
print("Sparsity {}".format(args.sparsity))
|
||||
print("{}/{} train/test split".format(1.0 - args.test_size, args.test_size))
|
||||
|
||||
tmp = time.time()
|
||||
n_informative = args.columns * 7 // 10
|
||||
n_redundant = args.columns // 10
|
||||
n_repeated = args.columns // 10
|
||||
print("n_informative: {}, n_redundant: {}, n_repeated: {}".format(n_informative, n_redundant,
|
||||
n_repeated))
|
||||
x, y = make_classification(n_samples=args.rows, n_features=args.columns,
|
||||
n_informative=n_informative, n_redundant=n_redundant,
|
||||
n_repeated=n_repeated, shuffle=False, random_state=RNG)
|
||||
print("Generate Time: {} seconds".format(time.time() - tmp))
|
||||
|
||||
tmp = time.time()
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=args.test_size,
|
||||
random_state=RNG, shuffle=False)
|
||||
print("Train/Test Split Time: {} seconds".format(time.time() - tmp))
|
||||
|
||||
tmp = time.time()
|
||||
write_file('train.libsvm', x_train, y_train, args.sparsity)
|
||||
print("Write Train Time: {} seconds".format(time.time() - tmp))
|
||||
|
||||
tmp = time.time()
|
||||
write_file('test.libsvm', x_test, y_test, args.sparsity)
|
||||
print("Write Test Time: {} seconds".format(time.time() - tmp))
|
||||
|
||||
|
||||
def write_file(filename, x_data, y_data, sparsity):
|
||||
with open(filename, 'w') as f:
|
||||
for x, y in zip(x_data, y_data):
|
||||
write_line(f, x, y, sparsity)
|
||||
|
||||
|
||||
def write_line(f, x, y, sparsity):
|
||||
with io.StringIO() as line:
|
||||
line.write(str(y))
|
||||
for i, col in enumerate(x):
|
||||
if 0.0 < sparsity < 1.0:
|
||||
if RNG.uniform(0, 1) > sparsity:
|
||||
write_feature(line, i, col)
|
||||
else:
|
||||
write_feature(line, i, col)
|
||||
line.write('\n')
|
||||
f.write(line.getvalue())
|
||||
|
||||
|
||||
def write_feature(line, index, feature):
|
||||
line.write(' ')
|
||||
line.write(str(index))
|
||||
line.write(':')
|
||||
line.write(str(feature))
|
||||
|
||||
|
||||
def main():
|
||||
"""The main function.
|
||||
|
||||
Defines and parses command line arguments and calls the generator.
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--rows', type=int, default=1000000)
|
||||
parser.add_argument('--columns', type=int, default=50)
|
||||
parser.add_argument('--sparsity', type=float, default=0.0)
|
||||
parser.add_argument('--test_size', type=float, default=0.01)
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_data(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
|
||||
tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \
|
||||
${SPARK_VERSION}
|
||||
|
||||
echo "--- Stash XGBoost4J JARs (Scala 2.12)"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
|
||||
|
||||
echo "--- Build XGBoost JVM packages scala 2.13"
|
||||
|
||||
tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \
|
||||
${SPARK_VERSION} "" "" "true"
|
||||
|
||||
echo "--- Stash XGBoost4J JARs"
|
||||
echo "--- Stash XGBoost4J JARs (Scala 2.13)"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
|
||||
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
. tests/buildkite/conftest.ps1
|
||||
|
||||
Write-Host "--- Build XGBoost R package with CUDA"
|
||||
|
||||
nvcc --version
|
||||
$arch_flag = "-DGPU_COMPUTE_VER=75"
|
||||
|
||||
bash tests/ci_build/build_r_pkg_with_cuda_win64.sh $Env:BUILDKITE_COMMIT
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
if ( $is_release_branch -eq 1 ) {
|
||||
Write-Host "--- Upload R tarball"
|
||||
Get-ChildItem . -Filter xgboost_r_gpu_win64_*.tar.gz |
|
||||
Foreach-Object {
|
||||
& aws s3 cp $_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ `
|
||||
--acl public-read --no-progress
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
}
|
||||
}
|
||||
@@ -13,11 +13,6 @@ steps:
|
||||
key: build-win64-gpu
|
||||
agents:
|
||||
queue: windows-cpu
|
||||
- label: ":windows: Build XGBoost R package for Windows with CUDA"
|
||||
command: "tests/buildkite/build-rpkg-win64-gpu.ps1"
|
||||
key: build-rpkg-win64-gpu
|
||||
agents:
|
||||
queue: windows-cpu
|
||||
|
||||
- wait
|
||||
|
||||
|
||||
@@ -24,12 +24,13 @@ if [ "x$gpu_arch" != "x" ]; then
|
||||
export GPU_ARCH_FLAG=$gpu_arch
|
||||
fi
|
||||
|
||||
mvn_profile_string=""
|
||||
if [ "x$use_scala213" != "x" ]; then
|
||||
export mvn_profile_string="-Pdefault,scala-2.13"
|
||||
cd ..
|
||||
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
|
||||
cd jvm-packages
|
||||
fi
|
||||
|
||||
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
|
||||
mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
|
||||
|
||||
set +x
|
||||
set +e
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
if [ "$#" -ne 1 ]
|
||||
then
|
||||
echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
commit_hash="$1"
|
||||
# Clear all positional args
|
||||
set --
|
||||
|
||||
source activate
|
||||
python tests/ci_build/test_r_package.py --task=pack
|
||||
mv xgboost/ xgboost_rpack/
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-4.3.2" -DCMAKE_PREFIX_PATH="C:\\rtools43\\x86_64-w64-mingw32.static.posix\\bin"
|
||||
cmake --build . --config Release --parallel
|
||||
cd ..
|
||||
|
||||
# This super wacky hack is found in cmake/RPackageInstall.cmake.in and
|
||||
# cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
|
||||
# and have R use xgboost.dll that we've already built.
|
||||
rm -v xgboost_rpack/configure
|
||||
rm -rfv xgboost_rpack/src
|
||||
mkdir -p xgboost_rpack/src
|
||||
cp -v lib/xgboost.dll xgboost_rpack/src/
|
||||
echo 'all:' > xgboost_rpack/src/Makefile
|
||||
echo 'all:' > xgboost_rpack/src/Makefile.win
|
||||
mv xgboost_rpack/ xgboost/
|
||||
/c/Rtools43/usr/bin/tar -cvf xgboost_r_gpu_win64_${commit_hash}.tar xgboost/
|
||||
/c/Rtools43/usr/bin/gzip -9c xgboost_r_gpu_win64_${commit_hash}.tar > xgboost_r_gpu_win64_${commit_hash}.tar.gz
|
||||
@@ -27,7 +27,10 @@ rm -rf ../build/
|
||||
# Deploy to S3 bucket xgboost-maven-repo
|
||||
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
|
||||
# Deploy scala 2.13 to S3 bucket xgboost-maven-repo
|
||||
mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
|
||||
cd ..
|
||||
python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
|
||||
cd jvm-packages/
|
||||
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
|
||||
|
||||
|
||||
set +x
|
||||
|
||||
@@ -18,14 +18,17 @@ class LintersPaths:
|
||||
"python-package/",
|
||||
# tests
|
||||
"tests/python/test_config.py",
|
||||
"tests/python/test_callback.py",
|
||||
"tests/python/test_data_iterator.py",
|
||||
"tests/python/test_dmatrix.py",
|
||||
"tests/python/test_dt.py",
|
||||
"tests/python/test_demos.py",
|
||||
"tests/python/test_eval_metrics.py",
|
||||
"tests/python/test_multi_target.py",
|
||||
"tests/python/test_predict.py",
|
||||
"tests/python/test_quantile_dmatrix.py",
|
||||
"tests/python/test_tree_regularization.py",
|
||||
"tests/python/test_training_continuation.py",
|
||||
"tests/python/test_shap.py",
|
||||
"tests/python/test_model_io.py",
|
||||
"tests/python/test_with_pandas.py",
|
||||
@@ -39,12 +42,15 @@ class LintersPaths:
|
||||
"demo/dask/",
|
||||
"demo/rmm_plugin",
|
||||
"demo/json-model/json_parser.py",
|
||||
"demo/guide-python/continuation.py",
|
||||
"demo/guide-python/cat_in_the_dat.py",
|
||||
"demo/guide-python/callbacks.py",
|
||||
"demo/guide-python/categorical.py",
|
||||
"demo/guide-python/cat_pipeline.py",
|
||||
"demo/guide-python/feature_weights.py",
|
||||
"demo/guide-python/sklearn_parallel.py",
|
||||
"demo/guide-python/sklearn_examples.py",
|
||||
"demo/guide-python/sklearn_evals_result.py",
|
||||
"demo/guide-python/spark_estimator_examples.py",
|
||||
"demo/guide-python/external_memory.py",
|
||||
"demo/guide-python/individual_trees.py",
|
||||
@@ -86,6 +92,7 @@ class LintersPaths:
|
||||
"tests/python/test_multi_target.py",
|
||||
"tests/python-gpu/test_gpu_data_iterator.py",
|
||||
"tests/python-gpu/load_pickle.py",
|
||||
"tests/python-gpu/test_gpu_training_continuation.py",
|
||||
"tests/python/test_model_io.py",
|
||||
"tests/test_distributed/test_with_spark/test_data.py",
|
||||
"tests/test_distributed/test_gpu_with_spark/test_data.py",
|
||||
@@ -93,6 +100,7 @@ class LintersPaths:
|
||||
# demo
|
||||
"demo/json-model/json_parser.py",
|
||||
"demo/guide-python/external_memory.py",
|
||||
"demo/guide-python/continuation.py",
|
||||
"demo/guide-python/callbacks.py",
|
||||
"demo/guide-python/cat_in_the_dat.py",
|
||||
"demo/guide-python/categorical.py",
|
||||
|
||||
@@ -20,10 +20,11 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
|
||||
cd $jvm_packages_dir
|
||||
fi
|
||||
|
||||
# including maven profiles for different scala versions: 2.12 is the default at the moment.
|
||||
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
|
||||
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
|
||||
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
|
||||
for scala_binary_version in "2.12" "2.13"; do
|
||||
cd ..
|
||||
python dev/change_scala_version.py --scala-version ${scala_binary_version}
|
||||
cd jvm-packages
|
||||
scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
|
||||
|
||||
# Install XGBoost4J JAR into local Maven repository
|
||||
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
|
||||
|
||||
@@ -253,6 +253,5 @@ void TestColumnSplit(bst_target_t n_targets) {
|
||||
|
||||
TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
|
||||
|
||||
TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
|
||||
|
||||
TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); }
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -1,18 +1,21 @@
|
||||
/**
|
||||
* Copyright 2020-2023 by XGBoost Contributors
|
||||
* Copyright 2020-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/context.h> // for Context
|
||||
#include <xgboost/task.h> // for ObjInfo
|
||||
#include <xgboost/tree_model.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <xgboost/context.h> // for Context
|
||||
#include <xgboost/task.h> // for ObjInfo
|
||||
#include <xgboost/tree_model.h> // for RegTree
|
||||
#include <xgboost/tree_updater.h> // for TreeUpdater
|
||||
|
||||
#include <memory> // for unique_ptr
|
||||
#include <memory> // for unique_ptr
|
||||
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
/**
|
||||
* @brief Test the tree statistic (like sum Hessian) is correct.
|
||||
*/
|
||||
class UpdaterTreeStatTest : public ::testing::Test {
|
||||
protected:
|
||||
std::shared_ptr<DMatrix> p_dmat_;
|
||||
@@ -28,13 +31,12 @@ class UpdaterTreeStatTest : public ::testing::Test {
|
||||
gpairs_.Data()->Copy(g);
|
||||
}
|
||||
|
||||
void RunTest(std::string updater) {
|
||||
void RunTest(Context const* ctx, std::string updater) {
|
||||
tree::TrainParam param;
|
||||
ObjInfo task{ObjInfo::kRegression};
|
||||
param.Init(Args{});
|
||||
|
||||
Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
|
||||
auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
|
||||
auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, ctx, &task)};
|
||||
up->Configure(Args{});
|
||||
RegTree tree{1u, kCols};
|
||||
std::vector<HostDeviceVector<bst_node_t>> position(1);
|
||||
@@ -51,77 +53,136 @@ class UpdaterTreeStatTest : public ::testing::Test {
|
||||
};
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
TEST_F(UpdaterTreeStatTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
TEST_F(UpdaterTreeStatTest, GpuHist) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
this->RunTest(&ctx, "grow_gpu_hist");
|
||||
}
|
||||
|
||||
TEST_F(UpdaterTreeStatTest, Hist) { this->RunTest("grow_quantile_histmaker"); }
|
||||
TEST_F(UpdaterTreeStatTest, GpuApprox) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
this->RunTest(&ctx, "grow_gpu_approx");
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
|
||||
TEST_F(UpdaterTreeStatTest, Exact) { this->RunTest("grow_colmaker"); }
|
||||
TEST_F(UpdaterTreeStatTest, Hist) {
|
||||
Context ctx;
|
||||
this->RunTest(&ctx, "grow_quantile_histmaker");
|
||||
}
|
||||
|
||||
TEST_F(UpdaterTreeStatTest, Approx) { this->RunTest("grow_histmaker"); }
|
||||
TEST_F(UpdaterTreeStatTest, Exact) {
|
||||
Context ctx;
|
||||
this->RunTest(&ctx, "grow_colmaker");
|
||||
}
|
||||
|
||||
class UpdaterEtaTest : public ::testing::Test {
|
||||
TEST_F(UpdaterTreeStatTest, Approx) {
|
||||
Context ctx;
|
||||
this->RunTest(&ctx, "grow_histmaker");
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Test changing learning rate doesn't change internal splits.
|
||||
*/
|
||||
class TestSplitWithEta : public ::testing::Test {
|
||||
protected:
|
||||
std::shared_ptr<DMatrix> p_dmat_;
|
||||
linalg::Matrix<GradientPair> gpairs_;
|
||||
size_t constexpr static kRows = 10;
|
||||
size_t constexpr static kCols = 10;
|
||||
size_t constexpr static kClasses = 10;
|
||||
void Run(Context const* ctx, bst_target_t n_targets, std::string name) {
|
||||
auto Xy = RandomDataGenerator{512, 64, 0.2}.Targets(n_targets).GenerateDMatrix(true);
|
||||
|
||||
void SetUp() override {
|
||||
p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true, false, kClasses);
|
||||
auto g = GenerateRandomGradients(kRows);
|
||||
gpairs_.Reshape(kRows, 1);
|
||||
gpairs_.Data()->Copy(g);
|
||||
}
|
||||
auto gen_tree = [&](float eta) {
|
||||
auto tree =
|
||||
std::make_unique<RegTree>(n_targets, static_cast<bst_feature_t>(Xy->Info().num_col_));
|
||||
std::vector<RegTree*> trees{tree.get()};
|
||||
ObjInfo task{ObjInfo::kRegression};
|
||||
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(name, ctx, &task)};
|
||||
updater->Configure({});
|
||||
|
||||
void RunTest(std::string updater) {
|
||||
ObjInfo task{ObjInfo::kClassification};
|
||||
auto grad = GenerateRandomGradients(ctx, Xy->Info().num_row_, n_targets);
|
||||
CHECK_EQ(grad.Shape(1), n_targets);
|
||||
tree::TrainParam param;
|
||||
param.Init(Args{{"learning_rate", std::to_string(eta)}});
|
||||
HostDeviceVector<bst_node_t> position;
|
||||
|
||||
Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
|
||||
|
||||
float eta = 0.4;
|
||||
auto up_0 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
|
||||
up_0->Configure(Args{});
|
||||
tree::TrainParam param0;
|
||||
param0.Init(Args{{"eta", std::to_string(eta)}});
|
||||
|
||||
auto up_1 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
|
||||
up_1->Configure(Args{{"eta", "1.0"}});
|
||||
tree::TrainParam param1;
|
||||
param1.Init(Args{{"eta", "1.0"}});
|
||||
|
||||
for (size_t iter = 0; iter < 4; ++iter) {
|
||||
RegTree tree_0{1u, kCols};
|
||||
{
|
||||
std::vector<HostDeviceVector<bst_node_t>> position(1);
|
||||
up_0->Update(¶m0, &gpairs_, p_dmat_.get(), position, {&tree_0});
|
||||
updater->Update(¶m, &grad, Xy.get(), common::Span{&position, 1}, trees);
|
||||
CHECK_EQ(tree->NumTargets(), n_targets);
|
||||
if (n_targets > 1) {
|
||||
CHECK(tree->IsMultiTarget());
|
||||
}
|
||||
return tree;
|
||||
};
|
||||
|
||||
RegTree tree_1{1u, kCols};
|
||||
{
|
||||
std::vector<HostDeviceVector<bst_node_t>> position(1);
|
||||
up_1->Update(¶m1, &gpairs_, p_dmat_.get(), position, {&tree_1});
|
||||
}
|
||||
tree_0.WalkTree([&](bst_node_t nidx) {
|
||||
if (tree_0[nidx].IsLeaf()) {
|
||||
EXPECT_NEAR(tree_1[nidx].LeafValue() * eta, tree_0[nidx].LeafValue(), kRtEps);
|
||||
auto eta_ratio = 8.0f;
|
||||
auto p_tree0 = gen_tree(0.1f);
|
||||
auto p_tree1 = gen_tree(0.1f * eta_ratio);
|
||||
// Just to make sure we are not testing a stump.
|
||||
CHECK_GE(p_tree0->NumExtraNodes(), 32);
|
||||
|
||||
bst_node_t n_nodes{0};
|
||||
p_tree0->WalkTree([&](bst_node_t nidx) {
|
||||
if (p_tree0->IsLeaf(nidx)) {
|
||||
CHECK(p_tree1->IsLeaf(nidx));
|
||||
if (p_tree0->IsMultiTarget()) {
|
||||
CHECK(p_tree1->IsMultiTarget());
|
||||
auto leaf_0 = p_tree0->GetMultiTargetTree()->LeafValue(nidx);
|
||||
auto leaf_1 = p_tree1->GetMultiTargetTree()->LeafValue(nidx);
|
||||
CHECK_EQ(leaf_0.Size(), leaf_1.Size());
|
||||
for (std::size_t i = 0; i < leaf_0.Size(); ++i) {
|
||||
CHECK_EQ(leaf_0(i) * eta_ratio, leaf_1(i));
|
||||
}
|
||||
CHECK(std::isnan(p_tree0->SplitCond(nidx)));
|
||||
CHECK(std::isnan(p_tree1->SplitCond(nidx)));
|
||||
} else {
|
||||
// NON-mt tree reuses split cond for leaf value.
|
||||
auto leaf_0 = p_tree0->SplitCond(nidx);
|
||||
auto leaf_1 = p_tree1->SplitCond(nidx);
|
||||
CHECK_EQ(leaf_0 * eta_ratio, leaf_1);
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
} else {
|
||||
CHECK(!p_tree1->IsLeaf(nidx));
|
||||
CHECK_EQ(p_tree0->SplitCond(nidx), p_tree1->SplitCond(nidx));
|
||||
}
|
||||
n_nodes++;
|
||||
return true;
|
||||
});
|
||||
ASSERT_EQ(n_nodes, p_tree0->NumExtraNodes() + 1);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(UpdaterEtaTest, Hist) { this->RunTest("grow_quantile_histmaker"); }
|
||||
TEST_F(TestSplitWithEta, HistMulti) {
|
||||
Context ctx;
|
||||
bst_target_t n_targets{3};
|
||||
this->Run(&ctx, n_targets, "grow_quantile_histmaker");
|
||||
}
|
||||
|
||||
TEST_F(UpdaterEtaTest, Exact) { this->RunTest("grow_colmaker"); }
|
||||
TEST_F(TestSplitWithEta, Hist) {
|
||||
Context ctx;
|
||||
bst_target_t n_targets{1};
|
||||
this->Run(&ctx, n_targets, "grow_quantile_histmaker");
|
||||
}
|
||||
|
||||
TEST_F(UpdaterEtaTest, Approx) { this->RunTest("grow_histmaker"); }
|
||||
TEST_F(TestSplitWithEta, Approx) {
|
||||
Context ctx;
|
||||
bst_target_t n_targets{1};
|
||||
this->Run(&ctx, n_targets, "grow_histmaker");
|
||||
}
|
||||
|
||||
TEST_F(TestSplitWithEta, Exact) {
|
||||
Context ctx;
|
||||
bst_target_t n_targets{1};
|
||||
this->Run(&ctx, n_targets, "grow_colmaker");
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
TEST_F(UpdaterEtaTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
TEST_F(TestSplitWithEta, GpuHist) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
bst_target_t n_targets{1};
|
||||
this->Run(&ctx, n_targets, "grow_gpu_hist");
|
||||
}
|
||||
|
||||
TEST_F(TestSplitWithEta, GpuApprox) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
bst_target_t n_targets{1};
|
||||
this->Run(&ctx, n_targets, "grow_gpu_approx");
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
|
||||
class TestMinSplitLoss : public ::testing::Test {
|
||||
std::shared_ptr<DMatrix> dmat_;
|
||||
|
||||
@@ -1,54 +1,12 @@
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost.testing.continuation import run_training_continuation_model_output
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
class TestGPUTrainingContinuation:
|
||||
def test_training_continuation(self):
|
||||
kRows = 64
|
||||
kCols = 32
|
||||
X = np.random.randn(kRows, kCols)
|
||||
y = np.random.randn(kRows)
|
||||
dtrain = xgb.DMatrix(X, y)
|
||||
params = {
|
||||
"tree_method": "gpu_hist",
|
||||
"max_depth": "2",
|
||||
"gamma": "0.1",
|
||||
"alpha": "0.01",
|
||||
}
|
||||
bst_0 = xgb.train(params, dtrain, num_boost_round=64)
|
||||
dump_0 = bst_0.get_dump(dump_format="json")
|
||||
|
||||
bst_1 = xgb.train(params, dtrain, num_boost_round=32)
|
||||
bst_1 = xgb.train(params, dtrain, num_boost_round=32, xgb_model=bst_1)
|
||||
dump_1 = bst_1.get_dump(dump_format="json")
|
||||
|
||||
def recursive_compare(obj_0, obj_1):
|
||||
if isinstance(obj_0, float):
|
||||
assert np.isclose(obj_0, obj_1, atol=1e-6)
|
||||
elif isinstance(obj_0, str):
|
||||
assert obj_0 == obj_1
|
||||
elif isinstance(obj_0, int):
|
||||
assert obj_0 == obj_1
|
||||
elif isinstance(obj_0, dict):
|
||||
keys_0 = list(obj_0.keys())
|
||||
keys_1 = list(obj_1.keys())
|
||||
values_0 = list(obj_0.values())
|
||||
values_1 = list(obj_1.values())
|
||||
for i in range(len(obj_0.items())):
|
||||
assert keys_0[i] == keys_1[i]
|
||||
if list(obj_0.keys())[i] != "missing":
|
||||
recursive_compare(values_0[i], values_1[i])
|
||||
else:
|
||||
for i in range(len(obj_0)):
|
||||
recursive_compare(obj_0[i], obj_1[i])
|
||||
|
||||
assert len(dump_0) == len(dump_1)
|
||||
for i in range(len(dump_0)):
|
||||
obj_0 = json.loads(dump_0[i])
|
||||
obj_1 = json.loads(dump_1[i])
|
||||
recursive_compare(obj_0, obj_1)
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||
def test_model_output(self, tree_method: str) -> None:
|
||||
run_training_continuation_model_output("cuda", tree_method)
|
||||
|
||||
@@ -16,13 +16,14 @@ class TestCallbacks:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
cls.X = X
|
||||
cls.y = y
|
||||
|
||||
split = int(X.shape[0]*0.8)
|
||||
cls.X_train = X[: split, ...]
|
||||
cls.y_train = y[: split, ...]
|
||||
split = int(X.shape[0] * 0.8)
|
||||
cls.X_train = X[:split, ...]
|
||||
cls.y_train = y[:split, ...]
|
||||
cls.X_valid = X[split:, ...]
|
||||
cls.y_valid = y[split:, ...]
|
||||
|
||||
@@ -31,31 +32,32 @@ class TestCallbacks:
|
||||
D_train: xgb.DMatrix,
|
||||
D_valid: xgb.DMatrix,
|
||||
rounds: int,
|
||||
verbose_eval: Union[bool, int]
|
||||
verbose_eval: Union[bool, int],
|
||||
):
|
||||
def check_output(output: str) -> None:
|
||||
if int(verbose_eval) == 1:
|
||||
# Should print each iteration info
|
||||
assert len(output.split('\n')) == rounds
|
||||
assert len(output.split("\n")) == rounds
|
||||
elif int(verbose_eval) > rounds:
|
||||
# Should print first and latest iteration info
|
||||
assert len(output.split('\n')) == 2
|
||||
assert len(output.split("\n")) == 2
|
||||
else:
|
||||
# Should print info by each period additionaly to first and latest
|
||||
# iteration
|
||||
num_periods = rounds // int(verbose_eval)
|
||||
# Extra information is required for latest iteration
|
||||
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
|
||||
assert len(output.split('\n')) == (
|
||||
assert len(output.split("\n")) == (
|
||||
1 + num_periods + int(is_extra_info_required)
|
||||
)
|
||||
|
||||
evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
|
||||
params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
|
||||
params = {"objective": "binary:logistic", "eval_metric": "error"}
|
||||
with tm.captured_output() as (out, err):
|
||||
xgb.train(
|
||||
params, D_train,
|
||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||
params,
|
||||
D_train,
|
||||
evals=[(D_train, "Train"), (D_valid, "Valid")],
|
||||
num_boost_round=rounds,
|
||||
evals_result=evals_result,
|
||||
verbose_eval=verbose_eval,
|
||||
@@ -73,14 +75,16 @@ class TestCallbacks:
|
||||
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
|
||||
evals_result = {}
|
||||
rounds = 10
|
||||
xgb.train({'objective': 'binary:logistic',
|
||||
'eval_metric': 'error'}, D_train,
|
||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||
num_boost_round=rounds,
|
||||
evals_result=evals_result,
|
||||
verbose_eval=True)
|
||||
assert len(evals_result['Train']['error']) == rounds
|
||||
assert len(evals_result['Valid']['error']) == rounds
|
||||
xgb.train(
|
||||
{"objective": "binary:logistic", "eval_metric": "error"},
|
||||
D_train,
|
||||
evals=[(D_train, "Train"), (D_valid, "Valid")],
|
||||
num_boost_round=rounds,
|
||||
evals_result=evals_result,
|
||||
verbose_eval=True,
|
||||
)
|
||||
assert len(evals_result["Train"]["error"]) == rounds
|
||||
assert len(evals_result["Valid"]["error"]) == rounds
|
||||
|
||||
self.run_evaluation_monitor(D_train, D_valid, rounds, True)
|
||||
self.run_evaluation_monitor(D_train, D_valid, rounds, 2)
|
||||
@@ -93,72 +97,83 @@ class TestCallbacks:
|
||||
evals_result = {}
|
||||
rounds = 30
|
||||
early_stopping_rounds = 5
|
||||
booster = xgb.train({'objective': 'binary:logistic',
|
||||
'eval_metric': 'error'}, D_train,
|
||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||
num_boost_round=rounds,
|
||||
evals_result=evals_result,
|
||||
verbose_eval=True,
|
||||
early_stopping_rounds=early_stopping_rounds)
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
booster = xgb.train(
|
||||
{"objective": "binary:logistic", "eval_metric": "error"},
|
||||
D_train,
|
||||
evals=[(D_train, "Train"), (D_valid, "Valid")],
|
||||
num_boost_round=rounds,
|
||||
evals_result=evals_result,
|
||||
verbose_eval=True,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
)
|
||||
dump = booster.get_dump(dump_format="json")
|
||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||
|
||||
def test_early_stopping_custom_eval(self):
|
||||
D_train = xgb.DMatrix(self.X_train, self.y_train)
|
||||
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
|
||||
early_stopping_rounds = 5
|
||||
booster = xgb.train({'objective': 'binary:logistic',
|
||||
'eval_metric': 'error',
|
||||
'tree_method': 'hist'}, D_train,
|
||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||
feval=tm.eval_error_metric,
|
||||
num_boost_round=1000,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
verbose_eval=False)
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
booster = xgb.train(
|
||||
{
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": "error",
|
||||
"tree_method": "hist",
|
||||
},
|
||||
D_train,
|
||||
evals=[(D_train, "Train"), (D_valid, "Valid")],
|
||||
feval=tm.eval_error_metric,
|
||||
num_boost_round=1000,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
verbose_eval=False,
|
||||
)
|
||||
dump = booster.get_dump(dump_format="json")
|
||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||
|
||||
def test_early_stopping_customize(self):
|
||||
D_train = xgb.DMatrix(self.X_train, self.y_train)
|
||||
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
|
||||
early_stopping_rounds = 5
|
||||
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
|
||||
metric_name='CustomErr',
|
||||
data_name='Train')
|
||||
early_stop = xgb.callback.EarlyStopping(
|
||||
rounds=early_stopping_rounds, metric_name="CustomErr", data_name="Train"
|
||||
)
|
||||
# Specify which dataset and which metric should be used for early stopping.
|
||||
booster = xgb.train(
|
||||
{'objective': 'binary:logistic',
|
||||
'eval_metric': ['error', 'rmse'],
|
||||
'tree_method': 'hist'}, D_train,
|
||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||
{
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["error", "rmse"],
|
||||
"tree_method": "hist",
|
||||
},
|
||||
D_train,
|
||||
evals=[(D_train, "Train"), (D_valid, "Valid")],
|
||||
feval=tm.eval_error_metric,
|
||||
num_boost_round=1000,
|
||||
callbacks=[early_stop],
|
||||
verbose_eval=False)
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
verbose_eval=False,
|
||||
)
|
||||
dump = booster.get_dump(dump_format="json")
|
||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||
assert len(early_stop.stopping_history['Train']['CustomErr']) == len(dump)
|
||||
assert len(early_stop.stopping_history["Train"]["CustomErr"]) == len(dump)
|
||||
|
||||
rounds = 100
|
||||
early_stop = xgb.callback.EarlyStopping(
|
||||
rounds=early_stopping_rounds,
|
||||
metric_name='CustomErr',
|
||||
data_name='Train',
|
||||
metric_name="CustomErr",
|
||||
data_name="Train",
|
||||
min_delta=100,
|
||||
save_best=True,
|
||||
)
|
||||
booster = xgb.train(
|
||||
{
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': ['error', 'rmse'],
|
||||
'tree_method': 'hist'
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["error", "rmse"],
|
||||
"tree_method": "hist",
|
||||
},
|
||||
D_train,
|
||||
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
|
||||
evals=[(D_train, "Train"), (D_valid, "Valid")],
|
||||
feval=tm.eval_error_metric,
|
||||
num_boost_round=rounds,
|
||||
callbacks=[early_stop],
|
||||
verbose_eval=False
|
||||
verbose_eval=False,
|
||||
)
|
||||
# No iteration can be made with min_delta == 100
|
||||
assert booster.best_iteration == 0
|
||||
@@ -166,18 +181,20 @@ class TestCallbacks:
|
||||
|
||||
def test_early_stopping_skl(self):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
early_stopping_rounds = 5
|
||||
cls = xgb.XGBClassifier(
|
||||
early_stopping_rounds=early_stopping_rounds, eval_metric='error'
|
||||
early_stopping_rounds=early_stopping_rounds, eval_metric="error"
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(X, y)])
|
||||
booster = cls.get_booster()
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
dump = booster.get_dump(dump_format="json")
|
||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||
|
||||
def test_early_stopping_custom_eval_skl(self):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
early_stopping_rounds = 5
|
||||
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds)
|
||||
@@ -186,11 +203,12 @@ class TestCallbacks:
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(X, y)])
|
||||
booster = cls.get_booster()
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
dump = booster.get_dump(dump_format="json")
|
||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||
|
||||
def test_early_stopping_save_best_model(self):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
n_estimators = 100
|
||||
early_stopping_rounds = 5
|
||||
@@ -200,11 +218,11 @@ class TestCallbacks:
|
||||
cls = xgb.XGBClassifier(
|
||||
n_estimators=n_estimators,
|
||||
eval_metric=tm.eval_error_metric_skl,
|
||||
callbacks=[early_stop]
|
||||
callbacks=[early_stop],
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(X, y)])
|
||||
booster = cls.get_booster()
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
dump = booster.get_dump(dump_format="json")
|
||||
assert len(dump) == booster.best_iteration + 1
|
||||
|
||||
early_stop = xgb.callback.EarlyStopping(
|
||||
@@ -220,8 +238,9 @@ class TestCallbacks:
|
||||
cls.fit(X, y, eval_set=[(X, y)])
|
||||
|
||||
# No error
|
||||
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
|
||||
save_best=False)
|
||||
early_stop = xgb.callback.EarlyStopping(
|
||||
rounds=early_stopping_rounds, save_best=False
|
||||
)
|
||||
xgb.XGBClassifier(
|
||||
booster="gblinear",
|
||||
n_estimators=10,
|
||||
@@ -231,14 +250,17 @@ class TestCallbacks:
|
||||
|
||||
def test_early_stopping_continuation(self):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
cls = xgb.XGBClassifier(eval_metric=tm.eval_error_metric_skl)
|
||||
|
||||
early_stopping_rounds = 5
|
||||
early_stop = xgb.callback.EarlyStopping(
|
||||
rounds=early_stopping_rounds, save_best=True
|
||||
)
|
||||
with pytest.warns(UserWarning):
|
||||
cls.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
|
||||
cls = xgb.XGBClassifier(
|
||||
eval_metric=tm.eval_error_metric_skl, callbacks=[early_stop]
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(X, y)])
|
||||
|
||||
booster = cls.get_booster()
|
||||
assert booster.num_boosted_rounds() == booster.best_iteration + 1
|
||||
@@ -256,21 +278,10 @@ class TestCallbacks:
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(X, y)])
|
||||
booster = cls.get_booster()
|
||||
assert booster.num_boosted_rounds() == \
|
||||
booster.best_iteration + early_stopping_rounds + 1
|
||||
|
||||
def test_deprecated(self):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
early_stopping_rounds = 5
|
||||
early_stop = xgb.callback.EarlyStopping(
|
||||
rounds=early_stopping_rounds, save_best=True
|
||||
)
|
||||
clf = xgb.XGBClassifier(
|
||||
eval_metric=tm.eval_error_metric_skl, callbacks=[early_stop]
|
||||
)
|
||||
with pytest.raises(ValueError, match=r".*set_params.*"):
|
||||
clf.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
|
||||
assert (
|
||||
booster.num_boosted_rounds()
|
||||
== booster.best_iteration + early_stopping_rounds + 1
|
||||
)
|
||||
|
||||
def run_eta_decay(self, tree_method):
|
||||
"""Test learning rate scheduler, used by both CPU and GPU tests."""
|
||||
@@ -343,7 +354,7 @@ class TestCallbacks:
|
||||
callbacks=[scheduler([0, 0, 0, 0])],
|
||||
evals_result=evals_result,
|
||||
)
|
||||
eval_errors_2 = list(map(float, evals_result['eval']['error']))
|
||||
eval_errors_2 = list(map(float, evals_result["eval"]["error"]))
|
||||
assert isinstance(bst, xgb.core.Booster)
|
||||
# validation error should not decrease, if eta/learning_rate = 0
|
||||
assert eval_errors_2[0] == eval_errors_2[-1]
|
||||
@@ -361,7 +372,7 @@ class TestCallbacks:
|
||||
callbacks=[scheduler(eta_decay)],
|
||||
evals_result=evals_result,
|
||||
)
|
||||
eval_errors_3 = list(map(float, evals_result['eval']['error']))
|
||||
eval_errors_3 = list(map(float, evals_result["eval"]["error"]))
|
||||
|
||||
assert isinstance(bst, xgb.core.Booster)
|
||||
|
||||
|
||||
@@ -15,23 +15,23 @@ class TestEarlyStopping:
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
digits = load_digits(n_class=2)
|
||||
X = digits['data']
|
||||
y = digits['target']
|
||||
X = digits["data"]
|
||||
y = digits["target"]
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
clf1 = xgb.XGBClassifier(learning_rate=0.1)
|
||||
clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
|
||||
eval_set=[(X_test, y_test)])
|
||||
clf2 = xgb.XGBClassifier(learning_rate=0.1)
|
||||
clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc",
|
||||
eval_set=[(X_test, y_test)])
|
||||
clf1 = xgb.XGBClassifier(
|
||||
learning_rate=0.1, early_stopping_rounds=5, eval_metric="auc"
|
||||
)
|
||||
clf1.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||
clf2 = xgb.XGBClassifier(
|
||||
learning_rate=0.1, early_stopping_rounds=4, eval_metric="auc"
|
||||
)
|
||||
clf2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||
# should be the same
|
||||
assert clf1.best_score == clf2.best_score
|
||||
assert clf1.best_score != 1
|
||||
# check overfit
|
||||
clf3 = xgb.XGBClassifier(
|
||||
learning_rate=0.1,
|
||||
eval_metric="auc",
|
||||
early_stopping_rounds=10
|
||||
learning_rate=0.1, eval_metric="auc", early_stopping_rounds=10
|
||||
)
|
||||
clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||
base_score = get_basescore(clf3)
|
||||
@@ -39,9 +39,9 @@ class TestEarlyStopping:
|
||||
|
||||
clf3 = xgb.XGBClassifier(
|
||||
learning_rate=0.1,
|
||||
base_score=.5,
|
||||
base_score=0.5,
|
||||
eval_metric="auc",
|
||||
early_stopping_rounds=10
|
||||
early_stopping_rounds=10,
|
||||
)
|
||||
clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||
|
||||
|
||||
@@ -9,37 +9,41 @@ rng = np.random.RandomState(1337)
|
||||
|
||||
|
||||
class TestEvalMetrics:
|
||||
xgb_params_01 = {'nthread': 1, 'eval_metric': 'error'}
|
||||
xgb_params_01 = {"nthread": 1, "eval_metric": "error"}
|
||||
|
||||
xgb_params_02 = {'nthread': 1, 'eval_metric': ['error']}
|
||||
xgb_params_02 = {"nthread": 1, "eval_metric": ["error"]}
|
||||
|
||||
xgb_params_03 = {'nthread': 1, 'eval_metric': ['rmse', 'error']}
|
||||
xgb_params_03 = {"nthread": 1, "eval_metric": ["rmse", "error"]}
|
||||
|
||||
xgb_params_04 = {'nthread': 1, 'eval_metric': ['error', 'rmse']}
|
||||
xgb_params_04 = {"nthread": 1, "eval_metric": ["error", "rmse"]}
|
||||
|
||||
def evalerror_01(self, preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
def evalerror_02(self, preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return [('error', float(sum(labels != (preds > 0.0))) / len(labels))]
|
||||
return [("error", float(sum(labels != (preds > 0.0))) / len(labels))]
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def evalerror_03(self, preds, dtrain):
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
labels = dtrain.get_label()
|
||||
return [('rmse', mean_squared_error(labels, preds)),
|
||||
('error', float(sum(labels != (preds > 0.0))) / len(labels))]
|
||||
return [
|
||||
("rmse", mean_squared_error(labels, preds)),
|
||||
("error", float(sum(labels != (preds > 0.0))) / len(labels)),
|
||||
]
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def evalerror_04(self, preds, dtrain):
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
labels = dtrain.get_label()
|
||||
return [('error', float(sum(labels != (preds > 0.0))) / len(labels)),
|
||||
('rmse', mean_squared_error(labels, preds))]
|
||||
return [
|
||||
("error", float(sum(labels != (preds > 0.0))) / len(labels)),
|
||||
("rmse", mean_squared_error(labels, preds)),
|
||||
]
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_eval_metrics(self):
|
||||
@@ -50,15 +54,15 @@ class TestEvalMetrics:
|
||||
from sklearn.datasets import load_digits
|
||||
|
||||
digits = load_digits(n_class=2)
|
||||
X = digits['data']
|
||||
y = digits['target']
|
||||
X = digits["data"]
|
||||
y = digits["target"]
|
||||
|
||||
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)
|
||||
|
||||
dtrain = xgb.DMatrix(Xt, label=yt)
|
||||
dvalid = xgb.DMatrix(Xv, label=yv)
|
||||
|
||||
watchlist = [(dtrain, 'train'), (dvalid, 'val')]
|
||||
watchlist = [(dtrain, "train"), (dvalid, "val")]
|
||||
|
||||
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, num_boost_round=10)
|
||||
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, num_boost_round=10)
|
||||
@@ -66,26 +70,54 @@ class TestEvalMetrics:
|
||||
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
|
||||
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
|
||||
|
||||
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2)
|
||||
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2)
|
||||
gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2)
|
||||
gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2)
|
||||
gbdt_01 = xgb.train(
|
||||
self.xgb_params_01, dtrain, 10, watchlist, early_stopping_rounds=2
|
||||
)
|
||||
gbdt_02 = xgb.train(
|
||||
self.xgb_params_02, dtrain, 10, watchlist, early_stopping_rounds=2
|
||||
)
|
||||
gbdt_03 = xgb.train(
|
||||
self.xgb_params_03, dtrain, 10, watchlist, early_stopping_rounds=2
|
||||
)
|
||||
gbdt_04 = xgb.train(
|
||||
self.xgb_params_04, dtrain, 10, watchlist, early_stopping_rounds=2
|
||||
)
|
||||
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
|
||||
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
|
||||
assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
|
||||
|
||||
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2, feval=self.evalerror_01)
|
||||
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2, feval=self.evalerror_02)
|
||||
gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2, feval=self.evalerror_03)
|
||||
gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
|
||||
early_stopping_rounds=2, feval=self.evalerror_04)
|
||||
gbdt_01 = xgb.train(
|
||||
self.xgb_params_01,
|
||||
dtrain,
|
||||
10,
|
||||
watchlist,
|
||||
early_stopping_rounds=2,
|
||||
feval=self.evalerror_01,
|
||||
)
|
||||
gbdt_02 = xgb.train(
|
||||
self.xgb_params_02,
|
||||
dtrain,
|
||||
10,
|
||||
watchlist,
|
||||
early_stopping_rounds=2,
|
||||
feval=self.evalerror_02,
|
||||
)
|
||||
gbdt_03 = xgb.train(
|
||||
self.xgb_params_03,
|
||||
dtrain,
|
||||
10,
|
||||
watchlist,
|
||||
early_stopping_rounds=2,
|
||||
feval=self.evalerror_03,
|
||||
)
|
||||
gbdt_04 = xgb.train(
|
||||
self.xgb_params_04,
|
||||
dtrain,
|
||||
10,
|
||||
watchlist,
|
||||
early_stopping_rounds=2,
|
||||
feval=self.evalerror_04,
|
||||
)
|
||||
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
|
||||
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
|
||||
assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
|
||||
@@ -93,6 +125,7 @@ class TestEvalMetrics:
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_gamma_deviance(self):
|
||||
from sklearn.metrics import mean_gamma_deviance
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
n_samples = 100
|
||||
n_features = 30
|
||||
@@ -101,8 +134,13 @@ class TestEvalMetrics:
|
||||
y = rng.randn(n_samples)
|
||||
y = y - y.min() * 100
|
||||
|
||||
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=10)
|
||||
reg.fit(X, y, eval_metric="gamma-deviance")
|
||||
reg = xgb.XGBRegressor(
|
||||
tree_method="hist",
|
||||
objective="reg:gamma",
|
||||
n_estimators=10,
|
||||
eval_metric="gamma-deviance",
|
||||
)
|
||||
reg.fit(X, y)
|
||||
|
||||
booster = reg.get_booster()
|
||||
score = reg.predict(X)
|
||||
@@ -113,16 +151,26 @@ class TestEvalMetrics:
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_gamma_lik(self) -> None:
|
||||
import scipy.stats as stats
|
||||
|
||||
rng = np.random.default_rng(1994)
|
||||
n_samples = 32
|
||||
n_features = 10
|
||||
|
||||
X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
|
||||
X = rng.normal(0, 1, size=n_samples * n_features).reshape(
|
||||
(n_samples, n_features)
|
||||
)
|
||||
|
||||
alpha, loc, beta = 5.0, 11.1, 22
|
||||
y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
|
||||
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
|
||||
reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
|
||||
y = stats.gamma.rvs(
|
||||
alpha, loc=loc, scale=beta, size=n_samples, random_state=rng
|
||||
)
|
||||
reg = xgb.XGBRegressor(
|
||||
tree_method="hist",
|
||||
objective="reg:gamma",
|
||||
n_estimators=64,
|
||||
eval_metric="gamma-nloglik",
|
||||
)
|
||||
reg.fit(X, y, eval_set=[(X, y)])
|
||||
|
||||
score = reg.predict(X)
|
||||
|
||||
@@ -134,7 +182,7 @@ class TestEvalMetrics:
|
||||
# XGBoost uses the canonical link function of gamma in evaluation function.
|
||||
# so \theta = - (1.0 / y)
|
||||
# dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
|
||||
beta = - (1.0 / (- (1.0 / y))) # == y
|
||||
beta = -(1.0 / (-(1.0 / y))) # == y
|
||||
nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
|
||||
|
||||
np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
|
||||
@@ -153,7 +201,7 @@ class TestEvalMetrics:
|
||||
n_features,
|
||||
n_informative=n_features,
|
||||
n_redundant=0,
|
||||
random_state=rng
|
||||
random_state=rng,
|
||||
)
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
booster = xgb.train(
|
||||
@@ -197,7 +245,7 @@ class TestEvalMetrics:
|
||||
n_informative=n_features,
|
||||
n_redundant=0,
|
||||
n_classes=n_classes,
|
||||
random_state=rng
|
||||
random_state=rng,
|
||||
)
|
||||
if weighted:
|
||||
weights = rng.randn(n_samples)
|
||||
@@ -242,20 +290,25 @@ class TestEvalMetrics:
|
||||
def run_pr_auc_binary(self, tree_method):
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.metrics import auc, precision_recall_curve
|
||||
|
||||
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
|
||||
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
|
||||
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
|
||||
clf = xgb.XGBClassifier(
|
||||
tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
|
||||
)
|
||||
clf.fit(X, y, eval_set=[(X, y)])
|
||||
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
|
||||
|
||||
y_score = clf.predict_proba(X)[:, 1] # get the positive column
|
||||
precision, recall, _ = precision_recall_curve(y, y_score)
|
||||
prauc = auc(recall, precision)
|
||||
# Interpolation results are slightly different from sklearn, but overall should be
|
||||
# similar.
|
||||
# Interpolation results are slightly different from sklearn, but overall should
|
||||
# be similar.
|
||||
np.testing.assert_allclose(prauc, evals_result, rtol=1e-2)
|
||||
|
||||
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
|
||||
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
|
||||
clf = xgb.XGBClassifier(
|
||||
tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
|
||||
)
|
||||
clf.fit(X, y, eval_set=[(X, y)])
|
||||
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
|
||||
np.testing.assert_allclose(0.99, evals_result, rtol=1e-2)
|
||||
|
||||
@@ -264,16 +317,21 @@ class TestEvalMetrics:
|
||||
|
||||
def run_pr_auc_multi(self, tree_method):
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
X, y = make_classification(
|
||||
64, 16, n_informative=8, n_classes=3, random_state=1994
|
||||
)
|
||||
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
|
||||
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
|
||||
clf = xgb.XGBClassifier(
|
||||
tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
|
||||
)
|
||||
clf.fit(X, y, eval_set=[(X, y)])
|
||||
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
|
||||
# No available implementation for comparison, just check that XGBoost converges to
|
||||
# 1.0
|
||||
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
|
||||
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
|
||||
# No available implementation for comparison, just check that XGBoost converges
|
||||
# to 1.0
|
||||
clf = xgb.XGBClassifier(
|
||||
tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
|
||||
)
|
||||
clf.fit(X, y, eval_set=[(X, y)])
|
||||
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
|
||||
np.testing.assert_allclose(1.0, evals_result, rtol=1e-2)
|
||||
|
||||
@@ -282,9 +340,13 @@ class TestEvalMetrics:
|
||||
|
||||
def run_pr_auc_ltr(self, tree_method):
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
|
||||
ltr = xgb.XGBRanker(
|
||||
tree_method=tree_method, n_estimators=16, objective="rank:pairwise"
|
||||
tree_method=tree_method,
|
||||
n_estimators=16,
|
||||
objective="rank:pairwise",
|
||||
eval_metric="aucpr",
|
||||
)
|
||||
groups = np.array([32, 32, 64])
|
||||
ltr.fit(
|
||||
@@ -293,7 +355,6 @@ class TestEvalMetrics:
|
||||
group=groups,
|
||||
eval_set=[(X, y)],
|
||||
eval_group=[groups],
|
||||
eval_metric="aucpr",
|
||||
)
|
||||
results = ltr.evals_result()["validation_0"]["aucpr"]
|
||||
assert results[-1] >= 0.99
|
||||
|
||||
@@ -6,6 +6,7 @@ import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
from xgboost.testing.continuation import run_training_continuation_model_output
|
||||
|
||||
rng = np.random.RandomState(1337)
|
||||
|
||||
@@ -15,54 +16,51 @@ class TestTrainingContinuation:
|
||||
|
||||
def generate_parameters(self):
|
||||
xgb_params_01_binary = {
|
||||
'nthread': 1,
|
||||
"nthread": 1,
|
||||
}
|
||||
|
||||
xgb_params_02_binary = {
|
||||
'nthread': 1,
|
||||
'num_parallel_tree': self.num_parallel_tree
|
||||
"nthread": 1,
|
||||
"num_parallel_tree": self.num_parallel_tree,
|
||||
}
|
||||
|
||||
xgb_params_03_binary = {
|
||||
'nthread': 1,
|
||||
'num_class': 5,
|
||||
'num_parallel_tree': self.num_parallel_tree
|
||||
"nthread": 1,
|
||||
"num_class": 5,
|
||||
"num_parallel_tree": self.num_parallel_tree,
|
||||
}
|
||||
|
||||
return [
|
||||
xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary
|
||||
]
|
||||
return [xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary]
|
||||
|
||||
def run_training_continuation(self, xgb_params_01, xgb_params_02,
|
||||
xgb_params_03):
|
||||
def run_training_continuation(self, xgb_params_01, xgb_params_02, xgb_params_03):
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
digits_2class = load_digits(n_class=2)
|
||||
digits_5class = load_digits(n_class=5)
|
||||
|
||||
X_2class = digits_2class['data']
|
||||
y_2class = digits_2class['target']
|
||||
X_2class = digits_2class["data"]
|
||||
y_2class = digits_2class["target"]
|
||||
|
||||
X_5class = digits_5class['data']
|
||||
y_5class = digits_5class['target']
|
||||
X_5class = digits_5class["data"]
|
||||
y_5class = digits_5class["target"]
|
||||
|
||||
dtrain_2class = xgb.DMatrix(X_2class, label=y_2class)
|
||||
dtrain_5class = xgb.DMatrix(X_5class, label=y_5class)
|
||||
|
||||
gbdt_01 = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=10)
|
||||
gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10)
|
||||
ntrees_01 = len(gbdt_01.get_dump())
|
||||
assert ntrees_01 == 10
|
||||
|
||||
gbdt_02 = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=0)
|
||||
gbdt_02.save_model('xgb_tc.json')
|
||||
gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0)
|
||||
gbdt_02.save_model("xgb_tc.json")
|
||||
|
||||
gbdt_02a = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=10, xgb_model=gbdt_02)
|
||||
gbdt_02b = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=10, xgb_model="xgb_tc.json")
|
||||
gbdt_02a = xgb.train(
|
||||
xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02
|
||||
)
|
||||
gbdt_02b = xgb.train(
|
||||
xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.json"
|
||||
)
|
||||
ntrees_02a = len(gbdt_02a.get_dump())
|
||||
ntrees_02b = len(gbdt_02b.get_dump())
|
||||
assert ntrees_02a == 10
|
||||
@@ -76,20 +74,21 @@ class TestTrainingContinuation:
|
||||
res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
|
||||
assert res1 == res2
|
||||
|
||||
gbdt_03 = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=3)
|
||||
gbdt_03.save_model('xgb_tc.json')
|
||||
gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3)
|
||||
gbdt_03.save_model("xgb_tc.json")
|
||||
|
||||
gbdt_03a = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=7, xgb_model=gbdt_03)
|
||||
gbdt_03b = xgb.train(xgb_params_01, dtrain_2class,
|
||||
num_boost_round=7, xgb_model="xgb_tc.json")
|
||||
gbdt_03a = xgb.train(
|
||||
xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03
|
||||
)
|
||||
gbdt_03b = xgb.train(
|
||||
xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.json"
|
||||
)
|
||||
ntrees_03a = len(gbdt_03a.get_dump())
|
||||
ntrees_03b = len(gbdt_03b.get_dump())
|
||||
assert ntrees_03a == 10
|
||||
assert ntrees_03b == 10
|
||||
|
||||
os.remove('xgb_tc.json')
|
||||
os.remove("xgb_tc.json")
|
||||
|
||||
res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class))
|
||||
res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
|
||||
@@ -113,16 +112,14 @@ class TestTrainingContinuation:
|
||||
y_2class,
|
||||
gbdt_04.predict(
|
||||
dtrain_2class, iteration_range=(0, gbdt_04.num_boosted_rounds())
|
||||
)
|
||||
),
|
||||
)
|
||||
assert res1 == res2
|
||||
|
||||
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
|
||||
num_boost_round=7)
|
||||
gbdt_05 = xgb.train(xgb_params_03,
|
||||
dtrain_5class,
|
||||
num_boost_round=3,
|
||||
xgb_model=gbdt_05)
|
||||
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7)
|
||||
gbdt_05 = xgb.train(
|
||||
xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05
|
||||
)
|
||||
|
||||
res1 = gbdt_05.predict(dtrain_5class)
|
||||
res2 = gbdt_05.predict(
|
||||
@@ -149,8 +146,8 @@ class TestTrainingContinuation:
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
clf = xgb.XGBClassifier(n_estimators=2)
|
||||
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
|
||||
clf = xgb.XGBClassifier(n_estimators=2, eval_metric="logloss")
|
||||
clf.fit(X, y, eval_set=[(X, y)])
|
||||
assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -160,5 +157,10 @@ class TestTrainingContinuation:
|
||||
|
||||
clf = xgb.XGBClassifier(n_estimators=2)
|
||||
# change metric to error
|
||||
clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
|
||||
clf.set_params(eval_metric="error")
|
||||
clf.fit(X, y, eval_set=[(X, y)], xgb_model=loaded)
|
||||
assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
|
||||
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
|
||||
def test_model_output(self, tree_method: str) -> None:
|
||||
run_training_continuation_model_output("cpu", tree_method)
|
||||
|
||||
@@ -30,8 +30,8 @@ def test_binary_classification():
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
clf = cls(random_state=42)
|
||||
xgb_model = clf.fit(X[train_index], y[train_index], eval_metric=['auc', 'logloss'])
|
||||
clf = cls(random_state=42, eval_metric=['auc', 'logloss'])
|
||||
xgb_model = clf.fit(X[train_index], y[train_index])
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
labels = y[test_index]
|
||||
err = sum(1 for i in range(len(preds))
|
||||
@@ -101,10 +101,11 @@ def test_best_iteration():
|
||||
def train(booster: str, forest: Optional[int]) -> None:
|
||||
rounds = 4
|
||||
cls = xgb.XGBClassifier(
|
||||
n_estimators=rounds, num_parallel_tree=forest, booster=booster
|
||||
).fit(
|
||||
X, y, eval_set=[(X, y)], early_stopping_rounds=3
|
||||
)
|
||||
n_estimators=rounds,
|
||||
num_parallel_tree=forest,
|
||||
booster=booster,
|
||||
early_stopping_rounds=3,
|
||||
).fit(X, y, eval_set=[(X, y)])
|
||||
assert cls.best_iteration == rounds - 1
|
||||
|
||||
# best_iteration is used by default, assert that under gblinear it's
|
||||
@@ -112,9 +113,9 @@ def test_best_iteration():
|
||||
cls.predict(X)
|
||||
|
||||
num_parallel_tree = 4
|
||||
train('gbtree', num_parallel_tree)
|
||||
train('dart', num_parallel_tree)
|
||||
train('gblinear', None)
|
||||
train("gbtree", num_parallel_tree)
|
||||
train("dart", num_parallel_tree)
|
||||
train("gblinear", None)
|
||||
|
||||
|
||||
def test_ranking():
|
||||
@@ -258,6 +259,7 @@ def test_stacking_classification():
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
|
||||
clf.fit(X_train, y_train).score(X_test, y_test)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_pandas())
|
||||
def test_feature_importances_weight():
|
||||
from sklearn.datasets import load_digits
|
||||
@@ -474,7 +476,8 @@ def run_housing_rf_regression(tree_method):
|
||||
|
||||
rfreg = xgb.XGBRFRegressor()
|
||||
with pytest.raises(NotImplementedError):
|
||||
rfreg.fit(X, y, early_stopping_rounds=10)
|
||||
rfreg.set_params(early_stopping_rounds=10)
|
||||
rfreg.fit(X, y)
|
||||
|
||||
|
||||
def test_rf_regression():
|
||||
@@ -574,7 +577,7 @@ def test_classification_with_custom_objective():
|
||||
return logregobj(y, p)
|
||||
|
||||
cls.set_params(objective=wrapped)
|
||||
cls.predict(X) # no throw
|
||||
cls.predict(X) # no throw
|
||||
cls.fit(X, y)
|
||||
|
||||
assert is_called[0]
|
||||
@@ -844,51 +847,65 @@ def run_validation_weights(model):
|
||||
y_train, y_test = y[:1600], y[1600:]
|
||||
|
||||
# instantiate model
|
||||
param_dist = {'objective': 'binary:logistic', 'n_estimators': 2,
|
||||
'random_state': 123}
|
||||
param_dist = {
|
||||
"objective": "binary:logistic",
|
||||
"n_estimators": 2,
|
||||
"random_state": 123,
|
||||
}
|
||||
clf = model(**param_dist)
|
||||
|
||||
# train it using instance weights only in the training set
|
||||
weights_train = np.random.choice([1, 2], len(X_train))
|
||||
clf.fit(X_train, y_train,
|
||||
sample_weight=weights_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
eval_metric='logloss',
|
||||
verbose=False)
|
||||
|
||||
clf.set_params(eval_metric="logloss")
|
||||
clf.fit(
|
||||
X_train,
|
||||
y_train,
|
||||
sample_weight=weights_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
verbose=False,
|
||||
)
|
||||
# evaluate logloss metric on test set *without* using weights
|
||||
evals_result_without_weights = clf.evals_result()
|
||||
logloss_without_weights = evals_result_without_weights[
|
||||
"validation_0"]["logloss"]
|
||||
logloss_without_weights = evals_result_without_weights["validation_0"]["logloss"]
|
||||
|
||||
# now use weights for the test set
|
||||
np.random.seed(0)
|
||||
weights_test = np.random.choice([1, 2], len(X_test))
|
||||
clf.fit(X_train, y_train,
|
||||
sample_weight=weights_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_test],
|
||||
eval_metric='logloss',
|
||||
verbose=False)
|
||||
clf.set_params(eval_metric="logloss")
|
||||
clf.fit(
|
||||
X_train,
|
||||
y_train,
|
||||
sample_weight=weights_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_test],
|
||||
verbose=False,
|
||||
)
|
||||
evals_result_with_weights = clf.evals_result()
|
||||
logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]
|
||||
|
||||
# check that the logloss in the test set is actually different when using
|
||||
# weights than when not using them
|
||||
assert all((logloss_with_weights[i] != logloss_without_weights[i]
|
||||
for i in [0, 1]))
|
||||
assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# length of eval set and sample weight doesn't match.
|
||||
clf.fit(X_train, y_train, sample_weight=weights_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_train])
|
||||
clf.fit(
|
||||
X_train,
|
||||
y_train,
|
||||
sample_weight=weights_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_train],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
cls = xgb.XGBClassifier()
|
||||
cls.fit(X_train, y_train, sample_weight=weights_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_train])
|
||||
cls.fit(
|
||||
X_train,
|
||||
y_train,
|
||||
sample_weight=weights_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_train],
|
||||
)
|
||||
|
||||
|
||||
def test_validation_weights():
|
||||
@@ -960,8 +977,7 @@ def test_XGBClassifier_resume():
|
||||
|
||||
# file name of stored xgb model
|
||||
model1.save_model(model1_path)
|
||||
model2 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=8)
|
||||
model2 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
|
||||
model2.fit(X, Y, xgb_model=model1_path)
|
||||
|
||||
pred2 = model2.predict(X)
|
||||
@@ -972,8 +988,7 @@ def test_XGBClassifier_resume():
|
||||
|
||||
# file name of 'Booster' instance Xgb model
|
||||
model1.get_booster().save_model(model1_booster_path)
|
||||
model2 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=8)
|
||||
model2 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
|
||||
model2.fit(X, Y, xgb_model=model1_booster_path)
|
||||
|
||||
pred2 = model2.predict(X)
|
||||
@@ -1279,12 +1294,16 @@ def test_estimator_reg(estimator, check):
|
||||
):
|
||||
estimator.fit(X, y)
|
||||
return
|
||||
if os.environ["PYTEST_CURRENT_TEST"].find("check_estimators_overwrite_params") != -1:
|
||||
if (
|
||||
os.environ["PYTEST_CURRENT_TEST"].find("check_estimators_overwrite_params")
|
||||
!= -1
|
||||
):
|
||||
# A hack to pass the scikit-learn parameter mutation tests. XGBoost regressor
|
||||
# returns actual internal default values for parameters in `get_params`, but those
|
||||
# are set as `None` in sklearn interface to avoid duplication. So we fit a dummy
|
||||
# model and obtain the default parameters here for the mutation tests.
|
||||
# returns actual internal default values for parameters in `get_params`, but
|
||||
# those are set as `None` in sklearn interface to avoid duplication. So we fit
|
||||
# a dummy model and obtain the default parameters here for the mutation tests.
|
||||
from sklearn.datasets import make_regression
|
||||
|
||||
X, y = make_regression(n_samples=2, n_features=1)
|
||||
estimator.set_params(**xgb.XGBRegressor().fit(X, y).get_params())
|
||||
|
||||
@@ -1325,6 +1344,7 @@ def test_categorical():
|
||||
def test_evaluation_metric():
|
||||
from sklearn.datasets import load_diabetes, load_digits
|
||||
from sklearn.metrics import mean_absolute_error
|
||||
|
||||
X, y = load_diabetes(return_X_y=True)
|
||||
n_estimators = 16
|
||||
|
||||
@@ -1341,17 +1361,6 @@ def test_evaluation_metric():
|
||||
for line in lines:
|
||||
assert line.find("mean_absolute_error") != -1
|
||||
|
||||
def metric(predt: np.ndarray, Xy: xgb.DMatrix):
|
||||
y = Xy.get_label()
|
||||
return "m", np.abs(predt - y).sum()
|
||||
|
||||
with pytest.warns(UserWarning):
|
||||
reg = xgb.XGBRegressor(
|
||||
tree_method="hist",
|
||||
n_estimators=1,
|
||||
)
|
||||
reg.fit(X, y, eval_set=[(X, y)], eval_metric=metric)
|
||||
|
||||
def merror(y_true: np.ndarray, predt: np.ndarray):
|
||||
n_samples = y_true.shape[0]
|
||||
assert n_samples == predt.size
|
||||
|
||||
@@ -363,12 +363,12 @@ class TestDistributedGPU:
|
||||
device="cuda",
|
||||
eval_metric="error",
|
||||
n_estimators=100,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
)
|
||||
cls.client = local_cuda_client
|
||||
cls.fit(
|
||||
X,
|
||||
y,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
eval_set=[(valid_X, valid_y)],
|
||||
)
|
||||
booster = cls.get_booster()
|
||||
|
||||
@@ -937,8 +937,10 @@ def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None
|
||||
valid_X = dd.from_array(valid_X_, chunksize=n_samples)
|
||||
valid_y = dd.from_array(valid_y_, chunksize=n_samples)
|
||||
|
||||
cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
|
||||
cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
device=device, n_estimators=2, eval_metric=["auc", "aucpr"]
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(valid_X, valid_y)])
|
||||
|
||||
# multiclass
|
||||
X_, y_ = make_classification(
|
||||
@@ -966,8 +968,10 @@ def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None
|
||||
valid_X = dd.from_array(valid_X_, chunksize=n_samples)
|
||||
valid_y = dd.from_array(valid_y_, chunksize=n_samples)
|
||||
|
||||
cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
|
||||
cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
device=device, n_estimators=2, eval_metric=["auc", "aucpr"]
|
||||
)
|
||||
cls.fit(X, y, eval_set=[(valid_X, valid_y)])
|
||||
|
||||
|
||||
def test_empty_dmatrix_auc() -> None:
|
||||
@@ -994,11 +998,11 @@ def run_auc(client: "Client", device: str) -> None:
|
||||
valid_X = dd.from_array(valid_X_, chunksize=10)
|
||||
valid_y = dd.from_array(valid_y_, chunksize=10)
|
||||
|
||||
cls = xgb.XGBClassifier(device=device, n_estimators=2)
|
||||
cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
|
||||
cls = xgb.XGBClassifier(device=device, n_estimators=2, eval_metric="auc")
|
||||
cls.fit(X_, y_, eval_set=[(valid_X_, valid_y_)])
|
||||
|
||||
dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
|
||||
dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
|
||||
dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2, eval_metric="auc")
|
||||
dcls.fit(X, y, eval_set=[(valid_X, valid_y)])
|
||||
|
||||
approx = dcls.evals_result()["validation_0"]["auc"]
|
||||
exact = cls.evals_result()["validation_0"]["auc"]
|
||||
@@ -1267,16 +1271,16 @@ def test_dask_ranking(client: "Client") -> None:
|
||||
qid_valid = qid_valid.astype(np.uint32)
|
||||
qid_test = qid_test.astype(np.uint32)
|
||||
|
||||
rank = xgb.dask.DaskXGBRanker(n_estimators=2500)
|
||||
rank = xgb.dask.DaskXGBRanker(
|
||||
n_estimators=2500, eval_metric=["ndcg"], early_stopping_rounds=10
|
||||
)
|
||||
rank.fit(
|
||||
x_train,
|
||||
y_train,
|
||||
qid=qid_train,
|
||||
eval_set=[(x_test, y_test), (x_train, y_train)],
|
||||
eval_qid=[qid_test, qid_train],
|
||||
eval_metric=["ndcg"],
|
||||
verbose=True,
|
||||
early_stopping_rounds=10,
|
||||
)
|
||||
assert rank.n_features_in_ == 46
|
||||
assert rank.best_score > 0.98
|
||||
@@ -2150,13 +2154,15 @@ class TestDaskCallbacks:
|
||||
valid_X, valid_y = load_breast_cancer(return_X_y=True)
|
||||
valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
objective="binary:logistic", tree_method="hist", n_estimators=1000
|
||||
objective="binary:logistic",
|
||||
tree_method="hist",
|
||||
n_estimators=1000,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
)
|
||||
cls.client = client
|
||||
cls.fit(
|
||||
X,
|
||||
y,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
eval_set=[(valid_X, valid_y)],
|
||||
)
|
||||
booster = cls.get_booster()
|
||||
@@ -2165,15 +2171,17 @@ class TestDaskCallbacks:
|
||||
|
||||
# Specify the metric
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
objective="binary:logistic", tree_method="hist", n_estimators=1000
|
||||
objective="binary:logistic",
|
||||
tree_method="hist",
|
||||
n_estimators=1000,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
eval_metric="error",
|
||||
)
|
||||
cls.client = client
|
||||
cls.fit(
|
||||
X,
|
||||
y,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
eval_set=[(valid_X, valid_y)],
|
||||
eval_metric="error",
|
||||
)
|
||||
assert tm.non_increasing(cls.evals_result()["validation_0"]["error"])
|
||||
booster = cls.get_booster()
|
||||
@@ -2215,12 +2223,12 @@ class TestDaskCallbacks:
|
||||
tree_method="hist",
|
||||
n_estimators=1000,
|
||||
eval_metric=tm.eval_error_metric_skl,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
)
|
||||
cls.client = client
|
||||
cls.fit(
|
||||
X,
|
||||
y,
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
eval_set=[(valid_X, valid_y)],
|
||||
)
|
||||
booster = cls.get_booster()
|
||||
@@ -2234,21 +2242,22 @@ class TestDaskCallbacks:
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X, y = da.from_array(X), da.from_array(y)
|
||||
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
objective="binary:logistic", tree_method="hist", n_estimators=10
|
||||
)
|
||||
cls.client = client
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
cls.fit(
|
||||
X,
|
||||
y,
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
objective="binary:logistic",
|
||||
tree_method="hist",
|
||||
n_estimators=10,
|
||||
callbacks=[
|
||||
xgb.callback.TrainingCheckPoint(
|
||||
directory=Path(tmpdir), interval=1, name="model"
|
||||
)
|
||||
],
|
||||
)
|
||||
cls.client = client
|
||||
cls.fit(
|
||||
X,
|
||||
y,
|
||||
)
|
||||
for i in range(1, 10):
|
||||
assert os.path.exists(
|
||||
os.path.join(
|
||||
|
||||
@@ -311,24 +311,20 @@ def clf_with_weight(
|
||||
y_val = np.array([0, 1])
|
||||
w_train = np.array([1.0, 2.0])
|
||||
w_val = np.array([1.0, 2.0])
|
||||
cls2 = XGBClassifier()
|
||||
cls2 = XGBClassifier(eval_metric="logloss", early_stopping_rounds=1)
|
||||
cls2.fit(
|
||||
X_train,
|
||||
y_train,
|
||||
eval_set=[(X_val, y_val)],
|
||||
early_stopping_rounds=1,
|
||||
eval_metric="logloss",
|
||||
)
|
||||
|
||||
cls3 = XGBClassifier()
|
||||
cls3 = XGBClassifier(eval_metric="logloss", early_stopping_rounds=1)
|
||||
cls3.fit(
|
||||
X_train,
|
||||
y_train,
|
||||
sample_weight=w_train,
|
||||
eval_set=[(X_val, y_val)],
|
||||
sample_weight_eval_set=[w_val],
|
||||
early_stopping_rounds=1,
|
||||
eval_metric="logloss",
|
||||
)
|
||||
|
||||
cls_df_train_with_eval_weight = spark.createDataFrame(
|
||||
|
||||
Reference in New Issue
Block a user