Move Python testing utilities into xgboost module. (#8379)

- Add typehints.
- Fixes for pylint.

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2022-10-26 16:56:11 +08:00
committed by GitHub
parent 7e53189e7c
commit cf70864fa3
66 changed files with 652 additions and 595 deletions

View File

@@ -1,7 +1,9 @@
import xgboost
import numpy as np
import os
import numpy as np
import xgboost
kRounds = 2
kRows = 1000
kCols = 4

View File

@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-
import numpy as np
import os
import xgboost as xgb
import pytest
import json
from pathlib import Path
import os
import tempfile
import testing as tm
from pathlib import Path
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
dpath = 'demo/data/'
rng = np.random.RandomState(1994)

View File

@@ -1,13 +1,15 @@
import numpy as np
import xgboost as xgb
import os
import json
import testing as tm
import pytest
import locale
import os
import tempfile
dpath = os.path.join(tm.PROJECT_ROOT, 'demo/data/')
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
dpath = tm.data_dir(__file__)
rng = np.random.RandomState(1994)
@@ -36,8 +38,8 @@ class TestModels:
param = {'verbosity': 0, 'objective': 'binary:logistic',
'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
'nthread': 1}
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
@@ -49,8 +51,8 @@ class TestModels:
assert err < 0.2
def test_dart(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
param = {'max_depth': 5, 'objective': 'binary:logistic',
'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1}
# specify validations set to watch performance
@@ -116,7 +118,7 @@ class TestModels:
def test_boost_from_prediction(self):
# Re-construct dtrain here to avoid modification
margined = xgb.DMatrix(dpath + 'agaricus.txt.train')
margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
bst = xgb.train({'tree_method': 'hist'}, margined, 1)
predt_0 = bst.predict(margined, output_margin=True)
margined.set_base_margin(predt_0)
@@ -124,13 +126,13 @@ class TestModels:
predt_1 = bst.predict(margined)
assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
predt_2 = bst.predict(dtrain)
assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
def test_boost_from_existing_model(self):
X = xgb.DMatrix(dpath + 'agaricus.txt.train')
X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
assert booster.num_boosted_rounds() == 4
booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4,
@@ -150,8 +152,8 @@ class TestModels:
'objective': 'reg:logistic',
"tree_method": tree_method
}
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 10
@@ -197,8 +199,8 @@ class TestModels:
self.run_custom_objective()
def test_multi_eval_metric(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
'objective': 'binary:logistic'}
@@ -220,7 +222,7 @@ class TestModels:
param['scale_pos_weight'] = ratio
return (dtrain, dtest, param)
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed=0, fpreproc=fpreproc)
@@ -228,7 +230,7 @@ class TestModels:
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
num_round = 2
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed=0, show_stdv=False)
@@ -346,7 +348,7 @@ class TestModels:
os.remove(model_path)
try:
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
except ValueError as e:
e_str = str(e)

View File

@@ -1,9 +1,12 @@
from typing import Union
import xgboost as xgb
import pytest
import os
import testing as tm
import tempfile
from contextlib import nullcontext
from typing import Union
import pytest
import xgboost as xgb
from xgboost import testing as tm
# We use the dataset for tests.
pytestmark = pytest.mark.skipif(**tm.no_sklearn())
@@ -271,13 +274,14 @@ class TestCallbacks:
"""Test learning rate scheduler, used by both CPU and GPU tests."""
scheduler = xgb.callback.LearningRateScheduler
dpath = os.path.join(tm.PROJECT_ROOT, 'demo/data/')
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dpath = tm.data_dir(__file__)
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4
warning_check = tm.noop_context()
warning_check = nullcontext()
# learning_rates as a list
# init eta with 0 to check whether learning_rates work

View File

@@ -1,11 +1,13 @@
import os
import tempfile
import platform
import xgboost
import subprocess
import numpy
import json
import testing as tm
import os
import platform
import subprocess
import tempfile
import numpy
import xgboost
from xgboost import testing as tm
class TestCLI:
@@ -29,7 +31,7 @@ data = {data_path}
eval[test] = {data_path}
'''
PROJECT_ROOT = tm.PROJECT_ROOT
PROJECT_ROOT = tm.project_root(__file__)
def get_exe(self):
if platform.system() == 'Windows':

View File

@@ -1,14 +1,16 @@
from typing import Dict, List
import numpy as np
import pytest
from hypothesis import given, settings, strategies
from scipy.sparse import csr_matrix
from testing import IteratorForTest, make_batches, non_increasing
from xgboost.data import SingleBatchInternalIter as SingleBatch
from xgboost.testing import IteratorForTest, make_batches, non_increasing
import xgboost as xgb
from xgboost import testing
from xgboost import testing as tm
pytestmark = testing.timeout(30)
pytestmark = tm.timeout(30)
def test_single_batch(tree_method: str = "approx") -> None:
@@ -83,7 +85,7 @@ def run_data_iterator(
if tree_method == "gpu_hist":
parameters["sampling_method"] = "gradient_based"
results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {}
results_from_it: Dict[str, Dict[str, List[float]]] = {}
from_it = xgb.train(
parameters,
Xy,
@@ -106,7 +108,7 @@ def run_data_iterator(
assert Xy.num_row() == n_samples_per_batch * n_batches
assert Xy.num_col() == n_features
results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {}
results_from_arrays: Dict[str, Dict[str, List[float]]] = {}
from_arrays = xgb.train(
parameters,
Xy,

View File

@@ -3,14 +3,12 @@ import subprocess
import sys
import pytest
import testing as tm
from xgboost import testing
from xgboost import testing as tm
pytestmark = testing.timeout(30)
pytestmark = tm.timeout(30)
ROOT_DIR = tm.PROJECT_ROOT
DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
DEMO_DIR = tm.demo_dir(__file__)
PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
CLI_DEMO_DIR = os.path.join(DEMO_DIR, 'CLI')
@@ -156,7 +154,7 @@ def test_cli_regression_demo():
cmd = ['python', script, 'machine.txt', '1']
subprocess.check_call(cmd, cwd=reg_dir)
exe = os.path.join(tm.PROJECT_ROOT, 'xgboost')
exe = os.path.join(DEMO_DIR, os.path.pardir, 'xgboost')
conf = os.path.join(reg_dir, 'machine.conf')
subprocess.check_call([exe, conf], cwd=reg_dir)

View File

@@ -4,11 +4,11 @@ import tempfile
import numpy as np
import pytest
import scipy.sparse
import testing as tm
from hypothesis import given, settings, strategies
from scipy.sparse import csr_matrix, rand
import xgboost as xgb
from xgboost import testing as tm
rng = np.random.RandomState(1)

View File

@@ -1,9 +1,8 @@
# -*- coding: utf-8 -*-
import pytest
import numpy as np
import pytest
import testing as tm
import xgboost as xgb
from xgboost import testing as tm
try:
import datatable as dt

View File

@@ -1,8 +1,9 @@
import xgboost as xgb
import testing as tm
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
rng = np.random.RandomState(1994)

View File

@@ -1,8 +1,9 @@
import xgboost as xgb
import testing as tm
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
rng = np.random.RandomState(1337)
@@ -254,8 +255,8 @@ class TestEvalMetrics:
self.run_roc_auc_multi("hist", n_samples, weighted)
def run_pr_auc_binary(self, tree_method):
from sklearn.metrics import precision_recall_curve, auc
from sklearn.datasets import make_classification
from sklearn.metrics import auc, precision_recall_curve
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])

View File

@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
import numpy as np
import xgboost
import testing as tm
import pytest
import xgboost
from xgboost import testing as tm
dpath = 'demo/data/'
rng = np.random.RandomState(1994)

View File

@@ -1,10 +1,9 @@
import testing as tm
from hypothesis import given, note, settings, strategies
import xgboost as xgb
from xgboost import testing
from xgboost import testing as tm
pytestmark = testing.timeout(10)
pytestmark = tm.timeout(10)
parameter_strategy = strategies.fixed_dictionaries({

View File

@@ -1,12 +1,14 @@
import xgboost
import os
import generate_models as gm
import testing as tm
import json
import zipfile
import pytest
import copy
import json
import os
import urllib.request
import zipfile
import generate_models as gm
import pytest
import xgboost
from xgboost import testing as tm
def run_model_param_check(config):

View File

@@ -1,8 +1,9 @@
import numpy as np
import xgboost as xgb
import testing as tm
import pytest
import xgboost as xgb
from xgboost import testing as tm
dpath = 'demo/data/'

View File

@@ -4,12 +4,11 @@ import tempfile
import numpy as np
import pytest
import testing as tm
import xgboost as xgb
from xgboost import testing
from xgboost import testing as tm
pytestmark = testing.timeout(10)
pytestmark = tm.timeout(10)
class TestOMP:
@@ -86,7 +85,7 @@ class TestOMP:
def test_with_omp_thread_limit(self):
args = [
"python", os.path.join(
tm.PROJECT_ROOT, "tests", "python", "with_omp_limit.py"
os.path.dirname(tm.normpath(__file__)), "with_omp_limit.py"
)
]
results = []

View File

@@ -1,8 +1,8 @@
import xgboost as xgb
import numpy as np
import pytest
import testing as tm
import xgboost as xgb
from xgboost import testing as tm
pytestmark = pytest.mark.skipif(**tm.no_pandas())

View File

@@ -1,9 +1,10 @@
import pickle
import numpy as np
import xgboost as xgb
import os
import json
import os
import pickle
import numpy as np
import xgboost as xgb
kRows = 100
kCols = 10

View File

@@ -1,15 +1,16 @@
import json
import numpy as np
import xgboost as xgb
import testing as tm
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
try:
import matplotlib
matplotlib.use('Agg')
from matplotlib.axes import Axes
from graphviz import Source
from matplotlib.axes import Axes
except ImportError:
pass

View File

@@ -1,12 +1,13 @@
'''Tests for running inplace prediction.'''
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from scipy import sparse
import pytest
import pandas as pd
import testing as tm
import numpy as np
import pandas as pd
import pytest
from scipy import sparse
import xgboost as xgb
from xgboost import testing as tm
def run_threaded_predict(X, rows, predict_func):

View File

@@ -4,7 +4,7 @@ import numpy as np
import pytest
from hypothesis import given, settings, strategies
from scipy import sparse
from testing import (
from xgboost.testing import (
IteratorForTest,
make_batches,
make_batches_sparse,

View File

@@ -1,13 +1,15 @@
import numpy as np
from scipy.sparse import csr_matrix
import testing as tm
import xgboost
import os
import itertools
import os
import shutil
import urllib.request
import zipfile
import numpy as np
from scipy.sparse import csr_matrix
import xgboost
from xgboost import testing as tm
def test_ranking_with_unweighted_data():
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])

View File

@@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
import numpy as np
import xgboost as xgb
import itertools
import re
import numpy as np
import scipy
import scipy.special
import xgboost as xgb
dpath = 'demo/data/'
rng = np.random.RandomState(1994)

View File

@@ -4,7 +4,8 @@ from typing import List
import numpy as np
import pandas as pd
import pytest
import testing as tm
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)

View File

@@ -6,10 +6,9 @@ import uuid
import numpy as np
import pytest
import testing as tm
import xgboost as xgb
from xgboost import testing
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
@@ -38,7 +37,7 @@ from .utils import SparkTestCase
logging.getLogger("py4j").setLevel(logging.INFO)
pytestmark = testing.timeout(60)
pytestmark = tm.timeout(60)
class XgboostLocalTest(SparkTestCase):

View File

@@ -6,7 +6,8 @@ import uuid
import numpy as np
import pytest
import testing as tm
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)

View File

@@ -6,9 +6,10 @@ import tempfile
import unittest
import pytest
import testing as tm
from six import StringIO
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):

View File

@@ -1,11 +1,13 @@
import testing as tm
import pytest
import numpy as np
import xgboost as xgb
import json
import os
dpath = os.path.join(tm.PROJECT_ROOT, 'demo', 'data')
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
dpath = tm.data_dir(__file__)
def test_aft_survival_toy_data():

View File

@@ -3,10 +3,10 @@ import sys
import numpy as np
import pytest
import testing as tm
import xgboost as xgb
from xgboost import RabitTracker, testing
from xgboost import RabitTracker
from xgboost import testing as tm
if sys.platform.startswith("win"):
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -61,7 +61,7 @@ def test_rabit_ops():
run_rabit_ops(client, n_workers)
@pytest.mark.skipif(**testing.skip_ipv6())
@pytest.mark.skipif(**tm.no_ipv6())
@pytest.mark.skipif(**tm.no_dask())
def test_rabit_ops_ipv6():
import dask

View File

@@ -1,10 +1,11 @@
import xgboost as xgb
import testing as tm
import numpy as np
import pytest
import os
import tempfile
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
rng = np.random.RandomState(1337)

View File

@@ -1,8 +1,8 @@
import numpy as np
import xgboost as xgb
from numpy.testing import assert_approx_equal
import xgboost as xgb
train_data = xgb.DMatrix(np.array([[1]]), label=np.array([1]))

View File

@@ -1,11 +1,13 @@
import json
from string import ascii_lowercase
from typing import Dict, Any
import testing as tm
import pytest
import xgboost as xgb
from typing import Any, Dict
import numpy as np
from hypothesis import given, strategies, settings, note
import pytest
from hypothesis import given, note, settings, strategies
import xgboost as xgb
from xgboost import testing as tm
exact_parameter_strategy = strategies.fixed_dictionaries({
'nthread': strategies.integers(1, 4),

View File

@@ -1,14 +1,16 @@
import unittest
import pytest
import numpy as np
import testing as tm
import xgboost as xgb
import os
import unittest
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
try:
import pandas as pd
import pyarrow as pa
import pyarrow.csv as pc
import pandas as pd
except ImportError:
pass
@@ -73,7 +75,7 @@ class TestArrowTable(unittest.TestCase):
np.testing.assert_allclose(preds1, preds2)
def test_arrow_survival(self):
data = os.path.join(tm.PROJECT_ROOT, "demo", "data", "veterans_lung_cancer.csv")
data = os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
table = pc.read_csv(data)
y_lower_bound = table["Survival_label_lower_bound"]
y_upper_bound = table["Survival_label_upper_bound"]

View File

@@ -20,7 +20,6 @@ import numpy as np
import pytest
import scipy
import sklearn
import testing as tm
from hypothesis import HealthCheck, given, note, settings
from sklearn.datasets import make_classification, make_regression
from test_predict import verify_leaf_output
@@ -29,7 +28,7 @@ from test_with_sklearn import run_data_initialization, run_feature_weights
from xgboost.data import _is_cudf_df
import xgboost as xgb
from xgboost import testing
from xgboost import testing as tm
if sys.platform.startswith("win"):
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -45,7 +44,7 @@ from xgboost.dask import DaskDMatrix
dask.config.set({"distributed.scheduler.allowed-failures": False})
pytestmark = testing.timeout(30)
pytestmark = tm.timeout(30)
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
@@ -1116,8 +1115,9 @@ def test_predict_with_meta(client: "Client") -> None:
def run_aft_survival(client: "Client", dmatrix_t: Type) -> None:
df = dd.read_csv(os.path.join(tm.PROJECT_ROOT, 'demo', 'data',
'veterans_lung_cancer.csv'))
df = dd.read_csv(
os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
)
y_lower_bound = df['Survival_label_lower_bound']
y_upper_bound = df['Survival_label_upper_bound']
X = df.drop(['Survival_label_lower_bound',

View File

@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
import numpy as np
import xgboost as xgb
import testing as tm
import pytest
from test_dmatrix import set_base_margin_info
import xgboost as xgb
from xgboost import testing as tm
try:
import modin.pandas as md
except ImportError:

View File

@@ -1,11 +1,13 @@
import os
import tempfile
import numpy as np
import xgboost as xgb
import testing as tm
import pytest
from test_dmatrix import set_base_margin_info
import xgboost as xgb
from xgboost import testing as tm
try:
import pandas as pd
except ImportError:

View File

@@ -1,7 +1,8 @@
import numpy as np
import xgboost as xgb
import pytest
import xgboost as xgb
try:
import shap
except ImportError:

View File

@@ -8,14 +8,13 @@ from typing import Callable, Optional
import numpy as np
import pytest
import testing as tm
from sklearn.utils.estimator_checks import parametrize_with_checks
import xgboost as xgb
from xgboost import testing
from xgboost import testing as tm
rng = np.random.RandomState(1994)
pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), testing.timeout(30)]
pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), tm.timeout(30)]
def test_binary_classification():
@@ -155,11 +154,10 @@ def test_ranking():
def test_stacking_regression():
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
X, y = load_diabetes(return_X_y=True)
estimators = [
@@ -177,13 +175,13 @@ def test_stacking_regression():
def test_stacking_classification():
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
X, y = load_iris(return_X_y=True)
estimators = [
@@ -354,8 +352,8 @@ def test_num_parallel_tree():
def test_regression():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
X, y = fetch_california_housing(return_X_y=True)
@@ -383,8 +381,8 @@ def test_regression():
def run_housing_rf_regression(tree_method):
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
X, y = fetch_california_housing(return_X_y=True)
@@ -407,8 +405,8 @@ def test_rf_regression():
def test_parameter_tuning():
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV
X, y = fetch_california_housing(return_X_y=True)
xgb_model = xgb.XGBRegressor(learning_rate=0.1)
@@ -421,8 +419,8 @@ def test_parameter_tuning():
def test_regression_with_custom_objective():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
def objective_ls(y_true, y_pred):
@@ -539,8 +537,8 @@ def test_sklearn_plotting():
import matplotlib
matplotlib.use('Agg')
from matplotlib.axes import Axes
from graphviz import Source
from matplotlib.axes import Axes
ax = xgb.plot_importance(classifier)
assert isinstance(ax, Axes)
@@ -666,8 +664,8 @@ def test_kwargs_error():
def test_kwargs_grid_search():
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
params = {'tree_method': 'hist'}
clf = xgb.XGBClassifier(n_estimators=1, learning_rate=1.0, **params)
@@ -841,9 +839,7 @@ def test_save_load_model():
def test_RFECV():
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
from sklearn.feature_selection import RFECV
# Regression
@@ -1046,8 +1042,9 @@ def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
with open(model_path) as fd:
model = json.load(fd)
parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
'json_parser.py')
parser_path = os.path.join(
tm.demo_dir(__file__), "json-model", "json_parser.py"
)
spec = importlib.util.spec_from_file_location("JsonParser",
parser_path)
foo = importlib.util.module_from_spec(spec)
@@ -1162,8 +1159,8 @@ def run_boost_from_prediction_multi_clasas(
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
def test_boost_from_prediction(tree_method):
from sklearn.datasets import load_breast_cancer, load_iris, make_regression
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_iris, make_regression
X, y = load_breast_cancer(return_X_y=True)

View File

@@ -1,806 +0,0 @@
from concurrent.futures import ThreadPoolExecutor
import os
import multiprocessing
from typing import Tuple, Union, List, Sequence, Callable
import urllib
import zipfile
import sys
from typing import Optional, Dict, Any
from contextlib import contextmanager
from io import StringIO
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
import pytest
import gc
import xgboost as xgb
from xgboost.core import ArrayLike
import numpy as np
from scipy import sparse
import platform
hypothesis = pytest.importorskip('hypothesis')
sklearn = pytest.importorskip('sklearn')
from hypothesis import strategies
from hypothesis.extra.numpy import arrays
from joblib import Memory
from sklearn import datasets
try:
import cupy as cp
except ImportError:
cp = None
memory = Memory('./cachedir', verbose=0)
def no_ubjson():
reason = "ubjson is not intsalled."
try:
import ubjson # noqa
return {"condition": False, "reason": reason}
except ImportError:
return {"condition": True, "reason": reason}
def no_sklearn():
return {'condition': not SKLEARN_INSTALLED,
'reason': 'Scikit-Learn is not installed'}
def no_dask():
try:
import pkg_resources
pkg_resources.get_distribution("dask")
DASK_INSTALLED = True
except pkg_resources.DistributionNotFound:
DASK_INSTALLED = False
return {"condition": not DASK_INSTALLED, "reason": "Dask is not installed"}
def no_spark():
try:
import pyspark # noqa
SPARK_INSTALLED = True
except ImportError:
SPARK_INSTALLED = False
return {"condition": not SPARK_INSTALLED, "reason": "Spark is not installed"}
def no_pandas():
return {'condition': not PANDAS_INSTALLED,
'reason': 'Pandas is not installed.'}
def no_arrow():
reason = "pyarrow is not installed"
try:
import pyarrow # noqa
return {"condition": False, "reason": reason}
except ImportError:
return {"condition": True, "reason": reason}
def no_modin():
reason = 'Modin is not installed.'
try:
import modin.pandas as _ # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_dt():
import importlib.util
spec = importlib.util.find_spec('datatable')
return {'condition': spec is None,
'reason': 'Datatable is not installed.'}
def no_matplotlib():
reason = 'Matplotlib is not installed.'
try:
import matplotlib.pyplot as _ # noqa
return {'condition': False,
'reason': reason}
except ImportError:
return {'condition': True,
'reason': reason}
def no_dask_cuda():
reason = 'dask_cuda is not installed.'
try:
import dask_cuda as _ # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_cudf():
try:
import cudf # noqa
CUDF_INSTALLED = True
except ImportError:
CUDF_INSTALLED = False
return {'condition': not CUDF_INSTALLED,
'reason': 'CUDF is not installed'}
def no_cupy():
reason = 'cupy is not installed.'
try:
import cupy as _ # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_dask_cudf():
reason = 'dask_cudf is not installed.'
try:
import dask_cudf as _ # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_json_schema():
reason = 'jsonschema is not installed'
try:
import jsonschema # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_graphviz():
reason = 'graphviz is not installed'
try:
import graphviz # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_multiple(*args):
condition = False
reason = ''
for arg in args:
condition = (condition or arg['condition'])
if arg['condition']:
reason = arg['reason']
break
return {'condition': condition, 'reason': reason}
def skip_s390x():
condition = platform.machine() == "s390x"
reason = "Known to fail on s390x"
return {"condition": condition, "reason": reason}
class IteratorForTest(xgb.core.DataIter):
def __init__(
self,
X: Sequence,
y: Sequence,
w: Optional[Sequence],
cache: Optional[str] = "./"
) -> None:
assert len(X) == len(y)
self.X = X
self.y = y
self.w = w
self.it = 0
super().__init__(cache)
def next(self, input_data: Callable) -> int:
if self.it == len(self.X):
return 0
with pytest.raises(TypeError, match="keyword args"):
input_data(self.X[self.it], self.y[self.it], None)
# Use copy to make sure the iterator doesn't hold a reference to the data.
input_data(
data=self.X[self.it].copy(),
label=self.y[self.it].copy(),
weight=self.w[self.it].copy() if self.w else None,
)
gc.collect() # clear up the copy, see if XGBoost access freed memory.
self.it += 1
return 1
def reset(self) -> None:
self.it = 0
def as_arrays(
self,
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
if isinstance(self.X[0], sparse.csr_matrix):
X = sparse.vstack(self.X, format="csr")
else:
X = np.concatenate(self.X, axis=0)
y = np.concatenate(self.y, axis=0)
if self.w:
w = np.concatenate(self.w, axis=0)
else:
w = None
return X, y, w
def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
X = []
y = []
w = []
if use_cupy:
import cupy
rng = cupy.random.RandomState(1994)
else:
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
X.append(_X)
y.append(_y)
w.append(_w)
return X, y, w
def make_batches_sparse(
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
X = []
y = []
w = []
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = sparse.random(
n_samples_per_batch,
n_features,
1.0 - sparsity,
format="csr",
dtype=np.float32,
random_state=rng,
)
_y = rng.randn(n_samples_per_batch)
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
X.append(_X)
y.append(_y)
w.append(_w)
return X, y, w
# Contains a dataset in numpy format as well as the relevant objective and metric
class TestDataset:
def __init__(
self, name: str, get_dataset: Callable, objective: str, metric: str
) -> None:
self.name = name
self.objective = objective
self.metric = metric
self.X, self.y = get_dataset()
self.w: Optional[np.ndarray] = None
self.margin: Optional[np.ndarray] = None
def set_params(self, params_in: Dict[str, Any]) -> Dict[str, Any]:
params_in['objective'] = self.objective
params_in['eval_metric'] = self.metric
if self.objective == "multi:softmax":
params_in["num_class"] = int(np.max(self.y) + 1)
return params_in
def get_dmat(self) -> xgb.DMatrix:
return xgb.DMatrix(
self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
)
def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
w = None if self.w is None else cp.array(self.w)
X = cp.array(self.X, dtype=np.float32)
y = cp.array(self.y, dtype=np.float32)
return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
def get_external_dmat(self) -> xgb.DMatrix:
n_samples = self.X.shape[0]
n_batches = 10
per_batch = n_samples // n_batches + 1
predictor = []
response = []
weight = []
for i in range(n_batches):
beg = i * per_batch
end = min((i + 1) * per_batch, n_samples)
assert end != beg
X = self.X[beg: end, ...]
y = self.y[beg: end]
w = self.w[beg: end] if self.w is not None else None
predictor.append(X)
response.append(y)
if w is not None:
weight.append(w)
it = IteratorForTest(predictor, response, weight if weight else None)
return xgb.DMatrix(it)
def __repr__(self) -> str:
return self.name
@memory.cache
def get_california_housing():
data = datasets.fetch_california_housing()
return data.data, data.target
@memory.cache
def get_digits():
data = datasets.load_digits()
return data.data, data.target
@memory.cache
def get_cancer():
data = datasets.load_breast_cancer()
return data.data, data.target
@memory.cache
def get_sparse():
rng = np.random.RandomState(199)
n = 2000
sparsity = 0.75
X, y = datasets.make_regression(n, random_state=rng)
flag = rng.binomial(1, sparsity, X.shape)
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if flag[i, j]:
X[i, j] = np.nan
return X, y
@memory.cache
def get_ames_housing():
"""
Number of samples: 1460
Number of features: 20
Number of categorical features: 10
Number of numerical features: 10
"""
from sklearn.datasets import fetch_openml
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
categorical_columns_subset: list[str] = [
"BldgType", # 5 cats, no nan
"GarageFinish", # 3 cats, nan
"LotConfig", # 5 cats, no nan
"Functional", # 7 cats, no nan
"MasVnrType", # 4 cats, nan
"HouseStyle", # 8 cats, no nan
"FireplaceQu", # 5 cats, nan
"ExterCond", # 5 cats, no nan
"ExterQual", # 4 cats, no nan
"PoolQC", # 3 cats, nan
]
numerical_columns_subset: list[str] = [
"3SsnPorch",
"Fireplaces",
"BsmtHalfBath",
"HalfBath",
"GarageCars",
"TotRmsAbvGrd",
"BsmtFinSF1",
"BsmtFinSF2",
"GrLivArea",
"ScreenPorch",
]
X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
return X, y
@memory.cache
def get_mq2008(dpath):
from sklearn.datasets import load_svmlight_files
src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
target = dpath + '/MQ2008.zip'
if not os.path.exists(target):
urllib.request.urlretrieve(url=src, filename=target)
with zipfile.ZipFile(target, 'r') as f:
f.extractall(path=dpath)
(x_train, y_train, qid_train, x_test, y_test, qid_test,
x_valid, y_valid, qid_valid) = load_svmlight_files(
(dpath + "MQ2008/Fold1/train.txt",
dpath + "MQ2008/Fold1/test.txt",
dpath + "MQ2008/Fold1/vali.txt"),
query_id=True, zero_based=False)
return (x_train, y_train, qid_train, x_test, y_test, qid_test,
x_valid, y_valid, qid_valid)
@memory.cache
def make_categorical(
n_samples: int, n_features: int, n_categories: int, onehot: bool, sparsity=0.0,
):
import pandas as pd
rng = np.random.RandomState(1994)
pd_dict = {}
for i in range(n_features + 1):
c = rng.randint(low=0, high=n_categories, size=n_samples)
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
df = pd.DataFrame(pd_dict)
label = df.iloc[:, 0]
df = df.iloc[:, 1:]
for i in range(0, n_features):
label += df.iloc[:, i]
label += 1
df = df.astype("category")
categories = np.arange(0, n_categories)
for col in df.columns:
df[col] = df[col].cat.set_categories(categories)
if sparsity > 0.0:
for i in range(n_features):
index = rng.randint(low=0, high=n_samples-1, size=int(n_samples * sparsity))
df.iloc[index, i] = np.NaN
assert n_categories == np.unique(df.dtypes[i].categories).size
if onehot:
return pd.get_dummies(df), label
return df, label
def _cat_sampled_from():
@strategies.composite
def _make_cat(draw):
n_samples = draw(strategies.integers(2, 512))
n_features = draw(strategies.integers(1, 4))
n_cats = draw(strategies.integers(1, 128))
sparsity = draw(
strategies.floats(
min_value=0,
max_value=1,
allow_nan=False,
allow_infinity=False,
allow_subnormal=False,
)
)
return n_samples, n_features, n_cats, sparsity
def _build(args):
n_samples = args[0]
n_features = args[1]
n_cats = args[2]
sparsity = args[3]
return TestDataset(
f"{n_samples}x{n_features}-{n_cats}-{sparsity}",
lambda: make_categorical(n_samples, n_features, n_cats, False, sparsity),
"reg:squarederror",
"rmse",
)
return _make_cat().map(_build)
categorical_dataset_strategy = _cat_sampled_from()
@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
"""Make sparse matrix.
Parameters
----------
as_dense:
Return the matrix as np.ndarray with missing values filled by NaN
"""
if not hasattr(np.random, "default_rng"):
# old version of numpy on s390x
rng = np.random.RandomState(1994)
X = sparse.random(
m=n_samples,
n=n_features,
density=1.0 - sparsity,
random_state=rng,
format="csr",
)
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
return X, y
# Use multi-thread to speed up the generation, convenient if you use this function
# for benchmarking.
n_threads = multiprocessing.cpu_count()
n_threads = min(n_threads, n_features)
def random_csc(t_id: int) -> sparse.csc_matrix:
rng = np.random.default_rng(1994 * t_id)
thread_size = n_features // n_threads
if t_id == n_threads - 1:
n_features_tloc = n_features - t_id * thread_size
else:
n_features_tloc = thread_size
X = sparse.random(
m=n_samples,
n=n_features_tloc,
density=1.0 - sparsity,
random_state=rng,
).tocsc()
y = np.zeros((n_samples, 1))
for i in range(X.shape[1]):
size = X.indptr[i + 1] - X.indptr[i]
if size != 0:
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
return X, y
futures = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
for i in range(n_threads):
futures.append(executor.submit(random_csc, i))
X_results = []
y_results = []
for f in futures:
X, y = f.result()
X_results.append(X)
y_results.append(y)
assert len(y_results) == n_threads
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
y = np.asarray(y_results)
y = y.reshape((y.shape[0], y.shape[1])).T
y = np.sum(y, axis=1)
assert csr.shape[0] == n_samples
assert csr.shape[1] == n_features
assert y.shape[0] == n_samples
if as_dense:
arr = csr.toarray()
assert arr.shape[0] == n_samples
assert arr.shape[1] == n_features
arr[arr == 0] = np.nan
return arr, y
return csr, y
sparse_datasets_strategy = strategies.sampled_from(
[
TestDataset(
"1e5x8-0.95-csr",
lambda: make_sparse_regression(int(1e5), 8, 0.95, False),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.5-csr",
lambda: make_sparse_regression(int(1e5), 8, 0.5, False),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.5-dense",
lambda: make_sparse_regression(int(1e5), 8, 0.5, True),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.05-csr",
lambda: make_sparse_regression(int(1e5), 8, 0.05, False),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.05-dense",
lambda: make_sparse_regression(int(1e5), 8, 0.05, True),
"reg:squarederror",
"rmse",
),
]
)
_unweighted_datasets_strategy = strategies.sampled_from(
[
TestDataset(
"calif_housing", get_california_housing, "reg:squarederror", "rmse"
),
TestDataset(
"calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
),
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
TestDataset(
"mtreg",
lambda: datasets.make_regression(n_samples=128, n_targets=3),
"reg:squarederror",
"rmse",
),
TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
TestDataset(
"empty",
lambda: (np.empty((0, 100)), np.empty(0)),
"reg:squarederror",
"rmse",
),
]
)
@strategies.composite
def _dataset_weight_margin(draw):
data: TestDataset = draw(_unweighted_datasets_strategy)
if draw(strategies.booleans()):
data.w = draw(
arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
)
if draw(strategies.booleans()):
num_class = 1
if data.objective == "multi:softmax":
num_class = int(np.max(data.y) + 1)
elif data.name == "mtreg":
num_class = data.y.shape[1]
data.margin = draw(
arrays(
np.float64,
(data.y.shape[0] * num_class),
elements=strategies.floats(0.5, 1.0),
)
)
if num_class != 1:
data.margin = data.margin.reshape(data.y.shape[0], num_class)
return data
# A strategy for drawing from a set of example datasets
# May add random weights to the dataset
dataset_strategy = _dataset_weight_margin()
def non_increasing(L, tolerance=1e-4):
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
def eval_error_metric(predt, dtrain: xgb.DMatrix):
"""Evaluation metric for xgb.train"""
label = dtrain.get_label()
r = np.zeros(predt.shape)
gt = predt > 0.5
if predt.size == 0:
return "CustomErr", 0
r[gt] = 1 - label[gt]
le = predt <= 0.5
r[le] = label[le]
return 'CustomErr', np.sum(r)
def eval_error_metric_skl(y_true: np.ndarray, y_score: np.ndarray) -> float:
"""Evaluation metric that looks like metrics provided by sklearn."""
r = np.zeros(y_score.shape)
gt = y_score > 0.5
r[gt] = 1 - y_true[gt]
le = y_score <= 0.5
r[le] = y_true[le]
return np.sum(r)
def root_mean_square(y_true: np.ndarray, y_score: np.ndarray) -> float:
err = y_score - y_true
rmse = np.sqrt(np.dot(err, err) / y_score.size)
return rmse
def softmax(x):
e = np.exp(x)
return e / np.sum(e)
def softprob_obj(classes):
def objective(labels, predt):
rows = labels.shape[0]
grad = np.zeros((rows, classes), dtype=float)
hess = np.zeros((rows, classes), dtype=float)
eps = 1e-6
for r in range(predt.shape[0]):
target = labels[r]
p = softmax(predt[r, :])
for c in range(predt.shape[1]):
assert target >= 0 or target <= classes
g = p[c] - 1.0 if c == target else p[c]
h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps)
grad[r, c] = g
hess[r, c] = h
grad = grad.reshape((rows * classes, 1))
hess = hess.reshape((rows * classes, 1))
return grad, hess
return objective
class DirectoryExcursion:
def __init__(self, path: os.PathLike, cleanup=False):
'''Change directory. Change back and optionally cleaning up the directory when exit.
'''
self.path = path
self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
self.cleanup = cleanup
self.files = {}
def __enter__(self):
os.chdir(self.path)
if self.cleanup:
self.files = {
os.path.join(root, f)
for root, subdir, files in os.walk(self.path) for f in files
}
def __exit__(self, *args):
os.chdir(self.curdir)
if self.cleanup:
files = {
os.path.join(root, f)
for root, subdir, files in os.walk(self.path) for f in files
}
diff = files.difference(self.files)
for f in diff:
os.remove(f)
@contextmanager
def captured_output():
"""Reassign stdout temporarily in order to test printed statements
Taken from:
https://stackoverflow.com/questions/4219717/how-to-assert-output-with-nosetest-unittest-in-python
Also works for pytest.
"""
new_out, new_err = StringIO(), StringIO()
old_out, old_err = sys.stdout, sys.stderr
try:
sys.stdout, sys.stderr = new_out, new_err
yield sys.stdout, sys.stderr
finally:
sys.stdout, sys.stderr = old_out, old_err
try:
# Python 3.7+
from contextlib import nullcontext as noop_context
except ImportError:
# Python 3.6
from contextlib import suppress as noop_context
CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
PROJECT_ROOT = os.path.normpath(
os.path.join(CURDIR, os.path.pardir, os.path.pardir))

View File

@@ -1,7 +1,9 @@
import xgboost as xgb
import sys
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import sys
import xgboost as xgb
def run_omp(output_path: str):