Move Python testing utilities into xgboost module. (#8379)
- Add typehints. - Fixes for pylint. Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
import xgboost
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
import xgboost
|
||||
|
||||
kRounds = 2
|
||||
kRows = 1000
|
||||
kCols = 4
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import os
|
||||
import xgboost as xgb
|
||||
import pytest
|
||||
import json
|
||||
from pathlib import Path
|
||||
import os
|
||||
import tempfile
|
||||
import testing as tm
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
dpath = 'demo/data/'
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import os
|
||||
import json
|
||||
import testing as tm
|
||||
import pytest
|
||||
import locale
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
dpath = os.path.join(tm.PROJECT_ROOT, 'demo/data/')
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
dpath = tm.data_dir(__file__)
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
@@ -36,8 +38,8 @@ class TestModels:
|
||||
param = {'verbosity': 0, 'objective': 'binary:logistic',
|
||||
'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
|
||||
'nthread': 1}
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 4
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
@@ -49,8 +51,8 @@ class TestModels:
|
||||
assert err < 0.2
|
||||
|
||||
def test_dart(self):
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
|
||||
param = {'max_depth': 5, 'objective': 'binary:logistic',
|
||||
'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1}
|
||||
# specify validations set to watch performance
|
||||
@@ -116,7 +118,7 @@ class TestModels:
|
||||
|
||||
def test_boost_from_prediction(self):
|
||||
# Re-construct dtrain here to avoid modification
|
||||
margined = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
bst = xgb.train({'tree_method': 'hist'}, margined, 1)
|
||||
predt_0 = bst.predict(margined, output_margin=True)
|
||||
margined.set_base_margin(predt_0)
|
||||
@@ -124,13 +126,13 @@ class TestModels:
|
||||
predt_1 = bst.predict(margined)
|
||||
|
||||
assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
|
||||
predt_2 = bst.predict(dtrain)
|
||||
assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
|
||||
|
||||
def test_boost_from_existing_model(self):
|
||||
X = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
|
||||
assert booster.num_boosted_rounds() == 4
|
||||
booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4,
|
||||
@@ -150,8 +152,8 @@ class TestModels:
|
||||
'objective': 'reg:logistic',
|
||||
"tree_method": tree_method
|
||||
}
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 10
|
||||
|
||||
@@ -197,8 +199,8 @@ class TestModels:
|
||||
self.run_custom_objective()
|
||||
|
||||
def test_multi_eval_metric(self):
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
|
||||
'objective': 'binary:logistic'}
|
||||
@@ -220,7 +222,7 @@ class TestModels:
|
||||
param['scale_pos_weight'] = ratio
|
||||
return (dtrain, dtest, param)
|
||||
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'auc'}, seed=0, fpreproc=fpreproc)
|
||||
|
||||
@@ -228,7 +230,7 @@ class TestModels:
|
||||
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
|
||||
'objective': 'binary:logistic'}
|
||||
num_round = 2
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed=0, show_stdv=False)
|
||||
|
||||
@@ -346,7 +348,7 @@ class TestModels:
|
||||
os.remove(model_path)
|
||||
|
||||
try:
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
|
||||
except ValueError as e:
|
||||
e_str = str(e)
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
from typing import Union
|
||||
import xgboost as xgb
|
||||
import pytest
|
||||
import os
|
||||
import testing as tm
|
||||
import tempfile
|
||||
from contextlib import nullcontext
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
# We use the dataset for tests.
|
||||
pytestmark = pytest.mark.skipif(**tm.no_sklearn())
|
||||
@@ -271,13 +274,14 @@ class TestCallbacks:
|
||||
"""Test learning rate scheduler, used by both CPU and GPU tests."""
|
||||
scheduler = xgb.callback.LearningRateScheduler
|
||||
|
||||
dpath = os.path.join(tm.PROJECT_ROOT, 'demo/data/')
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
dpath = tm.data_dir(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
|
||||
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
|
||||
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 4
|
||||
|
||||
warning_check = tm.noop_context()
|
||||
warning_check = nullcontext()
|
||||
|
||||
# learning_rates as a list
|
||||
# init eta with 0 to check whether learning_rates work
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import os
|
||||
import tempfile
|
||||
import platform
|
||||
import xgboost
|
||||
import subprocess
|
||||
import numpy
|
||||
import json
|
||||
import testing as tm
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import numpy
|
||||
|
||||
import xgboost
|
||||
from xgboost import testing as tm
|
||||
|
||||
|
||||
class TestCLI:
|
||||
@@ -29,7 +31,7 @@ data = {data_path}
|
||||
eval[test] = {data_path}
|
||||
'''
|
||||
|
||||
PROJECT_ROOT = tm.PROJECT_ROOT
|
||||
PROJECT_ROOT = tm.project_root(__file__)
|
||||
|
||||
def get_exe(self):
|
||||
if platform.system() == 'Windows':
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from hypothesis import given, settings, strategies
|
||||
from scipy.sparse import csr_matrix
|
||||
from testing import IteratorForTest, make_batches, non_increasing
|
||||
from xgboost.data import SingleBatchInternalIter as SingleBatch
|
||||
from xgboost.testing import IteratorForTest, make_batches, non_increasing
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = testing.timeout(30)
|
||||
pytestmark = tm.timeout(30)
|
||||
|
||||
|
||||
def test_single_batch(tree_method: str = "approx") -> None:
|
||||
@@ -83,7 +85,7 @@ def run_data_iterator(
|
||||
if tree_method == "gpu_hist":
|
||||
parameters["sampling_method"] = "gradient_based"
|
||||
|
||||
results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {}
|
||||
results_from_it: Dict[str, Dict[str, List[float]]] = {}
|
||||
from_it = xgb.train(
|
||||
parameters,
|
||||
Xy,
|
||||
@@ -106,7 +108,7 @@ def run_data_iterator(
|
||||
assert Xy.num_row() == n_samples_per_batch * n_batches
|
||||
assert Xy.num_col() == n_features
|
||||
|
||||
results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {}
|
||||
results_from_arrays: Dict[str, Dict[str, List[float]]] = {}
|
||||
from_arrays = xgb.train(
|
||||
parameters,
|
||||
Xy,
|
||||
|
||||
@@ -3,14 +3,12 @@ import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = testing.timeout(30)
|
||||
pytestmark = tm.timeout(30)
|
||||
|
||||
ROOT_DIR = tm.PROJECT_ROOT
|
||||
DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
|
||||
DEMO_DIR = tm.demo_dir(__file__)
|
||||
PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
|
||||
CLI_DEMO_DIR = os.path.join(DEMO_DIR, 'CLI')
|
||||
|
||||
@@ -156,7 +154,7 @@ def test_cli_regression_demo():
|
||||
cmd = ['python', script, 'machine.txt', '1']
|
||||
subprocess.check_call(cmd, cwd=reg_dir)
|
||||
|
||||
exe = os.path.join(tm.PROJECT_ROOT, 'xgboost')
|
||||
exe = os.path.join(DEMO_DIR, os.path.pardir, 'xgboost')
|
||||
conf = os.path.join(reg_dir, 'machine.conf')
|
||||
subprocess.check_call([exe, conf], cwd=reg_dir)
|
||||
|
||||
|
||||
@@ -4,11 +4,11 @@ import tempfile
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
import testing as tm
|
||||
from hypothesis import given, settings, strategies
|
||||
from scipy.sparse import csr_matrix, rand
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
rng = np.random.RandomState(1)
|
||||
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import testing as tm
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
try:
|
||||
import datatable as dt
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
rng = np.random.RandomState(1337)
|
||||
|
||||
|
||||
@@ -254,8 +255,8 @@ class TestEvalMetrics:
|
||||
self.run_roc_auc_multi("hist", n_samples, weighted)
|
||||
|
||||
def run_pr_auc_binary(self, tree_method):
|
||||
from sklearn.metrics import precision_recall_curve, auc
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.metrics import auc, precision_recall_curve
|
||||
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
|
||||
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
|
||||
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import xgboost
|
||||
import testing as tm
|
||||
import pytest
|
||||
|
||||
import xgboost
|
||||
from xgboost import testing as tm
|
||||
|
||||
dpath = 'demo/data/'
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import testing as tm
|
||||
from hypothesis import given, note, settings, strategies
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = testing.timeout(10)
|
||||
pytestmark = tm.timeout(10)
|
||||
|
||||
|
||||
parameter_strategy = strategies.fixed_dictionaries({
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
import xgboost
|
||||
import os
|
||||
import generate_models as gm
|
||||
import testing as tm
|
||||
import json
|
||||
import zipfile
|
||||
import pytest
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
import zipfile
|
||||
|
||||
import generate_models as gm
|
||||
import pytest
|
||||
|
||||
import xgboost
|
||||
from xgboost import testing as tm
|
||||
|
||||
|
||||
def run_model_param_check(config):
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
dpath = 'demo/data/'
|
||||
|
||||
|
||||
|
||||
@@ -4,12 +4,11 @@ import tempfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = testing.timeout(10)
|
||||
pytestmark = tm.timeout(10)
|
||||
|
||||
|
||||
class TestOMP:
|
||||
@@ -86,7 +85,7 @@ class TestOMP:
|
||||
def test_with_omp_thread_limit(self):
|
||||
args = [
|
||||
"python", os.path.join(
|
||||
tm.PROJECT_ROOT, "tests", "python", "with_omp_limit.py"
|
||||
os.path.dirname(tm.normpath(__file__)), "with_omp_limit.py"
|
||||
)
|
||||
]
|
||||
results = []
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = pytest.mark.skipif(**tm.no_pandas())
|
||||
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import pickle
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import os
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
kRows = 100
|
||||
kCols = 10
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
import json
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
try:
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
from matplotlib.axes import Axes
|
||||
from graphviz import Source
|
||||
from matplotlib.axes import Axes
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
'''Tests for running inplace prediction.'''
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
import pytest
|
||||
import pandas as pd
|
||||
|
||||
import testing as tm
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
|
||||
def run_threaded_predict(X, rows, predict_func):
|
||||
|
||||
@@ -4,7 +4,7 @@ import numpy as np
|
||||
import pytest
|
||||
from hypothesis import given, settings, strategies
|
||||
from scipy import sparse
|
||||
from testing import (
|
||||
from xgboost.testing import (
|
||||
IteratorForTest,
|
||||
make_batches,
|
||||
make_batches_sparse,
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix
|
||||
import testing as tm
|
||||
import xgboost
|
||||
import os
|
||||
import itertools
|
||||
import os
|
||||
import shutil
|
||||
import urllib.request
|
||||
import zipfile
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
import xgboost
|
||||
from xgboost import testing as tm
|
||||
|
||||
|
||||
def test_ranking_with_unweighted_data():
|
||||
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import itertools
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import scipy
|
||||
import scipy.special
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
dpath = 'demo/data/'
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
@@ -4,7 +4,8 @@ from typing import List
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
from xgboost import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
|
||||
@@ -6,10 +6,9 @@ import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
@@ -38,7 +37,7 @@ from .utils import SparkTestCase
|
||||
|
||||
logging.getLogger("py4j").setLevel(logging.INFO)
|
||||
|
||||
pytestmark = testing.timeout(60)
|
||||
pytestmark = tm.timeout(60)
|
||||
|
||||
|
||||
class XgboostLocalTest(SparkTestCase):
|
||||
|
||||
@@ -6,7 +6,8 @@ import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
from xgboost import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
|
||||
@@ -6,9 +6,10 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
import testing as tm
|
||||
from six import StringIO
|
||||
|
||||
from xgboost import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import testing as tm
|
||||
import pytest
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import json
|
||||
import os
|
||||
|
||||
dpath = os.path.join(tm.PROJECT_ROOT, 'demo', 'data')
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
dpath = tm.data_dir(__file__)
|
||||
|
||||
|
||||
def test_aft_survival_toy_data():
|
||||
|
||||
@@ -3,10 +3,10 @@ import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import RabitTracker, testing
|
||||
from xgboost import RabitTracker
|
||||
from xgboost import testing as tm
|
||||
|
||||
if sys.platform.startswith("win"):
|
||||
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
|
||||
@@ -61,7 +61,7 @@ def test_rabit_ops():
|
||||
run_rabit_ops(client, n_workers)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**testing.skip_ipv6())
|
||||
@pytest.mark.skipif(**tm.no_ipv6())
|
||||
@pytest.mark.skipif(**tm.no_dask())
|
||||
def test_rabit_ops_ipv6():
|
||||
import dask
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import numpy as np
|
||||
import pytest
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
rng = np.random.RandomState(1337)
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
|
||||
from numpy.testing import assert_approx_equal
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
train_data = xgb.DMatrix(np.array([[1]]), label=np.array([1]))
|
||||
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import json
|
||||
from string import ascii_lowercase
|
||||
from typing import Dict, Any
|
||||
import testing as tm
|
||||
import pytest
|
||||
import xgboost as xgb
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
from hypothesis import given, strategies, settings, note
|
||||
import pytest
|
||||
from hypothesis import given, note, settings, strategies
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
exact_parameter_strategy = strategies.fixed_dictionaries({
|
||||
'nthread': strategies.integers(1, 4),
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
import unittest
|
||||
import pytest
|
||||
import numpy as np
|
||||
import testing as tm
|
||||
import xgboost as xgb
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.csv as pc
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
@@ -73,7 +75,7 @@ class TestArrowTable(unittest.TestCase):
|
||||
np.testing.assert_allclose(preds1, preds2)
|
||||
|
||||
def test_arrow_survival(self):
|
||||
data = os.path.join(tm.PROJECT_ROOT, "demo", "data", "veterans_lung_cancer.csv")
|
||||
data = os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
|
||||
table = pc.read_csv(data)
|
||||
y_lower_bound = table["Survival_label_lower_bound"]
|
||||
y_upper_bound = table["Survival_label_upper_bound"]
|
||||
|
||||
@@ -20,7 +20,6 @@ import numpy as np
|
||||
import pytest
|
||||
import scipy
|
||||
import sklearn
|
||||
import testing as tm
|
||||
from hypothesis import HealthCheck, given, note, settings
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
from test_predict import verify_leaf_output
|
||||
@@ -29,7 +28,7 @@ from test_with_sklearn import run_data_initialization, run_feature_weights
|
||||
from xgboost.data import _is_cudf_df
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
if sys.platform.startswith("win"):
|
||||
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
|
||||
@@ -45,7 +44,7 @@ from xgboost.dask import DaskDMatrix
|
||||
|
||||
dask.config.set({"distributed.scheduler.allowed-failures": False})
|
||||
|
||||
pytestmark = testing.timeout(30)
|
||||
pytestmark = tm.timeout(30)
|
||||
|
||||
if hasattr(HealthCheck, 'function_scoped_fixture'):
|
||||
suppress = [HealthCheck.function_scoped_fixture]
|
||||
@@ -1116,8 +1115,9 @@ def test_predict_with_meta(client: "Client") -> None:
|
||||
|
||||
|
||||
def run_aft_survival(client: "Client", dmatrix_t: Type) -> None:
|
||||
df = dd.read_csv(os.path.join(tm.PROJECT_ROOT, 'demo', 'data',
|
||||
'veterans_lung_cancer.csv'))
|
||||
df = dd.read_csv(
|
||||
os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
|
||||
)
|
||||
y_lower_bound = df['Survival_label_lower_bound']
|
||||
y_upper_bound = df['Survival_label_upper_bound']
|
||||
X = df.drop(['Survival_label_lower_bound',
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import pytest
|
||||
from test_dmatrix import set_base_margin_info
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
try:
|
||||
import modin.pandas as md
|
||||
except ImportError:
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import pytest
|
||||
from test_dmatrix import set_base_margin_info
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
try:
|
||||
import shap
|
||||
except ImportError:
|
||||
|
||||
@@ -8,14 +8,13 @@ from typing import Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import testing as tm
|
||||
from sklearn.utils.estimator_checks import parametrize_with_checks
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing
|
||||
from xgboost import testing as tm
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), testing.timeout(30)]
|
||||
pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), tm.timeout(30)]
|
||||
|
||||
|
||||
def test_binary_classification():
|
||||
@@ -155,11 +154,10 @@ def test_ranking():
|
||||
|
||||
|
||||
def test_stacking_regression():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
|
||||
from sklearn.linear_model import RidgeCV
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
X, y = load_diabetes(return_X_y=True)
|
||||
estimators = [
|
||||
@@ -177,13 +175,13 @@ def test_stacking_regression():
|
||||
|
||||
|
||||
def test_stacking_classification():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
estimators = [
|
||||
@@ -354,8 +352,8 @@ def test_num_parallel_tree():
|
||||
|
||||
|
||||
def test_regression():
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
X, y = fetch_california_housing(return_X_y=True)
|
||||
@@ -383,8 +381,8 @@ def test_regression():
|
||||
|
||||
|
||||
def run_housing_rf_regression(tree_method):
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
X, y = fetch_california_housing(return_X_y=True)
|
||||
@@ -407,8 +405,8 @@ def test_rf_regression():
|
||||
|
||||
|
||||
def test_parameter_tuning():
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
X, y = fetch_california_housing(return_X_y=True)
|
||||
xgb_model = xgb.XGBRegressor(learning_rate=0.1)
|
||||
@@ -421,8 +419,8 @@ def test_parameter_tuning():
|
||||
|
||||
|
||||
def test_regression_with_custom_objective():
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
def objective_ls(y_true, y_pred):
|
||||
@@ -539,8 +537,8 @@ def test_sklearn_plotting():
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
|
||||
from matplotlib.axes import Axes
|
||||
from graphviz import Source
|
||||
from matplotlib.axes import Axes
|
||||
|
||||
ax = xgb.plot_importance(classifier)
|
||||
assert isinstance(ax, Axes)
|
||||
@@ -666,8 +664,8 @@ def test_kwargs_error():
|
||||
|
||||
|
||||
def test_kwargs_grid_search():
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
params = {'tree_method': 'hist'}
|
||||
clf = xgb.XGBClassifier(n_estimators=1, learning_rate=1.0, **params)
|
||||
@@ -841,9 +839,7 @@ def test_save_load_model():
|
||||
|
||||
|
||||
def test_RFECV():
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
|
||||
from sklearn.feature_selection import RFECV
|
||||
|
||||
# Regression
|
||||
@@ -1046,8 +1042,9 @@ def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
|
||||
with open(model_path) as fd:
|
||||
model = json.load(fd)
|
||||
|
||||
parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
|
||||
'json_parser.py')
|
||||
parser_path = os.path.join(
|
||||
tm.demo_dir(__file__), "json-model", "json_parser.py"
|
||||
)
|
||||
spec = importlib.util.spec_from_file_location("JsonParser",
|
||||
parser_path)
|
||||
foo = importlib.util.module_from_spec(spec)
|
||||
@@ -1162,8 +1159,8 @@ def run_boost_from_prediction_multi_clasas(
|
||||
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
|
||||
def test_boost_from_prediction(tree_method):
|
||||
from sklearn.datasets import load_breast_cancer, load_iris, make_regression
|
||||
import pandas as pd
|
||||
from sklearn.datasets import load_breast_cancer, load_iris, make_regression
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
|
||||
|
||||
@@ -1,806 +0,0 @@
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import os
|
||||
import multiprocessing
|
||||
from typing import Tuple, Union, List, Sequence, Callable
|
||||
import urllib
|
||||
import zipfile
|
||||
import sys
|
||||
from typing import Optional, Dict, Any
|
||||
from contextlib import contextmanager
|
||||
from io import StringIO
|
||||
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
|
||||
import pytest
|
||||
import gc
|
||||
import xgboost as xgb
|
||||
from xgboost.core import ArrayLike
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
import platform
|
||||
|
||||
hypothesis = pytest.importorskip('hypothesis')
|
||||
sklearn = pytest.importorskip('sklearn')
|
||||
from hypothesis import strategies
|
||||
from hypothesis.extra.numpy import arrays
|
||||
from joblib import Memory
|
||||
from sklearn import datasets
|
||||
|
||||
try:
|
||||
import cupy as cp
|
||||
except ImportError:
|
||||
cp = None
|
||||
|
||||
memory = Memory('./cachedir', verbose=0)
|
||||
|
||||
|
||||
def no_ubjson():
|
||||
reason = "ubjson is not intsalled."
|
||||
try:
|
||||
import ubjson # noqa
|
||||
return {"condition": False, "reason": reason}
|
||||
except ImportError:
|
||||
return {"condition": True, "reason": reason}
|
||||
|
||||
|
||||
def no_sklearn():
|
||||
return {'condition': not SKLEARN_INSTALLED,
|
||||
'reason': 'Scikit-Learn is not installed'}
|
||||
|
||||
|
||||
def no_dask():
|
||||
try:
|
||||
import pkg_resources
|
||||
|
||||
pkg_resources.get_distribution("dask")
|
||||
DASK_INSTALLED = True
|
||||
except pkg_resources.DistributionNotFound:
|
||||
DASK_INSTALLED = False
|
||||
return {"condition": not DASK_INSTALLED, "reason": "Dask is not installed"}
|
||||
|
||||
|
||||
def no_spark():
|
||||
try:
|
||||
import pyspark # noqa
|
||||
SPARK_INSTALLED = True
|
||||
except ImportError:
|
||||
SPARK_INSTALLED = False
|
||||
return {"condition": not SPARK_INSTALLED, "reason": "Spark is not installed"}
|
||||
|
||||
|
||||
def no_pandas():
|
||||
return {'condition': not PANDAS_INSTALLED,
|
||||
'reason': 'Pandas is not installed.'}
|
||||
|
||||
|
||||
def no_arrow():
|
||||
reason = "pyarrow is not installed"
|
||||
try:
|
||||
import pyarrow # noqa
|
||||
return {"condition": False, "reason": reason}
|
||||
except ImportError:
|
||||
return {"condition": True, "reason": reason}
|
||||
|
||||
|
||||
def no_modin():
|
||||
reason = 'Modin is not installed.'
|
||||
try:
|
||||
import modin.pandas as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_dt():
|
||||
import importlib.util
|
||||
spec = importlib.util.find_spec('datatable')
|
||||
return {'condition': spec is None,
|
||||
'reason': 'Datatable is not installed.'}
|
||||
|
||||
|
||||
def no_matplotlib():
|
||||
reason = 'Matplotlib is not installed.'
|
||||
try:
|
||||
import matplotlib.pyplot as _ # noqa
|
||||
return {'condition': False,
|
||||
'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True,
|
||||
'reason': reason}
|
||||
|
||||
|
||||
def no_dask_cuda():
|
||||
reason = 'dask_cuda is not installed.'
|
||||
try:
|
||||
import dask_cuda as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_cudf():
|
||||
try:
|
||||
import cudf # noqa
|
||||
CUDF_INSTALLED = True
|
||||
except ImportError:
|
||||
CUDF_INSTALLED = False
|
||||
|
||||
return {'condition': not CUDF_INSTALLED,
|
||||
'reason': 'CUDF is not installed'}
|
||||
|
||||
|
||||
def no_cupy():
|
||||
reason = 'cupy is not installed.'
|
||||
try:
|
||||
import cupy as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_dask_cudf():
|
||||
reason = 'dask_cudf is not installed.'
|
||||
try:
|
||||
import dask_cudf as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_json_schema():
|
||||
reason = 'jsonschema is not installed'
|
||||
try:
|
||||
import jsonschema # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_graphviz():
|
||||
reason = 'graphviz is not installed'
|
||||
try:
|
||||
import graphviz # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_multiple(*args):
|
||||
condition = False
|
||||
reason = ''
|
||||
for arg in args:
|
||||
condition = (condition or arg['condition'])
|
||||
if arg['condition']:
|
||||
reason = arg['reason']
|
||||
break
|
||||
return {'condition': condition, 'reason': reason}
|
||||
|
||||
|
||||
def skip_s390x():
|
||||
condition = platform.machine() == "s390x"
|
||||
reason = "Known to fail on s390x"
|
||||
return {"condition": condition, "reason": reason}
|
||||
|
||||
|
||||
class IteratorForTest(xgb.core.DataIter):
|
||||
def __init__(
|
||||
self,
|
||||
X: Sequence,
|
||||
y: Sequence,
|
||||
w: Optional[Sequence],
|
||||
cache: Optional[str] = "./"
|
||||
) -> None:
|
||||
assert len(X) == len(y)
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.w = w
|
||||
self.it = 0
|
||||
super().__init__(cache)
|
||||
|
||||
def next(self, input_data: Callable) -> int:
|
||||
if self.it == len(self.X):
|
||||
return 0
|
||||
|
||||
with pytest.raises(TypeError, match="keyword args"):
|
||||
input_data(self.X[self.it], self.y[self.it], None)
|
||||
|
||||
# Use copy to make sure the iterator doesn't hold a reference to the data.
|
||||
input_data(
|
||||
data=self.X[self.it].copy(),
|
||||
label=self.y[self.it].copy(),
|
||||
weight=self.w[self.it].copy() if self.w else None,
|
||||
)
|
||||
gc.collect() # clear up the copy, see if XGBoost access freed memory.
|
||||
self.it += 1
|
||||
return 1
|
||||
|
||||
def reset(self) -> None:
|
||||
self.it = 0
|
||||
|
||||
def as_arrays(
|
||||
self,
|
||||
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
|
||||
if isinstance(self.X[0], sparse.csr_matrix):
|
||||
X = sparse.vstack(self.X, format="csr")
|
||||
else:
|
||||
X = np.concatenate(self.X, axis=0)
|
||||
y = np.concatenate(self.y, axis=0)
|
||||
if self.w:
|
||||
w = np.concatenate(self.w, axis=0)
|
||||
else:
|
||||
w = None
|
||||
return X, y, w
|
||||
|
||||
|
||||
def make_batches(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
|
||||
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
|
||||
X = []
|
||||
y = []
|
||||
w = []
|
||||
if use_cupy:
|
||||
import cupy
|
||||
|
||||
rng = cupy.random.RandomState(1994)
|
||||
else:
|
||||
rng = np.random.RandomState(1994)
|
||||
for i in range(n_batches):
|
||||
_X = rng.randn(n_samples_per_batch, n_features)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
w.append(_w)
|
||||
return X, y, w
|
||||
|
||||
|
||||
def make_batches_sparse(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
|
||||
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
|
||||
X = []
|
||||
y = []
|
||||
w = []
|
||||
rng = np.random.RandomState(1994)
|
||||
for i in range(n_batches):
|
||||
_X = sparse.random(
|
||||
n_samples_per_batch,
|
||||
n_features,
|
||||
1.0 - sparsity,
|
||||
format="csr",
|
||||
dtype=np.float32,
|
||||
random_state=rng,
|
||||
)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
w.append(_w)
|
||||
return X, y, w
|
||||
|
||||
|
||||
# Contains a dataset in numpy format as well as the relevant objective and metric
|
||||
class TestDataset:
|
||||
def __init__(
|
||||
self, name: str, get_dataset: Callable, objective: str, metric: str
|
||||
) -> None:
|
||||
self.name = name
|
||||
self.objective = objective
|
||||
self.metric = metric
|
||||
self.X, self.y = get_dataset()
|
||||
self.w: Optional[np.ndarray] = None
|
||||
self.margin: Optional[np.ndarray] = None
|
||||
|
||||
def set_params(self, params_in: Dict[str, Any]) -> Dict[str, Any]:
|
||||
params_in['objective'] = self.objective
|
||||
params_in['eval_metric'] = self.metric
|
||||
if self.objective == "multi:softmax":
|
||||
params_in["num_class"] = int(np.max(self.y) + 1)
|
||||
return params_in
|
||||
|
||||
def get_dmat(self) -> xgb.DMatrix:
|
||||
return xgb.DMatrix(
|
||||
self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
|
||||
)
|
||||
|
||||
def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
|
||||
w = None if self.w is None else cp.array(self.w)
|
||||
X = cp.array(self.X, dtype=np.float32)
|
||||
y = cp.array(self.y, dtype=np.float32)
|
||||
return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
|
||||
|
||||
def get_external_dmat(self) -> xgb.DMatrix:
|
||||
n_samples = self.X.shape[0]
|
||||
n_batches = 10
|
||||
per_batch = n_samples // n_batches + 1
|
||||
|
||||
predictor = []
|
||||
response = []
|
||||
weight = []
|
||||
for i in range(n_batches):
|
||||
beg = i * per_batch
|
||||
end = min((i + 1) * per_batch, n_samples)
|
||||
assert end != beg
|
||||
X = self.X[beg: end, ...]
|
||||
y = self.y[beg: end]
|
||||
w = self.w[beg: end] if self.w is not None else None
|
||||
predictor.append(X)
|
||||
response.append(y)
|
||||
if w is not None:
|
||||
weight.append(w)
|
||||
|
||||
it = IteratorForTest(predictor, response, weight if weight else None)
|
||||
return xgb.DMatrix(it)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.name
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_california_housing():
|
||||
data = datasets.fetch_california_housing()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_digits():
|
||||
data = datasets.load_digits()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_cancer():
|
||||
data = datasets.load_breast_cancer()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_sparse():
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.75
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = np.nan
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_ames_housing():
|
||||
"""
|
||||
Number of samples: 1460
|
||||
Number of features: 20
|
||||
Number of categorical features: 10
|
||||
Number of numerical features: 10
|
||||
"""
|
||||
from sklearn.datasets import fetch_openml
|
||||
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||
|
||||
categorical_columns_subset: list[str] = [
|
||||
"BldgType", # 5 cats, no nan
|
||||
"GarageFinish", # 3 cats, nan
|
||||
"LotConfig", # 5 cats, no nan
|
||||
"Functional", # 7 cats, no nan
|
||||
"MasVnrType", # 4 cats, nan
|
||||
"HouseStyle", # 8 cats, no nan
|
||||
"FireplaceQu", # 5 cats, nan
|
||||
"ExterCond", # 5 cats, no nan
|
||||
"ExterQual", # 4 cats, no nan
|
||||
"PoolQC", # 3 cats, nan
|
||||
]
|
||||
|
||||
numerical_columns_subset: list[str] = [
|
||||
"3SsnPorch",
|
||||
"Fireplaces",
|
||||
"BsmtHalfBath",
|
||||
"HalfBath",
|
||||
"GarageCars",
|
||||
"TotRmsAbvGrd",
|
||||
"BsmtFinSF1",
|
||||
"BsmtFinSF2",
|
||||
"GrLivArea",
|
||||
"ScreenPorch",
|
||||
]
|
||||
|
||||
X = X[categorical_columns_subset + numerical_columns_subset]
|
||||
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_mq2008(dpath):
|
||||
from sklearn.datasets import load_svmlight_files
|
||||
|
||||
src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
|
||||
target = dpath + '/MQ2008.zip'
|
||||
if not os.path.exists(target):
|
||||
urllib.request.urlretrieve(url=src, filename=target)
|
||||
|
||||
with zipfile.ZipFile(target, 'r') as f:
|
||||
f.extractall(path=dpath)
|
||||
|
||||
(x_train, y_train, qid_train, x_test, y_test, qid_test,
|
||||
x_valid, y_valid, qid_valid) = load_svmlight_files(
|
||||
(dpath + "MQ2008/Fold1/train.txt",
|
||||
dpath + "MQ2008/Fold1/test.txt",
|
||||
dpath + "MQ2008/Fold1/vali.txt"),
|
||||
query_id=True, zero_based=False)
|
||||
|
||||
return (x_train, y_train, qid_train, x_test, y_test, qid_test,
|
||||
x_valid, y_valid, qid_valid)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def make_categorical(
|
||||
n_samples: int, n_features: int, n_categories: int, onehot: bool, sparsity=0.0,
|
||||
):
|
||||
import pandas as pd
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
pd_dict = {}
|
||||
for i in range(n_features + 1):
|
||||
c = rng.randint(low=0, high=n_categories, size=n_samples)
|
||||
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
|
||||
|
||||
df = pd.DataFrame(pd_dict)
|
||||
label = df.iloc[:, 0]
|
||||
df = df.iloc[:, 1:]
|
||||
for i in range(0, n_features):
|
||||
label += df.iloc[:, i]
|
||||
label += 1
|
||||
|
||||
df = df.astype("category")
|
||||
categories = np.arange(0, n_categories)
|
||||
for col in df.columns:
|
||||
df[col] = df[col].cat.set_categories(categories)
|
||||
|
||||
if sparsity > 0.0:
|
||||
for i in range(n_features):
|
||||
index = rng.randint(low=0, high=n_samples-1, size=int(n_samples * sparsity))
|
||||
df.iloc[index, i] = np.NaN
|
||||
assert n_categories == np.unique(df.dtypes[i].categories).size
|
||||
|
||||
if onehot:
|
||||
return pd.get_dummies(df), label
|
||||
return df, label
|
||||
|
||||
|
||||
def _cat_sampled_from():
|
||||
@strategies.composite
|
||||
def _make_cat(draw):
|
||||
n_samples = draw(strategies.integers(2, 512))
|
||||
n_features = draw(strategies.integers(1, 4))
|
||||
n_cats = draw(strategies.integers(1, 128))
|
||||
sparsity = draw(
|
||||
strategies.floats(
|
||||
min_value=0,
|
||||
max_value=1,
|
||||
allow_nan=False,
|
||||
allow_infinity=False,
|
||||
allow_subnormal=False,
|
||||
)
|
||||
)
|
||||
return n_samples, n_features, n_cats, sparsity
|
||||
|
||||
def _build(args):
|
||||
n_samples = args[0]
|
||||
n_features = args[1]
|
||||
n_cats = args[2]
|
||||
sparsity = args[3]
|
||||
return TestDataset(
|
||||
f"{n_samples}x{n_features}-{n_cats}-{sparsity}",
|
||||
lambda: make_categorical(n_samples, n_features, n_cats, False, sparsity),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
)
|
||||
|
||||
return _make_cat().map(_build)
|
||||
|
||||
|
||||
categorical_dataset_strategy = _cat_sampled_from()
|
||||
|
||||
|
||||
@memory.cache
|
||||
def make_sparse_regression(
|
||||
n_samples: int, n_features: int, sparsity: float, as_dense: bool
|
||||
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
|
||||
"""Make sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
as_dense:
|
||||
|
||||
Return the matrix as np.ndarray with missing values filled by NaN
|
||||
|
||||
"""
|
||||
if not hasattr(np.random, "default_rng"):
|
||||
# old version of numpy on s390x
|
||||
rng = np.random.RandomState(1994)
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
format="csr",
|
||||
)
|
||||
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
||||
return X, y
|
||||
|
||||
# Use multi-thread to speed up the generation, convenient if you use this function
|
||||
# for benchmarking.
|
||||
n_threads = multiprocessing.cpu_count()
|
||||
n_threads = min(n_threads, n_features)
|
||||
|
||||
def random_csc(t_id: int) -> sparse.csc_matrix:
|
||||
rng = np.random.default_rng(1994 * t_id)
|
||||
thread_size = n_features // n_threads
|
||||
if t_id == n_threads - 1:
|
||||
n_features_tloc = n_features - t_id * thread_size
|
||||
else:
|
||||
n_features_tloc = thread_size
|
||||
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features_tloc,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
).tocsc()
|
||||
y = np.zeros((n_samples, 1))
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
size = X.indptr[i + 1] - X.indptr[i]
|
||||
if size != 0:
|
||||
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
|
||||
|
||||
return X, y
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for i in range(n_threads):
|
||||
futures.append(executor.submit(random_csc, i))
|
||||
|
||||
X_results = []
|
||||
y_results = []
|
||||
for f in futures:
|
||||
X, y = f.result()
|
||||
X_results.append(X)
|
||||
y_results.append(y)
|
||||
|
||||
assert len(y_results) == n_threads
|
||||
|
||||
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
|
||||
y = np.asarray(y_results)
|
||||
y = y.reshape((y.shape[0], y.shape[1])).T
|
||||
y = np.sum(y, axis=1)
|
||||
|
||||
assert csr.shape[0] == n_samples
|
||||
assert csr.shape[1] == n_features
|
||||
assert y.shape[0] == n_samples
|
||||
|
||||
if as_dense:
|
||||
arr = csr.toarray()
|
||||
assert arr.shape[0] == n_samples
|
||||
assert arr.shape[1] == n_features
|
||||
arr[arr == 0] = np.nan
|
||||
return arr, y
|
||||
|
||||
return csr, y
|
||||
|
||||
|
||||
sparse_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset(
|
||||
"1e5x8-0.95-csr",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.95, False),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.5-csr",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.5, False),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.5-dense",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.5, True),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.05-csr",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.05, False),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.05-dense",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.05, True),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset(
|
||||
"calif_housing", get_california_housing, "reg:squarederror", "rmse"
|
||||
),
|
||||
TestDataset(
|
||||
"calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
|
||||
),
|
||||
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
|
||||
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
|
||||
TestDataset(
|
||||
"mtreg",
|
||||
lambda: datasets.make_regression(n_samples=128, n_targets=3),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
|
||||
TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
|
||||
TestDataset(
|
||||
"empty",
|
||||
lambda: (np.empty((0, 100)), np.empty(0)),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@strategies.composite
|
||||
def _dataset_weight_margin(draw):
|
||||
data: TestDataset = draw(_unweighted_datasets_strategy)
|
||||
if draw(strategies.booleans()):
|
||||
data.w = draw(
|
||||
arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
|
||||
)
|
||||
if draw(strategies.booleans()):
|
||||
num_class = 1
|
||||
if data.objective == "multi:softmax":
|
||||
num_class = int(np.max(data.y) + 1)
|
||||
elif data.name == "mtreg":
|
||||
num_class = data.y.shape[1]
|
||||
|
||||
data.margin = draw(
|
||||
arrays(
|
||||
np.float64,
|
||||
(data.y.shape[0] * num_class),
|
||||
elements=strategies.floats(0.5, 1.0),
|
||||
)
|
||||
)
|
||||
if num_class != 1:
|
||||
data.margin = data.margin.reshape(data.y.shape[0], num_class)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# A strategy for drawing from a set of example datasets
|
||||
# May add random weights to the dataset
|
||||
dataset_strategy = _dataset_weight_margin()
|
||||
|
||||
|
||||
def non_increasing(L, tolerance=1e-4):
|
||||
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
||||
|
||||
|
||||
def eval_error_metric(predt, dtrain: xgb.DMatrix):
|
||||
"""Evaluation metric for xgb.train"""
|
||||
label = dtrain.get_label()
|
||||
r = np.zeros(predt.shape)
|
||||
gt = predt > 0.5
|
||||
if predt.size == 0:
|
||||
return "CustomErr", 0
|
||||
r[gt] = 1 - label[gt]
|
||||
le = predt <= 0.5
|
||||
r[le] = label[le]
|
||||
return 'CustomErr', np.sum(r)
|
||||
|
||||
|
||||
def eval_error_metric_skl(y_true: np.ndarray, y_score: np.ndarray) -> float:
|
||||
"""Evaluation metric that looks like metrics provided by sklearn."""
|
||||
r = np.zeros(y_score.shape)
|
||||
gt = y_score > 0.5
|
||||
r[gt] = 1 - y_true[gt]
|
||||
le = y_score <= 0.5
|
||||
r[le] = y_true[le]
|
||||
return np.sum(r)
|
||||
|
||||
|
||||
def root_mean_square(y_true: np.ndarray, y_score: np.ndarray) -> float:
|
||||
err = y_score - y_true
|
||||
rmse = np.sqrt(np.dot(err, err) / y_score.size)
|
||||
return rmse
|
||||
|
||||
|
||||
def softmax(x):
|
||||
e = np.exp(x)
|
||||
return e / np.sum(e)
|
||||
|
||||
|
||||
def softprob_obj(classes):
|
||||
def objective(labels, predt):
|
||||
rows = labels.shape[0]
|
||||
grad = np.zeros((rows, classes), dtype=float)
|
||||
hess = np.zeros((rows, classes), dtype=float)
|
||||
eps = 1e-6
|
||||
for r in range(predt.shape[0]):
|
||||
target = labels[r]
|
||||
p = softmax(predt[r, :])
|
||||
for c in range(predt.shape[1]):
|
||||
assert target >= 0 or target <= classes
|
||||
g = p[c] - 1.0 if c == target else p[c]
|
||||
h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps)
|
||||
grad[r, c] = g
|
||||
hess[r, c] = h
|
||||
|
||||
grad = grad.reshape((rows * classes, 1))
|
||||
hess = hess.reshape((rows * classes, 1))
|
||||
return grad, hess
|
||||
|
||||
return objective
|
||||
|
||||
|
||||
class DirectoryExcursion:
|
||||
def __init__(self, path: os.PathLike, cleanup=False):
|
||||
'''Change directory. Change back and optionally cleaning up the directory when exit.
|
||||
|
||||
'''
|
||||
self.path = path
|
||||
self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
|
||||
self.cleanup = cleanup
|
||||
self.files = {}
|
||||
|
||||
def __enter__(self):
|
||||
os.chdir(self.path)
|
||||
if self.cleanup:
|
||||
self.files = {
|
||||
os.path.join(root, f)
|
||||
for root, subdir, files in os.walk(self.path) for f in files
|
||||
}
|
||||
|
||||
def __exit__(self, *args):
|
||||
os.chdir(self.curdir)
|
||||
if self.cleanup:
|
||||
files = {
|
||||
os.path.join(root, f)
|
||||
for root, subdir, files in os.walk(self.path) for f in files
|
||||
}
|
||||
diff = files.difference(self.files)
|
||||
for f in diff:
|
||||
os.remove(f)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def captured_output():
|
||||
"""Reassign stdout temporarily in order to test printed statements
|
||||
Taken from:
|
||||
https://stackoverflow.com/questions/4219717/how-to-assert-output-with-nosetest-unittest-in-python
|
||||
|
||||
Also works for pytest.
|
||||
|
||||
"""
|
||||
new_out, new_err = StringIO(), StringIO()
|
||||
old_out, old_err = sys.stdout, sys.stderr
|
||||
try:
|
||||
sys.stdout, sys.stderr = new_out, new_err
|
||||
yield sys.stdout, sys.stderr
|
||||
finally:
|
||||
sys.stdout, sys.stderr = old_out, old_err
|
||||
|
||||
|
||||
try:
|
||||
# Python 3.7+
|
||||
from contextlib import nullcontext as noop_context
|
||||
except ImportError:
|
||||
# Python 3.6
|
||||
from contextlib import suppress as noop_context
|
||||
|
||||
|
||||
CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
|
||||
PROJECT_ROOT = os.path.normpath(
|
||||
os.path.join(CURDIR, os.path.pardir, os.path.pardir))
|
||||
@@ -1,7 +1,9 @@
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.metrics import roc_auc_score
|
||||
import sys
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
|
||||
def run_omp(output_path: str):
|
||||
|
||||
Reference in New Issue
Block a user