Save model in ubj as the default. (#9947)

This commit is contained in:
Jiaming Yuan 2024-01-05 17:53:36 +08:00 committed by GitHub
parent c03a4d5088
commit 38dd91f491
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 598 additions and 550 deletions

View File

@ -30,9 +30,6 @@ import org.apache.spark.ml.param.Params
import org.apache.spark.ml.util.DefaultParamsReader.Metadata
abstract class XGBoostWriter extends MLWriter {
/** Currently it's using the "deprecated" format as
* default, which will be changed into `ubj` in future releases. */
def getModelFormat(): String = {
optionMap.getOrElse("format", JBooster.DEFAULT_FORMAT)
}

View File

@ -1,5 +1,5 @@
/*
Copyright (c) 2014-2022 by Contributors
Copyright (c) 2014-2024 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -432,6 +432,7 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS
val xgb = new XGBoostClassifier(paramMap)
val model = xgb.fit(trainingDF)
// test json
val modelPath = new File(tempDir.toFile, "xgbc").getPath
model.write.option("format", "json").save(modelPath)
val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath
@ -439,21 +440,21 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS
assert(compareTwoFiles(new File(modelPath, "data/XGBoostClassificationModel").getPath,
nativeJsonModelPath))
// test default "deprecated"
// test ubj
val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath
model.write.save(modelUbjPath)
val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel").getPath
model.nativeBooster.saveModel(nativeDeprecatedModelPath)
val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath
model.nativeBooster.saveModel(nativeUbjModelPath)
assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostClassificationModel").getPath,
nativeDeprecatedModelPath))
nativeUbjModelPath))
// json file should be indifferent with ubj file
val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath
model.write.option("format", "json").save(modelJsonPath)
val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel1.ubj").getPath
model.nativeBooster.saveModel(nativeUbjModelPath)
val nativeUbjModelPath1 = new File(tempDir.toFile, "nativeModel1.ubj").getPath
model.nativeBooster.saveModel(nativeUbjModelPath1)
assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath,
nativeUbjModelPath))
nativeUbjModelPath1))
}
test("native json model file should store feature_name and feature_type") {

View File

@ -1,5 +1,5 @@
/*
Copyright (c) 2014-2022 by Contributors
Copyright (c) 2014-2024 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -333,21 +333,24 @@ class XGBoostRegressorSuite extends AnyFunSuite with PerTest with TmpFolderPerSu
assert(compareTwoFiles(new File(modelPath, "data/XGBoostRegressionModel").getPath,
nativeJsonModelPath))
// test default "deprecated"
// test default "ubj"
val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath
model.write.save(modelUbjPath)
val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel").getPath
model.nativeBooster.saveModel(nativeDeprecatedModelPath)
assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostRegressionModel").getPath,
nativeDeprecatedModelPath))
// json file should be indifferent with ubj file
val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath
model.write.option("format", "json").save(modelJsonPath)
val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel1.ubj").getPath
val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath
model.nativeBooster.saveModel(nativeUbjModelPath)
assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostRegressionModel").getPath,
nativeUbjModelPath))
}
assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostRegressionModel").getPath,
nativeUbjModelPath))
// test the deprecated format
val modelDeprecatedPath = new File(tempDir.toFile, "modelDeprecated").getPath
model.write.option("format", "deprecated").save(modelDeprecatedPath)
val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel.deprecated").getPath
model.nativeBooster.saveModel(nativeDeprecatedModelPath)
assert(compareTwoFiles(new File(modelDeprecatedPath, "data/XGBoostRegressionModel").getPath,
nativeDeprecatedModelPath))
}
}

View File

@ -34,7 +34,7 @@ import org.apache.commons.logging.LogFactory;
* Booster for xgboost, this is a model API that support interactive build of a XGBoost Model
*/
public class Booster implements Serializable, KryoSerializable {
public static final String DEFAULT_FORMAT = "deprecated";
public static final String DEFAULT_FORMAT = "ubj";
private static final Log logger = LogFactory.getLog(Booster.class);
// handle to the booster.
private long handle = 0;
@ -788,8 +788,7 @@ public class Booster implements Serializable, KryoSerializable {
}
/**
* Save model into raw byte array. Currently it's using the deprecated format as
* default, which will be changed into `ubj` in future releases.
* Save model into raw byte array in the UBJSON ("ubj") format.
*
* @return the saved byte array
* @throws XGBoostError native error

View File

@ -337,8 +337,7 @@ class Booster private[xgboost4j](private[xgboost4j] var booster: JBooster)
}
/**
* Save model into a raw byte array. Currently it's using the deprecated format as
* default, which will be changed into `ubj` in future releases.
* Save model into a raw byte array in the UBJSON ("ubj") format.
*/
@throws(classOf[XGBoostError])
def toByteArray: Array[Byte] = {

View File

@ -2613,7 +2613,7 @@ class Booster:
else:
raise TypeError("fname must be a string or os PathLike")
def save_raw(self, raw_format: str = "deprecated") -> bytearray:
def save_raw(self, raw_format: str = "ubj") -> bytearray:
"""Save the model to a in memory buffer representation instead of file.
Parameters

View File

@ -630,7 +630,7 @@ sparse_datasets_strategy = strategies.sampled_from(
def make_datasets_with_margin(
unweighted_strategy: strategies.SearchStrategy,
) -> Callable:
) -> Callable[[], strategies.SearchStrategy[TestDataset]]:
"""Factory function for creating strategies that generates datasets with weight and
base margin.
@ -668,8 +668,7 @@ def make_datasets_with_margin(
# A strategy for drawing from a set of example datasets. May add random weights to the
# dataset
@memory.cache
def make_dataset_strategy() -> Callable:
def make_dataset_strategy() -> strategies.SearchStrategy[TestDataset]:
_unweighted_datasets_strategy = strategies.sampled_from(
[
TestDataset(

View File

@ -1313,10 +1313,8 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
namespace {
void WarnOldModel() {
if (XGBOOST_VER_MAJOR >= 2) {
LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or "
"`ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.";
}
LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or "
"`ubj`. Model format is default to UBJSON in XGBoost 2.1 if not specified.";
}
} // anonymous namespace
@ -1339,14 +1337,14 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *fname) {
save_json(std::ios::out);
} else if (common::FileExtension(fname) == "ubj") {
save_json(std::ios::binary);
} else if (XGBOOST_VER_MAJOR == 2 && XGBOOST_VER_MINOR >= 2) {
LOG(WARNING) << "Saving model to JSON as default. You can use file extension `json`, `ubj` or "
"`deprecated` to choose between formats.";
save_json(std::ios::out);
} else {
} else if (common::FileExtension(fname) == "deprecated") {
WarnOldModel();
auto *bst = static_cast<Learner *>(handle);
bst->SaveModel(fo.get());
} else {
LOG(WARNING) << "Saving model in the UBJSON format as default. You can use file extension:"
" `json`, `ubj` or `deprecated` to choose between formats.";
save_json(std::ios::binary);
}
API_END();
}

View File

@ -27,6 +27,7 @@ class LintersPaths:
"tests/python/test_quantile_dmatrix.py",
"tests/python/test_tree_regularization.py",
"tests/python/test_shap.py",
"tests/python/test_model_io.py",
"tests/python/test_with_pandas.py",
"tests/python-gpu/",
"tests/python-sycl/",
@ -83,6 +84,7 @@ class LintersPaths:
"tests/python/test_multi_target.py",
"tests/python-gpu/test_gpu_data_iterator.py",
"tests/python-gpu/load_pickle.py",
"tests/python/test_model_io.py",
"tests/test_distributed/test_with_spark/test_data.py",
"tests/test_distributed/test_gpu_with_spark/test_data.py",
"tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",

View File

@ -10,46 +10,48 @@ import pytest
import xgboost as xgb
from xgboost import testing as tm
dpath = 'demo/data/'
dpath = "demo/data/"
rng = np.random.RandomState(1994)
class TestBasic:
def test_compat(self):
from xgboost.compat import lazy_isinstance
a = np.array([1, 2, 3])
assert lazy_isinstance(a, 'numpy', 'ndarray')
assert not lazy_isinstance(a, 'numpy', 'dataframe')
assert lazy_isinstance(a, "numpy", "ndarray")
assert not lazy_isinstance(a, "numpy", "dataframe")
def test_basic(self):
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 2, 'eta': 1,
'objective': 'binary:logistic'}
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
# specify validations set to watch performance
watchlist = [(dtrain, 'train')]
watchlist = [(dtrain, "train")]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist, verbose_eval=True)
bst = xgb.train(param, dtrain, num_round, evals=watchlist, verbose_eval=True)
preds = bst.predict(dtrain)
labels = dtrain.get_label()
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
err = sum(
1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
) / float(len(preds))
# error must be smaller than 10%
assert err < 0.1
preds = bst.predict(dtest)
labels = dtest.get_label()
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
err = sum(
1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
) / float(len(preds))
# error must be smaller than 10%
assert err < 0.1
with tempfile.TemporaryDirectory() as tmpdir:
dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
dtest_path = os.path.join(tmpdir, "dtest.dmatrix")
# save dmatrix into binary buffer
dtest.save_binary(dtest_path)
# save model
model_path = os.path.join(tmpdir, 'model.booster')
model_path = os.path.join(tmpdir, "model.ubj")
bst.save_model(model_path)
# load model and data in
bst2 = xgb.Booster(model_file=model_path)
@ -59,17 +61,21 @@ class TestBasic:
assert np.sum(np.abs(preds2 - preds)) == 0
def test_metric_config(self):
# Make sure that the metric configuration happens in booster so the
# string `['error', 'auc']` doesn't get passed down to core.
# Make sure that the metric configuration happens in booster so the string
# `['error', 'auc']` doesn't get passed down to core.
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
param = {
"max_depth": 2,
"eta": 1,
"objective": "binary:logistic",
"eval_metric": ["error", "auc"],
}
watchlist = [(dtest, "eval"), (dtrain, "train")]
num_round = 2
booster = xgb.train(param, dtrain, num_round, watchlist)
booster = xgb.train(param, dtrain, num_round, evals=watchlist)
predt_0 = booster.predict(dtrain)
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, 'model.json')
path = os.path.join(tmpdir, "model.json")
booster.save_model(path)
booster = xgb.Booster(params=param, model_file=path)
@ -78,22 +84,23 @@ class TestBasic:
def test_multiclass(self):
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2}
param = {"max_depth": 2, "eta": 1, "num_class": 2}
# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
watchlist = [(dtest, "eval"), (dtrain, "train")]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
bst = xgb.train(param, dtrain, num_round, evals=watchlist)
# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
err = sum(1 for i in range(len(preds))
if preds[i] != labels[i]) / float(len(preds))
err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(
len(preds)
)
# error must be smaller than 10%
assert err < 0.1
with tempfile.TemporaryDirectory() as tmpdir:
dtest_path = os.path.join(tmpdir, 'dtest.buffer')
model_path = os.path.join(tmpdir, 'xgb.model')
dtest_path = os.path.join(tmpdir, "dtest.buffer")
model_path = os.path.join(tmpdir, "model.ubj")
# save dmatrix into binary buffer
dtest.save_binary(dtest_path)
# save model
@ -108,33 +115,39 @@ class TestBasic:
def test_dump(self):
data = np.random.randn(100, 2)
target = np.array([0, 1] * 50)
features = ['Feature1', 'Feature2']
features = ["Feature1", "Feature2"]
dm = xgb.DMatrix(data, label=target, feature_names=features)
params = {'objective': 'binary:logistic',
'eval_metric': 'logloss',
'eta': 0.3,
'max_depth': 1}
params = {
"objective": "binary:logistic",
"eval_metric": "logloss",
"eta": 0.3,
"max_depth": 1,
}
bst = xgb.train(params, dm, num_boost_round=1)
# number of feature importances should == number of features
dump1 = bst.get_dump()
assert len(dump1) == 1, 'Expected only 1 tree to be dumped.'
len(dump1[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.'
assert len(dump1) == 1, "Expected only 1 tree to be dumped."
len(
dump1[0].splitlines()
) == 3, "Expected 1 root and 2 leaves - 3 lines in dump."
dump2 = bst.get_dump(with_stats=True)
assert dump2[0].count('\n') == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.'
msg = 'Expected more info when with_stats=True is given.'
assert dump2[0].find('\n') > dump1[0].find('\n'), msg
assert (
dump2[0].count("\n") == 3
), "Expected 1 root and 2 leaves - 3 lines in dump."
msg = "Expected more info when with_stats=True is given."
assert dump2[0].find("\n") > dump1[0].find("\n"), msg
dump3 = bst.get_dump(dump_format="json")
dump3j = json.loads(dump3[0])
assert dump3j['nodeid'] == 0, 'Expected the root node on top.'
assert dump3j["nodeid"] == 0, "Expected the root node on top."
dump4 = bst.get_dump(dump_format="json", with_stats=True)
dump4j = json.loads(dump4[0])
assert 'gain' in dump4j, "Expected 'gain' to be dumped in JSON."
assert "gain" in dump4j, "Expected 'gain' to be dumped in JSON."
with pytest.raises(ValueError):
bst.get_dump(fmap="foo")
@ -163,12 +176,14 @@ class TestBasic:
def test_load_file_invalid(self):
with pytest.raises(xgb.core.XGBoostError):
xgb.Booster(model_file='incorrect_path')
xgb.Booster(model_file="incorrect_path")
with pytest.raises(xgb.core.XGBoostError):
xgb.Booster(model_file=u'不正なパス')
xgb.Booster(model_file="不正なパス")
@pytest.mark.parametrize("path", ["모델.ubj", "がうる・ぐら.json"], ids=["path-0", "path-1"])
@pytest.mark.parametrize(
"path", ["모델.ubj", "がうる・ぐら.json"], ids=["path-0", "path-1"]
)
def test_unicode_path(self, tmpdir, path):
model_path = pathlib.Path(tmpdir) / path
dtrain, _ = tm.load_agaricus(__file__)
@ -180,12 +195,11 @@ class TestBasic:
assert bst.get_dump(dump_format="text") == bst2.get_dump(dump_format="text")
def test_dmatrix_numpy_init_omp(self):
rows = [1000, 11326, 15000]
cols = 50
for row in rows:
X = np.random.randn(row, cols)
y = np.random.randn(row).astype('f')
y = np.random.randn(row).astype("f")
dm = xgb.DMatrix(X, y, nthread=0)
np.testing.assert_array_equal(dm.get_label(), y)
assert dm.num_row() == row
@ -198,8 +212,7 @@ class TestBasic:
def test_cv(self):
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
# return np.ndarray
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
@ -208,19 +221,18 @@ class TestBasic:
def test_cv_no_shuffle(self):
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
# return np.ndarray
cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10,
as_pandas=False)
cv = xgb.cv(
params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False
)
assert isinstance(cv, dict)
assert len(cv) == (4)
def test_cv_explicit_fold_indices(self):
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
'binary:logistic'}
params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
folds = [
# Train Test
([1, 3], [5, 8]),
@ -228,15 +240,13 @@ class TestBasic:
]
# return np.ndarray
cv = xgb.cv(params, dm, num_boost_round=10, folds=folds,
as_pandas=False)
cv = xgb.cv(params, dm, num_boost_round=10, folds=folds, as_pandas=False)
assert isinstance(cv, dict)
assert len(cv) == (4)
@pytest.mark.skipif(**tm.skip_s390x())
def test_cv_explicit_fold_indices_labels(self):
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
'reg:squarederror'}
params = {"max_depth": 2, "eta": 1, "objective": "reg:squarederror"}
N = 100
F = 3
dm = xgb.DMatrix(data=np.random.randn(N, F), label=np.arange(N))
@ -252,9 +262,10 @@ class TestBasic:
super().__init__()
def after_iteration(
self, model,
self,
model,
epoch: int,
evals_log: xgb.callback.TrainingCallback.EvalsLog
evals_log: xgb.callback.TrainingCallback.EvalsLog,
):
print([fold.dtest.get_label() for fold in model.cvfolds])
@ -263,12 +274,18 @@ class TestBasic:
# Run cross validation and capture standard out to test callback result
with tm.captured_output() as (out, err):
xgb.cv(
params, dm, num_boost_round=1, folds=folds, callbacks=[cb],
as_pandas=False
params,
dm,
num_boost_round=1,
folds=folds,
callbacks=[cb],
as_pandas=False,
)
output = out.getvalue().strip()
solution = ('[array([5., 8.], dtype=float32), array([23., 43., 11.],' +
' dtype=float32)]')
solution = (
"[array([5., 8.], dtype=float32), array([23., 43., 11.],"
+ " dtype=float32)]"
)
assert output == solution
@ -285,7 +302,7 @@ class TestBasicPathLike:
"""Saving to a binary file using pathlib from a DMatrix."""
data = np.random.randn(100, 2)
target = np.array([0, 1] * 50)
features = ['Feature1', 'Feature2']
features = ["Feature1", "Feature2"]
dm = xgb.DMatrix(data, label=target, feature_names=features)
@ -299,42 +316,3 @@ class TestBasicPathLike:
"""An invalid model_file path should raise XGBoostError."""
with pytest.raises(xgb.core.XGBoostError):
xgb.Booster(model_file=Path("invalidpath"))
def test_Booster_save_and_load(self):
"""Saving and loading model files from paths."""
save_path = Path("saveload.model")
data = np.random.randn(100, 2)
target = np.array([0, 1] * 50)
features = ['Feature1', 'Feature2']
dm = xgb.DMatrix(data, label=target, feature_names=features)
params = {'objective': 'binary:logistic',
'eval_metric': 'logloss',
'eta': 0.3,
'max_depth': 1}
bst = xgb.train(params, dm, num_boost_round=1)
# save, assert exists
bst.save_model(save_path)
assert save_path.exists()
def dump_assertions(dump):
"""Assertions for the expected dump from Booster"""
assert len(dump) == 1, 'Exepcted only 1 tree to be dumped.'
assert len(dump[0].splitlines()) == 3, 'Expected 1 root and 2 leaves - 3 lines.'
# load the model again using Path
bst2 = xgb.Booster(model_file=save_path)
dump2 = bst2.get_dump()
dump_assertions(dump2)
# load again using load_model
bst3 = xgb.Booster()
bst3.load_model(save_path)
dump3 = bst3.get_dump()
dump_assertions(dump3)
# remove file
Path.unlink(save_path)

View File

@ -15,33 +15,9 @@ dpath = tm.data_dir(__file__)
rng = np.random.RandomState(1994)
def json_model(model_path: str, parameters: dict) -> dict:
datasets = pytest.importorskip("sklearn.datasets")
X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
if parameters.get("objective", None) == "multi:softmax":
parameters["num_class"] = 3
dm1 = xgb.DMatrix(X, y)
bst = xgb.train(parameters, dm1)
bst.save_model(model_path)
if model_path.endswith("ubj"):
import ubjson
with open(model_path, "rb") as ubjfd:
model = ubjson.load(ubjfd)
else:
with open(model_path, "r") as fd:
model = json.load(fd)
return model
class TestModels:
def test_glm(self):
param = {'verbosity': 0, 'objective': 'binary:logistic',
param = {'objective': 'binary:logistic',
'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
'nthread': 1}
dtrain, dtest = tm.load_agaricus(__file__)
@ -73,7 +49,7 @@ class TestModels:
with tempfile.TemporaryDirectory() as tmpdir:
dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
model_path = os.path.join(tmpdir, 'xgboost.model.dart')
model_path = os.path.join(tmpdir, "xgboost.model.dart.ubj")
# save dmatrix into binary buffer
dtest.save_binary(dtest_path)
model_path = model_path
@ -101,7 +77,6 @@ class TestModels:
# check whether sample_type and normalize_type work
num_round = 50
param['verbosity'] = 0
param['learning_rate'] = 0.1
param['rate_drop'] = 0.1
preds_list = []
@ -214,8 +189,7 @@ class TestModels:
assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'}
def test_fpreproc(self):
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
num_round = 2
def fpreproc(dtrain, dtest, param):
@ -229,8 +203,7 @@ class TestModels:
metrics={'auc'}, seed=0, fpreproc=fpreproc)
def test_show_stdv(self):
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
num_round = 2
dtrain, _ = tm.load_agaricus(__file__)
xgb.cv(param, dtrain, num_round, nfold=5,
@ -273,142 +246,6 @@ class TestModels:
bst = xgb.train([], dm2)
bst.predict(dm2) # success
def test_model_binary_io(self):
model_path = 'test_model_binary_io.bin'
parameters = {'tree_method': 'hist', 'booster': 'gbtree',
'scale_pos_weight': '0.5'}
X = np.random.random((10, 3))
y = np.random.random((10,))
dtrain = xgb.DMatrix(X, y)
bst = xgb.train(parameters, dtrain, num_boost_round=2)
bst.save_model(model_path)
bst = xgb.Booster(model_file=model_path)
os.remove(model_path)
config = json.loads(bst.save_config())
assert float(config['learner']['objective'][
'reg_loss_param']['scale_pos_weight']) == 0.5
buf = bst.save_raw()
from_raw = xgb.Booster()
from_raw.load_model(buf)
buf_from_raw = from_raw.save_raw()
assert buf == buf_from_raw
def run_model_json_io(self, parameters: dict, ext: str) -> None:
if ext == "ubj" and tm.no_ubjson()["condition"]:
pytest.skip(tm.no_ubjson()["reason"])
loc = locale.getpreferredencoding(False)
model_path = 'test_model_json_io.' + ext
j_model = json_model(model_path, parameters)
assert isinstance(j_model['learner'], dict)
bst = xgb.Booster(model_file=model_path)
bst.save_model(fname=model_path)
if ext == "ubj":
import ubjson
with open(model_path, "rb") as ubjfd:
j_model = ubjson.load(ubjfd)
else:
with open(model_path, 'r') as fd:
j_model = json.load(fd)
assert isinstance(j_model['learner'], dict)
os.remove(model_path)
assert locale.getpreferredencoding(False) == loc
json_raw = bst.save_raw(raw_format="json")
from_jraw = xgb.Booster()
from_jraw.load_model(json_raw)
ubj_raw = bst.save_raw(raw_format="ubj")
from_ubjraw = xgb.Booster()
from_ubjraw.load_model(ubj_raw)
if parameters.get("multi_strategy", None) != "multi_output_tree":
# old binary model is not supported.
old_from_json = from_jraw.save_raw(raw_format="deprecated")
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
assert old_from_json == old_from_ubj
raw_json = bst.save_raw(raw_format="json")
pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
bst.load_model(bytearray(pretty, encoding="ascii"))
if parameters.get("multi_strategy", None) != "multi_output_tree":
# old binary model is not supported.
old_from_json = from_jraw.save_raw(raw_format="deprecated")
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
assert old_from_json == old_from_ubj
rng = np.random.default_rng()
X = rng.random(size=from_jraw.num_features() * 10).reshape(
(10, from_jraw.num_features())
)
predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
predt_from_bst = bst.predict(xgb.DMatrix(X))
np.testing.assert_allclose(predt_from_jraw, predt_from_bst)
@pytest.mark.parametrize("ext", ["json", "ubj"])
def test_model_json_io(self, ext: str) -> None:
parameters = {"booster": "gbtree", "tree_method": "hist"}
self.run_model_json_io(parameters, ext)
parameters = {
"booster": "gbtree",
"tree_method": "hist",
"multi_strategy": "multi_output_tree",
"objective": "multi:softmax",
}
self.run_model_json_io(parameters, ext)
parameters = {"booster": "gblinear"}
self.run_model_json_io(parameters, ext)
parameters = {"booster": "dart", "tree_method": "hist"}
self.run_model_json_io(parameters, ext)
@pytest.mark.skipif(**tm.no_json_schema())
def test_json_io_schema(self):
import jsonschema
model_path = 'test_json_schema.json'
path = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
doc = os.path.join(path, 'doc', 'model.schema')
with open(doc, 'r') as fd:
schema = json.load(fd)
parameters = {'tree_method': 'hist', 'booster': 'gbtree'}
jsonschema.validate(instance=json_model(model_path, parameters),
schema=schema)
os.remove(model_path)
parameters = {'tree_method': 'hist', 'booster': 'dart'}
jsonschema.validate(instance=json_model(model_path, parameters),
schema=schema)
os.remove(model_path)
try:
dtrain, _ = tm.load_agaricus(__file__)
xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
except ValueError as e:
e_str = str(e)
beg = e_str.find('Objective candidate')
end = e_str.find('Stack trace')
e_str = e_str[beg: end]
e_str = e_str.strip()
splited = e_str.splitlines()
objectives = [s.split(': ')[1] for s in splited]
j_objectives = schema['properties']['learner']['properties'][
'objective']['oneOf']
objectives_from_schema = set()
for j_obj in j_objectives:
objectives_from_schema.add(
j_obj['properties']['name']['const'])
objectives = set(objectives)
assert objectives == objectives_from_schema
@pytest.mark.skipif(**tm.no_json_schema())
def test_json_dump_schema(self):
import jsonschema
@ -470,29 +307,6 @@ class TestModels:
for d in text_dump:
assert d.find(r"feature \"2\"") != -1
def test_categorical_model_io(self):
X, y = tm.make_categorical(256, 16, 71, False)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16)
predt_0 = booster.predict(Xy)
with tempfile.TemporaryDirectory() as tempdir:
path = os.path.join(tempdir, "model.binary")
with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"):
booster.save_model(path)
path = os.path.join(tempdir, "model.json")
booster.save_model(path)
booster = xgb.Booster(model_file=path)
predt_1 = booster.predict(Xy)
np.testing.assert_allclose(predt_0, predt_1)
path = os.path.join(tempdir, "model.ubj")
booster.save_model(path)
booster = xgb.Booster(model_file=path)
predt_1 = booster.predict(Xy)
np.testing.assert_allclose(predt_0, predt_1)
@pytest.mark.skipif(**tm.no_sklearn())
def test_attributes(self):
from sklearn.datasets import load_iris

View File

@ -278,14 +278,18 @@ class TestCallbacks:
dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
watchlist = [(dtest, "eval"), (dtrain, "train")]
num_round = 4
# learning_rates as a list
# init eta with 0 to check whether learning_rates work
param = {'max_depth': 2, 'eta': 0, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': 'error',
'tree_method': tree_method}
param = {
"max_depth": 2,
"eta": 0,
"objective": "binary:logistic",
"eval_metric": "error",
"tree_method": tree_method,
}
evals_result = {}
bst = xgb.train(
param,
@ -295,15 +299,19 @@ class TestCallbacks:
callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
evals_result=evals_result,
)
eval_errors_0 = list(map(float, evals_result['eval']['error']))
eval_errors_0 = list(map(float, evals_result["eval"]["error"]))
assert isinstance(bst, xgb.core.Booster)
# validation error should decrease, if eta > 0
assert eval_errors_0[0] > eval_errors_0[-1]
# init learning_rate with 0 to check whether learning_rates work
param = {'max_depth': 2, 'learning_rate': 0, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': 'error',
'tree_method': tree_method}
param = {
"max_depth": 2,
"learning_rate": 0,
"objective": "binary:logistic",
"eval_metric": "error",
"tree_method": tree_method,
}
evals_result = {}
bst = xgb.train(
@ -314,15 +322,17 @@ class TestCallbacks:
callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
evals_result=evals_result,
)
eval_errors_1 = list(map(float, evals_result['eval']['error']))
eval_errors_1 = list(map(float, evals_result["eval"]["error"]))
assert isinstance(bst, xgb.core.Booster)
# validation error should decrease, if learning_rate > 0
assert eval_errors_1[0] > eval_errors_1[-1]
# check if learning_rates override default value of eta/learning_rate
param = {
'max_depth': 2, 'verbosity': 0, 'objective': 'binary:logistic',
'eval_metric': 'error', 'tree_method': tree_method
"max_depth": 2,
"objective": "binary:logistic",
"eval_metric": "error",
"tree_method": tree_method,
}
evals_result = {}
bst = xgb.train(

View File

@ -12,6 +12,7 @@ def test_global_config_verbosity(verbosity_level):
return xgb.get_config()["verbosity"]
old_verbosity = get_current_verbosity()
assert old_verbosity == 1
with xgb.config_context(verbosity=verbosity_level):
new_verbosity = get_current_verbosity()
assert new_verbosity == verbosity_level
@ -30,7 +31,10 @@ def test_global_config_use_rmm(use_rmm):
assert old_use_rmm_flag == get_current_use_rmm_flag()
def test_nested_config():
def test_nested_config() -> None:
verbosity = xgb.get_config()["verbosity"]
assert verbosity == 1
with xgb.config_context(verbosity=3):
assert xgb.get_config()["verbosity"] == 3
with xgb.config_context(verbosity=2):
@ -45,13 +49,15 @@ def test_nested_config():
with xgb.config_context(verbosity=None):
assert xgb.get_config()["verbosity"] == 3 # None has no effect
verbosity = xgb.get_config()["verbosity"]
xgb.set_config(verbosity=2)
assert xgb.get_config()["verbosity"] == 2
with xgb.config_context(verbosity=3):
assert xgb.get_config()["verbosity"] == 3
xgb.set_config(verbosity=verbosity) # reset
verbosity = xgb.get_config()["verbosity"]
assert verbosity == 1
def test_thread_safty():
n_threads = multiprocessing.cpu_count()

View File

@ -1,6 +1,7 @@
import csv
import os
import tempfile
import warnings
import numpy as np
import pytest
@ -24,20 +25,18 @@ class TestDMatrix:
with pytest.warns(UserWarning):
data._warn_unused_missing("uri", 4)
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
data._warn_unused_missing("uri", None)
data._warn_unused_missing("uri", np.nan)
assert len(record) == 0
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
x = rng.randn(10, 10)
y = rng.randn(10)
xgb.DMatrix(x, y, missing=4)
assert len(record) == 0
def test_dmatrix_numpy_init(self):
data = np.random.randn(5, 5)
dm = xgb.DMatrix(data)
@ -264,7 +263,7 @@ class TestDMatrix:
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
watchlist = [(dtrain, "train")]
param = {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0}
param = {"max_depth": 3, "objective": "binary:logistic"}
bst = xgb.train(param, dtrain, 5, watchlist)
bst.predict(dtrain)
@ -302,7 +301,7 @@ class TestDMatrix:
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
watchlist = [(dtrain, "train")]
param = {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0}
param = {"max_depth": 3, "objective": "binary:logistic"}
bst = xgb.train(param, dtrain, 5, watchlist)
bst.predict(dtrain)
@ -475,17 +474,19 @@ class TestDMatrixColumnSplit:
def test_uri(self):
def verify_uri():
rank = xgb.collective.get_rank()
data = np.random.rand(5, 5)
filename = f"test_data_{rank}.csv"
with open(filename, mode="w", newline="") as file:
writer = csv.writer(file)
for row in data:
writer.writerow(row)
dtrain = xgb.DMatrix(
f"{filename}?format=csv", data_split_mode=DataSplitMode.COL
)
assert dtrain.num_row() == 5
assert dtrain.num_col() == 5 * xgb.collective.get_world_size()
with tempfile.TemporaryDirectory() as tmpdir:
filename = os.path.join(tmpdir, f"test_data_{rank}.csv")
data = np.random.rand(5, 5)
with open(filename, mode="w", newline="") as file:
writer = csv.writer(file)
for row in data:
writer.writerow(row)
dtrain = xgb.DMatrix(
f"{filename}?format=csv", data_split_mode=DataSplitMode.COL
)
assert dtrain.num_row() == 5
assert dtrain.num_col() == 5 * xgb.collective.get_world_size()
tm.run_with_rabit(world_size=3, test_fn=verify_uri)

View File

@ -67,8 +67,10 @@ class TestEarlyStopping:
X = digits['data']
y = digits['target']
dm = xgb.DMatrix(X, label=y)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': 'error'}
params = {
'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic',
'eval_metric': 'error'
}
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
early_stopping_rounds=10)

View File

@ -9,29 +9,13 @@ rng = np.random.RandomState(1337)
class TestEvalMetrics:
xgb_params_01 = {
'verbosity': 0,
'nthread': 1,
'eval_metric': 'error'
}
xgb_params_01 = {'nthread': 1, 'eval_metric': 'error'}
xgb_params_02 = {
'verbosity': 0,
'nthread': 1,
'eval_metric': ['error']
}
xgb_params_02 = {'nthread': 1, 'eval_metric': ['error']}
xgb_params_03 = {
'verbosity': 0,
'nthread': 1,
'eval_metric': ['rmse', 'error']
}
xgb_params_03 = {'nthread': 1, 'eval_metric': ['rmse', 'error']}
xgb_params_04 = {
'verbosity': 0,
'nthread': 1,
'eval_metric': ['error', 'rmse']
}
xgb_params_04 = {'nthread': 1, 'eval_metric': ['error', 'rmse']}
def evalerror_01(self, preds, dtrain):
labels = dtrain.get_label()

View File

@ -22,8 +22,14 @@ coord_strategy = strategies.fixed_dictionaries({
def train_result(param, dmat, num_rounds):
result = {}
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
evals_result=result)
xgb.train(
param,
dmat,
num_rounds,
evals=[(dmat, "train")],
verbose_eval=False,
evals_result=result,
)
return result

View File

@ -0,0 +1,406 @@
import json
import locale
import os
import pickle
import tempfile
from pathlib import Path
from typing import List
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
def json_model(model_path: str, parameters: dict) -> dict:
datasets = pytest.importorskip("sklearn.datasets")
X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
if parameters.get("objective", None) == "multi:softmax":
parameters["num_class"] = 3
dm1 = xgb.DMatrix(X, y)
bst = xgb.train(parameters, dm1)
bst.save_model(model_path)
if model_path.endswith("ubj"):
import ubjson
with open(model_path, "rb") as ubjfd:
model = ubjson.load(ubjfd)
else:
with open(model_path, "r") as fd:
model = json.load(fd)
return model
class TestBoosterIO:
def run_model_json_io(self, parameters: dict, ext: str) -> None:
config = xgb.config.get_config()
assert config["verbosity"] == 1
if ext == "ubj" and tm.no_ubjson()["condition"]:
pytest.skip(tm.no_ubjson()["reason"])
loc = locale.getpreferredencoding(False)
model_path = "test_model_json_io." + ext
j_model = json_model(model_path, parameters)
assert isinstance(j_model["learner"], dict)
bst = xgb.Booster(model_file=model_path)
bst.save_model(fname=model_path)
if ext == "ubj":
import ubjson
with open(model_path, "rb") as ubjfd:
j_model = ubjson.load(ubjfd)
else:
with open(model_path, "r") as fd:
j_model = json.load(fd)
assert isinstance(j_model["learner"], dict)
os.remove(model_path)
assert locale.getpreferredencoding(False) == loc
json_raw = bst.save_raw(raw_format="json")
from_jraw = xgb.Booster()
from_jraw.load_model(json_raw)
ubj_raw = bst.save_raw(raw_format="ubj")
from_ubjraw = xgb.Booster()
from_ubjraw.load_model(ubj_raw)
if parameters.get("multi_strategy", None) != "multi_output_tree":
# Old binary model is not supported for vector leaf.
with pytest.warns(Warning, match="Model format is default to UBJSON"):
old_from_json = from_jraw.save_raw(raw_format="deprecated")
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
assert old_from_json == old_from_ubj
raw_json = bst.save_raw(raw_format="json")
pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
bst.load_model(bytearray(pretty, encoding="ascii"))
if parameters.get("multi_strategy", None) != "multi_output_tree":
# old binary model is not supported.
with pytest.warns(Warning, match="Model format is default to UBJSON"):
old_from_json = from_jraw.save_raw(raw_format="deprecated")
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
assert old_from_json == old_from_ubj
rng = np.random.default_rng()
X = rng.random(size=from_jraw.num_features() * 10).reshape(
(10, from_jraw.num_features())
)
predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
predt_from_bst = bst.predict(xgb.DMatrix(X))
np.testing.assert_allclose(predt_from_jraw, predt_from_bst)
@pytest.mark.parametrize("ext", ["json", "ubj"])
def test_model_json_io(self, ext: str) -> None:
parameters = {"booster": "gbtree", "tree_method": "hist"}
self.run_model_json_io(parameters, ext)
parameters = {
"booster": "gbtree",
"tree_method": "hist",
"multi_strategy": "multi_output_tree",
"objective": "multi:softmax",
}
self.run_model_json_io(parameters, ext)
parameters = {"booster": "gblinear"}
self.run_model_json_io(parameters, ext)
parameters = {"booster": "dart", "tree_method": "hist"}
self.run_model_json_io(parameters, ext)
def test_categorical_model_io(self) -> None:
X, y = tm.make_categorical(256, 16, 71, False)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16)
predt_0 = booster.predict(Xy)
with tempfile.TemporaryDirectory() as tempdir:
path = os.path.join(tempdir, "model.deprecated")
with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"):
with pytest.warns(Warning, match="Model format is default to UBJSON"):
booster.save_model(path)
path = os.path.join(tempdir, "model.json")
booster.save_model(path)
booster = xgb.Booster(model_file=path)
predt_1 = booster.predict(Xy)
np.testing.assert_allclose(predt_0, predt_1)
path = os.path.join(tempdir, "model.ubj")
booster.save_model(path)
booster = xgb.Booster(model_file=path)
predt_1 = booster.predict(Xy)
np.testing.assert_allclose(predt_0, predt_1)
@pytest.mark.skipif(**tm.no_json_schema())
def test_json_io_schema(self) -> None:
import jsonschema
model_path = "test_json_schema.json"
path = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
doc = os.path.join(path, "doc", "model.schema")
with open(doc, "r") as fd:
schema = json.load(fd)
parameters = {"tree_method": "hist", "booster": "gbtree"}
jsonschema.validate(instance=json_model(model_path, parameters), schema=schema)
os.remove(model_path)
parameters = {"tree_method": "hist", "booster": "dart"}
jsonschema.validate(instance=json_model(model_path, parameters), schema=schema)
os.remove(model_path)
try:
dtrain, _ = tm.load_agaricus(__file__)
xgb.train({"objective": "foo"}, dtrain, num_boost_round=1)
except ValueError as e:
e_str = str(e)
beg = e_str.find("Objective candidate")
end = e_str.find("Stack trace")
e_str = e_str[beg:end]
e_str = e_str.strip()
splited = e_str.splitlines()
objectives = [s.split(": ")[1] for s in splited]
j_objectives = schema["properties"]["learner"]["properties"]["objective"][
"oneOf"
]
objectives_from_schema = set()
for j_obj in j_objectives:
objectives_from_schema.add(j_obj["properties"]["name"]["const"])
assert set(objectives) == objectives_from_schema
def test_model_binary_io(self) -> None:
model_path = "test_model_binary_io.deprecated"
parameters = {
"tree_method": "hist",
"booster": "gbtree",
"scale_pos_weight": "0.5",
}
X = np.random.random((10, 3))
y = np.random.random((10,))
dtrain = xgb.DMatrix(X, y)
bst = xgb.train(parameters, dtrain, num_boost_round=2)
with pytest.warns(Warning, match="Model format is default to UBJSON"):
bst.save_model(model_path)
bst = xgb.Booster(model_file=model_path)
os.remove(model_path)
config = json.loads(bst.save_config())
assert (
float(config["learner"]["objective"]["reg_loss_param"]["scale_pos_weight"])
== 0.5
)
buf = bst.save_raw()
from_raw = xgb.Booster()
from_raw.load_model(buf)
buf_from_raw = from_raw.save_raw()
assert buf == buf_from_raw
def test_with_pathlib(self) -> None:
"""Saving and loading model files from paths."""
save_path = Path("model.ubj")
rng = np.random.default_rng(1994)
data = rng.normal(size=(100, 2))
target = np.array([0, 1] * 50)
features = ["Feature1", "Feature2"]
dm = xgb.DMatrix(data, label=target, feature_names=features)
params = {
"objective": "binary:logistic",
"eval_metric": "logloss",
"eta": 0.3,
"max_depth": 1,
}
bst = xgb.train(params, dm, num_boost_round=1)
# save, assert exists
bst.save_model(save_path)
assert save_path.exists()
def dump_assertions(dump: List[str]) -> None:
"""Assertions for the expected dump from Booster"""
assert len(dump) == 1, "Exepcted only 1 tree to be dumped."
assert (
len(dump[0].splitlines()) == 3
), "Expected 1 root and 2 leaves - 3 lines."
# load the model again using Path
bst2 = xgb.Booster(model_file=save_path)
dump2 = bst2.get_dump()
dump_assertions(dump2)
# load again using load_model
bst3 = xgb.Booster()
bst3.load_model(save_path)
dump3 = bst3.get_dump()
dump_assertions(dump3)
# remove file
Path.unlink(save_path)
def save_load_model(model_path: str) -> None:
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold
rng = np.random.RandomState(1994)
digits = load_digits(n_class=2)
y = digits["target"]
X = digits["data"]
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
xgb_model.save_model(model_path)
xgb_model = xgb.XGBClassifier()
xgb_model.load_model(model_path)
assert isinstance(xgb_model.classes_, np.ndarray)
np.testing.assert_equal(xgb_model.classes_, np.array([0, 1]))
assert isinstance(xgb_model._Booster, xgb.Booster)
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
err = sum(
1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
) / float(len(preds))
assert err < 0.1
assert xgb_model.get_booster().attr("scikit_learn") is None
# test native booster
preds = xgb_model.predict(X[test_index], output_margin=True)
booster = xgb.Booster(model_file=model_path)
predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True)
assert np.allclose(preds, predt_1)
with pytest.raises(TypeError):
xgb_model = xgb.XGBModel()
xgb_model.load_model(model_path)
clf = xgb.XGBClassifier(booster="gblinear", early_stopping_rounds=1)
clf.fit(X, y, eval_set=[(X, y)])
best_iteration = clf.best_iteration
best_score = clf.best_score
predt_0 = clf.predict(X)
clf.save_model(model_path)
clf.load_model(model_path)
assert clf.booster == "gblinear"
predt_1 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_1)
assert clf.best_iteration == best_iteration
assert clf.best_score == best_score
clfpkl = pickle.dumps(clf)
clf = pickle.loads(clfpkl)
predt_2 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_2)
assert clf.best_iteration == best_iteration
assert clf.best_score == best_score
@pytest.mark.skipif(**tm.no_sklearn())
def test_sklearn_model() -> None:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, "digits.deprecated")
with pytest.warns(Warning, match="Model format is default to UBJSON"):
save_load_model(model_path)
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, "digits.model.json")
save_load_model(model_path)
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, "digits.model.ubj")
digits = load_digits(n_class=2)
y = digits["target"]
X = digits["data"]
booster = xgb.train(
{"tree_method": "hist", "objective": "binary:logistic"},
dtrain=xgb.DMatrix(X, y),
num_boost_round=4,
)
predt_0 = booster.predict(xgb.DMatrix(X))
booster.save_model(model_path)
cls = xgb.XGBClassifier()
cls.load_model(model_path)
proba = cls.predict_proba(X)
assert proba.shape[0] == X.shape[0]
assert proba.shape[1] == 2 # binary
predt_1 = cls.predict_proba(X)[:, 1]
assert np.allclose(predt_0, predt_1)
cls = xgb.XGBModel()
cls.load_model(model_path)
predt_1 = cls.predict(X)
assert np.allclose(predt_0, predt_1)
# mclass
X, y = load_digits(n_class=10, return_X_y=True)
# small test_size to force early stop
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.01, random_state=1
)
clf = xgb.XGBClassifier(
n_estimators=64, tree_method="hist", early_stopping_rounds=2
)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
score = clf.best_score
clf.save_model(model_path)
clf = xgb.XGBClassifier()
clf.load_model(model_path)
assert clf.classes_.size == 10
assert clf.objective == "multi:softprob"
np.testing.assert_equal(clf.classes_, np.arange(10))
assert clf.n_classes_ == 10
assert clf.best_iteration == 27
assert clf.best_score == score
@pytest.mark.skipif(**tm.no_sklearn())
def test_with_sklearn_obj_metric() -> None:
from sklearn.metrics import mean_squared_error
X, y = tm.datasets.make_regression()
reg = xgb.XGBRegressor(objective=tm.ls_obj, eval_metric=mean_squared_error)
reg.fit(X, y)
pkl = pickle.dumps(reg)
reg_1 = pickle.loads(pkl)
assert callable(reg_1.objective)
assert callable(reg_1.eval_metric)
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.json")
reg.save_model(path)
reg_2 = xgb.XGBRegressor()
reg_2.load_model(path)
assert not callable(reg_2.objective)
assert not callable(reg_2.eval_metric)
assert reg_2.eval_metric is None

View File

@ -1,13 +1,10 @@
import json
import os
import pickle
import tempfile
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
kRows = 100
kCols = 10
@ -64,27 +61,3 @@ class TestPickling:
params = {"nthread": 8, "tree_method": "exact", "subsample": 0.5}
config = self.run_model_pickling(params)
check(config)
@pytest.mark.skipif(**tm.no_sklearn())
def test_with_sklearn_obj_metric(self) -> None:
from sklearn.metrics import mean_squared_error
X, y = tm.datasets.make_regression()
reg = xgb.XGBRegressor(objective=tm.ls_obj, eval_metric=mean_squared_error)
reg.fit(X, y)
pkl = pickle.dumps(reg)
reg_1 = pickle.loads(pkl)
assert callable(reg_1.objective)
assert callable(reg_1.eval_metric)
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.json")
reg.save_model(path)
reg_2 = xgb.XGBRegressor()
reg_2.load_model(path)
assert not callable(reg_2.objective)
assert not callable(reg_2.eval_metric)
assert reg_2.eval_metric is None

View File

@ -49,7 +49,7 @@ class TestSHAP:
def fn(max_depth: int, num_rounds: int) -> None:
# train
params = {"max_depth": max_depth, "eta": 1, "verbosity": 0}
params = {"max_depth": max_depth, "eta": 1}
bst = xgb.train(params, dtrain, num_boost_round=num_rounds)
# predict

View File

@ -117,7 +117,6 @@ class TestTreeMethod:
ag_param = {'max_depth': 2,
'tree_method': 'hist',
'eta': 1,
'verbosity': 0,
'objective': 'binary:logistic',
'eval_metric': 'auc'}
hist_res = {}
@ -340,7 +339,8 @@ class TestTreeMethod:
assert get_score(config_0) == get_score(config_1)
raw_booster = booster_1.save_raw(raw_format="deprecated")
with pytest.warns(Warning, match="Model format is default to UBJSON"):
raw_booster = booster_1.save_raw(raw_format="deprecated")
booster_2 = xgb.Booster(model_file=raw_booster)
config_2 = json.loads(booster_2.save_config())
assert get_score(config_1) == get_score(config_2)

View File

@ -341,7 +341,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
"eval_metric": "error",
}
@ -372,7 +371,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
"eval_metric": "auc",
}
@ -383,7 +381,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
"eval_metric": ["auc"],
}
@ -394,7 +391,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
"eval_metric": ["auc"],
}
@ -413,7 +409,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
}
cv = xgb.cv(
@ -424,7 +419,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
}
cv = xgb.cv(
@ -435,7 +429,6 @@ class TestPandas:
params = {
"max_depth": 2,
"eta": 1,
"verbosity": 0,
"objective": "binary:logistic",
"eval_metric": ["auc"],
}

View File

@ -678,7 +678,6 @@ def test_split_value_histograms():
params = {
"max_depth": 6,
"eta": 0.01,
"verbosity": 0,
"objective": "binary:logistic",
"base_score": 0.5,
}
@ -897,128 +896,6 @@ def test_validation_weights():
run_validation_weights(xgb.XGBClassifier)
def save_load_model(model_path):
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold
digits = load_digits(n_class=2)
y = digits['target']
X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
xgb_model.save_model(model_path)
xgb_model = xgb.XGBClassifier()
xgb_model.load_model(model_path)
assert isinstance(xgb_model.classes_, np.ndarray)
np.testing.assert_equal(xgb_model.classes_, np.array([0, 1]))
assert isinstance(xgb_model._Booster, xgb.Booster)
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
assert xgb_model.get_booster().attr('scikit_learn') is None
# test native booster
preds = xgb_model.predict(X[test_index], output_margin=True)
booster = xgb.Booster(model_file=model_path)
predt_1 = booster.predict(xgb.DMatrix(X[test_index]),
output_margin=True)
assert np.allclose(preds, predt_1)
with pytest.raises(TypeError):
xgb_model = xgb.XGBModel()
xgb_model.load_model(model_path)
clf = xgb.XGBClassifier(booster="gblinear", early_stopping_rounds=1)
clf.fit(X, y, eval_set=[(X, y)])
best_iteration = clf.best_iteration
best_score = clf.best_score
predt_0 = clf.predict(X)
clf.save_model(model_path)
clf.load_model(model_path)
assert clf.booster == "gblinear"
predt_1 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_1)
assert clf.best_iteration == best_iteration
assert clf.best_score == best_score
clfpkl = pickle.dumps(clf)
clf = pickle.loads(clfpkl)
predt_2 = clf.predict(X)
np.testing.assert_allclose(predt_0, predt_2)
assert clf.best_iteration == best_iteration
assert clf.best_score == best_score
def test_save_load_model():
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, "digits.model")
save_load_model(model_path)
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, "digits.model.json")
save_load_model(model_path)
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
with tempfile.TemporaryDirectory() as tempdir:
model_path = os.path.join(tempdir, "digits.model.ubj")
digits = load_digits(n_class=2)
y = digits["target"]
X = digits["data"]
booster = xgb.train(
{"tree_method": "hist", "objective": "binary:logistic"},
dtrain=xgb.DMatrix(X, y),
num_boost_round=4,
)
predt_0 = booster.predict(xgb.DMatrix(X))
booster.save_model(model_path)
cls = xgb.XGBClassifier()
cls.load_model(model_path)
proba = cls.predict_proba(X)
assert proba.shape[0] == X.shape[0]
assert proba.shape[1] == 2 # binary
predt_1 = cls.predict_proba(X)[:, 1]
assert np.allclose(predt_0, predt_1)
cls = xgb.XGBModel()
cls.load_model(model_path)
predt_1 = cls.predict(X)
assert np.allclose(predt_0, predt_1)
# mclass
X, y = load_digits(n_class=10, return_X_y=True)
# small test_size to force early stop
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.01, random_state=1
)
clf = xgb.XGBClassifier(
n_estimators=64, tree_method="hist", early_stopping_rounds=2
)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
score = clf.best_score
clf.save_model(model_path)
clf = xgb.XGBClassifier()
clf.load_model(model_path)
assert clf.classes_.size == 10
assert clf.objective == "multi:softprob"
np.testing.assert_equal(clf.classes_, np.arange(10))
assert clf.n_classes_ == 10
assert clf.best_iteration == 27
assert clf.best_score == score
def test_RFECV():
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
from sklearn.feature_selection import RFECV