[Breaking] Require format to be specified in input URI. (#9077)

Previously, we use `libsvm` as default when format is not specified. However, the dmlc
data parser is not particularly robust against errors, and the most common type of error
is undefined format.

Along with which, we will recommend users to use other data loader instead. We will
continue the maintenance of the parsers as it's currently used for many internal tests
including federated learning.
This commit is contained in:
Jiaming Yuan
2023-04-28 19:45:15 +08:00
committed by GitHub
parent e922004329
commit 1f9a57d17b
58 changed files with 327 additions and 268 deletions

View File

@@ -21,8 +21,7 @@ class TestBasic:
assert not lazy_isinstance(a, 'numpy', 'dataframe')
def test_basic(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 2, 'eta': 1,
'objective': 'binary:logistic'}
# specify validations set to watch performance
@@ -61,8 +60,7 @@ class TestBasic:
def test_metric_config(self):
# Make sure that the metric configuration happens in booster so the
# string `['error', 'auc']` doesn't get passed down to core.
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@@ -78,8 +76,7 @@ class TestBasic:
np.testing.assert_allclose(predt_0, predt_1)
def test_multiclass(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2}
# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@@ -188,7 +185,7 @@ class TestBasic:
assert dm.num_col() == cols
def test_cv(self):
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
@@ -198,7 +195,7 @@ class TestBasic:
assert len(cv) == (4)
def test_cv_no_shuffle(self):
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
@@ -209,7 +206,7 @@ class TestBasic:
assert len(cv) == (4)
def test_cv_explicit_fold_indices(self):
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
'binary:logistic'}
folds = [
@@ -268,8 +265,7 @@ class TestBasicPathLike:
def test_DMatrix_init_from_path(self):
"""Initialization from the data path."""
dpath = Path('demo/data')
dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train')
dtrain, _ = tm.load_agaricus(__file__)
assert dtrain.num_row() == 6513
assert dtrain.num_col() == 127

View File

@@ -42,8 +42,7 @@ class TestModels:
param = {'verbosity': 0, 'objective': 'binary:logistic',
'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
'nthread': 1}
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
@@ -55,8 +54,7 @@ class TestModels:
assert err < 0.2
def test_dart(self):
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
dtrain, dtest = tm.load_agaricus(__file__)
param = {'max_depth': 5, 'objective': 'binary:logistic',
'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1}
# specify validations set to watch performance
@@ -122,7 +120,7 @@ class TestModels:
def test_boost_from_prediction(self):
# Re-construct dtrain here to avoid modification
margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
margined, _ = tm.load_agaricus(__file__)
bst = xgb.train({'tree_method': 'hist'}, margined, 1)
predt_0 = bst.predict(margined, output_margin=True)
margined.set_base_margin(predt_0)
@@ -130,13 +128,13 @@ class TestModels:
predt_1 = bst.predict(margined)
assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtrain, _ = tm.load_agaricus(__file__)
bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
predt_2 = bst.predict(dtrain)
assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
def test_boost_from_existing_model(self):
X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
X, _ = tm.load_agaricus(__file__)
booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
assert booster.num_boosted_rounds() == 4
booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4,
@@ -156,8 +154,7 @@ class TestModels:
'objective': 'reg:logistic',
"tree_method": tree_method
}
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 10
@@ -203,8 +200,7 @@ class TestModels:
self.run_custom_objective()
def test_multi_eval_metric(self):
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
'objective': 'binary:logistic'}
@@ -226,7 +222,7 @@ class TestModels:
param['scale_pos_weight'] = ratio
return (dtrain, dtest, param)
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtrain, _ = tm.load_agaricus(__file__)
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed=0, fpreproc=fpreproc)
@@ -234,7 +230,7 @@ class TestModels:
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic'}
num_round = 2
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtrain, _ = tm.load_agaricus(__file__)
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed=0, show_stdv=False)
@@ -392,7 +388,7 @@ class TestModels:
os.remove(model_path)
try:
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtrain, _ = tm.load_agaricus(__file__)
xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
except ValueError as e:
e_str = str(e)

View File

@@ -275,9 +275,7 @@ class TestCallbacks:
"""Test learning rate scheduler, used by both CPU and GPU tests."""
scheduler = xgb.callback.LearningRateScheduler
dpath = tm.data_dir(__file__)
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4
@@ -361,9 +359,7 @@ class TestCallbacks:
num_round = 4
scheduler = xgb.callback.LearningRateScheduler
dpath = tm.data_dir(__file__)
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
param = {

View File

@@ -283,7 +283,7 @@ class TestDMatrix:
assert m0.feature_types == m1.feature_types
def test_get_info(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtrain, _ = tm.load_agaricus(__file__)
dtrain.get_float_info('label')
dtrain.get_float_info('weight')
dtrain.get_float_info('base_margin')
@@ -432,7 +432,9 @@ class TestDMatrix:
def test_uri_categorical(self):
path = os.path.join(dpath, 'agaricus.txt.train')
feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
Xy = xgb.DMatrix(
path + "?indexing_mode=1&format=libsvm", feature_types=feature_types
)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def test_base_margin(self):

View File

@@ -88,8 +88,12 @@ class TestInteractionConstraints:
def training_accuracy(self, tree_method):
"""Test accuracy, reused by GPU tests."""
from sklearn.metrics import accuracy_score
dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
dtrain = xgboost.DMatrix(
dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm"
)
dtest = xgboost.DMatrix(
dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm"
)
params = {
'eta': 1,
'max_depth': 6,

View File

@@ -134,8 +134,8 @@ class TestMonotoneConstraints:
@pytest.mark.skipif(**tm.no_sklearn())
def test_training_accuracy(self):
from sklearn.metrics import accuracy_score
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic',
'tree_method': 'hist', 'monotone_constraints': '(1, 0)'}
num_boost_round = 5

View File

@@ -13,9 +13,7 @@ pytestmark = tm.timeout(10)
class TestOMP:
def test_omp(self):
dpath = 'demo/data/'
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain, dtest = tm.load_agaricus(__file__)
param = {'booster': 'gbtree',
'objective': 'binary:logistic',

View File

@@ -13,7 +13,7 @@ rng = np.random.RandomState(1994)
class TestTreesToDataFrame:
def build_model(self, max_depth, num_round):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtrain, _ = tm.load_agaricus(__file__)
param = {'max_depth': max_depth, 'objective': 'binary:logistic',
'verbosity': 1}
num_round = num_round

View File

@@ -17,12 +17,10 @@ except ImportError:
pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(),
tm.no_graphviz()))
dpath = 'demo/data/agaricus.txt.train'
class TestPlotting:
def test_plotting(self):
m = xgb.DMatrix(dpath)
m, _ = tm.load_agaricus(__file__)
booster = xgb.train({'max_depth': 2, 'eta': 1,
'objective': 'binary:logistic'}, m,
num_boost_round=2)

View File

@@ -46,8 +46,8 @@ class TestSHAP:
fscores = bst.get_fscore()
assert scores1 == fscores
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm')
def fn(max_depth, num_rounds):
# train

View File

@@ -154,9 +154,7 @@ class TestTreeMethod:
def test_hist_categorical(self):
# hist must be same as exact on all-categorial data
dpath = 'demo/data/'
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
ag_dtrain, ag_dtest = tm.load_agaricus(__file__)
ag_param = {'max_depth': 2,
'tree_method': 'hist',
'eta': 1,

View File

@@ -222,7 +222,7 @@ class TestPandas:
set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist")
def test_cv_as_pandas(self):
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
dm, _ = tm.load_agaricus(__file__)
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': 'error'}