[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)

* Save feature info in booster in JSON model.
* [breaking] Remove automatic feature name generation in `DMatrix`.

This PR is to enable reliable feature validation in Python package.
This commit is contained in:
Jiaming Yuan
2021-02-25 18:54:16 +08:00
committed by GitHub
parent b6167cd2ff
commit 9da2287ab8
12 changed files with 363 additions and 36 deletions

View File

@@ -217,8 +217,8 @@ class TestModels:
X = np.random.random((10, 3))
y = np.random.randint(2, size=(10,))
dm1 = xgb.DMatrix(X, y)
dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
dm2 = xgb.DMatrix(X, y)
bst = xgb.train([], dm1)
bst.predict(dm1) # success
@@ -228,9 +228,6 @@ class TestModels:
bst = xgb.train([], dm2)
bst.predict(dm2) # success
with pytest.raises(ValueError):
bst.predict(dm1)
bst.predict(dm2) # success
def test_model_binary_io(self):
model_path = 'test_model_binary_io.bin'
@@ -458,3 +455,31 @@ class TestModels:
merged = predt_0 + predt_1 - 0.5
single = booster[1:7].predict(dtrain, output_margin=True)
np.testing.assert_allclose(merged, single, atol=1e-6)
@pytest.mark.skipif(**tm.no_pandas())
def test_feature_info(self):
import pandas as pd
rows = 100
cols = 10
X = rng.randn(rows, cols)
y = rng.randn(rows)
feature_names = ["test_feature_" + str(i) for i in range(cols)]
X_pd = pd.DataFrame(X, columns=feature_names)
X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
Xy = xgb.DMatrix(X_pd, y)
assert Xy.feature_types[3] == "int"
booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
assert booster.feature_names == Xy.feature_names
assert booster.feature_names == feature_names
assert booster.feature_types == Xy.feature_types
with tempfile.TemporaryDirectory() as tmpdir:
path = tmpdir + "model.json"
booster.save_model(path)
booster = xgb.Booster()
booster.load_model(path)
assert booster.feature_names == Xy.feature_names
assert booster.feature_types == Xy.feature_types

View File

@@ -95,6 +95,11 @@ eval[test] = {data_path}
}
data = xgboost.DMatrix(data_path)
booster = xgboost.train(parameters, data, num_boost_round=10)
# CLI model doesn't contain feature info.
booster.feature_names = None
booster.feature_types = None
booster.save_model(model_out_py)
py_predt = booster.predict(data)

View File

@@ -180,7 +180,7 @@ class TestDMatrix:
# reset
dm.feature_names = None
assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
assert dm.feature_names is None
assert dm.feature_types is None
def test_feature_names(self):