[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)

* Save feature info in booster in JSON model. * [breaking] Remove automatic feature name generation in `DMatrix`. This PR is to enable reliable feature validation in Python package.
2021-02-25 18:54:16 +08:00
parent b6167cd2ff
commit 9da2287ab8
12 changed files with 363 additions and 36 deletions
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -217,8 +217,8 @@ class TestModels:
        X = np.random.random((10, 3))
        y = np.random.randint(2, size=(10,))

-        dm1 = xgb.DMatrix(X, y)
-        dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
+        dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
+        dm2 = xgb.DMatrix(X, y)

        bst = xgb.train([], dm1)
        bst.predict(dm1)  # success
@@ -228,9 +228,6 @@ class TestModels:

        bst = xgb.train([], dm2)
        bst.predict(dm2)  # success
-        with pytest.raises(ValueError):
-            bst.predict(dm1)
-        bst.predict(dm2)  # success

    def test_model_binary_io(self):
        model_path = 'test_model_binary_io.bin'
@@ -458,3 +455,31 @@ class TestModels:
        merged = predt_0 + predt_1 - 0.5
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)
+
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_feature_info(self):
+        import pandas as pd
+        rows = 100
+        cols = 10
+        X = rng.randn(rows, cols)
+        y = rng.randn(rows)
+        feature_names = ["test_feature_" + str(i) for i in range(cols)]
+        X_pd = pd.DataFrame(X, columns=feature_names)
+        X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
+
+        Xy = xgb.DMatrix(X_pd, y)
+        assert Xy.feature_types[3] == "int"
+        booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
+
+        assert booster.feature_names == Xy.feature_names
+        assert booster.feature_names == feature_names
+        assert booster.feature_types == Xy.feature_types
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = tmpdir + "model.json"
+            booster.save_model(path)
+            booster = xgb.Booster()
+            booster.load_model(path)
+
+            assert booster.feature_names == Xy.feature_names
+            assert booster.feature_types == Xy.feature_types
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -95,6 +95,11 @@ eval[test] = {data_path}
            }
            data = xgboost.DMatrix(data_path)
            booster = xgboost.train(parameters, data, num_boost_round=10)
+
+            # CLI model doesn't contain feature info.
+            booster.feature_names = None
+            booster.feature_types = None
+
            booster.save_model(model_out_py)
            py_predt = booster.predict(data)

--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -180,7 +180,7 @@ class TestDMatrix:

        # reset
        dm.feature_names = None
-        assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
+        assert dm.feature_names is None
        assert dm.feature_types is None

    def test_feature_names(self):