[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)
* Save feature info in booster in JSON model. * [breaking] Remove automatic feature name generation in `DMatrix`. This PR is to enable reliable feature validation in Python package.
This commit is contained in:
@@ -360,4 +360,60 @@ TEST(Learner, ConstantSeed) {
|
||||
CHECK_EQ(v_0, v_2);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Learner, FeatureInfo) {
|
||||
size_t constexpr kCols = 10;
|
||||
auto m = RandomDataGenerator{10, kCols, 0}.GenerateDMatrix(true);
|
||||
std::vector<std::string> names(kCols);
|
||||
for (size_t i = 0; i < kCols; ++i) {
|
||||
names[i] = ("f" + std::to_string(i));
|
||||
}
|
||||
|
||||
std::vector<std::string> types(kCols);
|
||||
for (size_t i = 0; i < kCols; ++i) {
|
||||
types[i] = "q";
|
||||
}
|
||||
types[8] = "f";
|
||||
types[0] = "int";
|
||||
types[3] = "i";
|
||||
types[7] = "i";
|
||||
|
||||
std::vector<char const*> c_names(kCols);
|
||||
for (size_t i = 0; i < names.size(); ++i) {
|
||||
c_names[i] = names[i].c_str();
|
||||
}
|
||||
std::vector<char const*> c_types(kCols);
|
||||
for (size_t i = 0; i < types.size(); ++i) {
|
||||
c_types[i] = names[i].c_str();
|
||||
}
|
||||
|
||||
std::vector<std::string> out_names;
|
||||
std::vector<std::string> out_types;
|
||||
|
||||
Json model{Object()};
|
||||
{
|
||||
std::unique_ptr<Learner> learner{Learner::Create({m})};
|
||||
learner->Configure();
|
||||
learner->SetFeatureNames(names);
|
||||
learner->GetFeatureNames(&out_names);
|
||||
|
||||
learner->SetFeatureTypes(types);
|
||||
learner->GetFeatureTypes(&out_types);
|
||||
|
||||
ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
|
||||
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
|
||||
|
||||
learner->SaveModel(&model);
|
||||
}
|
||||
|
||||
{
|
||||
std::unique_ptr<Learner> learner{Learner::Create({m})};
|
||||
learner->LoadModel(model);
|
||||
|
||||
learner->GetFeatureNames(&out_names);
|
||||
learner->GetFeatureTypes(&out_types);
|
||||
ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
|
||||
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -217,8 +217,8 @@ class TestModels:
|
||||
X = np.random.random((10, 3))
|
||||
y = np.random.randint(2, size=(10,))
|
||||
|
||||
dm1 = xgb.DMatrix(X, y)
|
||||
dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
|
||||
dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
|
||||
dm2 = xgb.DMatrix(X, y)
|
||||
|
||||
bst = xgb.train([], dm1)
|
||||
bst.predict(dm1) # success
|
||||
@@ -228,9 +228,6 @@ class TestModels:
|
||||
|
||||
bst = xgb.train([], dm2)
|
||||
bst.predict(dm2) # success
|
||||
with pytest.raises(ValueError):
|
||||
bst.predict(dm1)
|
||||
bst.predict(dm2) # success
|
||||
|
||||
def test_model_binary_io(self):
|
||||
model_path = 'test_model_binary_io.bin'
|
||||
@@ -458,3 +455,31 @@ class TestModels:
|
||||
merged = predt_0 + predt_1 - 0.5
|
||||
single = booster[1:7].predict(dtrain, output_margin=True)
|
||||
np.testing.assert_allclose(merged, single, atol=1e-6)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_pandas())
|
||||
def test_feature_info(self):
|
||||
import pandas as pd
|
||||
rows = 100
|
||||
cols = 10
|
||||
X = rng.randn(rows, cols)
|
||||
y = rng.randn(rows)
|
||||
feature_names = ["test_feature_" + str(i) for i in range(cols)]
|
||||
X_pd = pd.DataFrame(X, columns=feature_names)
|
||||
X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
|
||||
|
||||
Xy = xgb.DMatrix(X_pd, y)
|
||||
assert Xy.feature_types[3] == "int"
|
||||
booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
|
||||
|
||||
assert booster.feature_names == Xy.feature_names
|
||||
assert booster.feature_names == feature_names
|
||||
assert booster.feature_types == Xy.feature_types
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
path = tmpdir + "model.json"
|
||||
booster.save_model(path)
|
||||
booster = xgb.Booster()
|
||||
booster.load_model(path)
|
||||
|
||||
assert booster.feature_names == Xy.feature_names
|
||||
assert booster.feature_types == Xy.feature_types
|
||||
|
||||
@@ -95,6 +95,11 @@ eval[test] = {data_path}
|
||||
}
|
||||
data = xgboost.DMatrix(data_path)
|
||||
booster = xgboost.train(parameters, data, num_boost_round=10)
|
||||
|
||||
# CLI model doesn't contain feature info.
|
||||
booster.feature_names = None
|
||||
booster.feature_types = None
|
||||
|
||||
booster.save_model(model_out_py)
|
||||
py_predt = booster.predict(data)
|
||||
|
||||
|
||||
@@ -180,7 +180,7 @@ class TestDMatrix:
|
||||
|
||||
# reset
|
||||
dm.feature_names = None
|
||||
assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
|
||||
assert dm.feature_names is None
|
||||
assert dm.feature_types is None
|
||||
|
||||
def test_feature_names(self):
|
||||
|
||||
Reference in New Issue
Block a user