From 067b704e58c5a4d97fd64c1871e79e9ace29efe9 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 6 Jan 2023 01:17:49 +0800 Subject: [PATCH] [backport] Fix inference with categorical feature. (#8591) (#8602) (#8638) * Fix inference with categorical feature. (#8591) * Fix windows build on buildkite. (#8602) * workaround. --- doc/tutorials/categorical.rst | 10 ++--- src/common/categorical.h | 17 ++++---- src/common/partition_builder.h | 4 +- src/predictor/predict_fn.h | 4 +- src/tree/updater_gpu_hist.cu | 5 +-- tests/cpp/common/test_categorical.cc | 64 +++++++++++++++++++++++++--- tests/python/test_with_sklearn.py | 6 +-- 7 files changed, 79 insertions(+), 31 deletions(-) diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst index 1c090801f..97877f23c 100644 --- a/doc/tutorials/categorical.rst +++ b/doc/tutorials/categorical.rst @@ -138,11 +138,11 @@ Miscellaneous By default, XGBoost assumes input categories are integers starting from 0 till the number of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid -values due to mistakes or missing values. It can be negative value, integer values that -can not be accurately represented by 32-bit floating point, or values that are larger than -actual number of unique categories. During training this is validated but for prediction -it's treated as the same as missing value for performance reasons. Lastly, missing values -are treated as the same as numerical features (using the learned split direction). +values due to mistakes or missing values in training dataset. It can be negative value, +integer values that can not be accurately represented by 32-bit floating point, or values +that are larger than actual number of unique categories. During training this is +validated but for prediction it's treated as the same as not-chosen category for +performance reasons. ********** diff --git a/src/common/categorical.h b/src/common/categorical.h index ead5f570c..452aaa8c1 100644 --- a/src/common/categorical.h +++ b/src/common/categorical.h @@ -48,20 +48,21 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) { return cat < 0 || cat >= kMaxCat; } -/* \brief Whether should it traverse to left branch of a tree. +/** + * \brief Whether should it traverse to left branch of a tree. * - * For one hot split, go to left if it's NOT the matching category. + * Go to left if it's NOT the matching category, which matches one-hot encoding. */ -template -inline XGBOOST_DEVICE bool Decision(common::Span cats, float cat, bool dft_left) { +inline XGBOOST_DEVICE bool Decision(common::Span cats, float cat) { KCatBitField const s_cats(cats); - // FIXME: Size() is not accurate since it represents the size of bit set instead of - // actual number of categories. - if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) { - return dft_left; + if (XGBOOST_EXPECT(InvalidCat(cat), false)) { + return true; } auto pos = KCatBitField::ToBitPos(cat); + // If the input category is larger than the size of the bit field, it implies that the + // category is not chosen. Otherwise the bit field would have the category instead of + // being smaller than the category value. if (pos.int_pos >= cats.size()) { return true; } diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h index 34864ee90..568e611b7 100644 --- a/src/common/partition_builder.h +++ b/src/common/partition_builder.h @@ -144,7 +144,7 @@ class PartitionBuilder { auto gidx = gidx_calc(ridx); bool go_left = default_left; if (gidx > -1) { - go_left = Decision(node_cats, cut_values[gidx], default_left); + go_left = Decision(node_cats, cut_values[gidx]); } return go_left; } else { @@ -157,7 +157,7 @@ class PartitionBuilder { bool go_left = default_left; if (gidx > -1) { if (is_cat) { - go_left = Decision(node_cats, cut_values[gidx], default_left); + go_left = Decision(node_cats, cut_values[gidx]); } else { go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value; } diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h index 7ce474023..5d0c175fc 100644 --- a/src/predictor/predict_fn.h +++ b/src/predictor/predict_fn.h @@ -18,9 +18,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs if (has_categorical && common::IsCat(cats.split_type, nid)) { auto node_categories = cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size); - return common::Decision(node_categories, fvalue, node.DefaultLeft()) - ? node.LeftChild() - : node.RightChild(); + return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild(); } else { return node.LeftChild() + !(fvalue < node.SplitCond()); } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index b3b3004a2..b90a7ce09 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -403,8 +403,7 @@ struct GPUHistMakerDevice { go_left = data.split_node.DefaultLeft(); } else { if (data.split_type == FeatureType::kCategorical) { - go_left = common::Decision(data.node_cats.Bits(), cut_value, - data.split_node.DefaultLeft()); + go_left = common::Decision(data.node_cats.Bits(), cut_value); } else { go_left = cut_value <= data.split_node.SplitCond(); } @@ -481,7 +480,7 @@ struct GPUHistMakerDevice { if (common::IsCat(d_feature_types, position)) { auto node_cats = categories.subspan(categories_segments[position].beg, categories_segments[position].size); - go_left = common::Decision(node_cats, element, node.DefaultLeft()); + go_left = common::Decision(node_cats, element); } else { go_left = element <= node.SplitCond(); } diff --git a/tests/cpp/common/test_categorical.cc b/tests/cpp/common/test_categorical.cc index cc8eb0f7e..4e6e696ec 100644 --- a/tests/cpp/common/test_categorical.cc +++ b/tests/cpp/common/test_categorical.cc @@ -1,11 +1,14 @@ /*! - * Copyright 2021 by XGBoost Contributors + * Copyright 2021-2022 by XGBoost Contributors */ #include +#include +#include #include #include "../../../src/common/categorical.h" +#include "../helpers.h" namespace xgboost { namespace common { @@ -15,29 +18,76 @@ TEST(Categorical, Decision) { ASSERT_TRUE(common::InvalidCat(a)); std::vector cats(256, 0); - ASSERT_TRUE(Decision(cats, a, true)); + ASSERT_TRUE(Decision(cats, a)); // larger than size a = 256; - ASSERT_TRUE(Decision(cats, a, true)); + ASSERT_TRUE(Decision(cats, a)); // negative a = -1; - ASSERT_TRUE(Decision(cats, a, true)); + ASSERT_TRUE(Decision(cats, a)); CatBitField bits{cats}; bits.Set(0); a = -0.5; - ASSERT_TRUE(Decision(cats, a, true)); + ASSERT_TRUE(Decision(cats, a)); // round toward 0 a = 0.5; - ASSERT_FALSE(Decision(cats, a, true)); + ASSERT_FALSE(Decision(cats, a)); // valid a = 13; bits.Set(a); - ASSERT_FALSE(Decision(bits.Bits(), a, true)); + ASSERT_FALSE(Decision(bits.Bits(), a)); +} + +/** + * Test for running inference with input category greater than the one stored in tree. + */ +TEST(Categorical, MinimalSet) { + std::size_t constexpr kRows = 256, kCols = 1, kCat = 3; + std::vector types{FeatureType::kCategorical}; + auto Xy = + RandomDataGenerator{kRows, kCols, 0.0}.Type(types).MaxCategory(kCat).GenerateDMatrix(true); + + std::unique_ptr learner{Learner::Create({Xy})}; + learner->SetParam("max_depth", "1"); + learner->SetParam("tree_method", "hist"); + learner->Configure(); + learner->UpdateOneIter(0, Xy); + + Json model{Object{}}; + learner->SaveModel(&model); + auto tree = model["learner"]["gradient_booster"]["model"]["trees"][0]; + ASSERT_GE(get(tree["categories"]).size(), 1); + auto v = get(tree["categories"])[0]; + + HostDeviceVector predt; + { + std::vector data{static_cast(kCat), + static_cast(kCat + 1), 32.0f, 33.0f, 34.0f}; + auto test = GetDMatrixFromData(data, data.size(), kCols); + learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true); + ASSERT_EQ(predt.Size(), data.size()); + auto const& h_predt = predt.ConstHostSpan(); + for (auto v : h_predt) { + ASSERT_EQ(v, 1); // left child of root node + } + } + + { + std::unique_ptr learner{Learner::Create({Xy})}; + learner->LoadModel(model); + std::vector data = {static_cast(v)}; + auto test = GetDMatrixFromData(data, data.size(), kCols); + learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true); + auto const& h_predt = predt.ConstHostSpan(); + for (auto v : h_predt) { + ASSERT_EQ(v, 2); // right child of root node + } + } } } // namespace common } // namespace xgboost diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index adbbcd02f..9f1c64625 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1029,9 +1029,9 @@ def test_pandas_input(): clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic") clf_isotonic.fit(train, target) - assert isinstance( - clf_isotonic.calibrated_classifiers_[0].estimator, xgb.XGBClassifier - ) + clf = clf_isotonic.calibrated_classifiers_[0] + est = clf.estimator if hasattr(clf, "estimator") else clf.base_estimator + assert isinstance(est, xgb.XGBClassifier) np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))