Rework the MAP metric. (#8931)

- The new implementation is more strict as only binary labels are accepted. The previous implementation converts values greater than 1 to 1. - Deterministic GPU. (no atomic add). - Fix top-k handling. - Precise definition of MAP. (There are other variants on how to handle top-k). - Refactor GPU ranking tests.
2023-03-22 17:45:20 +08:00
parent b240f055d3
commit 5891f752c8
18 changed files with 458 additions and 323 deletions
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -177,4 +177,36 @@ TEST(NDCGCache, InitFromCPU) {
  Context ctx;
  TestNDCGCache(&ctx);
 }
+
+void TestMAPCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  std::vector<float> h_data(32);
+
+  common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
+  info.labels.Reshape(h_data.size());
+  info.num_row_ = h_data.size();
+  info.labels.Data()->HostVector() = std::move(h_data);
+
+  auto fail = [&]() { std::make_shared<MAPCache>(ctx, info, param); };
+  // binary label
+  ASSERT_THROW(fail(), dmlc::Error);
+
+  h_data = std::vector<float>(32, 0.0f);
+  h_data[1] = 1.0f;
+  info.labels.Data()->HostVector() = h_data;
+  auto p_cache = std::make_shared<MAPCache>(ctx, info, param);
+
+  ASSERT_EQ(p_cache->Acc(ctx).size(), info.num_row_);
+  ASSERT_EQ(p_cache->NumRelevant(ctx).size(), info.num_row_);
+}
+
+TEST(MAPCache, InitFromCPU) {
+  Context ctx;
+  ctx.Init(Args{});
+  TestMAPCache(&ctx);
+}
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -95,4 +95,10 @@ TEST(NDCGCache, InitFromGPU) {
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  TestNDCGCache(&ctx);
 }
+
+TEST(MAPCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestMAPCache(&ctx);
+}
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_ranking_utils.h
+++ b/tests/cpp/common/test_ranking_utils.h
@@ -6,4 +6,6 @@

 namespace xgboost::ltr {
 void TestNDCGCache(Context const* ctx);
+
+void TestMAPCache(Context const* ctx);
 }  // namespace xgboost::ltr
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -141,7 +141,7 @@ TEST(Metric, DeclareUnifiedTest(MAP)) {
  // Rank metric with group info
  EXPECT_NEAR(GetMetricEval(metric,
                            {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
-                            {2, 7, 1, 0, 5, 0},  // Labels
+                            {1, 1, 1, 0, 1, 0},  // Labels
                            {},  // Weights
                            {0, 2, 5, 6}),  // Group info
              0.8611f, 0.001f);
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -1,194 +1,130 @@
-import itertools
 import os
-import shutil
-import urllib.request
-import zipfile
+from typing import Dict

 import numpy as np
+import pytest

 import xgboost
 from xgboost import testing as tm

-pytestmark = tm.timeout(10)
+pytestmark = tm.timeout(30)


-class TestRanking:
-    @classmethod
-    def setup_class(cls):
-        """
-        Download and setup the test fixtures
-        """
-        from sklearn.datasets import load_svmlight_files
+def comp_training_with_rank_objective(
+    dtrain: xgboost.DMatrix,
+    dtest: xgboost.DMatrix,
+    rank_objective: str,
+    metric_name: str,
+    tolerance: float = 1e-02,
+) -> None:
+    """Internal method that trains the dataset using the rank objective on GPU and CPU,
+    evaluates the metric and determines if the delta between the metric is within the
+    tolerance level.

-        # download the test data
-        cls.dpath = os.path.join(tm.demo_dir(__file__), "rank/")
-        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
-        target = os.path.join(cls.dpath, "MQ2008.zip")
+    """
+    # specify validations set to watch performance
+    watchlist = [(dtest, "eval"), (dtrain, "train")]

-        if os.path.exists(cls.dpath) and os.path.exists(target):
-            print("Skipping dataset download...")
-        else:
-            urllib.request.urlretrieve(url=src, filename=target)
-            with zipfile.ZipFile(target, 'r') as f:
-                f.extractall(path=cls.dpath)
+    params = {
+        "booster": "gbtree",
+        "tree_method": "gpu_hist",
+        "gpu_id": 0,
+        "predictor": "gpu_predictor",
+    }

-        (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = load_svmlight_files(
-            (cls.dpath + "MQ2008/Fold1/train.txt",
-             cls.dpath + "MQ2008/Fold1/test.txt",
-             cls.dpath + "MQ2008/Fold1/vali.txt"),
-            query_id=True, zero_based=False)
-        # instantiate the matrices
-        cls.dtrain = xgboost.DMatrix(x_train, y_train)
-        cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
-        cls.dtest = xgboost.DMatrix(x_test, y_test)
-        # set the group counts from the query IDs
-        cls.dtrain.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_train)])
-        cls.dtest.set_group([len(list(items))
-                             for _key, items in itertools.groupby(qid_test)])
-        cls.dvalid.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_valid)])
-        # save the query IDs for testing
-        cls.qid_train = qid_train
-        cls.qid_test = qid_test
-        cls.qid_valid = qid_valid
+    num_trees = 100
+    check_metric_improvement_rounds = 10

-        def setup_weighted(x, y, groups):
-            # Setup weighted data
-            data = xgboost.DMatrix(x, y)
-            groups_segment = [len(list(items))
-                              for _key, items in itertools.groupby(groups)]
-            data.set_group(groups_segment)
-            n_groups = len(groups_segment)
-            weights = np.ones((n_groups,))
-            data.set_weight(weights)
-            return data
+    evals_result: Dict[str, Dict] = {}
+    params["objective"] = rank_objective
+    params["eval_metric"] = metric_name
+    bst = xgboost.train(
+        params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result,
+    )
+    gpu_scores = evals_result["train"][metric_name][-1]

-        cls.dtrain_w = setup_weighted(x_train, y_train, qid_train)
-        cls.dtest_w = setup_weighted(x_test, y_test, qid_test)
-        cls.dvalid_w = setup_weighted(x_valid, y_valid, qid_valid)
+    evals_result = {}

-        # model training parameters
-        cls.params = {'booster': 'gbtree',
-                      'tree_method': 'gpu_hist',
-                      'gpu_id': 0,
-                      'predictor': 'gpu_predictor'}
-        cls.cpu_params = {'booster': 'gbtree',
-                          'tree_method': 'hist',
-                          'gpu_id': -1,
-                          'predictor': 'cpu_predictor'}
+    cpu_params = {
+        "booster": "gbtree",
+        "tree_method": "hist",
+        "gpu_id": -1,
+        "predictor": "cpu_predictor",
+    }
+    cpu_params["objective"] = rank_objective
+    cpu_params["eval_metric"] = metric_name
+    bstc = xgboost.train(
+        cpu_params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result,
+    )
+    cpu_scores = evals_result["train"][metric_name][-1]

-    @classmethod
-    def teardown_class(cls):
-        """
-        Cleanup test artifacts from download and unpacking
-        :return:
-        """
-        os.remove(os.path.join(cls.dpath, "MQ2008.zip"))
-        shutil.rmtree(os.path.join(cls.dpath, "MQ2008"))
+    info = (rank_objective, metric_name)
+    assert np.allclose(gpu_scores, cpu_scores, tolerance, tolerance), info
+    assert np.allclose(bst.best_score, bstc.best_score, tolerance, tolerance), info

-    @classmethod
-    def __test_training_with_rank_objective(cls, rank_objective, metric_name, tolerance=1e-02):
-        """
-        Internal method that trains the dataset using the rank objective on GPU and CPU, evaluates
-        the metric and determines if the delta between the metric is within the tolerance level
-        :return:
-        """
-        # specify validations set to watch performance
-        watchlist = [(cls.dtest, 'eval'), (cls.dtrain, 'train')]
+    evals_result_weighted: Dict[str, Dict] = {}
+    dtest.set_weight(np.ones((dtest.get_group().size,)))
+    dtrain.set_weight(np.ones((dtrain.get_group().size,)))
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+    bst_w = xgboost.train(
+        params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result_weighted,
+    )
+    weighted_metric = evals_result_weighted["train"][metric_name][-1]

-        num_trees = 100
-        check_metric_improvement_rounds = 10
+    tolerance = 1e-5
+    assert np.allclose(bst_w.best_score, bst.best_score, tolerance, tolerance)
+    assert np.allclose(weighted_metric, gpu_scores, tolerance, tolerance)

-        evals_result = {}
-        cls.params['objective'] = rank_objective
-        cls.params['eval_metric'] = metric_name
-        bst = xgboost.train(
-            cls.params, cls.dtrain, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result)
-        gpu_map_metric = evals_result['train'][metric_name][-1]

-        evals_result = {}
-        cls.cpu_params['objective'] = rank_objective
-        cls.cpu_params['eval_metric'] = metric_name
-        bstc = xgboost.train(
-            cls.cpu_params, cls.dtrain, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result)
-        cpu_map_metric = evals_result['train'][metric_name][-1]
+@pytest.mark.parametrize(
+    "objective,metric",
+    [
+        ("rank:pairwise", "auc"),
+        ("rank:pairwise", "ndcg"),
+        ("rank:pairwise", "map"),
+        ("rank:ndcg", "auc"),
+        ("rank:ndcg", "ndcg"),
+        ("rank:ndcg", "map"),
+        ("rank:map", "auc"),
+        ("rank:map", "ndcg"),
+        ("rank:map", "map"),
+    ],
+)
+def test_with_mq2008(objective, metric) -> None:
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = tm.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))

-        assert np.allclose(gpu_map_metric, cpu_map_metric, tolerance,
-                           tolerance)
-        assert np.allclose(bst.best_score, bstc.best_score, tolerance,
-                           tolerance)
+    if metric.find("map") != -1 or objective.find("map") != -1:
+        y_train[y_train <= 1] = 0.0
+        y_train[y_train > 1] = 1.0
+        y_test[y_test <= 1] = 0.0
+        y_test[y_test > 1] = 1.0

-        evals_result_weighted = {}
-        watchlist = [(cls.dtest_w, 'eval'), (cls.dtrain_w, 'train')]
-        bst_w = xgboost.train(
-            cls.params, cls.dtrain_w, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result_weighted)
-        weighted_metric = evals_result_weighted['train'][metric_name][-1]
-        # GPU Ranking is not deterministic due to `AtomicAddGpair`,
-        # remove tolerance once the issue is resolved.
-        # https://github.com/dmlc/xgboost/issues/5561
-        assert np.allclose(bst_w.best_score, bst.best_score,
-                           tolerance, tolerance)
-        assert np.allclose(weighted_metric, gpu_map_metric,
-                           tolerance, tolerance)
+    dtrain = xgboost.DMatrix(x_train, y_train, qid=qid_train)
+    dtest = xgboost.DMatrix(x_test, y_test, qid=qid_test)

-    def test_training_rank_pairwise_map_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'map')
-
-    def test_training_rank_pairwise_auc_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'auc')
-
-    def test_training_rank_pairwise_ndcg_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'ndcg')
-
-    def test_training_rank_ndcg_map(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'map')
-
-    def test_training_rank_ndcg_auc(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'auc')
-
-    def test_training_rank_ndcg_ndcg(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'ndcg')
-
-    def test_training_rank_map_map(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'map')
-
-    def test_training_rank_map_auc(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'auc')
-
-    def test_training_rank_map_ndcg(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'ndcg')
+    comp_training_with_rank_objective(dtrain, dtest, objective, metric)
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -128,12 +128,23 @@ def test_ranking():

    x_test = np.random.rand(100, 10)

-    params = {'tree_method': 'exact', 'objective': 'rank:pairwise',
-              'learning_rate': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1,
-              'max_depth': 6, 'n_estimators': 4}
+    params = {
+        "tree_method": "exact",
+        "learning_rate": 0.1,
+        "gamma": 1.0,
+        "min_child_weight": 0.1,
+        "max_depth": 6,
+        "eval_metric": "ndcg",
+        "n_estimators": 4,
+    }
    model = xgb.sklearn.XGBRanker(**params)
-    model.fit(x_train, y_train, group=train_group,
-              eval_set=[(x_valid, y_valid)], eval_group=[valid_group])
+    model.fit(
+        x_train,
+        y_train,
+        group=train_group,
+        eval_set=[(x_valid, y_valid)],
+        eval_group=[valid_group],
+    )
    assert model.evals_result()

    pred = model.predict(x_test)
@@ -145,11 +156,18 @@ def test_ranking():
    assert train_data.get_label().shape[0] == x_train.shape[0]
    valid_data.set_group(valid_group)

-    params_orig = {'tree_method': 'exact', 'objective': 'rank:pairwise',
-                   'eta': 0.1, 'gamma': 1.0,
-                   'min_child_weight': 0.1, 'max_depth': 6}
-    xgb_model_orig = xgb.train(params_orig, train_data, num_boost_round=4,
-                               evals=[(valid_data, 'validation')])
+    params_orig = {
+        "tree_method": "exact",
+        "objective": "rank:pairwise",
+        "eta": 0.1,
+        "gamma": 1.0,
+        "min_child_weight": 0.1,
+        "max_depth": 6,
+        "eval_metric": "ndcg",
+    }
+    xgb_model_orig = xgb.train(
+        params_orig, train_data, num_boost_round=4, evals=[(valid_data, "validation")]
+    )
    pred_orig = xgb_model_orig.predict(test_data)

    np.testing.assert_almost_equal(pred, pred_orig)
@@ -165,7 +183,11 @@ def test_ranking_metric() -> None:
    # sklearn compares the number of mis-classified docs, while the one in xgboost
    # compares the number of mis-classified pairs.
    ltr = xgb.XGBRanker(
-        eval_metric=roc_auc_score, n_estimators=10, tree_method="hist", max_depth=2
+        eval_metric=roc_auc_score,
+        n_estimators=10,
+        tree_method="hist",
+        max_depth=2,
+        objective="rank:pairwise",
    )
    ltr.fit(
        X,