- ndcg ltr implementation on gpu (#5004)

* - ndcg ltr implementation on gpu - this is a follow-up to the pairwise ltr implementation
2019-11-12 14:21:04 -08:00
parent f4e7b707c9
commit 2abe69d774
5 changed files with 780 additions and 202 deletions
--- a/tests/cpp/objective/test_ranking_obj.cc
+++ b/tests/cpp/objective/test_ranking_obj.cc
@@ -76,4 +76,33 @@ TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPairSameLabels)) {
  ASSERT_NO_THROW(obj->DefaultEvalMetric());
 }

+TEST(Objective, DeclareUnifiedTest(NDCGRankingGPair)) {
+  std::vector<std::pair<std::string, std::string>> args;
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  std::unique_ptr<xgboost::ObjFunction> obj {
+    xgboost::ObjFunction::Create("rank:ndcg", &lparam)
+  };
+  obj->Configure(args);
+  CheckConfigReload(obj, "rank:ndcg");
+
+  // Test with setting sample weight to second query group
+  CheckRankingObjFunction(obj,
+                          {0, 0.1f, 0, 0.1f},
+                          {0,   1, 0, 1},
+                          {2.0f, 0.0f},
+                          {0, 2, 4},
+                          {0.7f, -0.7f, 0.0f, 0.0f},
+                          {0.74f, 0.74f, 0.0f, 0.0f});
+
+  CheckRankingObjFunction(obj,
+                          {0, 0.1f, 0, 0.1f},
+                          {0,   1, 0, 1},
+                          {1.0f, 1.0f},
+                          {0, 2, 4},
+                          {0.35f, -0.35f,  0.35f, -0.35f},
+                          {0.368f, 0.368f, 0.368f, 0.368f});
+  ASSERT_NO_THROW(obj->DefaultEvalMetric());
+}
+
 }  // namespace xgboost
--- a/tests/cpp/objective/test_ranking_obj_gpu.cu
+++ b/tests/cpp/objective/test_ranking_obj_gpu.cu
@@ -1 +1,159 @@
 #include "test_ranking_obj.cc"
+
+#include "../../../src/objective/rank_obj.cu"
+
+namespace xgboost {
+
+template <typename T = uint32_t, typename Comparator = thrust::greater<T>>
+std::unique_ptr<xgboost::obj::SegmentSorter<T>>
+RankSegmentSorterTestImpl(const std::vector<uint32_t> &group_indices,
+                          const std::vector<T> &hlabels,
+                          const std::vector<T> &expected_sorted_hlabels,
+                          const std::vector<uint32_t> &expected_orig_pos
+                          ) {
+  std::unique_ptr<xgboost::obj::SegmentSorter<T>> seg_sorter_ptr(
+    new xgboost::obj::SegmentSorter<T>);
+  xgboost::obj::SegmentSorter<T> &seg_sorter(*seg_sorter_ptr);
+
+  // Create a bunch of unsorted labels on the device and sort it via the segment sorter
+  dh::device_vector<T> dlabels(hlabels);
+  seg_sorter.SortItems(dlabels.data().get(), dlabels.size(), group_indices, Comparator());
+
+  EXPECT_EQ(seg_sorter.NumItems(), group_indices.back());
+  EXPECT_EQ(seg_sorter.NumGroups(), group_indices.size() - 1);
+
+  // Check the labels
+  dh::device_vector<T> sorted_dlabels(seg_sorter.NumItems());
+  sorted_dlabels.assign(thrust::device_ptr<const T>(seg_sorter.Items()),
+                        thrust::device_ptr<const T>(seg_sorter.Items())
+                        + seg_sorter.NumItems());
+  thrust::host_vector<T> sorted_hlabels(sorted_dlabels);
+  EXPECT_EQ(expected_sorted_hlabels, sorted_hlabels);
+
+  // Check the indices
+  dh::device_vector<uint32_t> dorig_pos(seg_sorter.NumItems());
+  dorig_pos.assign(thrust::device_ptr<const uint32_t>(seg_sorter.OriginalPositions()),
+                   thrust::device_ptr<const uint32_t>(seg_sorter.OriginalPositions())
+                   + seg_sorter.NumItems());
+  dh::device_vector<uint32_t> horig_pos(dorig_pos);
+  EXPECT_EQ(expected_orig_pos, horig_pos);
+
+  return seg_sorter_ptr;
+}
+
+TEST(Objective, RankSegmentSorterTest) {
+  RankSegmentSorterTestImpl({0, 2, 4, 7, 10, 14, 18, 22, 26},  // Groups
+                            {1, 1,                             // Labels
+                             1, 2,
+                             3, 2, 1,
+                             1, 2, 1,
+                             1, 3, 4, 2,
+                             1, 2, 1, 1,
+                             1, 2, 2, 3,
+                             3, 3, 1, 2},
+                            {1, 1,                             // Expected sorted labels
+                             2, 1,
+                             3, 2, 1,
+                             2, 1, 1,
+                             4, 3, 2, 1,
+                             2, 1, 1, 1,
+                             3, 2, 2, 1,
+                             3, 3, 2, 1},
+                            {0, 1,                             // Expected original positions
+                             3, 2,
+                             4, 5, 6,
+                             8, 7, 9,
+                             12, 11, 13, 10,
+                             15, 14, 16, 17,
+                             21, 19, 20, 18,
+                             22, 23, 25, 24});
+}
+
+TEST(Objective, RankSegmentSorterSingleGroupTest) {
+  RankSegmentSorterTestImpl({0, 7},                  // Groups
+                            {6, 1, 4, 3, 0, 5, 2},   // Labels
+                            {6, 5, 4, 3, 2, 1, 0},   // Expected sorted labels
+                            {0, 5, 2, 3, 6, 1, 4});  // Expected original positions
+}
+
+TEST(Objective, RankSegmentSorterAscendingTest) {
+  RankSegmentSorterTestImpl<uint32_t, thrust::less<uint32_t>>(
+                                                    {0, 4, 7},    // Groups
+                                                    {3, 1, 4, 2,  // Labels
+                                                     6, 5, 7},
+                                                    {1, 2, 3, 4,  // Expected sorted labels
+                                                     5, 6, 7},
+                                                    {1, 3, 0, 2,  // Expected original positions
+                                                     5, 4, 6});
+}
+
+using CountFunctor = uint32_t (*)(const int *, uint32_t, int);
+void RankItemCountImpl(const std::vector<int> &sorted_items, CountFunctor f,
+                       int find_val, uint32_t exp_val) {
+  EXPECT_NE(std::find(sorted_items.begin(), sorted_items.end(), find_val), sorted_items.end());
+  EXPECT_EQ(f(&sorted_items[0], sorted_items.size(), find_val), exp_val);
+}
+
+TEST(Objective, RankItemCountOnLeft) {
+  // Items sorted descendingly
+  std::vector<int> sorted_items{10, 10, 6, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0};
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
+                    10, static_cast<uint32_t>(0));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
+                    6, static_cast<uint32_t>(2));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
+                    4, static_cast<uint32_t>(3));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
+                    1, static_cast<uint32_t>(7));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheLeftOf,
+                    0, static_cast<uint32_t>(12));
+}
+
+TEST(Objective, RankItemCountOnRight) {
+  // Items sorted descendingly
+  std::vector<int> sorted_items{10, 10, 6, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0};
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
+                    10, static_cast<uint32_t>(11));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
+                    6, static_cast<uint32_t>(10));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
+                    4, static_cast<uint32_t>(6));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
+                    1, static_cast<uint32_t>(1));
+  RankItemCountImpl(sorted_items, &xgboost::obj::CountNumItemsToTheRightOf,
+                    0, static_cast<uint32_t>(0));
+}
+
+TEST(Objective, NDCGLambdaWeightComputerTest) {
+  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
+    {0, 4, 7, 12},                  // Groups
+    {3.1f, 1.2f, 2.3f, 4.4f,        // Labels
+     7.8f, 5.01f, 6.96f,
+     10.3f, 8.7f, 11.4f, 9.45f, 11.4f},
+    {4.4f, 3.1f, 2.3f, 1.2f,        // Expected sorted labels
+     7.8f, 6.96f, 5.01f,
+     11.4f, 11.4f, 10.3f, 9.45f, 8.7f},
+    {3, 0, 2, 1,                    // Expected original positions
+     4, 6, 5,
+     9, 11, 7, 10, 8});
+
+  // Created segmented predictions for the labels from above
+  std::vector<bst_float> hpreds{-9.78f, 24.367f, 0.908f, -11.47f,
+                                -1.03f, -2.79f, -3.1f,
+                                104.22f, 103.1f, -101.7f, 100.5f, 45.1f};
+  dh::device_vector<bst_float> dpreds(hpreds);
+  xgboost::obj::NDCGLambdaWeightComputer ndcg_lw_computer(dpreds.data().get(),
+                                                          dpreds.size(),
+                                                          *segment_label_sorter);
+
+  // Where will the predictions move from its current position, if they were sorted
+  // descendingly?
+  auto dsorted_pred_pos = ndcg_lw_computer.GetSortedPredPos();
+  thrust::host_vector<uint32_t> hsorted_pred_pos(dsorted_pred_pos);
+  std::vector<uint32_t> expected_sorted_pred_pos{2, 0, 1, 3,
+                                                 4, 5, 6,
+                                                 7, 8, 11, 9, 10};
+  EXPECT_EQ(expected_sorted_pred_pos, hsorted_pred_pos);
+}
+
+}  // namespace xgboost
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -0,0 +1,143 @@
+import numpy as np
+from scipy.sparse import csr_matrix
+import xgboost
+import os
+import math
+import unittest
+import itertools
+import shutil
+import urllib.request
+import zipfile
+
+class TestRanking(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """
+        Download and setup the test fixtures
+        """
+        from sklearn.datasets import load_svmlight_files
+        # download the test data
+        cls.dpath = 'demo/rank/'
+        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
+        target = cls.dpath + '/MQ2008.zip'
+
+        if os.path.exists(cls.dpath) and os.path.exists(target):
+            print ("Skipping dataset download...")
+        else:
+            urllib.request.urlretrieve(url=src, filename=target)
+            with zipfile.ZipFile(target, 'r') as f:
+                f.extractall(path=cls.dpath)
+
+        (x_train, y_train, qid_train, x_test, y_test, qid_test,
+         x_valid, y_valid, qid_valid) = load_svmlight_files(
+            (cls.dpath + "MQ2008/Fold1/train.txt",
+             cls.dpath + "MQ2008/Fold1/test.txt",
+             cls.dpath + "MQ2008/Fold1/vali.txt"),
+            query_id=True, zero_based=False)
+        # instantiate the matrices
+        cls.dtrain = xgboost.DMatrix(x_train, y_train)
+        cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
+        cls.dtest = xgboost.DMatrix(x_test, y_test)
+        # set the group counts from the query IDs
+        cls.dtrain.set_group([len(list(items))
+                              for _key, items in itertools.groupby(qid_train)])
+        cls.dtest.set_group([len(list(items))
+                             for _key, items in itertools.groupby(qid_test)])
+        cls.dvalid.set_group([len(list(items))
+                              for _key, items in itertools.groupby(qid_valid)])
+        # save the query IDs for testing
+        cls.qid_train = qid_train
+        cls.qid_test = qid_test
+        cls.qid_valid = qid_valid
+
+        # model training parameters
+        cls.params = {'booster': 'gbtree',
+                      'tree_method': 'gpu_hist',
+                      'gpu_id': 0,
+                      'predictor': 'gpu_predictor'
+                     }
+        cls.cpu_params = {'booster': 'gbtree',
+                          'tree_method': 'hist',
+                          'gpu_id': -1,
+                          'predictor': 'cpu_predictor'
+                         }
+
+    @classmethod
+    def tearDownClass(cls):
+        """
+        Cleanup test artifacts from download and unpacking
+        :return:
+        """
+        os.remove(cls.dpath + "MQ2008.zip")
+        shutil.rmtree(cls.dpath + "MQ2008")
+
+    @classmethod
+    def __test_training_with_rank_objective(cls, rank_objective, metric_name, tolerance=1e-02):
+        """
+        Internal method that trains the dataset using the rank objective on GPU and CPU, evaluates
+        the metric and determines if the delta between the metric is within the tolerance level
+        :return:
+        """
+        # specify validations set to watch performance
+        watchlist = [(cls.dtest, 'eval'), (cls.dtrain, 'train')]
+
+        num_trees=2500
+        check_metric_improvement_rounds=10
+
+        evals_result = {}
+        cls.params['objective'] = rank_objective
+        cls.params['eval_metric'] = metric_name
+        bst = xgboost.train(cls.params, cls.dtrain, num_boost_round=num_trees,
+                            early_stopping_rounds=check_metric_improvement_rounds,
+                            evals=watchlist, evals_result=evals_result)
+        gpu_map_metric = evals_result['train'][metric_name][-1]
+
+        evals_result = {}
+        cls.cpu_params['objective'] = rank_objective
+        cls.cpu_params['eval_metric'] = metric_name
+        bstc = xgboost.train(cls.cpu_params, cls.dtrain, num_boost_round=num_trees,
+                             early_stopping_rounds=check_metric_improvement_rounds,
+                             evals=watchlist, evals_result=evals_result)
+        cpu_map_metric = evals_result['train'][metric_name][-1]
+
+        print("{0} gpu {1} metric {2}".format(rank_objective, metric_name, gpu_map_metric))
+        print("{0} cpu {1} metric {2}".format(rank_objective, metric_name, cpu_map_metric))
+        print("gpu best score {0} cpu best score {1}".format(bst.best_score, bstc.best_score))
+        assert np.allclose(gpu_map_metric, cpu_map_metric, tolerance, tolerance)
+        assert np.allclose(bst.best_score, bstc.best_score, tolerance, tolerance)
+
+    def test_training_rank_pairwise_map_metric(self):
+        """
+        Train an XGBoost ranking model with pairwise objective function and compare map metric
+        """
+        self.__test_training_with_rank_objective('rank:pairwise', 'map')
+
+    def test_training_rank_pairwise_auc_metric(self):
+        """
+        Train an XGBoost ranking model with pairwise objective function and compare auc metric
+        """
+        self.__test_training_with_rank_objective('rank:pairwise', 'auc')
+
+    def test_training_rank_pairwise_ndcg_metric(self):
+        """
+        Train an XGBoost ranking model with pairwise objective function and compare ndcg metric
+        """
+        self.__test_training_with_rank_objective('rank:pairwise', 'ndcg')
+
+    def test_training_rank_ndcg_map(self):
+        """
+        Train an XGBoost ranking model with ndcg objective function and compare map metric
+        """
+        self.__test_training_with_rank_objective('rank:ndcg', 'map')
+
+    def test_training_rank_ndcg_auc(self):
+        """
+        Train an XGBoost ranking model with ndcg objective function and compare auc metric
+        """
+        self.__test_training_with_rank_objective('rank:ndcg', 'auc')
+
+    def test_training_rank_ndcg_ndcg(self):
+        """
+        Train an XGBoost ranking model with ndcg objective function and compare ndcg metric
+        """
+        self.__test_training_with_rank_objective('rank:ndcg', 'ndcg')