Group aware GPU sketching. (#5551)

* Group aware GPU weighted sketching. * Distribute group weights to each data point. * Relax the test. * Validate input meta info. * Fix metainfo copy ctor.
2020-04-20 17:18:52 +08:00
parent 397d8f0ee7
commit 29a4cfe400
9 changed files with 296 additions and 124 deletions
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -1,14 +1,13 @@
 import numpy as np
-from scipy.sparse import csr_matrix
 import xgboost
 import os
-import math
 import unittest
 import itertools
 import shutil
 import urllib.request
 import zipfile

+
 class TestRanking(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -22,7 +21,7 @@ class TestRanking(unittest.TestCase):
        target = cls.dpath + '/MQ2008.zip'

        if os.path.exists(cls.dpath) and os.path.exists(target):
-            print ("Skipping dataset download...")
+            print("Skipping dataset download...")
        else:
            urllib.request.urlretrieve(url=src, filename=target)
            with zipfile.ZipFile(target, 'r') as f:
@@ -50,17 +49,30 @@ class TestRanking(unittest.TestCase):
        cls.qid_test = qid_test
        cls.qid_valid = qid_valid

+        def setup_weighted(x, y, groups):
+            # Setup weighted data
+            data = xgboost.DMatrix(x, y)
+            groups_segment = [len(list(items))
+                              for _key, items in itertools.groupby(groups)]
+            data.set_group(groups_segment)
+            n_groups = len(groups_segment)
+            weights = np.ones((n_groups,))
+            data.set_weight(weights)
+            return data
+
+        cls.dtrain_w = setup_weighted(x_train, y_train, qid_train)
+        cls.dtest_w = setup_weighted(x_test, y_test, qid_test)
+        cls.dvalid_w = setup_weighted(x_valid, y_valid, qid_valid)
+
        # model training parameters
        cls.params = {'booster': 'gbtree',
                      'tree_method': 'gpu_hist',
                      'gpu_id': 0,
-                      'predictor': 'gpu_predictor'
-                     }
+                      'predictor': 'gpu_predictor'}
        cls.cpu_params = {'booster': 'gbtree',
                          'tree_method': 'hist',
                          'gpu_id': -1,
-                          'predictor': 'cpu_predictor'
-                         }
+                          'predictor': 'cpu_predictor'}

    @classmethod
    def tearDownClass(cls):
@@ -81,30 +93,46 @@ class TestRanking(unittest.TestCase):
        # specify validations set to watch performance
        watchlist = [(cls.dtest, 'eval'), (cls.dtrain, 'train')]

-        num_trees=2500
-        check_metric_improvement_rounds=10
+        num_trees = 2500
+        check_metric_improvement_rounds = 10

        evals_result = {}
        cls.params['objective'] = rank_objective
        cls.params['eval_metric'] = metric_name
-        bst = xgboost.train(cls.params, cls.dtrain, num_boost_round=num_trees,
-                            early_stopping_rounds=check_metric_improvement_rounds,
-                            evals=watchlist, evals_result=evals_result)
+        bst = xgboost.train(
+            cls.params, cls.dtrain, num_boost_round=num_trees,
+            early_stopping_rounds=check_metric_improvement_rounds,
+            evals=watchlist, evals_result=evals_result)
        gpu_map_metric = evals_result['train'][metric_name][-1]

        evals_result = {}
        cls.cpu_params['objective'] = rank_objective
        cls.cpu_params['eval_metric'] = metric_name
-        bstc = xgboost.train(cls.cpu_params, cls.dtrain, num_boost_round=num_trees,
-                             early_stopping_rounds=check_metric_improvement_rounds,
-                             evals=watchlist, evals_result=evals_result)
+        bstc = xgboost.train(
+            cls.cpu_params, cls.dtrain, num_boost_round=num_trees,
+            early_stopping_rounds=check_metric_improvement_rounds,
+            evals=watchlist, evals_result=evals_result)
        cpu_map_metric = evals_result['train'][metric_name][-1]

-        print("{0} gpu {1} metric {2}".format(rank_objective, metric_name, gpu_map_metric))
-        print("{0} cpu {1} metric {2}".format(rank_objective, metric_name, cpu_map_metric))
-        print("gpu best score {0} cpu best score {1}".format(bst.best_score, bstc.best_score))
-        assert np.allclose(gpu_map_metric, cpu_map_metric, tolerance, tolerance)
-        assert np.allclose(bst.best_score, bstc.best_score, tolerance, tolerance)
+        assert np.allclose(gpu_map_metric, cpu_map_metric, tolerance,
+                           tolerance)
+        assert np.allclose(bst.best_score, bstc.best_score, tolerance,
+                           tolerance)
+
+        evals_result_weighted = {}
+        watchlist = [(cls.dtest_w, 'eval'), (cls.dtrain_w, 'train')]
+        bst_w = xgboost.train(
+            cls.params, cls.dtrain_w, num_boost_round=num_trees,
+            early_stopping_rounds=check_metric_improvement_rounds,
+            evals=watchlist, evals_result=evals_result_weighted)
+        weighted_metric = evals_result_weighted['train'][metric_name][-1]
+        # GPU Ranking is not deterministic due to `AtomicAddGpair`,
+        # remove tolerance once the issue is resolved.
+        # https://github.com/dmlc/xgboost/issues/5561
+        assert np.allclose(bst_w.best_score, bst.best_score,
+                           tolerance, tolerance)
+        assert np.allclose(weighted_metric, gpu_map_metric,
+                           tolerance, tolerance)

    def test_training_rank_pairwise_map_metric(self):
        """