* Initial support for distributed LTR using dask. * Support `qid` in libxgboost. * Refactor `predict` and `n_features_in_`, `best_[score/iteration/ntree_limit]` to avoid duplicated code. * Define `DaskXGBRanker`. The dask ranker doesn't support group structure, instead it uses query id and convert to group ptr internally.
170 lines
6.5 KiB
Python
170 lines
6.5 KiB
Python
import numpy as np
|
|
from scipy.sparse import csr_matrix
|
|
import testing as tm
|
|
import xgboost
|
|
import os
|
|
import itertools
|
|
import shutil
|
|
import urllib.request
|
|
import zipfile
|
|
|
|
|
|
def test_ranking_with_unweighted_data():
|
|
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
|
|
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
|
|
X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
|
|
y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
|
|
0.0, 1.0, 0.0, 1.0, 0.0,
|
|
0.0, 1.0, 0.0, 0.0, 1.0,
|
|
0.0, 1.0, 1.0, 0.0, 0.0])
|
|
|
|
group = np.array([5, 5, 5, 5], dtype=np.uint)
|
|
dtrain = xgboost.DMatrix(X, label=y)
|
|
dtrain.set_group(group)
|
|
|
|
params = {'eta': 1, 'tree_method': 'exact',
|
|
'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
|
|
'max_depth': 1}
|
|
evals_result = {}
|
|
bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
|
|
evals_result=evals_result)
|
|
auc_rec = evals_result['train']['auc']
|
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
|
auc_rec = evals_result['train']['aucpr']
|
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
|
|
|
|
|
def test_ranking_with_weighted_data():
|
|
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
|
|
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
|
|
X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
|
|
y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
|
|
0.0, 1.0, 0.0, 1.0, 0.0,
|
|
0.0, 1.0, 0.0, 0.0, 1.0,
|
|
0.0, 1.0, 1.0, 0.0, 0.0])
|
|
weights = np.array([1.0, 2.0, 3.0, 4.0])
|
|
|
|
group = np.array([5, 5, 5, 5], dtype=np.uint)
|
|
dtrain = xgboost.DMatrix(X, label=y, weight=weights)
|
|
dtrain.set_group(group)
|
|
|
|
params = {'eta': 1, 'tree_method': 'exact',
|
|
'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
|
|
'max_depth': 1}
|
|
evals_result = {}
|
|
bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
|
|
evals_result=evals_result)
|
|
auc_rec = evals_result['train']['auc']
|
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
|
auc_rec = evals_result['train']['aucpr']
|
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
|
|
|
for i in range(1, 11):
|
|
pred = bst.predict(dtrain, ntree_limit=i)
|
|
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
|
|
is_sorted = []
|
|
for k in range(0, 20, 5):
|
|
ind = np.argsort(-pred[k:k+5])
|
|
z = y[ind+k]
|
|
is_sorted.append(all(i >= j for i, j in zip(z, z[1:])))
|
|
# Since we give weights 1, 2, 3, 4 to the four query groups,
|
|
# the ranking predictor will first try to correctly sort the last query group
|
|
# before correctly sorting other groups.
|
|
assert all(p <= q for p, q in zip(is_sorted, is_sorted[1:]))
|
|
|
|
|
|
class TestRanking:
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
"""
|
|
Download and setup the test fixtures
|
|
"""
|
|
cls.dpath = 'demo/rank/'
|
|
(x_train, y_train, qid_train, x_test, y_test, qid_test,
|
|
x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath)
|
|
|
|
# instantiate the matrices
|
|
cls.dtrain = xgboost.DMatrix(x_train, y_train)
|
|
cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
|
|
cls.dtest = xgboost.DMatrix(x_test, y_test)
|
|
# set the group counts from the query IDs
|
|
cls.dtrain.set_group([len(list(items))
|
|
for _key, items in itertools.groupby(qid_train)])
|
|
cls.dtest.set_group([len(list(items))
|
|
for _key, items in itertools.groupby(qid_test)])
|
|
cls.dvalid.set_group([len(list(items))
|
|
for _key, items in itertools.groupby(qid_valid)])
|
|
# save the query IDs for testing
|
|
cls.qid_train = qid_train
|
|
cls.qid_test = qid_test
|
|
cls.qid_valid = qid_valid
|
|
|
|
# model training parameters
|
|
cls.params = {'objective': 'rank:pairwise',
|
|
'booster': 'gbtree',
|
|
'eval_metric': ['ndcg']
|
|
}
|
|
|
|
@classmethod
|
|
def teardown_class(cls):
|
|
"""
|
|
Cleanup test artifacts from download and unpacking
|
|
:return:
|
|
"""
|
|
os.remove(cls.dpath + "MQ2008.zip")
|
|
shutil.rmtree(cls.dpath + "MQ2008")
|
|
|
|
def test_training(self):
|
|
"""
|
|
Train an XGBoost ranking model
|
|
"""
|
|
# specify validations set to watch performance
|
|
watchlist = [(self.dtest, 'eval'), (self.dtrain, 'train')]
|
|
bst = xgboost.train(self.params, self.dtrain, num_boost_round=2500,
|
|
early_stopping_rounds=10, evals=watchlist)
|
|
assert bst.best_score > 0.98
|
|
|
|
def test_cv(self):
|
|
"""
|
|
Test cross-validation with a group specified
|
|
"""
|
|
cv = xgboost.cv(self.params, self.dtrain, num_boost_round=2500,
|
|
early_stopping_rounds=10, nfold=10, as_pandas=False)
|
|
assert isinstance(cv, dict)
|
|
assert (set(cv.keys()) == {'test-ndcg-mean', 'train-ndcg-mean', 'test-ndcg-std',
|
|
'train-ndcg-std'},
|
|
'CV results dict key mismatch.')
|
|
|
|
def test_cv_no_shuffle(self):
|
|
"""
|
|
Test cross-validation with a group specified
|
|
"""
|
|
cv = xgboost.cv(self.params, self.dtrain, num_boost_round=2500,
|
|
early_stopping_rounds=10, shuffle=False, nfold=10,
|
|
as_pandas=False)
|
|
assert isinstance(cv, dict)
|
|
assert len(cv) == 4
|
|
|
|
def test_get_group(self):
|
|
"""
|
|
Retrieve the group number from the dmatrix
|
|
"""
|
|
# test the new getter
|
|
self.dtrain.get_uint_info('group_ptr')
|
|
|
|
for d, qid in [(self.dtrain, self.qid_train),
|
|
(self.dvalid, self.qid_valid),
|
|
(self.dtest, self.qid_test)]:
|
|
# size of each group
|
|
group_sizes = np.array([len(list(items))
|
|
for _key, items in itertools.groupby(qid)])
|
|
# indexes of group boundaries
|
|
group_limits = d.get_uint_info('group_ptr')
|
|
assert len(group_limits) == len(group_sizes)+1
|
|
assert np.array_equal(np.diff(group_limits), group_sizes)
|
|
assert np.array_equal(
|
|
group_sizes, np.diff(d.get_uint_info('group_ptr')))
|
|
assert np.array_equal(group_sizes, np.diff(d.get_uint_info('group_ptr')))
|
|
assert np.array_equal(group_limits, d.get_uint_info('group_ptr'))
|