Set ndcg to default for LTR. (#8822)

- Add document.
- Add tests.
- Use `ndcg` with `topk` as default.
This commit is contained in:
Jiaming Yuan
2023-06-09 23:31:33 +08:00
committed by GitHub
parent e4dd6051a0
commit 1fcc26a6f8
18 changed files with 842 additions and 19 deletions

View File

@@ -36,6 +36,7 @@ class LintersPaths:
"demo/guide-python/individual_trees.py",
"demo/guide-python/quantile_regression.py",
"demo/guide-python/multioutput_regression.py",
"demo/guide-python/learning_to_rank.py",
# CI
"tests/ci_build/lint_python.py",
"tests/ci_build/test_r_package.py",
@@ -76,6 +77,7 @@ class LintersPaths:
"demo/guide-python/individual_trees.py",
"demo/guide-python/quantile_regression.py",
"demo/guide-python/multioutput_regression.py",
"demo/guide-python/learning_to_rank.py",
# CI
"tests/ci_build/lint_python.py",
"tests/ci_build/test_r_package.py",

View File

@@ -299,7 +299,9 @@ class TestEvalMetrics:
def run_pr_auc_ltr(self, tree_method):
from sklearn.datasets import make_classification
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
ltr = xgb.XGBRanker(tree_method=tree_method, n_estimators=16)
ltr = xgb.XGBRanker(
tree_method=tree_method, n_estimators=16, objective="rank:pairwise"
)
groups = np.array([32, 32, 64])
ltr.fit(
X,

View File

@@ -1,12 +1,57 @@
import itertools
import json
import os
import shutil
from typing import Optional
import numpy as np
import pytest
from hypothesis import given, note, settings
from scipy.sparse import csr_matrix
import xgboost
from xgboost import testing as tm
from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples
from xgboost.testing.params import lambdarank_parameter_strategy
def test_ndcg_custom_gain():
def ndcg_gain(y: np.ndarray) -> np.ndarray:
return np.exp2(y.astype(np.float64)) - 1.0
X, y, q, w = tm.make_ltr(n_samples=1024, n_features=4, n_query_groups=3, max_rel=3)
y_gain = ndcg_gain(y)
byxgb = xgboost.XGBRanker(tree_method="hist", ndcg_exp_gain=True, n_estimators=10)
byxgb.fit(
X,
y,
qid=q,
sample_weight=w,
eval_set=[(X, y)],
eval_qid=(q,),
sample_weight_eval_set=(w,),
verbose=True,
)
byxgb_json = json.loads(byxgb.get_booster().save_raw(raw_format="json"))
bynp = xgboost.XGBRanker(tree_method="hist", ndcg_exp_gain=False, n_estimators=10)
bynp.fit(
X,
y_gain,
qid=q,
sample_weight=w,
eval_set=[(X, y_gain)],
eval_qid=(q,),
sample_weight_eval_set=(w,),
verbose=True,
)
bynp_json = json.loads(bynp.get_booster().save_raw(raw_format="json"))
# Remove the difference in parameter for comparison
byxgb_json["learner"]["objective"]["lambdarank_param"]["ndcg_exp_gain"] = "0"
assert byxgb.evals_result() == bynp.evals_result()
assert byxgb_json == bynp_json
def test_ranking_with_unweighted_data():
@@ -73,8 +118,77 @@ def test_ranking_with_weighted_data():
assert all(p <= q for p, q in zip(is_sorted, is_sorted[1:]))
class TestRanking:
def test_error_msg() -> None:
X, y, qid, w = tm.make_ltr(10, 2, 2, 2)
ranker = xgboost.XGBRanker()
with pytest.raises(ValueError, match=r"equal to the number of query groups"):
ranker.fit(X, y, qid=qid, sample_weight=y)
@given(lambdarank_parameter_strategy)
@settings(deadline=None, print_blob=True)
def test_lambdarank_parameters(params):
if params["objective"] == "rank:map":
rel = 1
else:
rel = 4
X, y, q, w = tm.make_ltr(4096, 3, 13, rel)
ranker = xgboost.XGBRanker(tree_method="hist", n_estimators=64, **params)
ranker.fit(X, y, qid=q, sample_weight=w, eval_set=[(X, y)], eval_qid=[q])
for k, v in ranker.evals_result()["validation_0"].items():
note(v)
assert v[-1] >= v[0]
assert ranker.n_features_in_ == 3
@pytest.mark.skipif(**tm.no_pandas())
@pytest.mark.skipif(**tm.no_sklearn())
def test_unbiased() -> None:
import pandas as pd
from sklearn.model_selection import train_test_split
X, y, q, w = tm.make_ltr(8192, 2, n_query_groups=6, max_rel=4)
X, Xe, y, ye, q, qe = train_test_split(X, y, q, test_size=0.2, random_state=3)
X = csr_matrix(X)
Xe = csr_matrix(Xe)
data = RelDataCV((X, y, q), (Xe, ye, qe), max_rel=4)
train, _ = simulate_clicks(data)
x, c, y, q = sort_ltr_samples(
train.X, train.y, train.qid, train.click, train.pos
)
df: Optional[pd.DataFrame] = None
class Position(xgboost.callback.TrainingCallback):
def after_training(self, model) -> bool:
nonlocal df
config = json.loads(model.save_config())
ti_plus = np.array(config["learner"]["objective"]["ti+"])
tj_minus = np.array(config["learner"]["objective"]["tj-"])
df = pd.DataFrame({"ti+": ti_plus, "tj-": tj_minus})
return model
ltr = xgboost.XGBRanker(
n_estimators=8,
tree_method="hist",
lambdarank_unbiased=True,
lambdarank_num_pair_per_sample=12,
lambdarank_pair_method="topk",
objective="rank:ndcg",
callbacks=[Position()],
boost_from_average=0,
)
ltr.fit(x, c, qid=q, eval_set=[(x, c)], eval_qid=[q])
assert df is not None
# normalized
np.testing.assert_allclose(df["ti+"].iloc[0], 1.0)
np.testing.assert_allclose(df["tj-"].iloc[0], 1.0)
# less biased on low ranks.
assert df["ti+"].iloc[-1] < df["ti+"].iloc[0]
class TestRanking:
@classmethod
def setup_class(cls):
"""

View File

@@ -130,11 +130,11 @@ def test_ranking():
params = {
"tree_method": "exact",
"objective": "rank:pairwise",
"learning_rate": 0.1,
"gamma": 1.0,
"min_child_weight": 0.1,
"max_depth": 6,
"eval_metric": "ndcg",
"n_estimators": 4,
}
model = xgb.sklearn.XGBRanker(**params)
@@ -163,7 +163,6 @@ def test_ranking():
"gamma": 1.0,
"min_child_weight": 0.1,
"max_depth": 6,
"eval_metric": "ndcg",
}
xgb_model_orig = xgb.train(
params_orig, train_data, num_boost_round=4, evals=[(valid_data, "validation")]