Support sklearn cross validation for ranker. (#8859)

* Support sklearn cross validation for ranker. - Add a convention for X to include a special `qid` column. sklearn utilities consider only `X`, `y` and `sample_weight` for supervised learning algorithms, but we need an additional qid array for ranking. It's important to be able to support the cross validation function in sklearn since all other tuning functions like grid search are based on cross validation.
2023-03-07 00:22:08 +08:00
parent cad7401783
commit 7eba285a1e
8 changed files with 232 additions and 43 deletions
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -8,6 +8,7 @@ import pytest

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_ranking_qid_df

 sys.path.append("tests/python")
 import test_with_sklearn as twskl  # noqa
@@ -153,3 +154,10 @@ def test_classififer():
    y *= 10
    with pytest.raises(ValueError, match=r"Invalid classes.*"):
        clf.fit(X, y)
+
+
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_qid_df():
+    import cudf
+
+    run_ranking_qid_df(cudf, "gpu_hist")
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -11,6 +11,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore

@@ -180,6 +181,13 @@ def test_ranking_metric() -> None:
    assert results["validation_0"]["roc_auc_score"][-1] > 0.6


+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_qid_df():
+    import pandas as pd
+
+    run_ranking_qid_df(pd, "hist")
+
+
 def test_stacking_regression():
    from sklearn.datasets import load_diabetes
    from sklearn.ensemble import RandomForestRegressor, StackingRegressor