merge changes Jun092023

This commit is contained in:
amdsc21
2023-06-09 22:41:33 +02:00
63 changed files with 1299 additions and 243 deletions

View File

@@ -31,7 +31,7 @@ ArrayLike = Any
PathLike = Union[str, os.PathLike]
CupyT = ArrayLike # maybe need a stub for cupy arrays
NumpyOrCupy = Any
NumpyDType = Union[str, Type[np.number]]
NumpyDType = Union[str, Type[np.number]] # pylint: disable=invalid-name
PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
FloatCompatible = Union[float, np.float32, np.float64]

View File

@@ -1796,7 +1796,11 @@ def _get_qid(
@xgboost_model_doc(
"""Implementation of the Scikit-Learn API for XGBoost Ranking.""",
"""Implementation of the Scikit-Learn API for XGBoost Ranking.
See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an introducion.
""",
["estimators", "model"],
end_note="""
.. note::
@@ -1845,7 +1849,7 @@ def _get_qid(
class XGBRanker(XGBModel, XGBRankerMixIn):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name
@_deprecate_positional_args
def __init__(self, *, objective: str = "rank:pairwise", **kwargs: Any):
def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any):
super().__init__(objective=objective, **kwargs)
if callable(self.objective):
raise ValueError("custom objective function not supported by XGBRanker")
@@ -2029,7 +2033,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
self._Booster = train(
params,
train_dmatrix,
self.get_num_boosting_rounds(),
num_boost_round=self.get_num_boosting_rounds(),
early_stopping_rounds=early_stopping_rounds,
evals=evals,
evals_result=evals_result,

View File

@@ -1,6 +1,8 @@
"""Tests for dask shared by different test modules."""
import numpy as np
import pandas as pd
from dask import array as da
from dask import dataframe as dd
from distributed import Client
import xgboost as xgb
@@ -52,3 +54,22 @@ def check_init_estimation(tree_method: str, client: Client) -> None:
"""Test init estimation."""
check_init_estimation_reg(tree_method, client)
check_init_estimation_clf(tree_method, client)
def check_uneven_nan(client: Client, tree_method: str, n_workers: int) -> None:
"""Issue #9271, not every worker has missing value."""
assert n_workers >= 2
with client.as_current():
clf = xgb.dask.DaskXGBClassifier(tree_method=tree_method)
X = pd.DataFrame({"a": range(10000), "b": range(10000, 0, -1)})
y = pd.Series([*[0] * 5000, *[1] * 5000])
X["a"][:3000:1000] = np.NaN
client.wait_for_workers(n_workers=n_workers)
clf.fit(
dd.from_pandas(X, npartitions=n_workers),
dd.from_pandas(y, npartitions=n_workers),
)

View File

@@ -1,11 +1,14 @@
# pylint: disable=invalid-name
"""Utilities for data generation."""
import os
import zipfile
from typing import Any, Generator, List, Tuple, Union
from dataclasses import dataclass
from typing import Any, Generator, List, NamedTuple, Optional, Tuple, Union
from urllib import request
import numpy as np
import pytest
from numpy import typing as npt
from numpy.random import Generator as RNG
from scipy import sparse
@@ -340,3 +343,263 @@ def get_mq2008(
y_valid,
qid_valid,
)
RelData = Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]]
@dataclass
class ClickFold:
"""A structure containing information about generated user-click data."""
X: sparse.csr_matrix
y: npt.NDArray[np.int32]
qid: npt.NDArray[np.int32]
score: npt.NDArray[np.float32]
click: npt.NDArray[np.int32]
pos: npt.NDArray[np.int64]
class RelDataCV(NamedTuple):
"""Simple data struct for holding a train-test split of a learning to rank dataset."""
train: RelData
test: RelData
max_rel: int
def is_binary(self) -> bool:
"""Whether the label consists of binary relevance degree."""
return self.max_rel == 1
class PBM: # pylint: disable=too-few-public-methods
"""Simulate click data with position bias model. There are other models available in
`ULTRA <https://github.com/ULTR-Community/ULTRA.git>`_ like the cascading model.
References
----------
Unbiased LambdaMART: An Unbiased Pairwise Learning-to-Rank Algorithm
"""
def __init__(self, eta: float) -> None:
# click probability for each relevance degree. (from 0 to 4)
self.click_prob = np.array([0.1, 0.16, 0.28, 0.52, 1.0])
exam_prob = np.array(
[0.68, 0.61, 0.48, 0.34, 0.28, 0.20, 0.11, 0.10, 0.08, 0.06]
)
# Observation probability, encoding positional bias for each position
self.exam_prob = np.power(exam_prob, eta)
def sample_clicks_for_query(
self, labels: npt.NDArray[np.int32], position: npt.NDArray[np.int64]
) -> npt.NDArray[np.int32]:
"""Sample clicks for one query based on input relevance degree and position.
Parameters
----------
labels :
relevance_degree
"""
labels = np.array(labels, copy=True)
click_prob = np.zeros(labels.shape)
# minimum
labels[labels < 0] = 0
# maximum
labels[labels >= len(self.click_prob)] = -1
click_prob = self.click_prob[labels]
exam_prob = np.zeros(labels.shape)
assert position.size == labels.size
ranks = np.array(position, copy=True)
# maximum
ranks[ranks >= self.exam_prob.size] = -1
exam_prob = self.exam_prob[ranks]
rng = np.random.default_rng(1994)
prob = rng.random(size=labels.shape[0], dtype=np.float32)
clicks: npt.NDArray[np.int32] = np.zeros(labels.shape, dtype=np.int32)
clicks[prob < exam_prob * click_prob] = 1
return clicks
def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.NDArray]:
"""Run length encoding using numpy, modified from:
https://gist.github.com/nvictus/66627b580c13068589957d6ab0919e66
"""
x = np.asarray(x)
n = x.size
starts = np.r_[0, np.flatnonzero(~np.isclose(x[1:], x[:-1], equal_nan=True)) + 1]
lengths = np.diff(np.r_[starts, n])
values = x[starts]
indptr = np.append(starts, np.array([x.size]))
return indptr, lengths, values
def init_rank_score(
X: sparse.csr_matrix,
y: npt.NDArray[np.int32],
qid: npt.NDArray[np.int32],
sample_rate: float = 0.1,
) -> npt.NDArray[np.float32]:
"""We use XGBoost to generate the initial score instead of SVMRank for
simplicity. Sample rate is set to 0.1 by default so that we can test with small
datasets.
"""
# random sample
rng = np.random.default_rng(1994)
n_samples = int(X.shape[0] * sample_rate)
index = np.arange(0, X.shape[0], dtype=np.uint64)
rng.shuffle(index)
index = index[:n_samples]
X_train = X[index]
y_train = y[index]
qid_train = qid[index]
# Sort training data based on query id, required by XGBoost.
sorted_idx = np.argsort(qid_train)
X_train = X_train[sorted_idx]
y_train = y_train[sorted_idx]
qid_train = qid_train[sorted_idx]
ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
ltr.fit(X_train, y_train, qid=qid_train)
# Use the original order of the data.
scores = ltr.predict(X)
return scores
def simulate_one_fold(
fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
scores_fold: npt.NDArray[np.float32],
) -> ClickFold:
"""Simulate clicks for one fold."""
X_fold, y_fold, qid_fold = fold
assert qid_fold.dtype == np.int32
qids = np.unique(qid_fold)
position = np.empty((y_fold.size,), dtype=np.int64)
clicks = np.empty((y_fold.size,), dtype=np.int32)
pbm = PBM(eta=1.0)
# Avoid grouping by qid as we want to preserve the original data partition by
# the dataset authors.
for q in qids:
qid_mask = q == qid_fold
qid_mask = qid_mask.reshape(qid_mask.shape[0])
query_scores = scores_fold[qid_mask]
# Initial rank list, scores sorted to decreasing order
query_position = np.argsort(query_scores)[::-1]
position[qid_mask] = query_position
# get labels
relevance_degrees = y_fold[qid_mask]
query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position)
clicks[qid_mask] = query_clicks
assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
# pylint: disable=too-many-locals
def simulate_clicks(cv_data: RelDataCV) -> Tuple[ClickFold, Optional[ClickFold]]:
"""Simulate click data using position biased model (PBM)."""
X, y, qid = list(zip(cv_data.train, cv_data.test))
# ptr to train-test split
indptr = np.array([0] + [v.shape[0] for v in X])
indptr = np.cumsum(indptr)
assert len(indptr) == 2 + 1 # train, test
X_full = sparse.vstack(X)
y_full = np.concatenate(y)
qid_full = np.concatenate(qid)
# Obtain initial relevance score for click simulation
scores_full = init_rank_score(X_full, y_full, qid_full)
# partition it back to (train, test) tuple
scores = [scores_full[indptr[i - 1] : indptr[i]] for i in range(1, indptr.size)]
X_lst, y_lst, q_lst, s_lst, c_lst, p_lst = [], [], [], [], [], []
for i in range(indptr.size - 1):
fold = simulate_one_fold((X[i], y[i], qid[i]), scores[i])
X_lst.append(fold.X)
y_lst.append(fold.y)
q_lst.append(fold.qid)
s_lst.append(fold.score)
c_lst.append(fold.click)
p_lst.append(fold.pos)
scores_check_1 = [s_lst[i] for i in range(indptr.size - 1)]
for i in range(2):
assert (scores_check_1[i] == scores[i]).all()
if len(X_lst) == 1:
train = ClickFold(X_lst[0], y_lst[0], q_lst[0], s_lst[0], c_lst[0], p_lst[0])
test = None
else:
train, test = (
ClickFold(X_lst[i], y_lst[i], q_lst[i], s_lst[i], c_lst[i], p_lst[i])
for i in range(len(X_lst))
)
return train, test
def sort_ltr_samples(
X: sparse.csr_matrix,
y: npt.NDArray[np.int32],
qid: npt.NDArray[np.int32],
clicks: npt.NDArray[np.int32],
pos: npt.NDArray[np.int64],
) -> Tuple[
sparse.csr_matrix,
npt.NDArray[np.int32],
npt.NDArray[np.int32],
npt.NDArray[np.int32],
]:
"""Sort data based on query index and position."""
sorted_idx = np.argsort(qid)
X = X[sorted_idx]
clicks = clicks[sorted_idx]
qid = qid[sorted_idx]
pos = pos[sorted_idx]
indptr, _, _ = rlencode(qid)
for i in range(1, indptr.size):
beg = indptr[i - 1]
end = indptr[i]
assert beg < end, (beg, end)
assert np.unique(qid[beg:end]).size == 1, (beg, end)
query_pos = pos[beg:end]
assert query_pos.min() == 0, query_pos.min()
assert query_pos.max() >= query_pos.size - 1, (
query_pos.max(),
query_pos.size,
i,
np.unique(qid[beg:end]),
)
sorted_idx = np.argsort(query_pos)
X[beg:end] = X[beg:end][sorted_idx]
clicks[beg:end] = clicks[beg:end][sorted_idx]
y[beg:end] = y[beg:end][sorted_idx]
# not necessary
qid[beg:end] = qid[beg:end][sorted_idx]
data = X, clicks, y, qid
return data

View File

@@ -67,3 +67,17 @@ cat_parameter_strategy = strategies.fixed_dictionaries(
"max_cat_threshold": strategies.integers(1, 128),
}
)
lambdarank_parameter_strategy = strategies.fixed_dictionaries(
{
"lambdarank_unbiased": strategies.sampled_from([True, False]),
"lambdarank_pair_method": strategies.sampled_from(["topk", "mean"]),
"lambdarank_num_pair_per_sample": strategies.integers(1, 8),
"lambdarank_bias_norm": strategies.floats(0.5, 2.0),
"objective": strategies.sampled_from(
["rank:ndcg", "rank:map", "rank:pairwise"]
),
}
).filter(
lambda x: not (x["lambdarank_unbiased"] and x["lambdarank_pair_method"] == "mean")
)