Make AUCPR work with multiple query groups (#4436)
* Make AUCPR work with multiple query groups * Check AUCPR <= 1.0 in distributed setting
This commit is contained in:
parent
2be85fc62a
commit
9252b686ae
@ -101,11 +101,11 @@ struct EvalAuc : public Metric {
|
|||||||
CHECK_EQ(gptr.back(), info.labels_.Size())
|
CHECK_EQ(gptr.back(), info.labels_.Size())
|
||||||
<< "EvalAuc: group structure must match number of prediction";
|
<< "EvalAuc: group structure must match number of prediction";
|
||||||
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||||
// sum statistics
|
// sum of all AUC's across all query groups
|
||||||
bst_float sum_auc = 0.0f;
|
double sum_auc = 0.0;
|
||||||
int auc_error = 0;
|
int auc_error = 0;
|
||||||
// each thread takes a local rec
|
// each thread takes a local rec
|
||||||
std::vector< std::pair<bst_float, unsigned> > rec;
|
std::vector<std::pair<bst_float, unsigned>> rec;
|
||||||
const auto& labels = info.labels_.HostVector();
|
const auto& labels = info.labels_.HostVector();
|
||||||
const std::vector<bst_float>& h_preds = preds.HostVector();
|
const std::vector<bst_float>& h_preds = preds.HostVector();
|
||||||
for (bst_omp_uint k = 0; k < ngroup; ++k) {
|
for (bst_omp_uint k = 0; k < ngroup; ++k) {
|
||||||
@ -130,7 +130,7 @@ struct EvalAuc : public Metric {
|
|||||||
buf_pos += ctr * wt;
|
buf_pos += ctr * wt;
|
||||||
buf_neg += (1.0f - ctr) * wt;
|
buf_neg += (1.0f - ctr) * wt;
|
||||||
}
|
}
|
||||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
sum_pospair += buf_neg * (sum_npos + buf_pos * 0.5);
|
||||||
sum_npos += buf_pos;
|
sum_npos += buf_pos;
|
||||||
sum_nneg += buf_neg;
|
sum_nneg += buf_neg;
|
||||||
// check weird conditions
|
// check weird conditions
|
||||||
@ -139,15 +139,15 @@ struct EvalAuc : public Metric {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// this is the AUC
|
// this is the AUC
|
||||||
sum_auc += sum_pospair / (sum_npos*sum_nneg);
|
sum_auc += sum_pospair / (sum_npos * sum_nneg);
|
||||||
}
|
}
|
||||||
CHECK(!auc_error)
|
CHECK(!auc_error)
|
||||||
<< "AUC: the dataset only contains pos or neg samples";
|
<< "AUC: the dataset only contains pos or neg samples";
|
||||||
|
/* Report average AUC across all groups */
|
||||||
if (distributed) {
|
if (distributed) {
|
||||||
bst_float dat[2];
|
bst_float dat[2];
|
||||||
dat[0] = static_cast<bst_float>(sum_auc);
|
dat[0] = static_cast<bst_float>(sum_auc);
|
||||||
dat[1] = static_cast<bst_float>(ngroup);
|
dat[1] = static_cast<bst_float>(ngroup);
|
||||||
// approximately estimate auc using mean
|
|
||||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||||
return dat[0] / dat[1];
|
return dat[0] / dat[1];
|
||||||
} else {
|
} else {
|
||||||
@ -383,9 +383,9 @@ struct EvalAucPR : public Metric {
|
|||||||
CHECK_EQ(gptr.back(), info.labels_.Size())
|
CHECK_EQ(gptr.back(), info.labels_.Size())
|
||||||
<< "EvalAucPR: group structure must match number of prediction";
|
<< "EvalAucPR: group structure must match number of prediction";
|
||||||
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||||
// sum statistics
|
// sum of all AUC's across all query groups
|
||||||
double auc = 0.0;
|
double sum_auc = 0.0;
|
||||||
int auc_error = 0, auc_gt_one = 0;
|
int auc_error = 0;
|
||||||
// each thread takes a local rec
|
// each thread takes a local rec
|
||||||
std::vector<std::pair<bst_float, unsigned>> rec;
|
std::vector<std::pair<bst_float, unsigned>> rec;
|
||||||
const auto& h_labels = info.labels_.HostVector();
|
const auto& h_labels = info.labels_.HostVector();
|
||||||
@ -420,14 +420,11 @@ struct EvalAucPR : public Metric {
|
|||||||
b = (prevfp - h * prevtp) / total_pos;
|
b = (prevfp - h * prevtp) / total_pos;
|
||||||
}
|
}
|
||||||
if (0.0 != b) {
|
if (0.0 != b) {
|
||||||
auc += (tp / total_pos - prevtp / total_pos -
|
sum_auc += (tp / total_pos - prevtp / total_pos -
|
||||||
b / a * (std::log(a * tp / total_pos + b) -
|
b / a * (std::log(a * tp / total_pos + b) -
|
||||||
std::log(a * prevtp / total_pos + b))) / a;
|
std::log(a * prevtp / total_pos + b))) / a;
|
||||||
} else {
|
} else {
|
||||||
auc += (tp / total_pos - prevtp / total_pos) / a;
|
sum_auc += (tp / total_pos - prevtp / total_pos) / a;
|
||||||
}
|
|
||||||
if (auc > 1.0) {
|
|
||||||
auc_gt_one = 1;
|
|
||||||
}
|
}
|
||||||
prevtp = tp;
|
prevtp = tp;
|
||||||
prevfp = fp;
|
prevfp = fp;
|
||||||
@ -439,16 +436,17 @@ struct EvalAucPR : public Metric {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
CHECK(!auc_error) << "AUC-PR: the dataset only contains pos or neg samples";
|
CHECK(!auc_error) << "AUC-PR: the dataset only contains pos or neg samples";
|
||||||
CHECK(!auc_gt_one) << "AUC-PR: AUC > 1.0";
|
/* Report average AUC across all groups */
|
||||||
if (distributed) {
|
if (distributed) {
|
||||||
bst_float dat[2];
|
bst_float dat[2];
|
||||||
dat[0] = static_cast<bst_float>(auc);
|
dat[0] = static_cast<bst_float>(sum_auc);
|
||||||
dat[1] = static_cast<bst_float>(ngroup);
|
dat[1] = static_cast<bst_float>(ngroup);
|
||||||
// approximately estimate auc using mean
|
|
||||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||||
|
CHECK_LE(dat[0], dat[1]) << "AUC-PR: AUC > 1.0";
|
||||||
return dat[0] / dat[1];
|
return dat[0] / dat[1];
|
||||||
} else {
|
} else {
|
||||||
return static_cast<bst_float>(auc) / ngroup;
|
CHECK_LE(sum_auc, static_cast<double>(ngroup)) << "AUC-PR: AUC > 1.0";
|
||||||
|
return static_cast<bst_float>(sum_auc) / ngroup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const char *Name() const override { return "aucpr"; }
|
const char *Name() const override { return "aucpr"; }
|
||||||
|
|||||||
27
tests/python/test_ranking.py
Normal file
27
tests/python/test_ranking.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import numpy as np
|
||||||
|
from scipy.sparse import csr_matrix
|
||||||
|
import xgboost
|
||||||
|
|
||||||
|
def test_ranking_with_unweighted_data():
|
||||||
|
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
|
||||||
|
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
|
||||||
|
X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
|
||||||
|
y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
|
||||||
|
0.0, 1.0, 0.0, 1.0, 0.0,
|
||||||
|
0.0, 1.0, 0.0, 0.0, 1.0,
|
||||||
|
0.0, 1.0, 1.0, 0.0, 0.0])
|
||||||
|
|
||||||
|
group = np.array([5, 5, 5, 5], dtype=np.uint)
|
||||||
|
dtrain = xgboost.DMatrix(X, label=y)
|
||||||
|
dtrain.set_group(group)
|
||||||
|
|
||||||
|
params = {'eta': 1, 'tree_method': 'exact',
|
||||||
|
'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
|
||||||
|
'max_depth': 1}
|
||||||
|
evals_result = {}
|
||||||
|
bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
|
||||||
|
evals_result=evals_result)
|
||||||
|
auc_rec = evals_result['train']['auc']
|
||||||
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||||
|
auc_rec = evals_result['train']['aucpr']
|
||||||
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||||
Loading…
x
Reference in New Issue
Block a user