In AUC and AUCPR metrics, detect whether weights are per-instance or per-group (#4216)
* In AUC and AUCPR metrics, detect whether weights are per-instance or per-group * Fix C++ style check * Add a test for weighted AUC
This commit is contained in:
parent
9252b686ae
commit
8d1098a983
@ -14,6 +14,59 @@
|
|||||||
#include "../common/host_device_vector.h"
|
#include "../common/host_device_vector.h"
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Adapter to access instance weights.
|
||||||
|
*
|
||||||
|
* - For ranking task, weights are per-group
|
||||||
|
* - For binary classification task, weights are per-instance
|
||||||
|
*
|
||||||
|
* WeightPolicy::GetWeightOfInstance() :
|
||||||
|
* get weight associated with an individual instance, using index into
|
||||||
|
* `info.weights`
|
||||||
|
* WeightPolicy::GetWeightOfSortedRecord() :
|
||||||
|
* get weight associated with an individual instance, using index into
|
||||||
|
* sorted records `rec` (in ascending order of predicted labels). `rec` is
|
||||||
|
* of type PredIndPairContainer
|
||||||
|
*/
|
||||||
|
|
||||||
|
using PredIndPairContainer
|
||||||
|
= std::vector<std::pair<xgboost::bst_float, unsigned>>;
|
||||||
|
|
||||||
|
class PerInstanceWeightPolicy {
|
||||||
|
public:
|
||||||
|
inline static xgboost::bst_float
|
||||||
|
GetWeightOfInstance(const xgboost::MetaInfo& info,
|
||||||
|
unsigned instance_id, unsigned group_id) {
|
||||||
|
return info.GetWeight(instance_id);
|
||||||
|
}
|
||||||
|
inline static xgboost::bst_float
|
||||||
|
GetWeightOfSortedRecord(const xgboost::MetaInfo& info,
|
||||||
|
const PredIndPairContainer& rec,
|
||||||
|
unsigned record_id, unsigned group_id) {
|
||||||
|
return info.GetWeight(rec[record_id].second);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class PerGroupWeightPolicy {
|
||||||
|
public:
|
||||||
|
inline static xgboost::bst_float
|
||||||
|
GetWeightOfInstance(const xgboost::MetaInfo& info,
|
||||||
|
unsigned instance_id, unsigned group_id) {
|
||||||
|
return info.GetWeight(group_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static xgboost::bst_float
|
||||||
|
GetWeightOfSortedRecord(const xgboost::MetaInfo& info,
|
||||||
|
const PredIndPairContainer& rec,
|
||||||
|
unsigned record_id, unsigned group_id) {
|
||||||
|
return info.GetWeight(group_id);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace metric {
|
namespace metric {
|
||||||
// tag the this file, used by force static link later.
|
// tag the this file, used by force static link later.
|
||||||
@ -88,16 +141,18 @@ struct EvalAMS : public Metric {
|
|||||||
|
|
||||||
/*! \brief Area Under Curve, for both classification and rank */
|
/*! \brief Area Under Curve, for both classification and rank */
|
||||||
struct EvalAuc : public Metric {
|
struct EvalAuc : public Metric {
|
||||||
|
private:
|
||||||
|
template <typename WeightPolicy>
|
||||||
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||||
const MetaInfo &info,
|
const MetaInfo &info,
|
||||||
bool distributed) override {
|
bool distributed) {
|
||||||
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
||||||
CHECK_EQ(preds.Size(), info.labels_.Size())
|
CHECK_EQ(preds.Size(), info.labels_.Size())
|
||||||
<< "label size predict size not match";
|
<< "label size predict size not match";
|
||||||
std::vector<unsigned> tgptr(2, 0);
|
std::vector<unsigned> tgptr(2, 0);
|
||||||
tgptr[1] = static_cast<unsigned>(info.labels_.Size());
|
tgptr[1] = static_cast<unsigned>(info.labels_.Size());
|
||||||
|
|
||||||
const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
|
const std::vector<unsigned> &gptr = info.group_ptr_.empty() ? tgptr : info.group_ptr_;
|
||||||
CHECK_EQ(gptr.back(), info.labels_.Size())
|
CHECK_EQ(gptr.back(), info.labels_.Size())
|
||||||
<< "EvalAuc: group structure must match number of prediction";
|
<< "EvalAuc: group structure must match number of prediction";
|
||||||
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||||
@ -108,9 +163,9 @@ struct EvalAuc : public Metric {
|
|||||||
std::vector<std::pair<bst_float, unsigned>> rec;
|
std::vector<std::pair<bst_float, unsigned>> rec;
|
||||||
const auto& labels = info.labels_.HostVector();
|
const auto& labels = info.labels_.HostVector();
|
||||||
const std::vector<bst_float>& h_preds = preds.HostVector();
|
const std::vector<bst_float>& h_preds = preds.HostVector();
|
||||||
for (bst_omp_uint k = 0; k < ngroup; ++k) {
|
for (bst_omp_uint group_id = 0; group_id < ngroup; ++group_id) {
|
||||||
rec.clear();
|
rec.clear();
|
||||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
for (unsigned j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
||||||
rec.emplace_back(h_preds[j], j);
|
rec.emplace_back(h_preds[j], j);
|
||||||
}
|
}
|
||||||
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
||||||
@ -118,7 +173,8 @@ struct EvalAuc : public Metric {
|
|||||||
double sum_pospair = 0.0;
|
double sum_pospair = 0.0;
|
||||||
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
||||||
for (size_t j = 0; j < rec.size(); ++j) {
|
for (size_t j = 0; j < rec.size(); ++j) {
|
||||||
const bst_float wt = info.GetWeight(rec[j].second);
|
const bst_float wt
|
||||||
|
= WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id);
|
||||||
const bst_float ctr = labels[rec[j].second];
|
const bst_float ctr = labels[rec[j].second];
|
||||||
// keep bucketing predictions in same bucket
|
// keep bucketing predictions in same bucket
|
||||||
if (j != 0 && rec[j].first != rec[j - 1].first) {
|
if (j != 0 && rec[j].first != rec[j - 1].first) {
|
||||||
@ -154,6 +210,21 @@ struct EvalAuc : public Metric {
|
|||||||
return static_cast<bst_float>(sum_auc) / ngroup;
|
return static_cast<bst_float>(sum_auc) / ngroup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||||
|
const MetaInfo &info,
|
||||||
|
bool distributed) override {
|
||||||
|
// For ranking task, weights are per-group
|
||||||
|
// For binary classification task, weights are per-instance
|
||||||
|
const bool is_ranking_task =
|
||||||
|
!info.group_ptr_.empty() && info.weights_.Size() != info.num_row_;
|
||||||
|
if (is_ranking_task) {
|
||||||
|
return Eval<PerGroupWeightPolicy>(preds, info, distributed);
|
||||||
|
} else {
|
||||||
|
return Eval<PerInstanceWeightPolicy>(preds, info, distributed);
|
||||||
|
}
|
||||||
|
}
|
||||||
const char* Name() const override {
|
const char* Name() const override {
|
||||||
return "auc";
|
return "auc";
|
||||||
}
|
}
|
||||||
@ -370,9 +441,11 @@ struct EvalAucPR : public Metric {
|
|||||||
// implementation of AUC-PR for weighted data
|
// implementation of AUC-PR for weighted data
|
||||||
// translated from PRROC R Package
|
// translated from PRROC R Package
|
||||||
// see https://doi.org/10.1371/journal.pone.0092209
|
// see https://doi.org/10.1371/journal.pone.0092209
|
||||||
|
private:
|
||||||
bst_float Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
|
template <typename WeightPolicy>
|
||||||
bool distributed) override {
|
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||||
|
const MetaInfo &info,
|
||||||
|
bool distributed) {
|
||||||
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
||||||
CHECK_EQ(preds.Size(), info.labels_.Size())
|
CHECK_EQ(preds.Size(), info.labels_.Size())
|
||||||
<< "label size predict size not match";
|
<< "label size predict size not match";
|
||||||
@ -391,13 +464,15 @@ struct EvalAucPR : public Metric {
|
|||||||
const auto& h_labels = info.labels_.HostVector();
|
const auto& h_labels = info.labels_.HostVector();
|
||||||
const std::vector<bst_float>& h_preds = preds.HostVector();
|
const std::vector<bst_float>& h_preds = preds.HostVector();
|
||||||
|
|
||||||
for (bst_omp_uint k = 0; k < ngroup; ++k) {
|
for (bst_omp_uint group_id = 0; group_id < ngroup; ++group_id) {
|
||||||
double total_pos = 0.0;
|
double total_pos = 0.0;
|
||||||
double total_neg = 0.0;
|
double total_neg = 0.0;
|
||||||
rec.clear();
|
rec.clear();
|
||||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
for (unsigned j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
||||||
total_pos += info.GetWeight(j) * h_labels[j];
|
const bst_float wt
|
||||||
total_neg += info.GetWeight(j) * (1.0f - h_labels[j]);
|
= WeightPolicy::GetWeightOfInstance(info, j, group_id);
|
||||||
|
total_pos += wt * h_labels[j];
|
||||||
|
total_neg += wt * (1.0f - h_labels[j]);
|
||||||
rec.emplace_back(h_preds[j], j);
|
rec.emplace_back(h_preds[j], j);
|
||||||
}
|
}
|
||||||
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
||||||
@ -408,8 +483,10 @@ struct EvalAucPR : public Metric {
|
|||||||
// calculate AUC
|
// calculate AUC
|
||||||
double tp = 0.0, prevtp = 0.0, fp = 0.0, prevfp = 0.0, h = 0.0, a = 0.0, b = 0.0;
|
double tp = 0.0, prevtp = 0.0, fp = 0.0, prevfp = 0.0, h = 0.0, a = 0.0, b = 0.0;
|
||||||
for (size_t j = 0; j < rec.size(); ++j) {
|
for (size_t j = 0; j < rec.size(); ++j) {
|
||||||
tp += info.GetWeight(rec[j].second) * h_labels[rec[j].second];
|
const bst_float wt
|
||||||
fp += info.GetWeight(rec[j].second) * (1.0f - h_labels[rec[j].second]);
|
= WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id);
|
||||||
|
tp += wt * h_labels[rec[j].second];
|
||||||
|
fp += wt * (1.0f - h_labels[rec[j].second]);
|
||||||
if ((j < rec.size() - 1 && rec[j].first != rec[j + 1].first) || j == rec.size() - 1) {
|
if ((j < rec.size() - 1 && rec[j].first != rec[j + 1].first) || j == rec.size() - 1) {
|
||||||
if (tp == prevtp) {
|
if (tp == prevtp) {
|
||||||
a = 1.0;
|
a = 1.0;
|
||||||
@ -449,6 +526,21 @@ struct EvalAucPR : public Metric {
|
|||||||
return static_cast<bst_float>(sum_auc) / ngroup;
|
return static_cast<bst_float>(sum_auc) / ngroup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||||
|
const MetaInfo &info,
|
||||||
|
bool distributed) override {
|
||||||
|
// For ranking task, weights are per-group
|
||||||
|
// For binary classification task, weights are per-instance
|
||||||
|
const bool is_ranking_task =
|
||||||
|
!info.group_ptr_.empty() && info.weights_.Size() != info.num_row_;
|
||||||
|
if (is_ranking_task) {
|
||||||
|
return Eval<PerGroupWeightPolicy>(preds, info, distributed);
|
||||||
|
} else {
|
||||||
|
return Eval<PerInstanceWeightPolicy>(preds, info, distributed);
|
||||||
|
}
|
||||||
|
}
|
||||||
const char *Name() const override { return "aucpr"; }
|
const char *Name() const override { return "aucpr"; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -25,3 +25,41 @@ def test_ranking_with_unweighted_data():
|
|||||||
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||||
auc_rec = evals_result['train']['aucpr']
|
auc_rec = evals_result['train']['aucpr']
|
||||||
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||||
|
|
||||||
|
def test_ranking_with_weighted_data():
|
||||||
|
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
|
||||||
|
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
|
||||||
|
X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
|
||||||
|
y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
|
||||||
|
0.0, 1.0, 0.0, 1.0, 0.0,
|
||||||
|
0.0, 1.0, 0.0, 0.0, 1.0,
|
||||||
|
0.0, 1.0, 1.0, 0.0, 0.0])
|
||||||
|
weights = np.array([1.0, 2.0, 3.0, 4.0])
|
||||||
|
|
||||||
|
group = np.array([5, 5, 5, 5], dtype=np.uint)
|
||||||
|
dtrain = xgboost.DMatrix(X, label=y, weight=weights)
|
||||||
|
dtrain.set_group(group)
|
||||||
|
|
||||||
|
params = {'eta': 1, 'tree_method': 'exact',
|
||||||
|
'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
|
||||||
|
'max_depth': 1}
|
||||||
|
evals_result = {}
|
||||||
|
bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
|
||||||
|
evals_result=evals_result)
|
||||||
|
auc_rec = evals_result['train']['auc']
|
||||||
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||||
|
auc_rec = evals_result['train']['aucpr']
|
||||||
|
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||||
|
|
||||||
|
for i in range(1, 11):
|
||||||
|
pred = bst.predict(dtrain, ntree_limit=i)
|
||||||
|
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
|
||||||
|
is_sorted = []
|
||||||
|
for k in range(0, 20, 5):
|
||||||
|
ind = np.argsort(-pred[k:k+5])
|
||||||
|
z = y[ind+k]
|
||||||
|
is_sorted.append(all(i >= j for i, j in zip(z, z[1:])))
|
||||||
|
# Since we give weights 1, 2, 3, 4 to the four query groups,
|
||||||
|
# the ranking predictor will first try to correctly sort the last query group
|
||||||
|
# before correctly sorting other groups.
|
||||||
|
assert all(p <= q for p, q in zip(is_sorted, is_sorted[1:]))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user