[dask] Add DaskXGBRanker (#6576)

* Initial support for distributed LTR using dask.

* Support `qid` in libxgboost.
* Refactor `predict` and `n_features_in_`, `best_[score/iteration/ntree_limit]`
  to avoid duplicated code.
* Define `DaskXGBRanker`.

The dask ranker doesn't support group structure, instead it uses query id and
convert to group ptr internally.
This commit is contained in:
Jiaming Yuan
2021-01-08 18:35:09 +08:00
committed by GitHub
parent 96d3d32265
commit 80065d571e
18 changed files with 755 additions and 351 deletions

View File

@@ -374,13 +374,32 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, base_margin.begin()));
} else if (!std::strcmp(key, "group")) {
group_ptr_.resize(num + 1);
group_ptr_.clear(); group_ptr_.resize(num + 1, 0);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1));
group_ptr_[0] = 0;
for (size_t i = 1; i < group_ptr_.size(); ++i) {
group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i];
}
} else if (!std::strcmp(key, "qid")) {
std::vector<uint32_t> query_ids(num, 0);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, query_ids.begin()));
bool non_dec = true;
for (size_t i = 1; i < query_ids.size(); ++i) {
if (query_ids[i] < query_ids[i-1]) {
non_dec = false;
break;
}
}
CHECK(non_dec) << "`qid` must be sorted in non-decreasing order along with data.";
group_ptr_.clear(); group_ptr_.push_back(0);
for (size_t i = 1; i < query_ids.size(); ++i) {
if (query_ids[i] != query_ids[i-1]) {
group_ptr_.push_back(i);
}
}
group_ptr_.push_back(query_ids.size());
} else if (!std::strcmp(key, "label_lower_bound")) {
auto& labels = labels_lower_bound_.HostVector();
labels.resize(num);