[dask] Add DaskXGBRanker (#6576)

* Initial support for distributed LTR using dask.

* Support `qid` in libxgboost.
* Refactor `predict` and `n_features_in_`, `best_[score/iteration/ntree_limit]`
  to avoid duplicated code.
* Define `DaskXGBRanker`.

The dask ranker doesn't support group structure, instead it uses query id and
convert to group ptr internally.
This commit is contained in:
Jiaming Yuan
2021-01-08 18:35:09 +08:00
committed by GitHub
parent 96d3d32265
commit 80065d571e
18 changed files with 755 additions and 351 deletions

View File

@@ -63,7 +63,7 @@ Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
template <typename T>
Json Generate2dArrayInterface(int rows, int cols, std::string typestr,
thrust::device_vector<T>* p_data) {
thrust::device_vector<T> *p_data) {
auto& data = *p_data;
thrust::sequence(data.begin(), data.end());

View File

@@ -202,6 +202,24 @@ TEST(MetaInfo, LoadQid) {
}
}
TEST(MetaInfo, CPUQid) {
xgboost::MetaInfo info;
info.num_row_ = 100;
std::vector<uint32_t> qid(info.num_row_, 0);
for (size_t i = 0; i < qid.size(); ++i) {
qid[i] = i;
}
info.SetInfo("qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
ASSERT_EQ(info.group_ptr_.front(), 0);
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
for (size_t i = 0; i < info.num_row_ + 1; ++i) {
ASSERT_EQ(info.group_ptr_[i], i);
}
}
TEST(MetaInfo, Validate) {
xgboost::MetaInfo info;
info.num_row_ = 10;

View File

@@ -4,6 +4,7 @@
#include <xgboost/data.h>
#include <xgboost/json.h>
#include <thrust/device_vector.h>
#include "test_array_interface.h"
#include "../../../src/common/device_helpers.cuh"
namespace xgboost {
@@ -105,6 +106,28 @@ TEST(MetaInfo, Group) {
EXPECT_ANY_THROW(info.SetInfo("group", float_str.c_str()));
}
TEST(MetaInfo, GPUQid) {
xgboost::MetaInfo info;
info.num_row_ = 100;
thrust::device_vector<uint32_t> qid(info.num_row_, 0);
for (size_t i = 0; i < qid.size(); ++i) {
qid[i] = i;
}
auto column = Generate2dArrayInterface(info.num_row_, 1, "<u4", &qid);
Json array{std::vector<Json>{column}};
std::string array_str;
Json::Dump(array, &array_str);
info.SetInfo("qid", array_str.c_str());
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
ASSERT_EQ(info.group_ptr_.front(), 0);
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
for (size_t i = 0; i < info.num_row_ + 1; ++i) {
ASSERT_EQ(info.group_ptr_[i], i);
}
}
TEST(MetaInfo, DeviceExtend) {
dh::safe_cuda(cudaSetDevice(0));
size_t const kRows = 100;