[dask] Add DaskXGBRanker (#6576)
* Initial support for distributed LTR using dask. * Support `qid` in libxgboost. * Refactor `predict` and `n_features_in_`, `best_[score/iteration/ntree_limit]` to avoid duplicated code. * Define `DaskXGBRanker`. The dask ranker doesn't support group structure, instead it uses query id and convert to group ptr internally.
This commit is contained in:
@@ -63,7 +63,7 @@ Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
|
||||
|
||||
template <typename T>
|
||||
Json Generate2dArrayInterface(int rows, int cols, std::string typestr,
|
||||
thrust::device_vector<T>* p_data) {
|
||||
thrust::device_vector<T> *p_data) {
|
||||
auto& data = *p_data;
|
||||
thrust::sequence(data.begin(), data.end());
|
||||
|
||||
|
||||
@@ -202,6 +202,24 @@ TEST(MetaInfo, LoadQid) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MetaInfo, CPUQid) {
|
||||
xgboost::MetaInfo info;
|
||||
info.num_row_ = 100;
|
||||
std::vector<uint32_t> qid(info.num_row_, 0);
|
||||
for (size_t i = 0; i < qid.size(); ++i) {
|
||||
qid[i] = i;
|
||||
}
|
||||
|
||||
info.SetInfo("qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
|
||||
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
|
||||
ASSERT_EQ(info.group_ptr_.front(), 0);
|
||||
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
|
||||
|
||||
for (size_t i = 0; i < info.num_row_ + 1; ++i) {
|
||||
ASSERT_EQ(info.group_ptr_[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MetaInfo, Validate) {
|
||||
xgboost::MetaInfo info;
|
||||
info.num_row_ = 10;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/json.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include "test_array_interface.h"
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -105,6 +106,28 @@ TEST(MetaInfo, Group) {
|
||||
EXPECT_ANY_THROW(info.SetInfo("group", float_str.c_str()));
|
||||
}
|
||||
|
||||
TEST(MetaInfo, GPUQid) {
|
||||
xgboost::MetaInfo info;
|
||||
info.num_row_ = 100;
|
||||
thrust::device_vector<uint32_t> qid(info.num_row_, 0);
|
||||
for (size_t i = 0; i < qid.size(); ++i) {
|
||||
qid[i] = i;
|
||||
}
|
||||
auto column = Generate2dArrayInterface(info.num_row_, 1, "<u4", &qid);
|
||||
Json array{std::vector<Json>{column}};
|
||||
std::string array_str;
|
||||
Json::Dump(array, &array_str);
|
||||
info.SetInfo("qid", array_str.c_str());
|
||||
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
|
||||
ASSERT_EQ(info.group_ptr_.front(), 0);
|
||||
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
|
||||
|
||||
for (size_t i = 0; i < info.num_row_ + 1; ++i) {
|
||||
ASSERT_EQ(info.group_ptr_[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(MetaInfo, DeviceExtend) {
|
||||
dh::safe_cuda(cudaSetDevice(0));
|
||||
size_t const kRows = 100;
|
||||
|
||||
@@ -171,6 +171,22 @@ Arrow specification.'''
|
||||
with pytest.raises(xgb.core.XGBoostError):
|
||||
m.slice(rindex=[0, 1, 2])
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_qid(self):
|
||||
import cupy as cp
|
||||
rng = cp.random.RandomState(1994)
|
||||
rows = 100
|
||||
cols = 10
|
||||
X, y = rng.randn(rows, cols), rng.randn(rows)
|
||||
qid = rng.randint(low=0, high=10, size=rows, dtype=np.uint32)
|
||||
qid = cp.sort(qid)
|
||||
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
Xy.set_info(qid=qid)
|
||||
group_ptr = Xy.get_uint_info('group_ptr')
|
||||
assert group_ptr[0] == 0
|
||||
assert group_ptr[-1] == rows
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
@pytest.mark.mgpu
|
||||
def test_specified_device(self):
|
||||
|
||||
@@ -239,6 +239,19 @@ class TestDMatrix:
|
||||
dtrain.get_float_info('base_margin')
|
||||
dtrain.get_uint_info('group_ptr')
|
||||
|
||||
def test_qid(self):
|
||||
rows = 100
|
||||
cols = 10
|
||||
X, y = rng.randn(rows, cols), rng.randn(rows)
|
||||
qid = rng.randint(low=0, high=10, size=rows, dtype=np.uint32)
|
||||
qid = np.sort(qid)
|
||||
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
Xy.set_info(qid=qid)
|
||||
group_ptr = Xy.get_uint_info('group_ptr')
|
||||
assert group_ptr[0] == 0
|
||||
assert group_ptr[-1] == rows
|
||||
|
||||
def test_feature_weights(self):
|
||||
kRows = 10
|
||||
kCols = 50
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix
|
||||
import testing as tm
|
||||
import xgboost
|
||||
import os
|
||||
import itertools
|
||||
@@ -79,22 +80,10 @@ class TestRanking:
|
||||
"""
|
||||
Download and setup the test fixtures
|
||||
"""
|
||||
from sklearn.datasets import load_svmlight_files
|
||||
# download the test data
|
||||
cls.dpath = 'demo/rank/'
|
||||
src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
|
||||
target = cls.dpath + '/MQ2008.zip'
|
||||
urllib.request.urlretrieve(url=src, filename=target)
|
||||
|
||||
with zipfile.ZipFile(target, 'r') as f:
|
||||
f.extractall(path=cls.dpath)
|
||||
|
||||
(x_train, y_train, qid_train, x_test, y_test, qid_test,
|
||||
x_valid, y_valid, qid_valid) = load_svmlight_files(
|
||||
(cls.dpath + "MQ2008/Fold1/train.txt",
|
||||
cls.dpath + "MQ2008/Fold1/test.txt",
|
||||
cls.dpath + "MQ2008/Fold1/vali.txt"),
|
||||
query_id=True, zero_based=False)
|
||||
x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath)
|
||||
|
||||
# instantiate the matrices
|
||||
cls.dtrain = xgboost.DMatrix(x_train, y_train)
|
||||
cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
|
||||
|
||||
@@ -5,6 +5,7 @@ import pytest
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
import numpy as np
|
||||
import scipy
|
||||
import json
|
||||
from typing import List, Tuple, Dict, Optional, Type, Any
|
||||
import asyncio
|
||||
@@ -670,12 +671,56 @@ def run_aft_survival(client: "Client", dmatrix_t: Type) -> None:
|
||||
assert nloglik_rec['extreme'][-1] > 4.9
|
||||
|
||||
|
||||
def test_aft_survival() -> None:
|
||||
def test_dask_aft_survival() -> None:
|
||||
with LocalCluster(n_workers=kWorkers) as cluster:
|
||||
with Client(cluster) as client:
|
||||
run_aft_survival(client, DaskDMatrix)
|
||||
|
||||
|
||||
def test_dask_ranking(client: "Client") -> None:
|
||||
dpath = "demo/rank/"
|
||||
mq2008 = tm.get_mq2008(dpath)
|
||||
data = []
|
||||
for d in mq2008:
|
||||
if isinstance(d, scipy.sparse.csr_matrix):
|
||||
d[d == 0] = np.inf
|
||||
d = d.toarray()
|
||||
d[d == 0] = np.nan
|
||||
d[np.isinf(d)] = 0
|
||||
data.append(da.from_array(d))
|
||||
else:
|
||||
data.append(da.from_array(d))
|
||||
|
||||
(
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
) = data
|
||||
qid_train = qid_train.astype(np.uint32)
|
||||
qid_valid = qid_valid.astype(np.uint32)
|
||||
qid_test = qid_test.astype(np.uint32)
|
||||
|
||||
rank = xgb.dask.DaskXGBRanker(n_estimators=2500)
|
||||
rank.fit(
|
||||
x_train,
|
||||
y_train,
|
||||
qid=qid_train,
|
||||
eval_set=[(x_test, y_test), (x_train, y_train)],
|
||||
eval_qid=[qid_test, qid_train],
|
||||
eval_metric=["ndcg"],
|
||||
verbose=True,
|
||||
early_stopping_rounds=10,
|
||||
)
|
||||
assert rank.n_features_in_ == 46
|
||||
assert rank.best_score > 0.98
|
||||
|
||||
|
||||
class TestWithDask:
|
||||
def test_global_config(self, client: "Client") -> None:
|
||||
X, y, _ = generate_array()
|
||||
@@ -981,7 +1026,7 @@ class TestWithDask:
|
||||
def test_shap(self, client: "Client") -> None:
|
||||
from sklearn.datasets import load_boston, load_digits
|
||||
X, y = load_boston(return_X_y=True)
|
||||
params = {'objective': 'reg:squarederror'}
|
||||
params: Dict[str, Any] = {'objective': 'reg:squarederror'}
|
||||
self.run_shap(X, y, params, client)
|
||||
|
||||
X, y = load_digits(return_X_y=True)
|
||||
|
||||
@@ -125,9 +125,11 @@ def test_ranking():
|
||||
x_train = np.random.rand(1000, 10)
|
||||
y_train = np.random.randint(5, size=1000)
|
||||
train_group = np.repeat(50, 20)
|
||||
|
||||
x_valid = np.random.rand(200, 10)
|
||||
y_valid = np.random.randint(5, size=200)
|
||||
valid_group = np.repeat(50, 4)
|
||||
|
||||
x_test = np.random.rand(100, 10)
|
||||
|
||||
params = {'tree_method': 'exact', 'objective': 'rank:pairwise',
|
||||
@@ -136,6 +138,7 @@ def test_ranking():
|
||||
model = xgb.sklearn.XGBRanker(**params)
|
||||
model.fit(x_train, y_train, group=train_group,
|
||||
eval_set=[(x_valid, y_valid)], eval_group=[valid_group])
|
||||
|
||||
pred = model.predict(x_test)
|
||||
|
||||
train_data = xgb.DMatrix(x_train, y_train)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# coding: utf-8
|
||||
import os
|
||||
import urllib
|
||||
import zipfile
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from io import StringIO
|
||||
@@ -209,6 +211,29 @@ def get_sparse():
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_mq2008(dpath):
|
||||
from sklearn.datasets import load_svmlight_files
|
||||
|
||||
src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
|
||||
target = dpath + '/MQ2008.zip'
|
||||
if not os.path.exists(target):
|
||||
urllib.request.urlretrieve(url=src, filename=target)
|
||||
|
||||
with zipfile.ZipFile(target, 'r') as f:
|
||||
f.extractall(path=dpath)
|
||||
|
||||
(x_train, y_train, qid_train, x_test, y_test, qid_test,
|
||||
x_valid, y_valid, qid_valid) = load_svmlight_files(
|
||||
(dpath + "MQ2008/Fold1/train.txt",
|
||||
dpath + "MQ2008/Fold1/test.txt",
|
||||
dpath + "MQ2008/Fold1/vali.txt"),
|
||||
query_id=True, zero_based=False)
|
||||
|
||||
return (x_train, y_train, qid_train, x_test, y_test, qid_test,
|
||||
x_valid, y_valid, qid_valid)
|
||||
|
||||
|
||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||
[TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
|
||||
TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),
|
||||
|
||||
Reference in New Issue
Block a user