[breaking] Add prediction fucntion for DMatrix and use inplace predict for dask. (#6668)
* Add a new API function for predicting on `DMatrix`. This function aligns with rest of the `XGBoosterPredictFrom*` functions on semantic of function arguments. * Purge `ntree_limit` from libxgboost, use iteration instead. * [dask] Use `inplace_predict` by default for dask sklearn models. * [dask] Run prediction shape inference on worker instead of client. The breaking change is in the Python sklearn `apply` function, I made it to be consistent with other prediction functions where `best_iteration` is used by default.
This commit is contained in:
@@ -34,6 +34,7 @@ dependencies:
|
||||
- llvmlite
|
||||
- pip:
|
||||
- shap
|
||||
- ipython # required by shap at import time.
|
||||
- guzzle_sphinx_theme
|
||||
- datatable
|
||||
- modin[all]
|
||||
|
||||
@@ -51,6 +51,53 @@ TEST(GBTree, SelectTreeMethod) {
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
}
|
||||
|
||||
TEST(GBTree, PredictionCache) {
|
||||
size_t constexpr kRows = 100, kCols = 10;
|
||||
GenericParameter generic_param;
|
||||
generic_param.UpdateAllowUnknown(Args{});
|
||||
LearnerModelParam mparam;
|
||||
mparam.base_score = 0.5;
|
||||
mparam.num_feature = kCols;
|
||||
mparam.num_output_group = 1;
|
||||
|
||||
std::unique_ptr<GradientBooster> p_gbm {
|
||||
GradientBooster::Create("gbtree", &generic_param, &mparam)};
|
||||
auto& gbtree = dynamic_cast<gbm::GBTree&> (*p_gbm);
|
||||
|
||||
gbtree.Configure({{"tree_method", "hist"}});
|
||||
auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
|
||||
auto gpair = GenerateRandomGradients(kRows);
|
||||
PredictionCacheEntry out_predictions;
|
||||
gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
|
||||
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
|
||||
ASSERT_EQ(1, out_predictions.version);
|
||||
std::vector<float> first_iter = out_predictions.predictions.HostVector();
|
||||
// Add 1 more boosted round
|
||||
gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
|
||||
ASSERT_EQ(2, out_predictions.version);
|
||||
// Update the cache for all rounds
|
||||
out_predictions.version = 0;
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
|
||||
ASSERT_EQ(2, out_predictions.version);
|
||||
|
||||
gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
|
||||
// drop the cache.
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 1, 2);
|
||||
ASSERT_EQ(0, out_predictions.version);
|
||||
// half open set [1, 3)
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 1, 3);
|
||||
ASSERT_EQ(0, out_predictions.version);
|
||||
// iteration end
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 2);
|
||||
ASSERT_EQ(2, out_predictions.version);
|
||||
// restart the cache when end iteration is smaller than cache version
|
||||
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 1);
|
||||
ASSERT_EQ(1, out_predictions.version);
|
||||
ASSERT_EQ(out_predictions.predictions.HostVector(), first_iter);
|
||||
}
|
||||
|
||||
TEST(GBTree, WrongUpdater) {
|
||||
size_t constexpr kRows = 17;
|
||||
size_t constexpr kCols = 15;
|
||||
|
||||
@@ -32,7 +32,7 @@ TEST(CpuPredictor, Basic) {
|
||||
// Test predict batch
|
||||
PredictionCacheEntry out_predictions;
|
||||
cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
|
||||
ASSERT_EQ(model.trees.size(), out_predictions.version);
|
||||
|
||||
std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
|
||||
for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
|
||||
ASSERT_EQ(out_predictions_h[i], 1.5);
|
||||
@@ -215,7 +215,7 @@ TEST(CpuPredictor, UpdatePredictionCache) {
|
||||
|
||||
PredictionCacheEntry out_predictions;
|
||||
// perform fair prediction on the same input data, should be equal to cached result
|
||||
gbm->PredictBatch(dmat.get(), &out_predictions, false, 0);
|
||||
gbm->PredictBatch(dmat.get(), &out_predictions, false, 0, 0);
|
||||
|
||||
std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
|
||||
std::vector<float> &predtion_cache_from_train = predtion_cache.predictions.HostVector();
|
||||
|
||||
@@ -45,7 +45,6 @@ TEST(GPUPredictor, Basic) {
|
||||
PredictionCacheEntry cpu_out_predictions;
|
||||
|
||||
gpu_predictor->PredictBatch(dmat.get(), &gpu_out_predictions, model, 0);
|
||||
ASSERT_EQ(model.trees.size(), gpu_out_predictions.version);
|
||||
cpu_predictor->PredictBatch(dmat.get(), &cpu_out_predictions, model, 0);
|
||||
|
||||
std::vector<float>& gpu_out_predictions_h = gpu_out_predictions.predictions.HostVector();
|
||||
|
||||
@@ -64,10 +64,10 @@ void TestTrainingPrediction(size_t rows, size_t bins,
|
||||
}
|
||||
|
||||
HostDeviceVector<float> from_full;
|
||||
learner->Predict(p_full, false, &from_full);
|
||||
learner->Predict(p_full, false, &from_full, 0, 0);
|
||||
|
||||
HostDeviceVector<float> from_hist;
|
||||
learner->Predict(p_hist, false, &from_hist);
|
||||
learner->Predict(p_hist, false, &from_hist, 0, 0);
|
||||
|
||||
for (size_t i = 0; i < rows; ++i) {
|
||||
EXPECT_NEAR(from_hist.ConstHostVector()[i],
|
||||
@@ -157,20 +157,20 @@ void TestPredictionWithLesserFeatures(std::string predictor_name) {
|
||||
learner->SaveConfig(&config);
|
||||
ASSERT_EQ(get<String>(config["learner"]["gradient_booster"]["gbtree_train_param"]["predictor"]), predictor_name);
|
||||
|
||||
learner->Predict(m_test, false, &prediction);
|
||||
learner->Predict(m_test, false, &prediction, 0, 0);
|
||||
ASSERT_EQ(prediction.Size(), kRows);
|
||||
|
||||
auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
|
||||
ASSERT_THROW({learner->Predict(m_invalid, false, &prediction);}, dmlc::Error);
|
||||
ASSERT_THROW({learner->Predict(m_invalid, false, &prediction, 0, 0);}, dmlc::Error);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
HostDeviceVector<float> from_cpu;
|
||||
learner->SetParam("predictor", "cpu_predictor");
|
||||
learner->Predict(m_test, false, &from_cpu);
|
||||
learner->Predict(m_test, false, &from_cpu, 0, 0);
|
||||
|
||||
HostDeviceVector<float> from_cuda;
|
||||
learner->SetParam("predictor", "gpu_predictor");
|
||||
learner->Predict(m_test, false, &from_cuda);
|
||||
learner->Predict(m_test, false, &from_cuda, 0, 0);
|
||||
|
||||
auto const& h_cpu = from_cpu.ConstHostVector();
|
||||
auto const& h_gpu = from_cuda.ConstHostVector();
|
||||
|
||||
@@ -221,9 +221,10 @@ TEST(Learner, MultiThreadedPredict) {
|
||||
auto &entry = learner->GetThreadLocal().prediction_entry;
|
||||
HostDeviceVector<float> predictions;
|
||||
for (size_t iter = 0; iter < kIters; ++iter) {
|
||||
learner->Predict(p_data, false, &entry.predictions);
|
||||
learner->Predict(p_data, false, &predictions, 0, true); // leaf
|
||||
learner->Predict(p_data, false, &predictions, 0, false, true); // contribs
|
||||
learner->Predict(p_data, false, &entry.predictions, 0, 0);
|
||||
|
||||
learner->Predict(p_data, false, &predictions, 0, 0, false, true); // leaf
|
||||
learner->Predict(p_data, false, &predictions, 0, 0, false, false, true); // contribs
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -112,17 +112,24 @@ def _test_cupy_metainfo(DMatrixT):
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_cupy_training_with_sklearn():
|
||||
import cupy as cp
|
||||
|
||||
np.random.seed(1)
|
||||
cp.random.seed(1)
|
||||
X = cp.random.randn(50, 10, dtype='float32')
|
||||
y = (cp.random.randn(50, dtype='float32') > 0).astype('int8')
|
||||
X = cp.random.randn(50, 10, dtype="float32")
|
||||
y = (cp.random.randn(50, dtype="float32") > 0).astype("int8")
|
||||
weights = np.random.random(50) + 1
|
||||
cupy_weights = cp.array(weights)
|
||||
base_margin = np.random.random(50)
|
||||
cupy_base_margin = cp.array(base_margin)
|
||||
|
||||
clf = xgb.XGBClassifier(gpu_id=0, tree_method='gpu_hist', use_label_encoder=False)
|
||||
clf.fit(X, y, sample_weight=cupy_weights, base_margin=cupy_base_margin, eval_set=[(X, y)])
|
||||
clf = xgb.XGBClassifier(gpu_id=0, tree_method="gpu_hist", use_label_encoder=False)
|
||||
clf.fit(
|
||||
X,
|
||||
y,
|
||||
sample_weight=cupy_weights,
|
||||
base_margin=cupy_base_margin,
|
||||
eval_set=[(X, y)],
|
||||
)
|
||||
pred = clf.predict(X)
|
||||
assert np.array_equal(np.unique(pred), np.array([0, 1]))
|
||||
|
||||
|
||||
@@ -16,13 +16,15 @@ if sys.platform.startswith("win"):
|
||||
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
|
||||
|
||||
sys.path.append("tests/python")
|
||||
from test_with_dask import run_empty_dmatrix_reg # noqa
|
||||
from test_with_dask import run_empty_dmatrix_cls # noqa
|
||||
from test_with_dask import _get_client_workers # noqa
|
||||
from test_with_dask import generate_array # noqa
|
||||
from test_with_dask import kCols as random_cols # noqa
|
||||
from test_with_dask import suppress # noqa
|
||||
import testing as tm # noqa
|
||||
from test_with_dask import run_empty_dmatrix_reg # noqa
|
||||
from test_with_dask import run_boost_from_prediction # noqa
|
||||
from test_with_dask import run_dask_classifier # noqa
|
||||
from test_with_dask import run_empty_dmatrix_cls # noqa
|
||||
from test_with_dask import _get_client_workers # noqa
|
||||
from test_with_dask import generate_array # noqa
|
||||
from test_with_dask import kCols as random_cols # noqa
|
||||
from test_with_dask import suppress # noqa
|
||||
import testing as tm # noqa
|
||||
|
||||
|
||||
try:
|
||||
@@ -132,9 +134,9 @@ def run_gpu_hist(
|
||||
num_rounds: int,
|
||||
dataset: tm.TestDataset,
|
||||
DMatrixT: Type,
|
||||
client: Client
|
||||
client: Client,
|
||||
) -> None:
|
||||
params['tree_method'] = 'gpu_hist'
|
||||
params["tree_method"] = "gpu_hist"
|
||||
params = dataset.set_params(params)
|
||||
# It doesn't make sense to distribute a completely
|
||||
# empty dataset.
|
||||
@@ -143,26 +145,40 @@ def run_gpu_hist(
|
||||
|
||||
chunk = 128
|
||||
X = to_cp(dataset.X, DMatrixT)
|
||||
X = da.from_array(X,
|
||||
chunks=(chunk, dataset.X.shape[1]))
|
||||
X = da.from_array(X, chunks=(chunk, dataset.X.shape[1]))
|
||||
y = to_cp(dataset.y, DMatrixT)
|
||||
y = da.from_array(y, chunks=(chunk, ))
|
||||
y = da.from_array(y, chunks=(chunk,))
|
||||
if dataset.w is not None:
|
||||
w = to_cp(dataset.w, DMatrixT)
|
||||
w = da.from_array(w, chunks=(chunk, ))
|
||||
w = da.from_array(w, chunks=(chunk,))
|
||||
else:
|
||||
w = None
|
||||
|
||||
if DMatrixT is dxgb.DaskDeviceQuantileDMatrix:
|
||||
m = DMatrixT(client, data=X, label=y, weight=w,
|
||||
max_bin=params.get('max_bin', 256))
|
||||
m = DMatrixT(
|
||||
client, data=X, label=y, weight=w, max_bin=params.get("max_bin", 256)
|
||||
)
|
||||
else:
|
||||
m = DMatrixT(client, data=X, label=y, weight=w)
|
||||
history = dxgb.train(client, params=params, dtrain=m,
|
||||
num_boost_round=num_rounds,
|
||||
evals=[(m, 'train')])['history']
|
||||
history = dxgb.train(
|
||||
client,
|
||||
params=params,
|
||||
dtrain=m,
|
||||
num_boost_round=num_rounds,
|
||||
evals=[(m, "train")],
|
||||
)["history"]
|
||||
note(history)
|
||||
assert tm.non_increasing(history['train'][dataset.metric])
|
||||
assert tm.non_increasing(history["train"][dataset.metric])
|
||||
|
||||
|
||||
def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
|
||||
import cudf
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
with Client(local_cuda_cluster) as client:
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
run_boost_from_prediction(X, y, "gpu_hist", client)
|
||||
|
||||
|
||||
class TestDistributedGPU:
|
||||
@@ -246,6 +262,20 @@ class TestDistributedGPU:
|
||||
dump = booster.get_dump(dump_format='json')
|
||||
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
@pytest.mark.skipif(**tm.no_dask())
|
||||
@pytest.mark.skipif(**tm.no_dask_cuda())
|
||||
@pytest.mark.parametrize("model", ["boosting"])
|
||||
def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None:
|
||||
import dask_cudf
|
||||
with Client(local_cuda_cluster) as client:
|
||||
X_, y_, w_ = generate_array(with_weights=True)
|
||||
y_ = (y_ * 10).astype(np.int32)
|
||||
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
|
||||
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
|
||||
w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
|
||||
run_dask_classifier(X, y, w, model, client)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_dask())
|
||||
@pytest.mark.skipif(**tm.no_dask_cuda())
|
||||
@pytest.mark.mgpu
|
||||
|
||||
@@ -434,7 +434,13 @@ class TestModels:
|
||||
booster[...:end] = booster
|
||||
|
||||
sliced_0 = booster[1:3]
|
||||
np.testing.assert_allclose(
|
||||
booster.predict(dtrain, iteration_range=(1, 3)), sliced_0.predict(dtrain)
|
||||
)
|
||||
sliced_1 = booster[3:7]
|
||||
np.testing.assert_allclose(
|
||||
booster.predict(dtrain, iteration_range=(3, 7)), sliced_1.predict(dtrain)
|
||||
)
|
||||
|
||||
predt_0 = sliced_0.predict(dtrain, output_margin=True)
|
||||
predt_1 = sliced_1.predict(dtrain, output_margin=True)
|
||||
|
||||
@@ -47,30 +47,27 @@ def run_predict_leaf(predictor):
|
||||
empty_leaf = booster.predict(empty, pred_leaf=True)
|
||||
assert empty_leaf.shape[0] == 0
|
||||
|
||||
leaf = booster.predict(m, pred_leaf=True)
|
||||
leaf = booster.predict(m, pred_leaf=True, strict_shape=True)
|
||||
assert leaf.shape[0] == rows
|
||||
assert leaf.shape[1] == classes * num_parallel_tree * num_boost_round
|
||||
assert leaf.shape[1] == num_boost_round
|
||||
assert leaf.shape[2] == classes
|
||||
assert leaf.shape[3] == num_parallel_tree
|
||||
|
||||
for i in range(rows):
|
||||
row = leaf[i, ...]
|
||||
for j in range(num_boost_round):
|
||||
start = classes * num_parallel_tree * j
|
||||
end = classes * num_parallel_tree * (j + 1)
|
||||
layer = row[start: end]
|
||||
for c in range(classes):
|
||||
tree_group = layer[c * num_parallel_tree: (c + 1) * num_parallel_tree]
|
||||
for k in range(classes):
|
||||
tree_group = leaf[i, j, k, :]
|
||||
assert tree_group.shape[0] == num_parallel_tree
|
||||
# no subsampling so tree in same forest should output same
|
||||
# leaf.
|
||||
# No sampling, all trees within forest are the same
|
||||
assert np.all(tree_group == tree_group[0])
|
||||
|
||||
ntree_limit = 2
|
||||
sliced = booster.predict(
|
||||
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit
|
||||
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True
|
||||
)
|
||||
first = sliced[0, ...]
|
||||
|
||||
assert first.shape[0] == classes * num_parallel_tree * ntree_limit
|
||||
assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
|
||||
return leaf
|
||||
|
||||
|
||||
@@ -78,6 +75,23 @@ def test_predict_leaf():
|
||||
run_predict_leaf('cpu_predictor')
|
||||
|
||||
|
||||
def test_predict_shape():
|
||||
from sklearn.datasets import load_boston
|
||||
X, y = load_boston(return_X_y=True)
|
||||
reg = xgb.XGBRegressor(n_estimators=1)
|
||||
reg.fit(X, y)
|
||||
predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
|
||||
assert len(predt.shape) == 2
|
||||
assert predt.shape[0] == X.shape[0]
|
||||
assert predt.shape[1] == 1
|
||||
|
||||
contrib = reg.get_booster().predict(
|
||||
xgb.DMatrix(X), pred_contribs=True, strict_shape=True
|
||||
)
|
||||
assert len(contrib.shape) == 3
|
||||
assert contrib.shape[1] == 1
|
||||
|
||||
|
||||
class TestInplacePredict:
|
||||
'''Tests for running inplace prediction'''
|
||||
@classmethod
|
||||
@@ -92,8 +106,7 @@ class TestInplacePredict:
|
||||
|
||||
dtrain = xgb.DMatrix(cls.X, cls.y)
|
||||
|
||||
cls.booster = xgb.train({'tree_method': 'hist'},
|
||||
dtrain, num_boost_round=10)
|
||||
cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)
|
||||
|
||||
cls.test = xgb.DMatrix(cls.X[:10, ...])
|
||||
|
||||
|
||||
@@ -159,12 +159,9 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
|
||||
assert prediction.shape[1] == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
|
||||
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
|
||||
def run_boost_from_prediction(
|
||||
X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
|
||||
) -> None:
|
||||
model_0 = xgb.dask.DaskXGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4,
|
||||
tree_method=tree_method)
|
||||
@@ -202,6 +199,30 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
|
||||
assert margined_res[i] < unmargined_res[i]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
|
||||
run_boost_from_prediction(X, y, tree_method, client)
|
||||
|
||||
|
||||
def test_inplace_predict(client: "Client") -> None:
|
||||
from sklearn.datasets import load_boston
|
||||
X_, y_ = load_boston(return_X_y=True)
|
||||
X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32)
|
||||
reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y)
|
||||
booster = reg.get_booster()
|
||||
base_margin = y
|
||||
|
||||
inplace = xgb.dask.inplace_predict(
|
||||
client, booster, X, base_margin=base_margin
|
||||
).compute()
|
||||
Xy = xgb.dask.DaskDMatrix(client, X, base_margin=base_margin)
|
||||
copied = xgb.dask.predict(client, booster, Xy).compute()
|
||||
np.testing.assert_allclose(inplace, copied)
|
||||
|
||||
|
||||
def test_dask_missing_value_reg(client: "Client") -> None:
|
||||
X_0 = np.ones((20 // 2, kCols))
|
||||
X_1 = np.zeros((20 // 2, kCols))
|
||||
@@ -288,10 +309,13 @@ def test_dask_regressor(model: str, client: "Client") -> None:
|
||||
assert forest == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["boosting", "rf"])
|
||||
def test_dask_classifier(model: str, client: "Client") -> None:
|
||||
X, y, w = generate_array(with_weights=True)
|
||||
y = (y * 10).astype(np.int32)
|
||||
def run_dask_classifier(
|
||||
X: xgb.dask._DaskCollection,
|
||||
y: xgb.dask._DaskCollection,
|
||||
w: xgb.dask._DaskCollection,
|
||||
model: str,
|
||||
client: "Client",
|
||||
) -> None:
|
||||
if model == "boosting":
|
||||
classifier = xgb.dask.DaskXGBClassifier(
|
||||
verbosity=1, n_estimators=2, eval_metric="merror"
|
||||
@@ -306,14 +330,13 @@ def test_dask_classifier(model: str, client: "Client") -> None:
|
||||
|
||||
classifier.client = client
|
||||
classifier.fit(X, y, sample_weight=w, eval_set=[(X, y)])
|
||||
prediction = classifier.predict(X)
|
||||
prediction = classifier.predict(X).compute()
|
||||
|
||||
assert prediction.ndim == 1
|
||||
assert prediction.shape[0] == kRows
|
||||
|
||||
history = classifier.evals_result()
|
||||
|
||||
assert isinstance(prediction, da.Array)
|
||||
assert isinstance(history, dict)
|
||||
|
||||
assert list(history.keys())[0] == "validation_0"
|
||||
@@ -332,7 +355,7 @@ def test_dask_classifier(model: str, client: "Client") -> None:
|
||||
assert forest == 2
|
||||
|
||||
# Test .predict_proba()
|
||||
probas = classifier.predict_proba(X)
|
||||
probas = classifier.predict_proba(X).compute()
|
||||
assert classifier.n_classes_ == 10
|
||||
assert probas.ndim == 2
|
||||
assert probas.shape[0] == kRows
|
||||
@@ -341,18 +364,33 @@ def test_dask_classifier(model: str, client: "Client") -> None:
|
||||
cls_booster = classifier.get_booster()
|
||||
single_node_proba = cls_booster.inplace_predict(X.compute())
|
||||
|
||||
np.testing.assert_allclose(single_node_proba, probas.compute())
|
||||
# test shared by CPU and GPU
|
||||
if isinstance(single_node_proba, np.ndarray):
|
||||
np.testing.assert_allclose(single_node_proba, probas)
|
||||
else:
|
||||
import cupy
|
||||
cupy.testing.assert_allclose(single_node_proba, probas)
|
||||
|
||||
# Test with dataframe.
|
||||
X_d = dd.from_dask_array(X)
|
||||
y_d = dd.from_dask_array(y)
|
||||
classifier.fit(X_d, y_d)
|
||||
# Test with dataframe, not shared with GPU as cupy doesn't work well with da.unique.
|
||||
if isinstance(X, da.Array):
|
||||
X_d: dd.DataFrame = X.to_dask_dataframe()
|
||||
|
||||
assert classifier.n_classes_ == 10
|
||||
prediction = classifier.predict(X_d).compute()
|
||||
assert classifier.n_classes_ == 10
|
||||
prediction_df = classifier.predict(X_d).compute()
|
||||
|
||||
assert prediction.ndim == 1
|
||||
assert prediction.shape[0] == kRows
|
||||
assert prediction_df.ndim == 1
|
||||
assert prediction_df.shape[0] == kRows
|
||||
np.testing.assert_allclose(prediction_df, prediction)
|
||||
|
||||
probas = classifier.predict_proba(X).compute()
|
||||
np.testing.assert_allclose(single_node_proba, probas)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["boosting", "rf"])
|
||||
def test_dask_classifier(model: str, client: "Client") -> None:
|
||||
X, y, w = generate_array(with_weights=True)
|
||||
y = (y * 10).astype(np.int32)
|
||||
run_dask_classifier(X, y, w, model, client)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
@@ -913,9 +951,9 @@ class TestWithDask:
|
||||
train = xgb.dask.DaskDMatrix(client, dX, dy)
|
||||
|
||||
dX = dd.from_array(X)
|
||||
dX = client.persist(dX, workers={dX: workers[1]})
|
||||
dX = client.persist(dX, workers=workers[1])
|
||||
dy = dd.from_array(y)
|
||||
dy = client.persist(dy, workers={dy: workers[1]})
|
||||
dy = client.persist(dy, workers=workers[1])
|
||||
valid = xgb.dask.DaskDMatrix(client, dX, dy)
|
||||
|
||||
merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
|
||||
@@ -1060,6 +1098,16 @@ class TestWithDask:
|
||||
assert_shape(shap.shape)
|
||||
assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)
|
||||
|
||||
X = dd.from_dask_array(X).repartition(npartitions=32)
|
||||
y = dd.from_dask_array(y).repartition(npartitions=32)
|
||||
shap_df = xgb.dask.predict(
|
||||
client, booster, X, pred_contribs=True, validate_features=False
|
||||
).compute()
|
||||
assert_shape(shap_df.shape)
|
||||
assert np.allclose(
|
||||
np.sum(shap_df, axis=len(shap_df.shape) - 1), margin, 1e-5, 1e-5
|
||||
)
|
||||
|
||||
def run_shap_cls_sklearn(self, X: Any, y: Any, client: "Client") -> None:
|
||||
X, y = da.from_array(X, chunks=(32, -1)), da.from_array(y, chunks=32)
|
||||
cls = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
||||
|
||||
Reference in New Issue
Block a user