[dask] Fix empty partition with pandas input. (#7644)

Empty partition is different from empty dataset.  For the former case, each worker has
non-empty dask collections, but each collection might contain empty partition.
This commit is contained in:
Jiaming Yuan 2022-02-14 19:35:51 +08:00 committed by GitHub
parent 1f020a6097
commit b52c4e13b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 12 deletions

View File

@ -1062,6 +1062,9 @@ def _maybe_dataframe(
prediction, columns=columns, dtype=numpy.float32, index=index
)
else:
if prediction.size == 0:
return DataFrame({}, columns=columns, dtype=numpy.float32, index=index)
prediction = DataFrame(
prediction, columns=columns, dtype=numpy.float32, index=index
)

View File

@ -84,15 +84,12 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
out_preds->Resize(n);
ValidateBaseMarginShape(info.base_margin_, info.num_row_, n_classes);
out_preds->Copy(*base_margin);
} else {
if (out_preds->Empty()) {
out_preds->Resize(n, model.learner_model_param->base_score);
} else {
out_preds->Resize(n);
// cannot rely on the Resize to fill as it might skip if the size is already correct.
out_preds->Fill(model.learner_model_param->base_score);
}
}
}
} // namespace xgboost
namespace xgboost {

View File

@ -7,8 +7,6 @@ import numpy as np
import asyncio
import xgboost
import subprocess
import tempfile
import json
from collections import OrderedDict
from inspect import signature
from hypothesis import given, strategies, settings, note
@ -350,10 +348,34 @@ class TestDistributedGPU:
y = ddf[["y"]]
dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
bst = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")])
predt = dxgb.predict(client, bst, X).compute().values
predt = dxgb.predict(client, bst, X).compute().values
cupy.testing.assert_allclose(predt, predt_empty)
predt = dxgb.predict(client, bst, dtrain).compute()
cupy.testing.assert_allclose(predt, predt_empty)
predt = dxgb.inplace_predict(client, bst, X).compute().values
cupy.testing.assert_allclose(predt, predt_empty)
df = df.to_pandas()
empty = df.iloc[:0]
ddf = dd.concat(
[dd.from_pandas(empty, npartitions=1)]
+ [dd.from_pandas(df, npartitions=3)]
+ [dd.from_pandas(df, npartitions=3)]
)
X = ddf[ddf.columns.difference(["y"])]
y = ddf[["y"]]
predt_empty = cupy.asnumpy(predt_empty)
predt = dxgb.predict(client, bst_empty, X).compute().values
np.testing.assert_allclose(predt, predt_empty)
in_predt = dxgb.inplace_predict(client, bst_empty, X).compute().values
np.testing.assert_allclose(predt, in_predt)
def test_empty_dmatrix_auc(self, local_cuda_cluster: LocalCUDACluster) -> None:
with Client(local_cuda_cluster) as client:
n_workers = len(_get_client_workers(client))