[dask] Fix empty partition with pandas input. (#7644)

Empty partition is different from empty dataset. For the former case, each worker has non-empty dask collections, but each collection might contain empty partition.
2022-02-14 19:35:51 +08:00
parent 1f020a6097
commit b52c4e13b0
3 changed files with 34 additions and 12 deletions
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -1062,6 +1062,9 @@ def _maybe_dataframe(
                prediction, columns=columns, dtype=numpy.float32, index=index
            )
        else:
+            if prediction.size == 0:
+                return DataFrame({}, columns=columns, dtype=numpy.float32, index=index)
+
            prediction = DataFrame(
                prediction, columns=columns, dtype=numpy.float32, index=index
            )
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -84,14 +84,11 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
    out_preds->Resize(n);
    ValidateBaseMarginShape(info.base_margin_, info.num_row_, n_classes);
    out_preds->Copy(*base_margin);
-  } else {
-    if (out_preds->Empty()) {
-      out_preds->Resize(n, model.learner_model_param->base_score);
  } else {
    out_preds->Resize(n);
+    // cannot rely on the Resize to fill as it might skip if the size is already correct.
    out_preds->Fill(model.learner_model_param->base_score);
  }
-  }
 }
 }  // namespace xgboost

--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -7,8 +7,6 @@ import numpy as np
 import asyncio
 import xgboost
 import subprocess
-import tempfile
-import json
 from collections import OrderedDict
 from inspect import signature
 from hypothesis import given, strategies, settings, note
@@ -321,9 +319,9 @@ class TestDistributedGPU:
            mult = 100
            df = cudf.DataFrame(
                {
-                    "a": [1,2,3,4,5.1] * mult,
-                    "b": [10,15,29.3,30,31] * mult,
-                    "y": [10,20,30,40.,50] * mult,
+                    "a": [1, 2, 3, 4, 5.1] * mult,
+                    "b": [10, 15, 29.3, 30, 31] * mult,
+                    "y": [10, 20, 30, 40., 50] * mult,
                }
            )
            parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}
@@ -350,10 +348,34 @@ class TestDistributedGPU:
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")])
-            predt = dxgb.predict(client, bst, X).compute().values

+            predt = dxgb.predict(client, bst, X).compute().values
            cupy.testing.assert_allclose(predt, predt_empty)

+            predt = dxgb.predict(client, bst, dtrain).compute()
+            cupy.testing.assert_allclose(predt, predt_empty)
+
+            predt = dxgb.inplace_predict(client, bst, X).compute().values
+            cupy.testing.assert_allclose(predt, predt_empty)
+
+            df = df.to_pandas()
+            empty = df.iloc[:0]
+            ddf = dd.concat(
+                [dd.from_pandas(empty, npartitions=1)]
+                + [dd.from_pandas(df, npartitions=3)]
+                + [dd.from_pandas(df, npartitions=3)]
+            )
+            X = ddf[ddf.columns.difference(["y"])]
+            y = ddf[["y"]]
+
+            predt_empty = cupy.asnumpy(predt_empty)
+
+            predt = dxgb.predict(client, bst_empty, X).compute().values
+            np.testing.assert_allclose(predt, predt_empty)
+
+            in_predt = dxgb.inplace_predict(client, bst_empty, X).compute().values
+            np.testing.assert_allclose(predt, in_predt)
+
    def test_empty_dmatrix_auc(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            n_workers = len(_get_client_workers(client))