Disable dense optimization in hist for distributed training. (#9272)

2023-06-10 02:31:34 +08:00
parent 8c1065f645
commit ea0deeca68
5 changed files with 44 additions and 10 deletions
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -1,6 +1,8 @@
 """Tests for dask shared by different test modules."""
 import numpy as np
+import pandas as pd
 from dask import array as da
+from dask import dataframe as dd
 from distributed import Client

 import xgboost as xgb
@@ -52,3 +54,22 @@ def check_init_estimation(tree_method: str, client: Client) -> None:
    """Test init estimation."""
    check_init_estimation_reg(tree_method, client)
    check_init_estimation_clf(tree_method, client)
+
+
+def check_uneven_nan(client: Client, tree_method: str, n_workers: int) -> None:
+    """Issue #9271, not every worker has missing value."""
+    assert n_workers >= 2
+
+    with client.as_current():
+        clf = xgb.dask.DaskXGBClassifier(tree_method=tree_method)
+        X = pd.DataFrame({"a": range(10000), "b": range(10000, 0, -1)})
+        y = pd.Series([*[0] * 5000, *[1] * 5000])
+
+        X["a"][:3000:1000] = np.NaN
+
+        client.wait_for_workers(n_workers=n_workers)
+
+        clf.fit(
+            dd.from_pandas(X, npartitions=n_workers),
+            dd.from_pandas(y, npartitions=n_workers),
+        )