[dask] prediction with categorical data. (#7708)

2022-03-10 00:21:48 +08:00 · 2022-03-10 00:21:48 +08:00 · a62a3d991d
commit a62a3d991d
parent 68b6d6bbe2
4 changed files with 40 additions and 19 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
        raise XGBoostError(py_str(_LIB.XGBGetLastError()))
 def _has_categorical(booster: "Booster", data: Any) -> bool:
    """Check whether the booster and input data for prediction contain categorical data.
    """
    from .data import _is_pandas_df, _is_cudf_df
    if _is_pandas_df(data) or _is_cudf_df(data):
        ft = booster.feature_types
        if ft is None:
            enable_categorical = False
        else:
            enable_categorical = any(f == "c" for f in ft)
    else:
        enable_categorical = False
    return enable_categorical
 def build_info() -> dict:
    """Build information of XGBoost.  The returned value format is not stable. Also, please
    note that build time dependency is not the same as runtime dependency. For instance,
@ -2046,17 +2062,9 @@ class Booster:
                    f"got {data.shape[1]}"
                )
-        from .data import _is_pandas_df, _transform_pandas_df
+        from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
        from .data import _array_interface
-        if (
+        enable_categorical = _has_categorical(self, data)
            _is_pandas_df(data)
            or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
        ):
            ft = self.feature_types
            if ft is None:
                enable_categorical = False
            else:
                enable_categorical = any(f == "c" for f in ft)
        if _is_pandas_df(data):
            data, _, _ = _transform_pandas_df(data, enable_categorical)
@ -2111,7 +2119,7 @@ class Booster:
                )
            )
            return _prediction_output(shape, dims, preds, True)
-        if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
+        if _is_cudf_df(data):
            from .data import _cudf_array_interfaces, _transform_cudf_df
            data, cat_codes, _, _ = _transform_cudf_df(
                data, None, None, enable_categorical
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@ -56,7 +56,7 @@ from .compat import lazy_isinstance
 from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
 from .core import Objective, Metric
-from .core import _deprecate_positional_args
+from .core import _deprecate_positional_args, _has_categorical
 from .data import FeatNamesT
 from .training import train as worker_train
 from .tracker import RabitTracker, get_host_ip
@ -1241,7 +1241,11 @@ async def _predict_async(
        booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
    ) -> Any:
        with config.config_context(**global_config):
-            m = DMatrix(data=partition, missing=missing)
+            m = DMatrix(
                data=partition,
                missing=missing,
                enable_categorical=_has_categorical(booster, partition)
            )
            predt = booster.predict(
                data=m,
                output_margin=output_margin,
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -466,12 +466,8 @@ def _from_dt_df(
    return handle, feature_names, feature_types
-def _is_cudf_df(data):
+def _is_cudf_df(data) -> bool:
-    try:
+    return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
        import cudf
    except ImportError:
        return False
    return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
 def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@ -288,10 +288,23 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
    reg.fit(X, y, eval_set=[(X, y)])
    assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
    booster = reg.get_booster()
    predt = xgb.dask.predict(client, booster, X).compute().values
    inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values
    if hasattr(predt, "get"):
        predt = predt.get()
    if hasattr(inpredt, "get"):
        inpredt = inpredt.get()
    np.testing.assert_allclose(predt, inpredt)
 def test_categorical(client: "Client") -> None:
    X, y = make_categorical(client, 10000, 30, 13)
    X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
    run_categorical(client, "approx", X, X_onehot, y)
    run_categorical(client, "hist", X, X_onehot, y)
 def test_dask_predict_shape_infer(client: "Client") -> None: