From a62a3d991d8826719e13f6e7b502b9c0409bbaac Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 10 Mar 2022 00:21:48 +0800 Subject: [PATCH] [dask] prediction with categorical data. (#7708) --- python-package/xgboost/core.py | 30 +++++++++++++++++++----------- python-package/xgboost/dask.py | 8 ++++++-- python-package/xgboost/data.py | 8 ++------ tests/python/test_with_dask.py | 13 +++++++++++++ 4 files changed, 40 insertions(+), 19 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index cfe7058b0..aaae8b539 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -192,6 +192,22 @@ def _check_call(ret: int) -> None: raise XGBoostError(py_str(_LIB.XGBGetLastError())) +def _has_categorical(booster: "Booster", data: Any) -> bool: + """Check whether the booster and input data for prediction contain categorical data. + + """ + from .data import _is_pandas_df, _is_cudf_df + if _is_pandas_df(data) or _is_cudf_df(data): + ft = booster.feature_types + if ft is None: + enable_categorical = False + else: + enable_categorical = any(f == "c" for f in ft) + else: + enable_categorical = False + return enable_categorical + + def build_info() -> dict: """Build information of XGBoost. The returned value format is not stable. Also, please note that build time dependency is not the same as runtime dependency. For instance, @@ -2046,17 +2062,9 @@ class Booster: f"got {data.shape[1]}" ) - from .data import _is_pandas_df, _transform_pandas_df + from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df from .data import _array_interface - if ( - _is_pandas_df(data) - or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") - ): - ft = self.feature_types - if ft is None: - enable_categorical = False - else: - enable_categorical = any(f == "c" for f in ft) + enable_categorical = _has_categorical(self, data) if _is_pandas_df(data): data, _, _ = _transform_pandas_df(data, enable_categorical) @@ -2111,7 +2119,7 @@ class Booster: ) ) return _prediction_output(shape, dims, preds, True) - if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"): + if _is_cudf_df(data): from .data import _cudf_array_interfaces, _transform_cudf_df data, cat_codes, _, _ = _transform_cudf_df( data, None, None, enable_categorical diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index b5f03c120..133d50160 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -56,7 +56,7 @@ from .compat import lazy_isinstance from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter from .core import Objective, Metric -from .core import _deprecate_positional_args +from .core import _deprecate_positional_args, _has_categorical from .data import FeatNamesT from .training import train as worker_train from .tracker import RabitTracker, get_host_ip @@ -1241,7 +1241,11 @@ async def _predict_async( booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any ) -> Any: with config.config_context(**global_config): - m = DMatrix(data=partition, missing=missing) + m = DMatrix( + data=partition, + missing=missing, + enable_categorical=_has_categorical(booster, partition) + ) predt = booster.predict( data=m, output_margin=output_margin, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 6fe2c56ee..adf1cff5c 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -466,12 +466,8 @@ def _from_dt_df( return handle, feature_names, feature_types -def _is_cudf_df(data): - try: - import cudf - except ImportError: - return False - return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame) +def _is_cudf_df(data) -> bool: + return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") def _cudf_array_interfaces(data, cat_codes: list) -> bytes: diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index 55fd22e02..9a68f4453 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -288,10 +288,23 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None: reg.fit(X, y, eval_set=[(X, y)]) assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"]) + booster = reg.get_booster() + predt = xgb.dask.predict(client, booster, X).compute().values + inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values + + if hasattr(predt, "get"): + predt = predt.get() + if hasattr(inpredt, "get"): + inpredt = inpredt.get() + + np.testing.assert_allclose(predt, inpredt) + + def test_categorical(client: "Client") -> None: X, y = make_categorical(client, 10000, 30, 13) X_onehot, _ = make_categorical(client, 10000, 30, 13, True) run_categorical(client, "approx", X, X_onehot, y) + run_categorical(client, "hist", X, X_onehot, y) def test_dask_predict_shape_infer(client: "Client") -> None: