[dask] prediction with categorical data. (#7708)

This commit is contained in:
Jiaming Yuan 2022-03-10 00:21:48 +08:00 committed by GitHub
parent 68b6d6bbe2
commit a62a3d991d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 19 deletions

View File

@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError())) raise XGBoostError(py_str(_LIB.XGBGetLastError()))
def _has_categorical(booster: "Booster", data: Any) -> bool:
"""Check whether the booster and input data for prediction contain categorical data.
"""
from .data import _is_pandas_df, _is_cudf_df
if _is_pandas_df(data) or _is_cudf_df(data):
ft = booster.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
else:
enable_categorical = False
return enable_categorical
def build_info() -> dict: def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also, please """Build information of XGBoost. The returned value format is not stable. Also, please
note that build time dependency is not the same as runtime dependency. For instance, note that build time dependency is not the same as runtime dependency. For instance,
@ -2046,17 +2062,9 @@ class Booster:
f"got {data.shape[1]}" f"got {data.shape[1]}"
) )
from .data import _is_pandas_df, _transform_pandas_df from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
from .data import _array_interface from .data import _array_interface
if ( enable_categorical = _has_categorical(self, data)
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
if _is_pandas_df(data): if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical) data, _, _ = _transform_pandas_df(data, enable_categorical)
@ -2111,7 +2119,7 @@ class Booster:
) )
) )
return _prediction_output(shape, dims, preds, True) return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"): if _is_cudf_df(data):
from .data import _cudf_array_interfaces, _transform_cudf_df from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df( data, cat_codes, _, _ = _transform_cudf_df(
data, None, None, enable_categorical data, None, None, enable_categorical

View File

@ -56,7 +56,7 @@ from .compat import lazy_isinstance
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric from .core import Objective, Metric
from .core import _deprecate_positional_args from .core import _deprecate_positional_args, _has_categorical
from .data import FeatNamesT from .data import FeatNamesT
from .training import train as worker_train from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip from .tracker import RabitTracker, get_host_ip
@ -1241,7 +1241,11 @@ async def _predict_async(
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
) -> Any: ) -> Any:
with config.config_context(**global_config): with config.config_context(**global_config):
m = DMatrix(data=partition, missing=missing) m = DMatrix(
data=partition,
missing=missing,
enable_categorical=_has_categorical(booster, partition)
)
predt = booster.predict( predt = booster.predict(
data=m, data=m,
output_margin=output_margin, output_margin=output_margin,

View File

@ -466,12 +466,8 @@ def _from_dt_df(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_cudf_df(data): def _is_cudf_df(data) -> bool:
try: return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
import cudf
except ImportError:
return False
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _cudf_array_interfaces(data, cat_codes: list) -> bytes: def _cudf_array_interfaces(data, cat_codes: list) -> bytes:

View File

@ -288,10 +288,23 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
reg.fit(X, y, eval_set=[(X, y)]) reg.fit(X, y, eval_set=[(X, y)])
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"]) assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
booster = reg.get_booster()
predt = xgb.dask.predict(client, booster, X).compute().values
inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values
if hasattr(predt, "get"):
predt = predt.get()
if hasattr(inpredt, "get"):
inpredt = inpredt.get()
np.testing.assert_allclose(predt, inpredt)
def test_categorical(client: "Client") -> None: def test_categorical(client: "Client") -> None:
X, y = make_categorical(client, 10000, 30, 13) X, y = make_categorical(client, 10000, 30, 13)
X_onehot, _ = make_categorical(client, 10000, 30, 13, True) X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
run_categorical(client, "approx", X, X_onehot, y) run_categorical(client, "approx", X, X_onehot, y)
run_categorical(client, "hist", X, X_onehot, y)
def test_dask_predict_shape_infer(client: "Client") -> None: def test_dask_predict_shape_infer(client: "Client") -> None: