[dask] prediction with categorical data. (#7708)
This commit is contained in:
parent
68b6d6bbe2
commit
a62a3d991d
@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
|
||||
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
|
||||
|
||||
|
||||
def _has_categorical(booster: "Booster", data: Any) -> bool:
|
||||
"""Check whether the booster and input data for prediction contain categorical data.
|
||||
|
||||
"""
|
||||
from .data import _is_pandas_df, _is_cudf_df
|
||||
if _is_pandas_df(data) or _is_cudf_df(data):
|
||||
ft = booster.feature_types
|
||||
if ft is None:
|
||||
enable_categorical = False
|
||||
else:
|
||||
enable_categorical = any(f == "c" for f in ft)
|
||||
else:
|
||||
enable_categorical = False
|
||||
return enable_categorical
|
||||
|
||||
|
||||
def build_info() -> dict:
|
||||
"""Build information of XGBoost. The returned value format is not stable. Also, please
|
||||
note that build time dependency is not the same as runtime dependency. For instance,
|
||||
@ -2046,17 +2062,9 @@ class Booster:
|
||||
f"got {data.shape[1]}"
|
||||
)
|
||||
|
||||
from .data import _is_pandas_df, _transform_pandas_df
|
||||
from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
|
||||
from .data import _array_interface
|
||||
if (
|
||||
_is_pandas_df(data)
|
||||
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||
):
|
||||
ft = self.feature_types
|
||||
if ft is None:
|
||||
enable_categorical = False
|
||||
else:
|
||||
enable_categorical = any(f == "c" for f in ft)
|
||||
enable_categorical = _has_categorical(self, data)
|
||||
if _is_pandas_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||
|
||||
@ -2111,7 +2119,7 @@ class Booster:
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, True)
|
||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
||||
if _is_cudf_df(data):
|
||||
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||
data, cat_codes, _, _ = _transform_cudf_df(
|
||||
data, None, None, enable_categorical
|
||||
|
||||
@ -56,7 +56,7 @@ from .compat import lazy_isinstance
|
||||
|
||||
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
|
||||
from .core import Objective, Metric
|
||||
from .core import _deprecate_positional_args
|
||||
from .core import _deprecate_positional_args, _has_categorical
|
||||
from .data import FeatNamesT
|
||||
from .training import train as worker_train
|
||||
from .tracker import RabitTracker, get_host_ip
|
||||
@ -1241,7 +1241,11 @@ async def _predict_async(
|
||||
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
|
||||
) -> Any:
|
||||
with config.config_context(**global_config):
|
||||
m = DMatrix(data=partition, missing=missing)
|
||||
m = DMatrix(
|
||||
data=partition,
|
||||
missing=missing,
|
||||
enable_categorical=_has_categorical(booster, partition)
|
||||
)
|
||||
predt = booster.predict(
|
||||
data=m,
|
||||
output_margin=output_margin,
|
||||
|
||||
@ -466,12 +466,8 @@ def _from_dt_df(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_cudf_df(data):
|
||||
try:
|
||||
import cudf
|
||||
except ImportError:
|
||||
return False
|
||||
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
||||
def _is_cudf_df(data) -> bool:
|
||||
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||
|
||||
|
||||
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
|
||||
@ -288,10 +288,23 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
|
||||
reg.fit(X, y, eval_set=[(X, y)])
|
||||
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
|
||||
|
||||
booster = reg.get_booster()
|
||||
predt = xgb.dask.predict(client, booster, X).compute().values
|
||||
inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values
|
||||
|
||||
if hasattr(predt, "get"):
|
||||
predt = predt.get()
|
||||
if hasattr(inpredt, "get"):
|
||||
inpredt = inpredt.get()
|
||||
|
||||
np.testing.assert_allclose(predt, inpredt)
|
||||
|
||||
|
||||
def test_categorical(client: "Client") -> None:
|
||||
X, y = make_categorical(client, 10000, 30, 13)
|
||||
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
|
||||
run_categorical(client, "approx", X, X_onehot, y)
|
||||
run_categorical(client, "hist", X, X_onehot, y)
|
||||
|
||||
|
||||
def test_dask_predict_shape_infer(client: "Client") -> None:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user