[dask] prediction with categorical data. (#7708)

This commit is contained in:
Jiaming Yuan 2022-03-10 00:21:48 +08:00 committed by GitHub
parent 68b6d6bbe2
commit a62a3d991d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 19 deletions

View File

@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
def _has_categorical(booster: "Booster", data: Any) -> bool:
"""Check whether the booster and input data for prediction contain categorical data.
"""
from .data import _is_pandas_df, _is_cudf_df
if _is_pandas_df(data) or _is_cudf_df(data):
ft = booster.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
else:
enable_categorical = False
return enable_categorical
def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also, please
note that build time dependency is not the same as runtime dependency. For instance,
@ -2046,17 +2062,9 @@ class Booster:
f"got {data.shape[1]}"
)
from .data import _is_pandas_df, _transform_pandas_df
from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
from .data import _array_interface
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
enable_categorical = _has_categorical(self, data)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)
@ -2111,7 +2119,7 @@ class Booster:
)
)
return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
if _is_cudf_df(data):
from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
data, None, None, enable_categorical

View File

@ -56,7 +56,7 @@ from .compat import lazy_isinstance
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric
from .core import _deprecate_positional_args
from .core import _deprecate_positional_args, _has_categorical
from .data import FeatNamesT
from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip
@ -1241,7 +1241,11 @@ async def _predict_async(
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
) -> Any:
with config.config_context(**global_config):
m = DMatrix(data=partition, missing=missing)
m = DMatrix(
data=partition,
missing=missing,
enable_categorical=_has_categorical(booster, partition)
)
predt = booster.predict(
data=m,
output_margin=output_margin,

View File

@ -466,12 +466,8 @@ def _from_dt_df(
return handle, feature_names, feature_types
def _is_cudf_df(data):
try:
import cudf
except ImportError:
return False
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _is_cudf_df(data) -> bool:
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:

View File

@ -288,10 +288,23 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
reg.fit(X, y, eval_set=[(X, y)])
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
booster = reg.get_booster()
predt = xgb.dask.predict(client, booster, X).compute().values
inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values
if hasattr(predt, "get"):
predt = predt.get()
if hasattr(inpredt, "get"):
inpredt = inpredt.get()
np.testing.assert_allclose(predt, inpredt)
def test_categorical(client: "Client") -> None:
X, y = make_categorical(client, 10000, 30, 13)
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
run_categorical(client, "approx", X, X_onehot, y)
run_categorical(client, "hist", X, X_onehot, y)
def test_dask_predict_shape_infer(client: "Client") -> None: