[dask] prediction with categorical data. (#7708)
This commit is contained in:
parent
68b6d6bbe2
commit
a62a3d991d
@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
|
|||||||
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
|
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
|
||||||
|
|
||||||
|
|
||||||
|
def _has_categorical(booster: "Booster", data: Any) -> bool:
|
||||||
|
"""Check whether the booster and input data for prediction contain categorical data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from .data import _is_pandas_df, _is_cudf_df
|
||||||
|
if _is_pandas_df(data) or _is_cudf_df(data):
|
||||||
|
ft = booster.feature_types
|
||||||
|
if ft is None:
|
||||||
|
enable_categorical = False
|
||||||
|
else:
|
||||||
|
enable_categorical = any(f == "c" for f in ft)
|
||||||
|
else:
|
||||||
|
enable_categorical = False
|
||||||
|
return enable_categorical
|
||||||
|
|
||||||
|
|
||||||
def build_info() -> dict:
|
def build_info() -> dict:
|
||||||
"""Build information of XGBoost. The returned value format is not stable. Also, please
|
"""Build information of XGBoost. The returned value format is not stable. Also, please
|
||||||
note that build time dependency is not the same as runtime dependency. For instance,
|
note that build time dependency is not the same as runtime dependency. For instance,
|
||||||
@ -2046,17 +2062,9 @@ class Booster:
|
|||||||
f"got {data.shape[1]}"
|
f"got {data.shape[1]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
from .data import _is_pandas_df, _transform_pandas_df
|
from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
|
||||||
from .data import _array_interface
|
from .data import _array_interface
|
||||||
if (
|
enable_categorical = _has_categorical(self, data)
|
||||||
_is_pandas_df(data)
|
|
||||||
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
|
||||||
):
|
|
||||||
ft = self.feature_types
|
|
||||||
if ft is None:
|
|
||||||
enable_categorical = False
|
|
||||||
else:
|
|
||||||
enable_categorical = any(f == "c" for f in ft)
|
|
||||||
if _is_pandas_df(data):
|
if _is_pandas_df(data):
|
||||||
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||||
|
|
||||||
@ -2111,7 +2119,7 @@ class Booster:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
return _prediction_output(shape, dims, preds, True)
|
return _prediction_output(shape, dims, preds, True)
|
||||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
if _is_cudf_df(data):
|
||||||
from .data import _cudf_array_interfaces, _transform_cudf_df
|
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||||
data, cat_codes, _, _ = _transform_cudf_df(
|
data, cat_codes, _, _ = _transform_cudf_df(
|
||||||
data, None, None, enable_categorical
|
data, None, None, enable_categorical
|
||||||
|
|||||||
@ -56,7 +56,7 @@ from .compat import lazy_isinstance
|
|||||||
|
|
||||||
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
|
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
|
||||||
from .core import Objective, Metric
|
from .core import Objective, Metric
|
||||||
from .core import _deprecate_positional_args
|
from .core import _deprecate_positional_args, _has_categorical
|
||||||
from .data import FeatNamesT
|
from .data import FeatNamesT
|
||||||
from .training import train as worker_train
|
from .training import train as worker_train
|
||||||
from .tracker import RabitTracker, get_host_ip
|
from .tracker import RabitTracker, get_host_ip
|
||||||
@ -1241,7 +1241,11 @@ async def _predict_async(
|
|||||||
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
|
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
|
||||||
) -> Any:
|
) -> Any:
|
||||||
with config.config_context(**global_config):
|
with config.config_context(**global_config):
|
||||||
m = DMatrix(data=partition, missing=missing)
|
m = DMatrix(
|
||||||
|
data=partition,
|
||||||
|
missing=missing,
|
||||||
|
enable_categorical=_has_categorical(booster, partition)
|
||||||
|
)
|
||||||
predt = booster.predict(
|
predt = booster.predict(
|
||||||
data=m,
|
data=m,
|
||||||
output_margin=output_margin,
|
output_margin=output_margin,
|
||||||
|
|||||||
@ -466,12 +466,8 @@ def _from_dt_df(
|
|||||||
return handle, feature_names, feature_types
|
return handle, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
def _is_cudf_df(data):
|
def _is_cudf_df(data) -> bool:
|
||||||
try:
|
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||||
import cudf
|
|
||||||
except ImportError:
|
|
||||||
return False
|
|
||||||
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
|
||||||
|
|
||||||
|
|
||||||
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||||
|
|||||||
@ -288,10 +288,23 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
|
|||||||
reg.fit(X, y, eval_set=[(X, y)])
|
reg.fit(X, y, eval_set=[(X, y)])
|
||||||
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
|
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
|
||||||
|
|
||||||
|
booster = reg.get_booster()
|
||||||
|
predt = xgb.dask.predict(client, booster, X).compute().values
|
||||||
|
inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values
|
||||||
|
|
||||||
|
if hasattr(predt, "get"):
|
||||||
|
predt = predt.get()
|
||||||
|
if hasattr(inpredt, "get"):
|
||||||
|
inpredt = inpredt.get()
|
||||||
|
|
||||||
|
np.testing.assert_allclose(predt, inpredt)
|
||||||
|
|
||||||
|
|
||||||
def test_categorical(client: "Client") -> None:
|
def test_categorical(client: "Client") -> None:
|
||||||
X, y = make_categorical(client, 10000, 30, 13)
|
X, y = make_categorical(client, 10000, 30, 13)
|
||||||
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
|
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
|
||||||
run_categorical(client, "approx", X, X_onehot, y)
|
run_categorical(client, "approx", X, X_onehot, y)
|
||||||
|
run_categorical(client, "hist", X, X_onehot, y)
|
||||||
|
|
||||||
|
|
||||||
def test_dask_predict_shape_infer(client: "Client") -> None:
|
def test_dask_predict_shape_infer(client: "Client") -> None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user