Support Series and Python primitives in inplace_predict and QDM (#8547)

This commit is contained in:
Jiaming Yuan
2022-12-17 00:15:15 +08:00
committed by GitHub
parent a10e4cba4e
commit f6effa1734
5 changed files with 84 additions and 46 deletions

View File

@@ -2220,17 +2220,15 @@ class Booster:
preds = ctypes.POINTER(ctypes.c_float)()
# once caching is supported, we can pass id(data) as cache id.
args = {
"type": 0,
"training": False,
"iteration_begin": iteration_range[0],
"iteration_end": iteration_range[1],
"missing": missing,
"strict_shape": strict_shape,
"cache_id": 0,
}
if predict_type == "margin":
args["type"] = 1
args = make_jcargs(
type=1 if predict_type == "margin" else 0,
training=False,
iteration_begin=iteration_range[0],
iteration_end=iteration_range[1],
missing=missing,
strict_shape=strict_shape,
cache_id=0,
)
shape = ctypes.POINTER(c_bst_ulong)()
dims = c_bst_ulong()
@@ -2243,6 +2241,29 @@ class Booster:
proxy = None
p_handle = ctypes.c_void_p()
assert proxy is None or isinstance(proxy, _ProxyDMatrix)
from .data import (
_array_interface,
_is_cudf_df,
_is_cupy_array,
_is_list,
_is_pandas_df,
_is_pandas_series,
_is_tuple,
_transform_pandas_df,
)
enable_categorical = True
if _is_pandas_series(data):
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
data, fns, _ = _transform_pandas_df(data, enable_categorical)
if validate_features:
self._validate_features(fns)
if _is_list(data) or _is_tuple(data):
data = np.array(data)
if validate_features:
if not hasattr(data, "shape"):
raise TypeError(
@@ -2254,20 +2275,6 @@ class Booster:
f"got {data.shape[1]}"
)
from .data import (
_array_interface,
_is_cudf_df,
_is_cupy_array,
_is_pandas_df,
_transform_pandas_df,
)
enable_categorical = True
if _is_pandas_df(data):
data, fns, _ = _transform_pandas_df(data, enable_categorical)
if validate_features:
self._validate_features(fns)
if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype
@@ -2276,7 +2283,7 @@ class Booster:
_LIB.XGBoosterPredictFromDense(
self.handle,
_array_interface(data),
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
@@ -2293,7 +2300,7 @@ class Booster:
_array_interface(csr.indices),
_array_interface(csr.data),
c_bst_ulong(csr.shape[1]),
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
@@ -2310,7 +2317,7 @@ class Booster:
_LIB.XGBoosterPredictFromCudaArray(
self.handle,
interface_str,
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
@@ -2331,7 +2338,7 @@ class Booster:
_LIB.XGBoosterPredictFromCudaColumnar(
self.handle,
interfaces_str,
from_pystr_to_cstr(json.dumps(args)),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),

View File

@@ -958,12 +958,12 @@ def dispatch_data_backend(
return _from_list(data, missing, threads, feature_names, feature_types)
if _is_tuple(data):
return _from_tuple(data, missing, threads, feature_names, feature_types)
if _is_pandas_df(data):
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_pandas_series(data):
return _from_pandas_series(
data, missing, threads, enable_categorical, feature_names, feature_types
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
return _from_pandas_df(
data, enable_categorical, missing, threads, feature_names, feature_types
)
if _is_cudf_df(data) or _is_cudf_ser(data):
return _from_cudf_df(
@@ -1205,6 +1205,9 @@ def _proxy_transform(
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
return data, None, feature_names, feature_types
if _is_pandas_series(data):
import pandas as pd
data = pd.DataFrame(data)
if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types

View File

@@ -40,6 +40,7 @@ def np_dtypes(
for dtype in dtypes:
X = np.array(orig, dtype=dtype)
yield orig, X
yield orig.tolist(), X.tolist()
for dtype in dtypes:
X = np.array(orig, dtype=dtype)
@@ -101,6 +102,11 @@ def pd_dtypes() -> Generator:
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=dtype
)
yield orig, df
ser_orig = orig["f0"]
ser = df["f0"]
assert isinstance(ser, pd.Series)
assert isinstance(ser_orig, pd.Series)
yield ser_orig, ser
# Categorical
orig = orig.astype("category")