Validate features for inplace predict. (#8359)
This commit is contained in:
parent
52977f0cdf
commit
c884b9e888
@ -406,10 +406,7 @@ def c_array(
|
||||
|
||||
|
||||
def _prediction_output(
|
||||
shape: CNumericPtr,
|
||||
dims: c_bst_ulong,
|
||||
predts: CFloatPtr,
|
||||
is_cuda: bool
|
||||
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
|
||||
) -> NumpyOrCupy:
|
||||
arr_shape = ctypes2numpy(shape, dims.value, np.uint64)
|
||||
length = int(np.prod(arr_shape))
|
||||
@ -1555,7 +1552,7 @@ class Booster:
|
||||
ctypes.byref(self.handle)))
|
||||
for d in cache:
|
||||
# Validate feature only after the feature names are saved into booster.
|
||||
self._validate_features(d)
|
||||
self._validate_dmatrix_features(d)
|
||||
|
||||
if isinstance(model_file, Booster):
|
||||
assert self.handle is not None
|
||||
@ -1914,7 +1911,7 @@ class Booster:
|
||||
"""
|
||||
if not isinstance(dtrain, DMatrix):
|
||||
raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
|
||||
self._validate_features(dtrain)
|
||||
self._validate_dmatrix_features(dtrain)
|
||||
|
||||
if fobj is None:
|
||||
_check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
|
||||
@ -1946,7 +1943,7 @@ class Booster:
|
||||
)
|
||||
if not isinstance(dtrain, DMatrix):
|
||||
raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
|
||||
self._validate_features(dtrain)
|
||||
self._validate_dmatrix_features(dtrain)
|
||||
|
||||
_check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
|
||||
c_array(ctypes.c_float, grad),
|
||||
@ -1982,7 +1979,7 @@ class Booster:
|
||||
raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
|
||||
if not isinstance(d[1], str):
|
||||
raise TypeError(f"expected string, got {type(d[1]).__name__}")
|
||||
self._validate_features(d[0])
|
||||
self._validate_dmatrix_features(d[0])
|
||||
|
||||
dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
|
||||
evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
|
||||
@ -2033,7 +2030,7 @@ class Booster:
|
||||
result: str
|
||||
Evaluation result string.
|
||||
"""
|
||||
self._validate_features(data)
|
||||
self._validate_dmatrix_features(data)
|
||||
return self.eval_set([(data, name)], iteration)
|
||||
|
||||
# pylint: disable=too-many-function-args
|
||||
@ -2136,7 +2133,7 @@ class Booster:
|
||||
if not isinstance(data, DMatrix):
|
||||
raise TypeError('Expecting data to be a DMatrix object, got: ', type(data))
|
||||
if validate_features:
|
||||
self._validate_features(data)
|
||||
self._validate_dmatrix_features(data)
|
||||
iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
|
||||
args = {
|
||||
"type": 0,
|
||||
@ -2184,8 +2181,8 @@ class Booster:
|
||||
base_margin: Any = None,
|
||||
strict_shape: bool = False
|
||||
) -> NumpyOrCupy:
|
||||
"""Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction does not
|
||||
cache the prediction result.
|
||||
"""Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
|
||||
does not cache the prediction result.
|
||||
|
||||
Calling only ``inplace_predict`` in multiple threads is safe and lock
|
||||
free. But the safety does not hold when used in conjunction with other
|
||||
@ -2273,18 +2270,22 @@ class Booster:
|
||||
)
|
||||
|
||||
from .data import (
|
||||
_is_pandas_df,
|
||||
_transform_pandas_df,
|
||||
_array_interface,
|
||||
_is_cudf_df,
|
||||
_is_cupy_array,
|
||||
_array_interface,
|
||||
_is_pandas_df,
|
||||
_transform_pandas_df,
|
||||
)
|
||||
|
||||
enable_categorical = _has_categorical(self, data)
|
||||
if _is_pandas_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||
data, fns, _ = _transform_pandas_df(data, enable_categorical)
|
||||
if validate_features:
|
||||
self._validate_features(fns)
|
||||
|
||||
if isinstance(data, np.ndarray):
|
||||
from .data import _ensure_np_dtype
|
||||
|
||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromDense(
|
||||
@ -2334,10 +2335,13 @@ class Booster:
|
||||
return _prediction_output(shape, dims, preds, True)
|
||||
if _is_cudf_df(data):
|
||||
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||
data, cat_codes, _, _ = _transform_cudf_df(
|
||||
|
||||
data, cat_codes, fns, _ = _transform_cudf_df(
|
||||
data, None, None, enable_categorical
|
||||
)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
if validate_features:
|
||||
self._validate_features(fns)
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromCudaColumnar(
|
||||
self.handle,
|
||||
@ -2723,40 +2727,55 @@ class Booster:
|
||||
# pylint: disable=no-member
|
||||
return df.sort(['Tree', 'Node']).reset_index(drop=True)
|
||||
|
||||
def _validate_features(self, data: DMatrix) -> None:
|
||||
"""
|
||||
Validate Booster and data's feature_names are identical.
|
||||
Set feature_names and feature_types from DMatrix
|
||||
"""
|
||||
def _validate_dmatrix_features(self, data: DMatrix) -> None:
|
||||
if data.num_row() == 0:
|
||||
return
|
||||
|
||||
fn = data.feature_names
|
||||
ft = data.feature_types
|
||||
# Be consistent with versions before 1.7, "validate" actually modifies the
|
||||
# booster.
|
||||
if self.feature_names is None:
|
||||
self.feature_names = data.feature_names
|
||||
self.feature_types = data.feature_types
|
||||
if data.feature_names is None and self.feature_names is not None:
|
||||
raise ValueError(
|
||||
"training data did not have the following fields: " +
|
||||
", ".join(self.feature_names)
|
||||
)
|
||||
# Booster can't accept data with different feature names
|
||||
if self.feature_names != data.feature_names:
|
||||
dat_missing = set(cast(FeatureNames, self.feature_names)) - \
|
||||
set(cast(FeatureNames, data.feature_names))
|
||||
my_missing = set(cast(FeatureNames, data.feature_names)) - \
|
||||
set(cast(FeatureNames, self.feature_names))
|
||||
self.feature_names = fn
|
||||
if self.feature_types is None:
|
||||
self.feature_types = ft
|
||||
|
||||
msg = 'feature_names mismatch: {0} {1}'
|
||||
self._validate_features(fn)
|
||||
|
||||
def _validate_features(self, feature_names: Optional[FeatureNames]) -> None:
|
||||
if self.feature_names is None:
|
||||
return
|
||||
|
||||
if feature_names is None and self.feature_names is not None:
|
||||
raise ValueError(
|
||||
"training data did not have the following fields: "
|
||||
+ ", ".join(self.feature_names)
|
||||
)
|
||||
|
||||
if self.feature_names != feature_names:
|
||||
dat_missing = set(cast(FeatureNames, self.feature_names)) - set(
|
||||
cast(FeatureNames, feature_names)
|
||||
)
|
||||
my_missing = set(cast(FeatureNames, feature_names)) - set(
|
||||
cast(FeatureNames, self.feature_names)
|
||||
)
|
||||
|
||||
msg = "feature_names mismatch: {0} {1}"
|
||||
|
||||
if dat_missing:
|
||||
msg += ('\nexpected ' + ', '.join(
|
||||
str(s) for s in dat_missing) + ' in input data')
|
||||
msg += (
|
||||
"\nexpected "
|
||||
+ ", ".join(str(s) for s in dat_missing)
|
||||
+ " in input data"
|
||||
)
|
||||
|
||||
if my_missing:
|
||||
msg += ('\ntraining data did not have the following fields: ' +
|
||||
', '.join(str(s) for s in my_missing))
|
||||
msg += (
|
||||
"\ntraining data did not have the following fields: "
|
||||
+ ", ".join(str(s) for s in my_missing)
|
||||
)
|
||||
|
||||
raise ValueError(msg.format(self.feature_names, data.feature_names))
|
||||
raise ValueError(msg.format(self.feature_names, feature_names))
|
||||
|
||||
def get_split_value_histogram(
|
||||
self,
|
||||
|
||||
@ -2,6 +2,7 @@ import collections
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import tempfile
|
||||
from typing import Callable, Optional
|
||||
|
||||
@ -998,34 +999,40 @@ def test_deprecate_position_arg():
|
||||
def test_pandas_input():
|
||||
import pandas as pd
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
kRows = 100
|
||||
kCols = 6
|
||||
|
||||
X = rng.randint(low=0, high=2, size=kRows*kCols)
|
||||
X = rng.randint(low=0, high=2, size=kRows * kCols)
|
||||
X = X.reshape(kRows, kCols)
|
||||
|
||||
df = pd.DataFrame(X)
|
||||
feature_names = []
|
||||
for i in range(1, kCols):
|
||||
feature_names += ['k'+str(i)]
|
||||
feature_names += ["k" + str(i)]
|
||||
|
||||
df.columns = ['status'] + feature_names
|
||||
df.columns = ["status"] + feature_names
|
||||
|
||||
target = df['status']
|
||||
train = df.drop(columns=['status'])
|
||||
target = df["status"]
|
||||
train = df.drop(columns=["status"])
|
||||
model = xgb.XGBClassifier()
|
||||
model.fit(train, target)
|
||||
np.testing.assert_equal(model.feature_names_in_, np.array(feature_names))
|
||||
|
||||
clf_isotonic = CalibratedClassifierCV(model,
|
||||
cv='prefit', method='isotonic')
|
||||
columns = list(train.columns)
|
||||
random.shuffle(columns, lambda: 0.1)
|
||||
df_incorrect = df[columns]
|
||||
with pytest.raises(ValueError):
|
||||
model.predict(df_incorrect)
|
||||
|
||||
clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
|
||||
clf_isotonic.fit(train, target)
|
||||
assert isinstance(clf_isotonic.calibrated_classifiers_[0].base_estimator,
|
||||
xgb.XGBClassifier)
|
||||
np.testing.assert_allclose(np.array(clf_isotonic.classes_),
|
||||
np.array([0, 1]))
|
||||
assert isinstance(
|
||||
clf_isotonic.calibrated_classifiers_[0].base_estimator, xgb.XGBClassifier
|
||||
)
|
||||
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))
|
||||
|
||||
|
||||
def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user