diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 5bd43736a..cd028ba0b 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -406,10 +406,7 @@ def c_array( def _prediction_output( - shape: CNumericPtr, - dims: c_bst_ulong, - predts: CFloatPtr, - is_cuda: bool + shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool ) -> NumpyOrCupy: arr_shape = ctypes2numpy(shape, dims.value, np.uint64) length = int(np.prod(arr_shape)) @@ -1555,7 +1552,7 @@ class Booster: ctypes.byref(self.handle))) for d in cache: # Validate feature only after the feature names are saved into booster. - self._validate_features(d) + self._validate_dmatrix_features(d) if isinstance(model_file, Booster): assert self.handle is not None @@ -1914,7 +1911,7 @@ class Booster: """ if not isinstance(dtrain, DMatrix): raise TypeError(f"invalid training matrix: {type(dtrain).__name__}") - self._validate_features(dtrain) + self._validate_dmatrix_features(dtrain) if fobj is None: _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, @@ -1946,7 +1943,7 @@ class Booster: ) if not isinstance(dtrain, DMatrix): raise TypeError(f"invalid training matrix: {type(dtrain).__name__}") - self._validate_features(dtrain) + self._validate_dmatrix_features(dtrain) _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, c_array(ctypes.c_float, grad), @@ -1982,7 +1979,7 @@ class Booster: raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}") if not isinstance(d[1], str): raise TypeError(f"expected string, got {type(d[1]).__name__}") - self._validate_features(d[0]) + self._validate_dmatrix_features(d[0]) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) @@ -2033,7 +2030,7 @@ class Booster: result: str Evaluation result string. """ - self._validate_features(data) + self._validate_dmatrix_features(data) return self.eval_set([(data, name)], iteration) # pylint: disable=too-many-function-args @@ -2136,7 +2133,7 @@ class Booster: if not isinstance(data, DMatrix): raise TypeError('Expecting data to be a DMatrix object, got: ', type(data)) if validate_features: - self._validate_features(data) + self._validate_dmatrix_features(data) iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range) args = { "type": 0, @@ -2184,8 +2181,8 @@ class Booster: base_margin: Any = None, strict_shape: bool = False ) -> NumpyOrCupy: - """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction does not - cache the prediction result. + """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction + does not cache the prediction result. Calling only ``inplace_predict`` in multiple threads is safe and lock free. But the safety does not hold when used in conjunction with other @@ -2273,18 +2270,22 @@ class Booster: ) from .data import ( - _is_pandas_df, - _transform_pandas_df, + _array_interface, _is_cudf_df, _is_cupy_array, - _array_interface, + _is_pandas_df, + _transform_pandas_df, ) + enable_categorical = _has_categorical(self, data) if _is_pandas_df(data): - data, _, _ = _transform_pandas_df(data, enable_categorical) + data, fns, _ = _transform_pandas_df(data, enable_categorical) + if validate_features: + self._validate_features(fns) if isinstance(data, np.ndarray): from .data import _ensure_np_dtype + data, _ = _ensure_np_dtype(data, data.dtype) _check_call( _LIB.XGBoosterPredictFromDense( @@ -2334,10 +2335,13 @@ class Booster: return _prediction_output(shape, dims, preds, True) if _is_cudf_df(data): from .data import _cudf_array_interfaces, _transform_cudf_df - data, cat_codes, _, _ = _transform_cudf_df( + + data, cat_codes, fns, _ = _transform_cudf_df( data, None, None, enable_categorical ) interfaces_str = _cudf_array_interfaces(data, cat_codes) + if validate_features: + self._validate_features(fns) _check_call( _LIB.XGBoosterPredictFromCudaColumnar( self.handle, @@ -2723,40 +2727,55 @@ class Booster: # pylint: disable=no-member return df.sort(['Tree', 'Node']).reset_index(drop=True) - def _validate_features(self, data: DMatrix) -> None: - """ - Validate Booster and data's feature_names are identical. - Set feature_names and feature_types from DMatrix - """ + def _validate_dmatrix_features(self, data: DMatrix) -> None: if data.num_row() == 0: return + fn = data.feature_names + ft = data.feature_types + # Be consistent with versions before 1.7, "validate" actually modifies the + # booster. if self.feature_names is None: - self.feature_names = data.feature_names - self.feature_types = data.feature_types - if data.feature_names is None and self.feature_names is not None: - raise ValueError( - "training data did not have the following fields: " + - ", ".join(self.feature_names) - ) - # Booster can't accept data with different feature names - if self.feature_names != data.feature_names: - dat_missing = set(cast(FeatureNames, self.feature_names)) - \ - set(cast(FeatureNames, data.feature_names)) - my_missing = set(cast(FeatureNames, data.feature_names)) - \ - set(cast(FeatureNames, self.feature_names)) + self.feature_names = fn + if self.feature_types is None: + self.feature_types = ft - msg = 'feature_names mismatch: {0} {1}' + self._validate_features(fn) + + def _validate_features(self, feature_names: Optional[FeatureNames]) -> None: + if self.feature_names is None: + return + + if feature_names is None and self.feature_names is not None: + raise ValueError( + "training data did not have the following fields: " + + ", ".join(self.feature_names) + ) + + if self.feature_names != feature_names: + dat_missing = set(cast(FeatureNames, self.feature_names)) - set( + cast(FeatureNames, feature_names) + ) + my_missing = set(cast(FeatureNames, feature_names)) - set( + cast(FeatureNames, self.feature_names) + ) + + msg = "feature_names mismatch: {0} {1}" if dat_missing: - msg += ('\nexpected ' + ', '.join( - str(s) for s in dat_missing) + ' in input data') + msg += ( + "\nexpected " + + ", ".join(str(s) for s in dat_missing) + + " in input data" + ) if my_missing: - msg += ('\ntraining data did not have the following fields: ' + - ', '.join(str(s) for s in my_missing)) + msg += ( + "\ntraining data did not have the following fields: " + + ", ".join(str(s) for s in my_missing) + ) - raise ValueError(msg.format(self.feature_names, data.feature_names)) + raise ValueError(msg.format(self.feature_names, feature_names)) def get_split_value_histogram( self, diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index a5f5ddb6f..17114d2dd 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -2,6 +2,7 @@ import collections import importlib.util import json import os +import random import tempfile from typing import Callable, Optional @@ -998,34 +999,40 @@ def test_deprecate_position_arg(): def test_pandas_input(): import pandas as pd from sklearn.calibration import CalibratedClassifierCV + rng = np.random.RandomState(1994) kRows = 100 kCols = 6 - X = rng.randint(low=0, high=2, size=kRows*kCols) + X = rng.randint(low=0, high=2, size=kRows * kCols) X = X.reshape(kRows, kCols) df = pd.DataFrame(X) feature_names = [] for i in range(1, kCols): - feature_names += ['k'+str(i)] + feature_names += ["k" + str(i)] - df.columns = ['status'] + feature_names + df.columns = ["status"] + feature_names - target = df['status'] - train = df.drop(columns=['status']) + target = df["status"] + train = df.drop(columns=["status"]) model = xgb.XGBClassifier() model.fit(train, target) np.testing.assert_equal(model.feature_names_in_, np.array(feature_names)) - clf_isotonic = CalibratedClassifierCV(model, - cv='prefit', method='isotonic') + columns = list(train.columns) + random.shuffle(columns, lambda: 0.1) + df_incorrect = df[columns] + with pytest.raises(ValueError): + model.predict(df_incorrect) + + clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic") clf_isotonic.fit(train, target) - assert isinstance(clf_isotonic.calibrated_classifiers_[0].base_estimator, - xgb.XGBClassifier) - np.testing.assert_allclose(np.array(clf_isotonic.classes_), - np.array([0, 1])) + assert isinstance( + clf_isotonic.calibrated_classifiers_[0].base_estimator, xgb.XGBClassifier + ) + np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1])) def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):