Pass scikit learn estimator checks for regressor. (#7130)
* Check data shape. * Check labels.
This commit is contained in:
@@ -584,8 +584,6 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
`gpu_predictor` and pandas input are required.
|
||||
|
||||
"""
|
||||
if isinstance(data, list):
|
||||
raise TypeError("Input data can not be a list.")
|
||||
if group is not None and qid is not None:
|
||||
raise ValueError("Either one of `group` or `qid` should be None.")
|
||||
|
||||
@@ -2005,6 +2003,10 @@ class Booster(object):
|
||||
p_handle = ctypes.c_void_p()
|
||||
assert proxy is None or isinstance(proxy, _ProxyDMatrix)
|
||||
if validate_features:
|
||||
if not hasattr(data, "shape"):
|
||||
raise TypeError(
|
||||
"`shape` attribute is required when `validate_features` is True."
|
||||
)
|
||||
if len(data.shape) != 1 and self.num_features() != data.shape[1]:
|
||||
raise ValueError(
|
||||
f"Feature shape mismatch, expected: {self.num_features()}, "
|
||||
|
||||
@@ -32,6 +32,11 @@ def _check_complex(data):
|
||||
raise ValueError('Complex data not supported')
|
||||
|
||||
|
||||
def _check_data_shape(data: Any) -> None:
|
||||
if hasattr(data, "shape") and len(data.shape) != 2:
|
||||
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
|
||||
|
||||
|
||||
def _is_scipy_csr(data):
|
||||
try:
|
||||
import scipy
|
||||
@@ -524,16 +529,18 @@ def _is_list(data):
|
||||
return isinstance(data, list)
|
||||
|
||||
|
||||
def _from_list(data, missing, feature_names, feature_types):
|
||||
raise TypeError('List input data is not supported for data')
|
||||
def _from_list(data, missing, n_threads, feature_names, feature_types):
|
||||
array = np.array(data)
|
||||
_check_data_shape(data)
|
||||
return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
|
||||
|
||||
|
||||
def _is_tuple(data):
|
||||
return isinstance(data, tuple)
|
||||
|
||||
|
||||
def _from_tuple(data, missing, feature_names, feature_types):
|
||||
return _from_list(data, missing, feature_names, feature_types)
|
||||
def _from_tuple(data, missing, n_threads, feature_names, feature_types):
|
||||
return _from_list(data, missing, n_threads, feature_names, feature_types)
|
||||
|
||||
|
||||
def _is_iter(data):
|
||||
@@ -566,6 +573,8 @@ def dispatch_data_backend(data, missing, threads,
|
||||
feature_names, feature_types,
|
||||
enable_categorical=False):
|
||||
'''Dispatch data for DMatrix.'''
|
||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||
_check_data_shape(data)
|
||||
if _is_scipy_csr(data):
|
||||
return _from_scipy_csr(data, missing, threads, feature_names, feature_types)
|
||||
if _is_scipy_csc(data):
|
||||
@@ -578,9 +587,9 @@ def dispatch_data_backend(data, missing, threads,
|
||||
if _is_uri(data):
|
||||
return _from_uri(data, missing, feature_names, feature_types)
|
||||
if _is_list(data):
|
||||
return _from_list(data, missing, feature_names, feature_types)
|
||||
return _from_list(data, missing, threads, feature_names, feature_types)
|
||||
if _is_tuple(data):
|
||||
return _from_tuple(data, missing, feature_names, feature_types)
|
||||
return _from_tuple(data, missing, threads, feature_names, feature_types)
|
||||
if _is_pandas_df(data):
|
||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||
feature_names, feature_types)
|
||||
@@ -612,11 +621,12 @@ def dispatch_data_backend(data, missing, threads,
|
||||
return _from_pandas_series(data, missing, threads, feature_names,
|
||||
feature_types)
|
||||
if _has_array_protocol(data):
|
||||
pass
|
||||
array = np.asarray(data)
|
||||
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
||||
|
||||
converted = _convert_unknown_data(data)
|
||||
if converted:
|
||||
return _from_scipy_csr(data, missing, threads, feature_names, feature_types)
|
||||
if converted is not None:
|
||||
return _from_scipy_csr(converted, missing, threads, feature_names, feature_types)
|
||||
|
||||
raise TypeError('Not supported type for data.' + str(type(data)))
|
||||
|
||||
@@ -630,11 +640,12 @@ def _to_data_type(dtype: str, name: str):
|
||||
return dtype_map[dtype]
|
||||
|
||||
|
||||
def _validate_meta_shape(data):
|
||||
if hasattr(data, 'shape'):
|
||||
assert len(data.shape) == 1 or (
|
||||
len(data.shape) == 2 and
|
||||
(data.shape[1] == 0 or data.shape[1] == 1))
|
||||
def _validate_meta_shape(data, name: str) -> None:
|
||||
if hasattr(data, "shape"):
|
||||
if len(data.shape) > 2 or (
|
||||
len(data.shape) == 2 and (data.shape[1] != 0 and data.shape[1] != 1)
|
||||
):
|
||||
raise ValueError(f"Invalid shape: {data.shape} for {name}")
|
||||
|
||||
|
||||
def _meta_from_numpy(data, field, dtype, handle):
|
||||
@@ -702,7 +713,7 @@ def _meta_from_dt(data, field, dtype, handle):
|
||||
def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
||||
'''Dispatch for meta info.'''
|
||||
handle = matrix.handle
|
||||
_validate_meta_shape(data)
|
||||
_validate_meta_shape(data, name)
|
||||
if data is None:
|
||||
return
|
||||
if _is_list(data):
|
||||
@@ -751,7 +762,9 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
return
|
||||
if _has_array_protocol(data):
|
||||
pass
|
||||
array = np.asarray(data)
|
||||
_meta_from_numpy(array, name, dtype, handle)
|
||||
return
|
||||
raise TypeError('Unsupported type for ' + name, str(type(data)))
|
||||
|
||||
|
||||
@@ -802,6 +815,8 @@ def _proxy_transform(data, feature_names, feature_types, enable_categorical):
|
||||
|
||||
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
|
||||
"""Dispatch for DeviceQuantileDMatrix."""
|
||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||
_check_data_shape(data)
|
||||
if _is_cudf_df(data):
|
||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
||||
return
|
||||
|
||||
@@ -419,7 +419,6 @@ class XGBModel(XGBModelBase):
|
||||
self.base_score = base_score
|
||||
self.missing = missing
|
||||
self.num_parallel_tree = num_parallel_tree
|
||||
self.kwargs = kwargs
|
||||
self.random_state = random_state
|
||||
self.n_jobs = n_jobs
|
||||
self.monotone_constraints = monotone_constraints
|
||||
@@ -429,6 +428,8 @@ class XGBModel(XGBModelBase):
|
||||
self.validate_parameters = validate_parameters
|
||||
self.predictor = predictor
|
||||
self.enable_categorical = enable_categorical
|
||||
if kwargs:
|
||||
self.kwargs = kwargs
|
||||
|
||||
def _more_tags(self) -> Dict[str, bool]:
|
||||
'''Tags used for scikit-learn data validation.'''
|
||||
@@ -469,6 +470,8 @@ class XGBModel(XGBModelBase):
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
else:
|
||||
if not hasattr(self, "kwargs"):
|
||||
self.kwargs = {}
|
||||
self.kwargs[key] = value
|
||||
|
||||
if hasattr(self, '_Booster'):
|
||||
@@ -491,7 +494,7 @@ class XGBModel(XGBModelBase):
|
||||
cp.__class__ = cp.__class__.__bases__[0]
|
||||
params.update(cp.__class__.get_params(cp, deep))
|
||||
# if kwargs is a dict, update params accordingly
|
||||
if isinstance(self.kwargs, dict):
|
||||
if hasattr(self, "kwargs") and isinstance(self.kwargs, dict):
|
||||
params.update(self.kwargs)
|
||||
if isinstance(params['random_state'], np.random.RandomState):
|
||||
params['random_state'] = params['random_state'].randint(
|
||||
@@ -745,7 +748,6 @@ class XGBModel(XGBModelBase):
|
||||
|
||||
"""
|
||||
evals_result: TrainingCallback.EvalsLog = {}
|
||||
|
||||
train_dmatrix, evals = _wrap_evaluation_matrices(
|
||||
missing=self.missing,
|
||||
X=X,
|
||||
@@ -1169,7 +1171,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
):
|
||||
raise ValueError(label_encoding_check_error)
|
||||
else:
|
||||
self.classes_ = np.unique(y)
|
||||
self.classes_ = np.unique(np.asarray(y))
|
||||
self.n_classes_ = len(self.classes_)
|
||||
if not self.use_label_encoder and (
|
||||
not np.array_equal(self.classes_, np.arange(self.n_classes_))
|
||||
@@ -1206,11 +1208,6 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
label_transform = lambda x: x
|
||||
|
||||
model, feval, params = self._configure_fit(xgb_model, eval_metric, params)
|
||||
if len(X.shape) != 2:
|
||||
# Simply raise an error here since there might be many
|
||||
# different ways of reshaping
|
||||
raise ValueError("Please reshape the input data X into 2-dimensional matrix.")
|
||||
|
||||
train_dmatrix, evals = _wrap_evaluation_matrices(
|
||||
missing=self.missing,
|
||||
X=X,
|
||||
|
||||
Reference in New Issue
Block a user