Add scikit-learn tests (#3674)

* Add scikit-learn tests

Goal is to pass scikit-learn's check_estimator() for XGBClassifier,
XGBRegressor, and XGBRanker. It is actually not possible to do so
entirely, since check_estimator() assumes that NaN is disallowed,
but XGBoost allows for NaN as missing values. However, it is always
good ideas to add some checks inspired by check_estimator().

* Fix lint

* Fix lint
This commit is contained in:
Philip Hyunsu Cho 2018-09-06 09:55:28 -07:00 committed by GitHub
parent 190d888695
commit d176a0fbc8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 4 deletions

View File

@ -9,7 +9,6 @@ import ctypes
import os import os
import re import re
import sys import sys
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
@ -374,11 +373,15 @@ class DMatrix(object):
if label is not None: if label is not None:
if isinstance(label, np.ndarray): if isinstance(label, np.ndarray):
self.set_label_npy2d(label) self.set_label_npy2d(label)
elif getattr(label, '__array__', None) is not None:
self.set_label_npy2d(label.__array__())
else: else:
self.set_label(label) self.set_label(label)
if weight is not None: if weight is not None:
if isinstance(weight, np.ndarray): if isinstance(weight, np.ndarray):
self.set_weight_npy2d(weight) self.set_weight_npy2d(weight)
elif getattr(weight, '__array__', None) is not None:
self.set_weight_npy2d(weight.__array__())
else: else:
self.set_weight(weight) self.set_weight(weight)
@ -428,7 +431,7 @@ class DMatrix(object):
and type if memory use is a concern. and type if memory use is a concern.
""" """
if len(mat.shape) != 2: if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional') raise ValueError('Input numpy.ndarray must be 2 dimensional. Reshape your data.')
# flatten the array by rows and ensure it is float32. # flatten the array by rows and ensure it is float32.
# we try to avoid data copies if possible (reshape returns a view when possible # we try to avoid data copies if possible (reshape returns a view when possible
# and we explicitly tell np.array to try and avoid copying) # and we explicitly tell np.array to try and avoid copying)

View File

@ -1,10 +1,12 @@
# coding: utf-8 # coding: utf-8
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912, C0302
"""Scikit-Learn Wrapper interface for XGBoost.""" """Scikit-Learn Wrapper interface for XGBoost."""
from __future__ import absolute_import from __future__ import absolute_import
import numpy as np import numpy as np
import warnings import warnings
from sklearn.exceptions import NotFittedError
from sklearn.exceptions import DataConversionWarning
from .core import Booster, DMatrix, XGBoostError from .core import Booster, DMatrix, XGBoostError
from .training import train from .training import train
@ -14,6 +16,16 @@ from .compat import (SKLEARN_INSTALLED, XGBModelBase,
XGBClassifierBase, XGBRegressorBase, XGBLabelEncoder) XGBClassifierBase, XGBRegressorBase, XGBLabelEncoder)
def _check_label_1d(label):
"""Produce warning if label is not 1D array"""
label = np.array(label, copy=False, dtype=np.float32)
if len(label.shape) == 2 and label.shape[1] == 1:
warnings.warn('A column-vector y was passed when a 1d array was'
' expected. Please change the shape of y to '
'(n_samples, ), for example using ravel().',
DataConversionWarning, stacklevel=2)
def _objective_decorator(func): def _objective_decorator(func):
"""Decorate an objective function """Decorate an objective function
@ -178,7 +190,7 @@ class XGBModel(XGBModelBase):
booster : a xgboost booster of underlying model booster : a xgboost booster of underlying model
""" """
if self._Booster is None: if self._Booster is None:
raise XGBoostError('need to call fit or load_model beforehand') raise NotFittedError('need to call fit or load_model beforehand')
return self._Booster return self._Booster
def get_params(self, deep=False): def get_params(self, deep=False):
@ -286,6 +298,7 @@ class XGBModel(XGBModelBase):
file name of stored xgb model or 'Booster' instance Xgb model to be file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation). loaded before training (allows training continuation).
""" """
_check_label_1d(label=y)
if sample_weight is not None: if sample_weight is not None:
trainDmatrix = DMatrix(X, label=y, weight=sample_weight, trainDmatrix = DMatrix(X, label=y, weight=sample_weight,
missing=self.missing, nthread=self.n_jobs) missing=self.missing, nthread=self.n_jobs)
@ -536,6 +549,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
file name of stored xgb model or 'Booster' instance Xgb model to be file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation). loaded before training (allows training continuation).
""" """
_check_label_1d(label=y)
evals_result = {} evals_result = {}
self.classes_ = np.unique(y) self.classes_ = np.unique(y)
self.n_classes_ = len(self.classes_) self.n_classes_ = len(self.classes_)
@ -912,6 +926,7 @@ class XGBRanker(XGBModel):
file name of stored xgb model or 'Booster' instance Xgb model to be file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation). loaded before training (allows training continuation).
""" """
_check_label_1d(label=y)
# check if group information is provided # check if group information is provided
if group is None: if group is None:
raise ValueError("group is required for ranking task") raise ValueError("group is required for ranking task")

View File

@ -203,6 +203,18 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
DeprecationWarning) DeprecationWarning)
callbacks.append(callback.reset_learning_rate(learning_rates)) callbacks.append(callback.reset_learning_rate(learning_rates))
nrow = dtrain.num_row()
ncol = dtrain.num_col()
if nrow <= 0:
raise ValueError('{} row(s) (shape=({}, {})) while a minimum of 1 is required.'
.format(nrow, nrow, ncol))
if ncol <= 0:
raise ValueError('{} feature(s) (shape=({}, {})) while a minimum of 1 is required.'
.format(ncol, nrow, ncol))
label = dtrain.get_label()
if nrow != len(label):
raise ValueError('Label must have same length as the number of data rows')
return _train_internal(params, dtrain, return _train_internal(params, dtrain,
num_boost_round=num_boost_round, num_boost_round=num_boost_round,
evals=evals, evals=evals,