ENH: Allow settable missing value in sklearn api.

This commit is contained in:
Skipper Seabold 2015-05-14 14:08:03 -05:00
parent 3a7808dc7d
commit 0a0a80ec72

View File

@ -93,6 +93,7 @@ def ctypes2numpy(cptr, length, dtype):
raise RuntimeError('memmove failed')
return res
def ctypes2buffer(cptr, length):
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
raise RuntimeError('expected char pointer')
@ -102,6 +103,7 @@ def ctypes2buffer(cptr, length):
raise RuntimeError('memmove failed')
return res
def c_str(string):
return ctypes.c_char_p(string.encode('utf-8'))
@ -850,10 +852,13 @@ class XGBModel(XGBModelBase):
The initial prediction score of all instances, global bias.
seed : int
Random number seed.
missing : float, optional
Value in the data which needs to be present as a missing value. If
None, defaults to np.nan.
"""
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
base_score=0.5, seed=0):
base_score=0.5, seed=0, missing=None):
if not SKLEARN_INSTALLED:
raise XGBError('sklearn needs to be installed in order to use this module')
self.max_depth = max_depth
@ -871,6 +876,7 @@ class XGBModel(XGBModelBase):
self.base_score = base_score
self.seed = seed
self.missing = missing or np.nan
self._Booster = None
@ -902,6 +908,12 @@ class XGBModel(XGBModelBase):
raise XGBError('need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
params = super(XGBModel, self).get_params(deep=deep)
if params['missing'] is np.nan:
params['missing'] = None # sklearn doesn't handle nan. see #4725
return params
def get_xgb_params(self):
xgb_params = self.get_params()
@ -912,12 +924,12 @@ class XGBModel(XGBModelBase):
return xgb_params
def fit(self, X, y):
trainDmatrix = DMatrix(X, label=y)
trainDmatrix = DMatrix(X, label=y, missing=self.missing)
self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_estimators)
return self
def predict(self, X):
testDmatrix = DMatrix(X)
testDmatrix = DMatrix(X, missing=self.missing)
return self.booster().predict(testDmatrix)
@ -928,11 +940,11 @@ class XGBClassifier(XGBModel, XGBClassifier):
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic",
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
base_score=0.5, seed=0):
base_score=0.5, seed=0, missing=None):
super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective,
nthread, gamma, min_child_weight, max_delta_step, subsample,
colsample_bytree,
base_score, seed)
base_score, seed, missing)
def fit(self, X, y, sample_weight=None):
y_values = list(np.unique(y))
@ -949,16 +961,18 @@ class XGBClassifier(XGBModel, XGBClassifier):
training_labels = self._le.transform(y)
if sample_weight is not None:
trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight)
trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
missing=self.missing)
else:
trainDmatrix = DMatrix(X, label=training_labels)
trainDmatrix = DMatrix(X, label=training_labels,
missing=self.missing)
self._Booster = train(xgb_options, trainDmatrix, self.n_estimators)
return self
def predict(self, X):
testDmatrix = DMatrix(X)
testDmatrix = DMatrix(X, missing=self.missing)
class_probs = self.booster().predict(testDmatrix)
if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1)
@ -968,7 +982,7 @@ class XGBClassifier(XGBModel, XGBClassifier):
return self._le.inverse_transform(column_indexes)
def predict_proba(self, X):
testDmatrix = DMatrix(X)
testDmatrix = DMatrix(X, missing=self.missing)
class_probs = self.booster().predict(testDmatrix)
if self.objective == "multi:softprob":
return class_probs