Fix some stuff

2015-04-02 00:05:14 -07:00 · 2015-04-02 00:05:14 -07:00 · a1a427af37
commit a1a427af37
parent 136e902fb2
2 changed files with 43 additions and 37 deletions
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@ -11,7 +11,7 @@ import xgboost as xgb
 import numpy as np
 from sklearn.cross_validation import KFold
 from sklearn.grid_search import GridSearchCV
-from sklearn.metrics import confusion_matrix
+from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn.datasets import load_iris, load_digits, load_boston
 rng = np.random.RandomState(31337)
@ -39,4 +39,26 @@ for train_index, test_index in kf:
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))
 print("Boston Housing: regression")
 boston = load_boston()
 y = boston['target']
 X = boston['data']
 kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(mean_squared_error(actuals, predictions))
 print("Parameter optimization")
 y = boston['target']
 X = boston['data']
 xgb_model = xgb.XGBRegressor()
 clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
 clf.fit(X,y)
 print(clf.best_score_)
 print(clf.best_params_)
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -16,6 +16,7 @@ import scipy.sparse
 try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
    SKLEARN_INSTALLED = True
 except ImportError:
@ -716,40 +717,32 @@ class XGBModel(BaseEstimator):
        self._Booster = train(self.get_xgb_params(), trainDmatrix, self.n_rounds)
        return self
-class XGBClassifier(XGBModel):    
+    def predict(self, X):
        testDmatrix = DMatrix(X)
        return self._Booster.predict(testDmatrix)
 class XGBClassifier(XGBModel, ClassifierMixin):    
    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True):
        super().__init__(max_depth, learning_rate, n_estimators, silent, objective="binary:logistic")
    def fit(self, X, y, sample_weight=None):
        y_values = list(np.unique(y))
-        if len(y_values) == 2:
+        if len(y_values) > 2:
            # Map the two classes in the y vector into {0,1}, and record the mapping so that
            # the predict() method can return results in the original range
            if not (-1 in y_values and 1 in y_values) or (0 in y_values and 1 in y_values) or (True in y_values and False in y_values):
                raise ValueError("For a binary classifier, y must be in (0,1), or (-1,1), or (True,False).")
            if -1 in y_values:
                self._yspace = "svm_like"
                training_labels = y.copy()
                training_labels[training_labels == -1] = 0
            elif False in y_values:
                self._yspace = "boolean"
                training_labels = np.array(y, dtype=int)
            else:
                self._yspace = "zero_one"
                training_labels = y 
            xgb_options = self.get_xgb_params()
        else:
            # Switch to using a multiclass objective in the underlying XGB instance
            self._yspace = "multiclass"
            self.objective = "multi:softprob"
            self._le = LabelEncoder().fit(y)
            training_labels = self._le.transform(y)
            xgb_options = self.get_xgb_params()
            xgb_options['num_class'] = len(y_values)
        else:
            xgb_options = self.get_xgb_params()
        self._le = LabelEncoder().fit(y)
        training_labels = self._le.transform(y)
        if sample_weight is not None:
            trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight)
        else:
            trainDmatrix = DMatrix(X, label=training_labels)
        self._Booster = train(xgb_options, trainDmatrix, self.n_rounds)
        return self
@ -757,22 +750,12 @@ class XGBClassifier(XGBModel):
    def predict(self, X):
        testDmatrix = DMatrix(X)
        class_probs = self._Booster.predict(testDmatrix)
-        if self._yspace == "multiclass":
+        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
            fitted_values = self._le.inverse_transform(column_indexes)
        else:
-            if self._yspace == "svm_like":
+            column_indexes = np.repeat(0, X.shape[0])
-                base_value = -1
+            column_indexes[class_probs > 0.5] = 1
-                one_value = 1
+        return self._le.inverse_transform(column_indexes)
            elif self._yspace == "boolean":
                base_value = False
                one_value = True
            else:
                base_value = 0
                one_value = 1
            fitted_values = np.repeat(base_value, X.shape[0])
            fitted_values[class_probs > 0.5] = one_value
        return fitted_values
    def predict_proba(self, X):
        testDmatrix = DMatrix(X)
@ -784,6 +767,7 @@ class XGBClassifier(XGBModel):
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs,classone_probs)).transpose()
- 
+class XGBRegressor(XGBModel, RegressorMixin):  
    pass