Added SKLearn-like random forest Python API. (#4148)

* Added SKLearn-like random forest Python API. - added XGBRFClassifier and XGBRFRegressor classes to SKL-like xgboost API - also added n_gpus and gpu_id parameters to SKL classes - added documentation describing how to use xgboost for random forests, as well as existing caveats
2019-03-12 15:28:19 +01:00
parent 6fb4c5efef
commit a36c3ed4f4
4 changed files with 240 additions and 55 deletions
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -29,13 +29,14 @@ def test_binary_classification():
    y = digits['target']
    X = digits['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
-    for train_index, test_index in kf.split(X, y):
-        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
-        preds = xgb_model.predict(X[test_index])
-        labels = y[test_index]
-        err = sum(1 for i in range(len(preds))
-                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
-        assert err < 0.1
+    for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
+        for train_index, test_index in kf.split(X, y):
+            xgb_model = cls(random_state=42).fit(X[train_index], y[train_index])
+            preds = xgb_model.predict(X[test_index])
+            labels = y[test_index]
+            err = sum(1 for i in range(len(preds))
+                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+            assert err < 0.1


 def test_multiclass_classification():
@@ -83,8 +84,8 @@ def test_ranking():
    valid_group = np.repeat(50, 4)
    x_test = np.random.rand(100, 10)

-    params = {'objective': 'rank:pairwise', 'learning_rate': 0.1,
-              'gamma': 1.0, 'min_child_weight': 0.1,
+    params = {'tree_method': 'exact', 'objective': 'rank:pairwise',
+              'learning_rate': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1,
              'max_depth': 6, 'n_estimators': 4}
    model = xgb.sklearn.XGBRanker(**params)
    model.fit(x_train, y_train, train_group,
@@ -97,7 +98,8 @@ def test_ranking():
    train_data.set_group(train_group)
    valid_data.set_group(valid_group)

-    params_orig = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0,
+    params_orig = {'tree_method': 'exact', 'objective': 'rank:pairwise',
+                   'eta': 0.1, 'gamma': 1.0,
                   'min_child_weight': 0.1, 'max_depth': 6}
    xgb_model_orig = xgb.train(params_orig, train_data, num_boost_round=4,
                               evals=[(valid_data, 'validation')])
@@ -113,7 +115,7 @@ def test_feature_importances_weight():
    y = digits['target']
    X = digits['data']
    xgb_model = xgb.XGBClassifier(
-        random_state=0, importance_type="weight").fit(X, y)
+        random_state=0, tree_method="exact", importance_type="weight").fit(X, y)

    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
                    0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
@@ -130,11 +132,11 @@ def test_feature_importances_weight():
    y = pd.Series(digits['target'])
    X = pd.DataFrame(digits['data'])
    xgb_model = xgb.XGBClassifier(
-        random_state=0, importance_type="weight").fit(X, y)
+        random_state=0, tree_method="exact", importance_type="weight").fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    xgb_model = xgb.XGBClassifier(
-        random_state=0, importance_type="weight").fit(X, y)
+        random_state=0, tree_method="exact", importance_type="weight").fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)


@@ -145,7 +147,7 @@ def test_feature_importances_gain():
    y = digits['target']
    X = digits['data']
    xgb_model = xgb.XGBClassifier(
-        random_state=0, importance_type="gain").fit(X, y)
+        random_state=0, tree_method="exact", importance_type="gain").fit(X, y)

    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                    0.00326159, 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -163,11 +165,11 @@ def test_feature_importances_gain():
    y = pd.Series(digits['target'])
    X = pd.DataFrame(digits['data'])
    xgb_model = xgb.XGBClassifier(
-        random_state=0, importance_type="gain").fit(X, y)
+        random_state=0, tree_method="exact", importance_type="gain").fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    xgb_model = xgb.XGBClassifier(
-        random_state=0, importance_type="gain").fit(X, y)
+        random_state=0, tree_method="exact", importance_type="gain").fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)


@@ -199,6 +201,23 @@ def test_boston_housing_regression():
        assert mean_squared_error(preds4, labels) < 350


+def test_boston_housing_rf_regression():
+    from sklearn.metrics import mean_squared_error
+    from sklearn.datasets import load_boston
+    from sklearn.model_selection import KFold
+
+    boston = load_boston()
+    y = boston['target']
+    X = boston['data']
+    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
+    for train_index, test_index in kf.split(X, y):
+        xgb_model = xgb.XGBRFRegressor(random_state=42).fit(
+            X[train_index], y[train_index])
+        preds = xgb_model.predict(X[test_index])
+        labels = y[test_index]
+        assert mean_squared_error(preds, labels) < 35
+
+
 def test_parameter_tuning():
    from sklearn.model_selection import GridSearchCV
    from sklearn.datasets import load_boston