Added SKLearn-like random forest Python API. (#4148)

* Added SKLearn-like random forest Python API.

- added XGBRFClassifier and XGBRFRegressor classes to SKL-like xgboost API
- also added n_gpus and gpu_id parameters to SKL classes
- added documentation describing how to use xgboost for random forests,
  as well as existing caveats
This commit is contained in:
Andy Adinets
2019-03-12 15:28:19 +01:00
committed by Jiaming Yuan
parent 6fb4c5efef
commit a36c3ed4f4
4 changed files with 240 additions and 55 deletions

View File

@@ -29,13 +29,14 @@ def test_binary_classification():
y = digits['target']
X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
for train_index, test_index in kf.split(X, y):
xgb_model = cls(random_state=42).fit(X[train_index], y[train_index])
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
def test_multiclass_classification():
@@ -83,8 +84,8 @@ def test_ranking():
valid_group = np.repeat(50, 4)
x_test = np.random.rand(100, 10)
params = {'objective': 'rank:pairwise', 'learning_rate': 0.1,
'gamma': 1.0, 'min_child_weight': 0.1,
params = {'tree_method': 'exact', 'objective': 'rank:pairwise',
'learning_rate': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1,
'max_depth': 6, 'n_estimators': 4}
model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, train_group,
@@ -97,7 +98,8 @@ def test_ranking():
train_data.set_group(train_group)
valid_data.set_group(valid_group)
params_orig = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0,
params_orig = {'tree_method': 'exact', 'objective': 'rank:pairwise',
'eta': 0.1, 'gamma': 1.0,
'min_child_weight': 0.1, 'max_depth': 6}
xgb_model_orig = xgb.train(params_orig, train_data, num_boost_round=4,
evals=[(valid_data, 'validation')])
@@ -113,7 +115,7 @@ def test_feature_importances_weight():
y = digits['target']
X = digits['data']
xgb_model = xgb.XGBClassifier(
random_state=0, importance_type="weight").fit(X, y)
random_state=0, tree_method="exact", importance_type="weight").fit(X, y)
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
@@ -130,11 +132,11 @@ def test_feature_importances_weight():
y = pd.Series(digits['target'])
X = pd.DataFrame(digits['data'])
xgb_model = xgb.XGBClassifier(
random_state=0, importance_type="weight").fit(X, y)
random_state=0, tree_method="exact", importance_type="weight").fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
xgb_model = xgb.XGBClassifier(
random_state=0, importance_type="weight").fit(X, y)
random_state=0, tree_method="exact", importance_type="weight").fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
@@ -145,7 +147,7 @@ def test_feature_importances_gain():
y = digits['target']
X = digits['data']
xgb_model = xgb.XGBClassifier(
random_state=0, importance_type="gain").fit(X, y)
random_state=0, tree_method="exact", importance_type="gain").fit(X, y)
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0.00326159, 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -163,11 +165,11 @@ def test_feature_importances_gain():
y = pd.Series(digits['target'])
X = pd.DataFrame(digits['data'])
xgb_model = xgb.XGBClassifier(
random_state=0, importance_type="gain").fit(X, y)
random_state=0, tree_method="exact", importance_type="gain").fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
xgb_model = xgb.XGBClassifier(
random_state=0, importance_type="gain").fit(X, y)
random_state=0, tree_method="exact", importance_type="gain").fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
@@ -199,6 +201,23 @@ def test_boston_housing_regression():
assert mean_squared_error(preds4, labels) < 350
def test_boston_housing_rf_regression():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBRFRegressor(random_state=42).fit(
X[train_index], y[train_index])
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
assert mean_squared_error(preds, labels) < 35
def test_parameter_tuning():
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_boston