xgboost/tests/python/test_linear.py

from __future__ import print_function

import numpy as np
import testing as tm
import unittest
import xgboost as xgb

try:
    from sklearn.linear_model import ElasticNet
    from sklearn.preprocessing import scale
    from regression_test_utilities import run_suite, parameter_combinations
except ImportError:
    None


def is_float(s):
    try:
        float(s)
        return 1
    except ValueError:
        return 0


def xgb_get_weights(bst):
    return np.array([float(s) for s in bst.get_dump()[0].split() if is_float(s)])


def assert_regression_result(results, tol):
    regression_results = [r for r in results if r["param"]["objective"] == "reg:linear"]
    for res in regression_results:
        X = scale(res["dataset"].X, with_mean=isinstance(res["dataset"].X, np.ndarray))
        y = res["dataset"].y
        reg_alpha = res["param"]["alpha"]
        reg_lambda = res["param"]["lambda"]
        pred = res["bst"].predict(xgb.DMatrix(X))
        weights = xgb_get_weights(res["bst"])[1:]
        enet = ElasticNet(alpha=reg_alpha + reg_lambda,
                          l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
        enet.fit(X, y)
        enet_pred = enet.predict(X)
        assert np.isclose(weights, enet.coef_, rtol=tol, atol=tol).all(), (weights, enet.coef_)
        assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
            res["dataset"].name, enet_pred[:5], pred[:5])


# TODO: More robust classification tests
def assert_classification_result(results):
    classification_results = [r for r in results if r["param"]["objective"] != "reg:linear"]
    for res in classification_results:
        # Check accuracy  is reasonable
        assert res["eval"][-1] < 0.5, (res["dataset"].name, res["eval"][-1])


class TestLinear(unittest.TestCase):

    datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
                "Boston External Memory"]

    def test_coordinate(self):
        tm._skip_if_no_sklearn()
        variable_param = {'booster': ['gblinear'], 'updater': ['coord_descent'], 'eta': [0.5],
                          'top_k': [10], 'tolerance': [1e-5], 'nthread': [2],
                          'alpha': [.005, .1], 'lambda': [.005],
                          'feature_selector': ['cyclic', 'shuffle', 'greedy', 'thrifty']
                          }
        for param in parameter_combinations(variable_param):
            results = run_suite(param, 200, self.datasets, scale_features=True)
            assert_regression_result(results, 1e-2)
            assert_classification_result(results)

    def test_shotgun(self):
        tm._skip_if_no_sklearn()
        variable_param = {'booster': ['gblinear'], 'updater': ['shotgun'], 'eta': [0.5],
                          'top_k': [10], 'tolerance': [1e-5], 'nthread': [2],
                          'alpha': [.005, .1], 'lambda': [.005],
                          'feature_selector': ['cyclic', 'shuffle']
                          }
        for param in parameter_combinations(variable_param):
            results = run_suite(param, 200, self.datasets, True)
            assert_regression_result(results, 1e-2)
            assert_classification_result(results)