xgboost/tests/python/test_linear.py

from __future__ import print_function

import numpy as np
import testing as tm
import unittest
import pytest

import xgboost as xgb

try:
    from sklearn.linear_model import ElasticNet
    from sklearn.preprocessing import scale
    from regression_test_utilities import run_suite, parameter_combinations
except ImportError:
    None


def is_float(s):
    try:
        float(s)
        return 1
    except ValueError:
        return 0


def xgb_get_weights(bst):
    return np.array([float(s) for s in bst.get_dump()[0].split() if
                     is_float(s)])


def assert_regression_result(results, tol):
    regression_results = [r for r in results if
                          r["param"]["objective"] == "reg:squarederror"]
    for res in regression_results:
        X = scale(res["dataset"].X,
                  with_mean=isinstance(res["dataset"].X, np.ndarray))
        y = res["dataset"].y
        reg_alpha = res["param"]["alpha"]
        reg_lambda = res["param"]["lambda"]
        pred = res["bst"].predict(xgb.DMatrix(X))
        weights = xgb_get_weights(res["bst"])[1:]
        enet = ElasticNet(alpha=reg_alpha + reg_lambda,
                          l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
        enet.fit(X, y)
        enet_pred = enet.predict(X)
        assert np.isclose(weights, enet.coef_, rtol=tol,
                          atol=tol).all(), (weights, enet.coef_)
        assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
            res["dataset"].name, enet_pred[:5], pred[:5])


# TODO: More robust classification tests
def assert_classification_result(results):
    classification_results = [r for r in results if
                              r["param"]["objective"] != "reg:squarederror"]
    for res in classification_results:
        # Check accuracy  is reasonable
        assert res["eval"][-1] < 0.5, (res["dataset"].name, res["eval"][-1])


class TestLinear(unittest.TestCase):

    datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
                "Boston External Memory"]

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_coordinate(self):
        variable_param = {'booster': ['gblinear'], 'updater':
                          ['coord_descent'], 'eta': [0.5], 'top_k':
                          [10], 'tolerance': [1e-5], 'nthread': [2],
                          'alpha': [.005, .1], 'lambda': [.005],
                          'feature_selector': ['cyclic', 'shuffle',
                                               'greedy', 'thrifty']}
        for param in parameter_combinations(variable_param):
            results = run_suite(param, 150, self.datasets, scale_features=True)
            assert_regression_result(results, 1e-2)
            assert_classification_result(results)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_shotgun(self):
        variable_param = {'booster': ['gblinear'], 'updater':
                          ['shotgun'], 'eta': [0.5], 'top_k': [10],
                          'tolerance': [1e-5], 'nthread': [2],
                          'alpha': [.005, .1], 'lambda': [.005],
                          'feature_selector': ['cyclic', 'shuffle']}
        for param in parameter_combinations(variable_param):
            results = run_suite(param, 200, self.datasets, True)
            assert_regression_result(results, 1e-2)
            assert_classification_result(results)