xgboost/tests/python/test_interaction_constraints.py
Jiaming Yuan 97abcc7ee2
Extract interaction constraint from split evaluator. (#5034)
*  Extract interaction constraints from split evaluator.

The reason for doing so is mostly for model IO, where num_feature and interaction_constraints are copied in split evaluator. Also interaction constraint by itself is a feature selector, acting like column sampler and it's inefficient to bury it deep in the evaluator chain. Lastly removing one another copied parameter is a win.

*  Enable inc for approx tree method.

As now the implementation is spited up from evaluator class, it's also enabled for approx method.

*  Removing obsoleted code in colmaker.

They are never documented nor actually used in real world. Also there isn't a single test for those code blocks.

*  Unifying the types used for row and column.

As the size of input dataset is marching to billion, incorrect use of int is subject to overflow, also singed integer overflow is undefined behaviour. This PR starts the procedure for unifying used index type to unsigned integers. There's optimization that can utilize this undefined behaviour, but after some testings I don't see the optimization is beneficial to XGBoost.
2019-11-14 20:11:41 +08:00

93 lines
3.3 KiB
Python

# -*- coding: utf-8 -*-
import numpy as np
import xgboost
import unittest
import testing as tm
import pytest
dpath = 'demo/data/'
rng = np.random.RandomState(1994)
class TestInteractionConstraints(unittest.TestCase):
def run_interaction_constraints(self, tree_method):
x1 = np.random.normal(loc=1.0, scale=1.0, size=1000)
x2 = np.random.normal(loc=1.0, scale=1.0, size=1000)
x3 = np.random.choice([1, 2, 3], size=1000, replace=True)
y = x1 + x2 + x3 + x1 * x2 * x3 \
+ np.random.normal(
loc=0.001, scale=1.0, size=1000) + 3 * np.sin(x1)
X = np.column_stack((x1, x2, x3))
dtrain = xgboost.DMatrix(X, label=y)
params = {
'max_depth': 3,
'eta': 0.1,
'nthread': 2,
'interaction_constraints': '[[0, 1]]',
'tree_method': tree_method
}
num_boost_round = 12
# Fit a model that only allows interaction between x1 and x2
bst = xgboost.train(
params, dtrain, num_boost_round, evals=[(dtrain, 'train')])
# Set all observations to have the same x3 values then increment
# by the same amount
def f(x):
tmat = xgboost.DMatrix(
np.column_stack((x1, x2, np.repeat(x, 1000))))
return bst.predict(tmat)
preds = [f(x) for x in [1, 2, 3]]
# Check incrementing x3 has the same effect on all observations
# since x3 is constrained to be independent of x1 and x2
# and all observations start off from the same x3 value
diff1 = preds[1] - preds[0]
assert np.all(np.abs(diff1 - diff1[0]) < 1e-4)
diff2 = preds[2] - preds[1]
assert np.all(np.abs(diff2 - diff2[0]) < 1e-4)
def test_exact_interaction_constraints(self):
self.run_interaction_constraints(tree_method='exact')
def test_hist_interaction_constraints(self):
self.run_interaction_constraints(tree_method='hist')
def test_approx_interaction_constraints(self):
self.run_interaction_constraints(tree_method='approx')
@pytest.mark.skipif(**tm.no_sklearn())
def training_accuracy(self, tree_method):
from sklearn.metrics import accuracy_score
dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
params = {
'eta': 1,
'max_depth': 6,
'objective': 'binary:logistic',
'tree_method': tree_method,
'interaction_constraints': '[[1,2], [2,3,4]]'
}
num_boost_round = 5
params['grow_policy'] = 'lossguide'
bst = xgboost.train(params, dtrain, num_boost_round)
pred_dtest = (bst.predict(dtest) < 0.5)
assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
params['grow_policy'] = 'depthwise'
bst = xgboost.train(params, dtrain, num_boost_round)
pred_dtest = (bst.predict(dtest) < 0.5)
assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
def test_hist_training_accuracy(self):
self.training_accuracy(tree_method='hist')
def test_exact_training_accuracy(self):
self.training_accuracy(tree_method='exact')
def test_approx_training_accuracy(self):
self.training_accuracy(tree_method='approx')