Previously, we use `libsvm` as default when format is not specified. However, the dmlc data parser is not particularly robust against errors, and the most common type of error is undefined format. Along with which, we will recommend users to use other data loader instead. We will continue the maintenance of the parsers as it's currently used for many internal tests including federated learning.
152 lines
5.2 KiB
Python
152 lines
5.2 KiB
Python
import numpy as np
|
|
import pytest
|
|
|
|
import xgboost as xgb
|
|
from xgboost import testing as tm
|
|
|
|
dpath = 'demo/data/'
|
|
|
|
|
|
def is_increasing(y):
|
|
return np.count_nonzero(np.diff(y) < 0.0) == 0
|
|
|
|
|
|
def is_decreasing(y):
|
|
return np.count_nonzero(np.diff(y) > 0.0) == 0
|
|
|
|
|
|
def is_correctly_constrained(learner, feature_names=None):
|
|
n = 100
|
|
variable_x = np.linspace(0, 1, n).reshape((n, 1))
|
|
fixed_xs_values = np.linspace(0, 1, n)
|
|
|
|
for i in range(n):
|
|
fixed_x = fixed_xs_values[i] * np.ones((n, 1))
|
|
monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
|
|
monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x,
|
|
feature_names=feature_names)
|
|
monotonically_increasing_y = learner.predict(
|
|
monotonically_increasing_dset
|
|
)
|
|
|
|
monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
|
|
monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x,
|
|
feature_names=feature_names)
|
|
monotonically_decreasing_y = learner.predict(
|
|
monotonically_decreasing_dset
|
|
)
|
|
|
|
if not (
|
|
is_increasing(monotonically_increasing_y) and
|
|
is_decreasing(monotonically_decreasing_y)
|
|
):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
number_of_dpoints = 1000
|
|
x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
|
|
x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
|
|
|
|
x = np.column_stack((
|
|
x1_positively_correlated_with_y, x2_negatively_correlated_with_y
|
|
))
|
|
zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
|
|
y = (
|
|
5 * x1_positively_correlated_with_y +
|
|
np.sin(10 * np.pi * x1_positively_correlated_with_y) -
|
|
5 * x2_negatively_correlated_with_y -
|
|
np.cos(10 * np.pi * x2_negatively_correlated_with_y) +
|
|
zs
|
|
)
|
|
training_dset = xgb.DMatrix(x, label=y)
|
|
|
|
|
|
class TestMonotoneConstraints:
|
|
def test_monotone_constraints_for_exact_tree_method(self):
|
|
|
|
# first check monotonicity for the 'exact' tree method
|
|
params_for_constrained_exact_method = {
|
|
'tree_method': 'exact', 'verbosity': 1,
|
|
'monotone_constraints': '(1, -1)'
|
|
}
|
|
constrained_exact_method = xgb.train(
|
|
params_for_constrained_exact_method, training_dset
|
|
)
|
|
assert is_correctly_constrained(constrained_exact_method)
|
|
|
|
@pytest.mark.parametrize(
|
|
"tree_method,policy",
|
|
[
|
|
("hist", "depthwise"),
|
|
("approx", "depthwise"),
|
|
("hist", "lossguide"),
|
|
("approx", "lossguide"),
|
|
],
|
|
)
|
|
def test_monotone_constraints(self, tree_method: str, policy: str) -> None:
|
|
params_for_constrained = {
|
|
"tree_method": tree_method,
|
|
"grow_policy": policy,
|
|
"monotone_constraints": "(1, -1)",
|
|
}
|
|
constrained = xgb.train(params_for_constrained, training_dset)
|
|
assert is_correctly_constrained(constrained)
|
|
|
|
def test_monotone_constraints_tuple(self) -> None:
|
|
params_for_constrained = {"monotone_constraints": (1, -1)}
|
|
constrained = xgb.train(params_for_constrained, training_dset)
|
|
assert is_correctly_constrained(constrained)
|
|
|
|
@pytest.mark.parametrize('format', [dict, list])
|
|
def test_monotone_constraints_feature_names(self, format):
|
|
|
|
# next check monotonicity when initializing monotone_constraints by feature names
|
|
params = {
|
|
'tree_method': 'hist',
|
|
'grow_policy': 'lossguide',
|
|
'monotone_constraints': {'feature_0': 1, 'feature_1': -1}
|
|
}
|
|
|
|
if format == list:
|
|
params = list(params.items())
|
|
|
|
with pytest.raises(ValueError):
|
|
xgb.train(params, training_dset)
|
|
|
|
feature_names = ['feature_0', 'feature_2']
|
|
training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
|
|
|
|
with pytest.raises(ValueError):
|
|
xgb.train(params, training_dset_w_feature_names)
|
|
|
|
feature_names = ['feature_0', 'feature_1']
|
|
training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
|
|
|
|
constrained_learner = xgb.train(
|
|
params, training_dset_w_feature_names
|
|
)
|
|
|
|
assert is_correctly_constrained(constrained_learner, feature_names)
|
|
|
|
|
|
@pytest.mark.skipif(**tm.no_sklearn())
|
|
def test_training_accuracy(self):
|
|
from sklearn.metrics import accuracy_score
|
|
dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
|
|
dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
|
|
params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic',
|
|
'tree_method': 'hist', 'monotone_constraints': '(1, 0)'}
|
|
num_boost_round = 5
|
|
|
|
params['grow_policy'] = 'lossguide'
|
|
bst = xgb.train(params, dtrain, num_boost_round)
|
|
pred_dtest = (bst.predict(dtest) < 0.5)
|
|
assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
|
|
|
|
params['grow_policy'] = 'depthwise'
|
|
bst = xgb.train(params, dtrain, num_boost_round)
|
|
pred_dtest = (bst.predict(dtest) < 0.5)
|
|
assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
|