Feature weights (#5962)
This commit is contained in:
@@ -1,12 +1,10 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
|
||||
ROOT_DIR = tm.PROJECT_ROOT
|
||||
DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
|
||||
PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
|
||||
|
||||
@@ -19,21 +17,27 @@ def test_basic_walkthrough():
|
||||
os.remove('dump.raw.txt')
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_matplotlib())
|
||||
def test_custom_multiclass_objective():
|
||||
script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py')
|
||||
cmd = ['python', script, '--plot=0']
|
||||
subprocess.check_call(cmd)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_matplotlib())
|
||||
def test_custom_rmsle_objective():
|
||||
major, minor = sys.version_info[:2]
|
||||
if minor < 6:
|
||||
pytest.skip('Skipping RMLSE test due to Python version being too low.')
|
||||
script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py')
|
||||
cmd = ['python', script, '--plot=0']
|
||||
subprocess.check_call(cmd)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_matplotlib())
|
||||
def test_feature_weights_demo():
|
||||
script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py')
|
||||
cmd = ['python', script, '--plot=0']
|
||||
subprocess.check_call(cmd)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_sklearn_demo():
|
||||
script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
|
||||
|
||||
@@ -99,6 +99,11 @@ class TestDMatrix(unittest.TestCase):
|
||||
X = rng.randn(100, 100)
|
||||
y = rng.randint(low=0, high=3, size=100)
|
||||
d = xgb.DMatrix(X, y)
|
||||
np.testing.assert_equal(d.get_label(), y.astype(np.float32))
|
||||
|
||||
fw = rng.uniform(size=100).astype(np.float32)
|
||||
d.set_info(feature_weights=fw)
|
||||
|
||||
eval_res_0 = {}
|
||||
booster = xgb.train(
|
||||
{'num_class': 3, 'objective': 'multi:softprob'}, d,
|
||||
@@ -106,19 +111,23 @@ class TestDMatrix(unittest.TestCase):
|
||||
|
||||
predt = booster.predict(d)
|
||||
predt = predt.reshape(100 * 3, 1)
|
||||
|
||||
d.set_base_margin(predt)
|
||||
|
||||
ridxs = [1, 2, 3, 4, 5, 6]
|
||||
d = d.slice(ridxs)
|
||||
sliced_margin = d.get_float_info('base_margin')
|
||||
sliced = d.slice(ridxs)
|
||||
|
||||
sliced_margin = sliced.get_float_info('base_margin')
|
||||
assert sliced_margin.shape[0] == len(ridxs) * 3
|
||||
|
||||
eval_res_1 = {}
|
||||
xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d,
|
||||
num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1)
|
||||
xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced,
|
||||
num_boost_round=2, evals=[(sliced, 'd')],
|
||||
evals_result=eval_res_1)
|
||||
|
||||
eval_res_0 = eval_res_0['d']['merror']
|
||||
eval_res_1 = eval_res_1['d']['merror']
|
||||
|
||||
for i in range(len(eval_res_0)):
|
||||
assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02
|
||||
|
||||
@@ -196,13 +205,33 @@ class TestDMatrix(unittest.TestCase):
|
||||
dtrain.get_float_info('base_margin')
|
||||
dtrain.get_uint_info('group_ptr')
|
||||
|
||||
def test_feature_weights(self):
|
||||
kRows = 10
|
||||
kCols = 50
|
||||
rng = np.random.RandomState(1994)
|
||||
fw = rng.uniform(size=kCols)
|
||||
X = rng.randn(kRows, kCols)
|
||||
m = xgb.DMatrix(X)
|
||||
m.set_info(feature_weights=fw)
|
||||
np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
|
||||
# Handle empty
|
||||
m.set_info(feature_weights=np.empty((0, 0)))
|
||||
|
||||
assert m.get_float_info('feature_weights').shape[0] == 0
|
||||
|
||||
fw -= 1
|
||||
|
||||
def assign_weight():
|
||||
m.set_info(feature_weights=fw)
|
||||
self.assertRaises(ValueError, assign_weight)
|
||||
|
||||
def test_sparse_dmatrix_csr(self):
|
||||
nrow = 100
|
||||
ncol = 1000
|
||||
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
|
||||
assert x.indices.max() < ncol - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
|
||||
@@ -215,7 +244,7 @@ class TestDMatrix(unittest.TestCase):
|
||||
x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
|
||||
assert x.indices.max() < nrow - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import collections
|
||||
import importlib.util
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from xgboost.sklearn import XGBoostLabelEncoder
|
||||
@@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel():
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_train])
|
||||
|
||||
|
||||
def test_validation_weights_xgbclassifier():
|
||||
from sklearn.datasets import make_hastie_10_2
|
||||
|
||||
@@ -920,6 +923,64 @@ def test_pandas_input():
|
||||
np.array([0, 1]))
|
||||
|
||||
|
||||
def run_feature_weights(increasing):
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
kRows = 512
|
||||
kCols = 64
|
||||
colsample_bynode = 0.5
|
||||
reg = xgb.XGBRegressor(tree_method='hist',
|
||||
colsample_bynode=colsample_bynode)
|
||||
X = rng.randn(kRows, kCols)
|
||||
y = rng.randn(kRows)
|
||||
fw = np.ones(shape=(kCols,))
|
||||
for i in range(kCols):
|
||||
if increasing:
|
||||
fw[i] *= float(i)
|
||||
else:
|
||||
fw[i] *= float(kCols - i)
|
||||
|
||||
reg.fit(X, y, feature_weights=fw)
|
||||
model_path = os.path.join(tmpdir, 'model.json')
|
||||
reg.save_model(model_path)
|
||||
with open(model_path) as fd:
|
||||
model = json.load(fd)
|
||||
|
||||
parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
|
||||
'json_parser.py')
|
||||
spec = importlib.util.spec_from_file_location("JsonParser",
|
||||
parser_path)
|
||||
foo = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(foo)
|
||||
model = foo.Model(model)
|
||||
splits = {}
|
||||
total_nodes = 0
|
||||
for tree in model.trees:
|
||||
n_nodes = len(tree.nodes)
|
||||
total_nodes += n_nodes
|
||||
for n in range(n_nodes):
|
||||
if tree.is_leaf(n):
|
||||
continue
|
||||
if splits.get(tree.split_index(n), None) is None:
|
||||
splits[tree.split_index(n)] = 1
|
||||
else:
|
||||
splits[tree.split_index(n)] += 1
|
||||
|
||||
od = collections.OrderedDict(sorted(splits.items()))
|
||||
tuples = [(k, v) for k, v in od.items()]
|
||||
k, v = list(zip(*tuples))
|
||||
w = np.polyfit(k, v, deg=1)
|
||||
return w
|
||||
|
||||
|
||||
def test_feature_weights():
|
||||
poly_increasing = run_feature_weights(True)
|
||||
poly_decreasing = run_feature_weights(False)
|
||||
# Approxmated test, this is dependent on the implementation of random
|
||||
# number generator in std library.
|
||||
assert poly_increasing[0] > 0.08
|
||||
assert poly_decreasing[0] < -0.08
|
||||
|
||||
|
||||
class TestBoostFromPrediction(unittest.TestCase):
|
||||
def run_boost_from_prediction(self, tree_method):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
Reference in New Issue
Block a user