Feature weights (#5962)

This commit is contained in:
Jiaming Yuan
2020-08-18 19:55:41 +08:00
committed by GitHub
parent a418278064
commit 4d99c58a5f
25 changed files with 509 additions and 104 deletions

View File

@@ -0,0 +1,13 @@
#include <gtest/gtest.h>
#include "../../../src/common/common.h"
namespace xgboost {
namespace common {
TEST(ArgSort, Basic) {
std::vector<float> inputs {3.0, 2.0, 1.0};
auto ret = ArgSort<bst_feature_t>(inputs);
std::vector<bst_feature_t> sol{2, 1, 0};
ASSERT_EQ(ret, sol);
}
} // namespace common
} // namespace xgboost

View File

@@ -8,9 +8,10 @@ namespace common {
TEST(ColumnSampler, Test) {
int n = 128;
ColumnSampler cs;
std::vector<float> feature_weights;
// No node sampling
cs.Init(n, 1.0f, 0.5f, 0.5f);
cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f);
auto set0 = cs.GetFeatureSet(0);
ASSERT_EQ(set0->Size(), 32);
@@ -23,7 +24,7 @@ TEST(ColumnSampler, Test) {
ASSERT_EQ(set2->Size(), 32);
// Node sampling
cs.Init(n, 0.5f, 1.0f, 0.5f);
cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f);
auto set3 = cs.GetFeatureSet(0);
ASSERT_EQ(set3->Size(), 32);
@@ -33,19 +34,19 @@ TEST(ColumnSampler, Test) {
ASSERT_EQ(set4->Size(), 32);
// No level or node sampling, should be the same at different depth
cs.Init(n, 1.0f, 1.0f, 0.5f);
cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f);
ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
cs.GetFeatureSet(1)->HostVector());
cs.Init(n, 1.0f, 1.0f, 1.0f);
cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
auto set5 = cs.GetFeatureSet(0);
ASSERT_EQ(set5->Size(), n);
cs.Init(n, 1.0f, 1.0f, 1.0f);
cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
auto set6 = cs.GetFeatureSet(0);
ASSERT_EQ(set5->HostVector(), set6->HostVector());
// Should always be a minimum of one feature
cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
}
@@ -56,13 +57,13 @@ TEST(ColumnSampler, ThreadSynchronisation) {
size_t iterations = 10;
size_t levels = 5;
std::vector<bst_feature_t> reference_result;
bool success =
true; // Cannot use google test asserts in multithreaded region
std::vector<float> feature_weights;
bool success = true; // Cannot use google test asserts in multithreaded region
#pragma omp parallel num_threads(num_threads)
{
for (auto j = 0ull; j < iterations; j++) {
ColumnSampler cs(j);
cs.Init(n, 0.5f, 0.5f, 0.5f);
cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f);
for (auto level = 0ull; level < levels; level++) {
auto result = cs.GetFeatureSet(level)->ConstHostVector();
#pragma omp single
@@ -76,5 +77,54 @@ TEST(ColumnSampler, ThreadSynchronisation) {
}
ASSERT_TRUE(success);
}
TEST(ColumnSampler, WeightedSampling) {
auto test_basic = [](int first) {
std::vector<float> feature_weights(2);
feature_weights[0] = std::abs(first - 1.0f);
feature_weights[1] = first - 0.0f;
ColumnSampler cs{0};
cs.Init(2, feature_weights, 1.0, 1.0, 0.5);
auto feature_sets = cs.GetFeatureSet(0);
auto const &h_feat_set = feature_sets->HostVector();
ASSERT_EQ(h_feat_set.size(), 1);
ASSERT_EQ(h_feat_set[0], first - 0);
};
test_basic(0);
test_basic(1);
size_t constexpr kCols = 64;
std::vector<float> feature_weights(kCols);
SimpleLCG rng;
SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
ColumnSampler cs{0};
cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f);
std::vector<bst_feature_t> features(kCols);
std::iota(features.begin(), features.end(), 0);
std::vector<float> freq(kCols, 0);
for (size_t i = 0; i < 1024; ++i) {
auto fset = cs.GetFeatureSet(0);
ASSERT_EQ(kCols * 0.5, fset->Size());
auto const& h_fset = fset->HostVector();
for (auto f : h_fset) {
freq[f] += 1.0f;
}
}
auto norm = std::accumulate(freq.cbegin(), freq.cend(), .0f);
for (auto& f : freq) {
f /= norm;
}
norm = std::accumulate(feature_weights.cbegin(), feature_weights.cend(), .0f);
for (auto& f : feature_weights) {
f /= norm;
}
for (size_t i = 0; i < feature_weights.size(); ++i) {
EXPECT_NEAR(freq[i], feature_weights[i], 1e-2);
}
}
} // namespace common
} // namespace xgboost

View File

@@ -204,12 +204,11 @@ TEST(GpuHist, EvaluateRootSplit) {
ASSERT_EQ(maker.hist.Data().size(), hist.size());
thrust::copy(hist.begin(), hist.end(),
maker.hist.Data().begin());
std::vector<float> feature_weights;
maker.column_sampler.Init(kNCols,
param.colsample_bynode,
param.colsample_bylevel,
param.colsample_bytree,
false);
maker.column_sampler.Init(kNCols, feature_weights, param.colsample_bynode,
param.colsample_bylevel, param.colsample_bytree,
false);
RegTree tree;
MetaInfo info;

View File

@@ -16,6 +16,20 @@ class TestDeviceQuantileDMatrix(unittest.TestCase):
match='is not supported for DeviceQuantileDMatrix'):
xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))
@pytest.mark.skipif(**tm.no_cupy())
def test_dmatrix_feature_weights(self):
import cupy as cp
rng = cp.random.RandomState(1994)
data = rng.randn(5, 5)
m = xgb.DMatrix(data)
feature_weights = rng.uniform(size=5)
m.set_info(feature_weights=feature_weights)
cp.testing.assert_array_equal(
cp.array(m.get_float_info('feature_weights')),
feature_weights.astype(np.float32))
@pytest.mark.skipif(**tm.no_cupy())
def test_dmatrix_cupy_init(self):
import cupy as cp

View File

@@ -1,12 +1,10 @@
import os
import subprocess
import sys
import pytest
import testing as tm
CURRENT_DIR = os.path.dirname(__file__)
ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
ROOT_DIR = tm.PROJECT_ROOT
DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
@@ -19,21 +17,27 @@ def test_basic_walkthrough():
os.remove('dump.raw.txt')
@pytest.mark.skipif(**tm.no_matplotlib())
def test_custom_multiclass_objective():
script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py')
cmd = ['python', script, '--plot=0']
subprocess.check_call(cmd)
@pytest.mark.skipif(**tm.no_matplotlib())
def test_custom_rmsle_objective():
major, minor = sys.version_info[:2]
if minor < 6:
pytest.skip('Skipping RMLSE test due to Python version being too low.')
script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py')
cmd = ['python', script, '--plot=0']
subprocess.check_call(cmd)
@pytest.mark.skipif(**tm.no_matplotlib())
def test_feature_weights_demo():
script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py')
cmd = ['python', script, '--plot=0']
subprocess.check_call(cmd)
@pytest.mark.skipif(**tm.no_sklearn())
def test_sklearn_demo():
script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')

View File

@@ -99,6 +99,11 @@ class TestDMatrix(unittest.TestCase):
X = rng.randn(100, 100)
y = rng.randint(low=0, high=3, size=100)
d = xgb.DMatrix(X, y)
np.testing.assert_equal(d.get_label(), y.astype(np.float32))
fw = rng.uniform(size=100).astype(np.float32)
d.set_info(feature_weights=fw)
eval_res_0 = {}
booster = xgb.train(
{'num_class': 3, 'objective': 'multi:softprob'}, d,
@@ -106,19 +111,23 @@ class TestDMatrix(unittest.TestCase):
predt = booster.predict(d)
predt = predt.reshape(100 * 3, 1)
d.set_base_margin(predt)
ridxs = [1, 2, 3, 4, 5, 6]
d = d.slice(ridxs)
sliced_margin = d.get_float_info('base_margin')
sliced = d.slice(ridxs)
sliced_margin = sliced.get_float_info('base_margin')
assert sliced_margin.shape[0] == len(ridxs) * 3
eval_res_1 = {}
xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d,
num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1)
xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced,
num_boost_round=2, evals=[(sliced, 'd')],
evals_result=eval_res_1)
eval_res_0 = eval_res_0['d']['merror']
eval_res_1 = eval_res_1['d']['merror']
for i in range(len(eval_res_0)):
assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02
@@ -196,13 +205,33 @@ class TestDMatrix(unittest.TestCase):
dtrain.get_float_info('base_margin')
dtrain.get_uint_info('group_ptr')
def test_feature_weights(self):
kRows = 10
kCols = 50
rng = np.random.RandomState(1994)
fw = rng.uniform(size=kCols)
X = rng.randn(kRows, kCols)
m = xgb.DMatrix(X)
m.set_info(feature_weights=fw)
np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
# Handle empty
m.set_info(feature_weights=np.empty((0, 0)))
assert m.get_float_info('feature_weights').shape[0] == 0
fw -= 1
def assign_weight():
m.set_info(feature_weights=fw)
self.assertRaises(ValueError, assign_weight)
def test_sparse_dmatrix_csr(self):
nrow = 100
ncol = 1000
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
assert x.indices.max() < ncol - 1
x.data[:] = 1
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
watchlist = [(dtrain, 'train')]
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
@@ -215,7 +244,7 @@ class TestDMatrix(unittest.TestCase):
x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
assert x.indices.max() < nrow - 1
x.data[:] = 1
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
watchlist = [(dtrain, 'train')]
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}

View File

@@ -1,3 +1,5 @@
import collections
import importlib.util
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBoostLabelEncoder
@@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel():
eval_set=[(X_train, y_train), (X_test, y_test)],
sample_weight_eval_set=[weights_train])
def test_validation_weights_xgbclassifier():
from sklearn.datasets import make_hastie_10_2
@@ -920,6 +923,64 @@ def test_pandas_input():
np.array([0, 1]))
def run_feature_weights(increasing):
with TemporaryDirectory() as tmpdir:
kRows = 512
kCols = 64
colsample_bynode = 0.5
reg = xgb.XGBRegressor(tree_method='hist',
colsample_bynode=colsample_bynode)
X = rng.randn(kRows, kCols)
y = rng.randn(kRows)
fw = np.ones(shape=(kCols,))
for i in range(kCols):
if increasing:
fw[i] *= float(i)
else:
fw[i] *= float(kCols - i)
reg.fit(X, y, feature_weights=fw)
model_path = os.path.join(tmpdir, 'model.json')
reg.save_model(model_path)
with open(model_path) as fd:
model = json.load(fd)
parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
'json_parser.py')
spec = importlib.util.spec_from_file_location("JsonParser",
parser_path)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
model = foo.Model(model)
splits = {}
total_nodes = 0
for tree in model.trees:
n_nodes = len(tree.nodes)
total_nodes += n_nodes
for n in range(n_nodes):
if tree.is_leaf(n):
continue
if splits.get(tree.split_index(n), None) is None:
splits[tree.split_index(n)] = 1
else:
splits[tree.split_index(n)] += 1
od = collections.OrderedDict(sorted(splits.items()))
tuples = [(k, v) for k, v in od.items()]
k, v = list(zip(*tuples))
w = np.polyfit(k, v, deg=1)
return w
def test_feature_weights():
poly_increasing = run_feature_weights(True)
poly_decreasing = run_feature_weights(False)
# Approxmated test, this is dependent on the implementation of random
# number generator in std library.
assert poly_increasing[0] > 0.08
assert poly_decreasing[0] < -0.08
class TestBoostFromPrediction(unittest.TestCase):
def run_boost_from_prediction(self, tree_method):
from sklearn.datasets import load_breast_cancer