Feature weights (#5962)

2020-08-18 19:55:41 +08:00
parent a418278064
commit 4d99c58a5f
25 changed files with 509 additions and 104 deletions
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -1,12 +1,10 @@
 import os
 import subprocess
-import sys
 import pytest
 import testing as tm


-CURRENT_DIR = os.path.dirname(__file__)
-ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
+ROOT_DIR = tm.PROJECT_ROOT
 DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
 PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')

@@ -19,21 +17,27 @@ def test_basic_walkthrough():
    os.remove('dump.raw.txt')


+@pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_multiclass_objective():
    script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py')
    cmd = ['python', script, '--plot=0']
    subprocess.check_call(cmd)


+@pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_rmsle_objective():
-    major, minor = sys.version_info[:2]
-    if minor < 6:
-        pytest.skip('Skipping RMLSE test due to Python version being too low.')
    script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py')
    cmd = ['python', script, '--plot=0']
    subprocess.check_call(cmd)


+@pytest.mark.skipif(**tm.no_matplotlib())
+def test_feature_weights_demo():
+    script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py')
+    cmd = ['python', script, '--plot=0']
+    subprocess.check_call(cmd)
+
+
@pytest.mark.skipif(**tm.no_sklearn())
 def test_sklearn_demo():
    script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -99,6 +99,11 @@ class TestDMatrix(unittest.TestCase):
        X = rng.randn(100, 100)
        y = rng.randint(low=0, high=3, size=100)
        d = xgb.DMatrix(X, y)
+        np.testing.assert_equal(d.get_label(), y.astype(np.float32))
+
+        fw = rng.uniform(size=100).astype(np.float32)
+        d.set_info(feature_weights=fw)
+
        eval_res_0 = {}
        booster = xgb.train(
            {'num_class': 3, 'objective': 'multi:softprob'}, d,
@@ -106,19 +111,23 @@ class TestDMatrix(unittest.TestCase):

        predt = booster.predict(d)
        predt = predt.reshape(100 * 3, 1)
+
        d.set_base_margin(predt)

        ridxs = [1, 2, 3, 4, 5, 6]
-        d = d.slice(ridxs)
-        sliced_margin = d.get_float_info('base_margin')
+        sliced = d.slice(ridxs)
+
+        sliced_margin = sliced.get_float_info('base_margin')
        assert sliced_margin.shape[0] == len(ridxs) * 3

        eval_res_1 = {}
-        xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d,
-                  num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1)
+        xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced,
+                  num_boost_round=2, evals=[(sliced, 'd')],
+                  evals_result=eval_res_1)

        eval_res_0 = eval_res_0['d']['merror']
        eval_res_1 = eval_res_1['d']['merror']
+
        for i in range(len(eval_res_0)):
            assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02

@@ -196,13 +205,33 @@ class TestDMatrix(unittest.TestCase):
        dtrain.get_float_info('base_margin')
        dtrain.get_uint_info('group_ptr')

+    def test_feature_weights(self):
+        kRows = 10
+        kCols = 50
+        rng = np.random.RandomState(1994)
+        fw = rng.uniform(size=kCols)
+        X = rng.randn(kRows, kCols)
+        m = xgb.DMatrix(X)
+        m.set_info(feature_weights=fw)
+        np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
+        # Handle empty
+        m.set_info(feature_weights=np.empty((0, 0)))
+
+        assert m.get_float_info('feature_weights').shape[0] == 0
+
+        fw -= 1
+
+        def assign_weight():
+            m.set_info(feature_weights=fw)
+        self.assertRaises(ValueError, assign_weight)
+
    def test_sparse_dmatrix_csr(self):
        nrow = 100
        ncol = 1000
        x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
        assert x.indices.max() < ncol - 1
        x.data[:] = 1
-        dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
+        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
        assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
        watchlist = [(dtrain, 'train')]
        param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
@@ -215,7 +244,7 @@ class TestDMatrix(unittest.TestCase):
        x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
        assert x.indices.max() < nrow - 1
        x.data[:] = 1
-        dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
+        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
        assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
        watchlist = [(dtrain, 'train')]
        param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1,3 +1,5 @@
+import collections
+import importlib.util
 import numpy as np
 import xgboost as xgb
 from xgboost.sklearn import XGBoostLabelEncoder
@@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel():
                eval_set=[(X_train, y_train), (X_test, y_test)],
                sample_weight_eval_set=[weights_train])

+
 def test_validation_weights_xgbclassifier():
    from sklearn.datasets import make_hastie_10_2

@@ -920,6 +923,64 @@ def test_pandas_input():
                               np.array([0, 1]))


+def run_feature_weights(increasing):
+    with TemporaryDirectory() as tmpdir:
+        kRows = 512
+        kCols = 64
+        colsample_bynode = 0.5
+        reg = xgb.XGBRegressor(tree_method='hist',
+                               colsample_bynode=colsample_bynode)
+        X = rng.randn(kRows, kCols)
+        y = rng.randn(kRows)
+        fw = np.ones(shape=(kCols,))
+        for i in range(kCols):
+            if increasing:
+                fw[i] *= float(i)
+            else:
+                fw[i] *= float(kCols - i)
+
+        reg.fit(X, y, feature_weights=fw)
+        model_path = os.path.join(tmpdir, 'model.json')
+        reg.save_model(model_path)
+        with open(model_path) as fd:
+            model = json.load(fd)
+
+        parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
+                                   'json_parser.py')
+        spec = importlib.util.spec_from_file_location("JsonParser",
+                                                      parser_path)
+        foo = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(foo)
+        model = foo.Model(model)
+        splits = {}
+        total_nodes = 0
+        for tree in model.trees:
+            n_nodes = len(tree.nodes)
+            total_nodes += n_nodes
+            for n in range(n_nodes):
+                if tree.is_leaf(n):
+                    continue
+                if splits.get(tree.split_index(n), None) is None:
+                    splits[tree.split_index(n)] = 1
+                else:
+                    splits[tree.split_index(n)] += 1
+
+        od = collections.OrderedDict(sorted(splits.items()))
+        tuples = [(k, v) for k, v in od.items()]
+        k, v = list(zip(*tuples))
+        w = np.polyfit(k, v, deg=1)
+        return w
+
+
+def test_feature_weights():
+    poly_increasing = run_feature_weights(True)
+    poly_decreasing = run_feature_weights(False)
+    # Approxmated test, this is dependent on the implementation of random
+    # number generator in std library.
+    assert poly_increasing[0] > 0.08
+    assert poly_decreasing[0] < -0.08
+
+
 class TestBoostFromPrediction(unittest.TestCase):
    def run_boost_from_prediction(self, tree_method):
        from sklearn.datasets import load_breast_cancer