Rewrite approx (#7214)

This PR rewrites the approx tree method to use codebase from hist for better performance and code sharing. The rewrite has many benefits: - Support for both `max_leaves` and `max_depth`. - Support for `grow_policy`. - Support for mono constraint. - Support for feature weights. - Support for easier bin configuration (`max_bin`). - Support for categorical data. - Faster performance for most of the datasets. (many times faster) - Support for prediction cache. - Significantly better performance for external memory. - Unites the code base between approx and hist.
2022-01-10 21:15:05 +08:00
parent ed95e77752
commit 001503186c
22 changed files with 635 additions and 264 deletions
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -35,7 +35,7 @@ TEST(GBTree, SelectTreeMethod) {
  gbtree.Configure(args);
  auto const& tparam = gbtree.GetTrainParam();
  gbtree.Configure({{"tree_method", "approx"}});
-  ASSERT_EQ(tparam.updater_seq, "grow_histmaker,prune");
+  ASSERT_EQ(tparam.updater_seq, "grow_histmaker");
  gbtree.Configure({{"tree_method", "exact"}});
  ASSERT_EQ(tparam.updater_seq, "grow_colmaker,prune");
  gbtree.Configure({{"tree_method", "hist"}});
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -72,5 +72,58 @@ TEST(Approx, Partitioner) {
    }
  }
 }
+
+TEST(Approx, PredictionCache) {
+  size_t n_samples = 2048, n_features = 13;
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+
+  {
+    omp_set_num_threads(1);
+    GenericParameter ctx;
+    ctx.InitAllowUnknown(Args{{"nthread", "8"}});
+    std::unique_ptr<TreeUpdater> approx{
+        TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
+    RegTree tree;
+    std::vector<RegTree *> trees{&tree};
+    auto gpair = GenerateRandomGradients(n_samples);
+    approx->Configure(Args{{"max_bin", "64"}});
+    approx->Update(&gpair, Xy.get(), trees);
+    HostDeviceVector<float> out_prediction_cached;
+    out_prediction_cached.Resize(n_samples);
+    auto cache = linalg::VectorView<float>{
+        out_prediction_cached.HostSpan(), {out_prediction_cached.Size()}, GenericParameter::kCpuId};
+    ASSERT_TRUE(approx->UpdatePredictionCache(Xy.get(), cache));
+  }
+
+  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+  learner->SetParam("tree_method", "approx");
+  learner->SetParam("nthread", "0");
+  learner->Configure();
+
+  for (size_t i = 0; i < 8; ++i) {
+    learner->UpdateOneIter(i, Xy);
+  }
+
+  HostDeviceVector<float> out_prediction_cached;
+  learner->Predict(Xy, false, &out_prediction_cached, 0, 0);
+
+  Json model{Object()};
+  learner->SaveModel(&model);
+
+  HostDeviceVector<float> out_prediction;
+  {
+    std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+    learner->LoadModel(model);
+    learner->Predict(Xy, false, &out_prediction, 0, 0);
+  }
+
+  auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
+  auto const h_predt = out_prediction.ConstHostSpan();
+
+  ASSERT_EQ(h_predt.size(), h_predt_cached.size());
+  for (size_t i = 0; i < h_predt.size(); ++i) {
+    ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
+  }
+}
 }  // namespace tree
 }  // namespace xgboost
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -315,57 +315,6 @@ TEST(GpuHist, TestHistogramIndex) {
  TestHistogramIndexImpl();
 }

-// gamma is an alias of min_split_loss
-int32_t TestMinSplitLoss(DMatrix* dmat, float gamma, HostDeviceVector<GradientPair>* gpair) {
-  Args args {
-    {"max_depth", "1"},
-    {"max_leaves", "0"},
-
-    // Disable all other parameters.
-    {"colsample_bynode", "1"},
-    {"colsample_bylevel", "1"},
-    {"colsample_bytree", "1"},
-    {"min_child_weight", "0.01"},
-    {"reg_alpha", "0"},
-    {"reg_lambda", "0"},
-    {"max_delta_step", "0"},
-
-    // test gamma
-    {"gamma", std::to_string(gamma)}
-  };
-
-  tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker{ObjInfo{ObjInfo::kRegression}};
-  GenericParameter generic_param(CreateEmptyGenericParam(0));
-  hist_maker.Configure(args, &generic_param);
-
-  RegTree tree;
-  hist_maker.Update(gpair, dmat, {&tree});
-
-  auto n_nodes = tree.NumExtraNodes();
-  return n_nodes;
-}
-
-TEST(GpuHist, MinSplitLoss) {
-  constexpr size_t kRows = 32;
-  constexpr size_t kCols = 16;
-  constexpr float kSparsity = 0.6;
-  auto dmat = RandomDataGenerator(kRows, kCols, kSparsity).Seed(3).GenerateDMatrix();
-  auto gpair = GenerateRandomGradients(kRows);
-
-  {
-    int32_t n_nodes = TestMinSplitLoss(dmat.get(), 0.01, &gpair);
-    // This is not strictly verified, meaning the numeber `2` is whatever GPU_Hist retured
-    // when writing this test, and only used for testing larger gamma (below) does prevent
-    // building tree.
-    ASSERT_EQ(n_nodes, 2);
-  }
-  {
-    int32_t n_nodes = TestMinSplitLoss(dmat.get(), 100.0, &gpair);
-    // No new nodes with gamma == 100.
-    ASSERT_EQ(n_nodes, static_cast<decltype(n_nodes)>(0));
-  }
-}
-
 void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
                size_t gpu_page_size, RegTree* tree,
                HostDeviceVector<bst_float>* preds, float subsample = 1.0f,
--- a/tests/cpp/tree/test_tree_policy.cc
+++ b/tests/cpp/tree/test_tree_policy.cc
@@ -61,7 +61,7 @@ class TestGrowPolicy : public ::testing::Test {
  }
 };

-TEST_F(TestGrowPolicy, DISABLED_Approx) {
+TEST_F(TestGrowPolicy, Approx) {
  this->TestTreeGrowPolicy("approx", "depthwise");
  this->TestTreeGrowPolicy("approx", "lossguide");
 }
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -114,4 +114,70 @@ TEST_F(UpdaterEtaTest, Approx) { this->RunTest("grow_histmaker"); }
 #if defined(XGBOOST_USE_CUDA)
 TEST_F(UpdaterEtaTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
 #endif  // defined(XGBOOST_USE_CUDA)
+
+class TestMinSplitLoss : public ::testing::Test {
+  std::shared_ptr<DMatrix> dmat_;
+  HostDeviceVector<GradientPair> gpair_;
+
+  void SetUp() override {
+    constexpr size_t kRows = 32;
+    constexpr size_t kCols = 16;
+    constexpr float kSparsity = 0.6;
+    dmat_ = RandomDataGenerator(kRows, kCols, kSparsity).Seed(3).GenerateDMatrix();
+    gpair_ = GenerateRandomGradients(kRows);
+  }
+
+  int32_t Update(std::string updater, float gamma) {
+    Args args{{"max_depth", "1"},
+              {"max_leaves", "0"},
+
+              // Disable all other parameters.
+              {"colsample_bynode", "1"},
+              {"colsample_bylevel", "1"},
+              {"colsample_bytree", "1"},
+              {"min_child_weight", "0.01"},
+              {"reg_alpha", "0"},
+              {"reg_lambda", "0"},
+              {"max_delta_step", "0"},
+
+              // test gamma
+              {"gamma", std::to_string(gamma)}};
+
+    GenericParameter generic_param(CreateEmptyGenericParam(0));
+    auto up = std::unique_ptr<TreeUpdater>{
+        TreeUpdater::Create(updater, &generic_param, ObjInfo{ObjInfo::kRegression})};
+    up->Configure(args);
+
+    RegTree tree;
+    up->Update(&gpair_, dmat_.get(), {&tree});
+
+    auto n_nodes = tree.NumExtraNodes();
+    return n_nodes;
+  }
+
+ public:
+  void RunTest(std::string updater) {
+    {
+      int32_t n_nodes = Update(updater, 0.01);
+      // This is not strictly verified, meaning the numeber `2` is whatever GPU_Hist retured
+      // when writing this test, and only used for testing larger gamma (below) does prevent
+      // building tree.
+      ASSERT_EQ(n_nodes, 2);
+    }
+    {
+      int32_t n_nodes = Update(updater, 100.0);
+      // No new nodes with gamma == 100.
+      ASSERT_EQ(n_nodes, static_cast<decltype(n_nodes)>(0));
+    }
+  }
+};
+
+/* Exact tree method requires a pruner as an additional updater, so not tested here. */
+
+TEST_F(TestMinSplitLoss, Approx) { this->RunTest("grow_histmaker"); }
+
+TEST_F(TestMinSplitLoss, Hist) { this->RunTest("grow_quantile_histmaker"); }
+#if defined(XGBOOST_USE_CUDA)
+TEST_F(TestMinSplitLoss, GpuHist) { this->RunTest("grow_gpu_hist"); }
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -7,6 +7,8 @@ from hypothesis import given, strategies, assume, settings, note

 sys.path.append("tests/python")
 import testing as tm
+import test_updaters as test_up
+

 parameter_strategy = strategies.fixed_dictionaries({
    'max_depth': strategies.integers(0, 11),
@@ -32,6 +34,8 @@ def train_result(param, dmat, num_rounds):


 class TestGPUUpdaters:
+    cputest = test_up.TestTreeMethod()
+
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
@@ -41,51 +45,12 @@ class TestGPUUpdaters:
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

-    def run_categorical_basic(self, rows, cols, rounds, cats):
-        onehot, label = tm.make_categorical(rows, cols, cats, True)
-        cat, _ = tm.make_categorical(rows, cols, cats, False)
-
-        by_etl_results = {}
-        by_builtin_results = {}
-
-        parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}
-
-        m = xgb.DMatrix(onehot, label, enable_categorical=False)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_etl_results,
-        )
-
-        m = xgb.DMatrix(cat, label, enable_categorical=True)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_builtin_results,
-        )
-
-        # There are guidelines on how to specify tolerance based on considering output as
-        # random variables. But in here the tree construction is extremely sensitive to
-        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
-        # different tree.  So even though the test is quite lenient, hypothesis can still
-        # pick up falsifying examples from time to time.
-        np.testing.assert_allclose(
-            np.array(by_etl_results["Train"]["rmse"]),
-            np.array(by_builtin_results["Train"]["rmse"]),
-            rtol=1e-3,
-        )
-        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
-
    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
-        self.run_categorical_basic(rows, cols, rounds, cats)
+        self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
@@ -93,7 +58,7 @@ class TestGPUUpdaters:
        cols = 10
        cats = 32
        rounds = 4
-        self.run_categorical_basic(rows, cols, rounds, cats)
+        self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")

    def test_invalid_categorical(self):
        import cupy as cp
--- a/tests/python/test_monotone_constraints.py
+++ b/tests/python/test_monotone_constraints.py
@@ -63,7 +63,6 @@ training_dset = xgb.DMatrix(x, label=y)


 class TestMonotoneConstraints:
-
    def test_monotone_constraints_for_exact_tree_method(self):

        # first check monotonicity for the 'exact' tree method
@@ -76,32 +75,23 @@ class TestMonotoneConstraints:
        )
        assert is_correctly_constrained(constrained_exact_method)

-    def test_monotone_constraints_for_depthwise_hist_tree_method(self):
-
-        # next check monotonicity for the 'hist' tree method
-        params_for_constrained_hist_method = {
-            'tree_method': 'hist', 'verbosity': 1,
-            'monotone_constraints': '(1, -1)'
+    @pytest.mark.parametrize(
+        "tree_method,policy",
+        [
+            ("hist", "depthwise"),
+            ("approx", "depthwise"),
+            ("hist", "lossguide"),
+            ("approx", "lossguide"),
+        ],
+    )
+    def test_monotone_constraints(self, tree_method: str, policy: str) -> None:
+        params_for_constrained = {
+            "tree_method": tree_method,
+            "grow_policy": policy,
+            "monotone_constraints": "(1, -1)",
        }
-        constrained_hist_method = xgb.train(
-            params_for_constrained_hist_method, training_dset
-        )
-
-        assert is_correctly_constrained(constrained_hist_method)
-
-    def test_monotone_constraints_for_lossguide_hist_tree_method(self):
-
-        # next check monotonicity for the 'hist' tree method
-        params_for_constrained_hist_method = {
-            'tree_method': 'hist', 'verbosity': 1,
-            'grow_policy': 'lossguide',
-            'monotone_constraints': '(1, -1)'
-        }
-        constrained_hist_method = xgb.train(
-            params_for_constrained_hist_method, training_dset
-        )
-
-        assert is_correctly_constrained(constrained_hist_method)
+        constrained = xgb.train(params_for_constrained, training_dset)
+        assert is_correctly_constrained(constrained)

    @pytest.mark.parametrize('format', [dict, list])
    def test_monotone_constraints_feature_names(self, format):
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -45,14 +45,20 @@ class TestTreeMethod:
        result = train_result(param, dataset.get_dmat(), num_rounds)
        assert tm.non_increasing(result['train'][dataset.metric])

-    @given(exact_parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.dataset_strategy,
+    )
    @settings(deadline=None)
-    def test_approx(self, param, num_rounds, dataset):
-        param['tree_method'] = 'approx'
+    def test_approx(self, param, hist_param, num_rounds, dataset):
+        param["tree_method"] = "approx"
        param = dataset.set_params(param)
+        param.update(hist_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
-        assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_pruner(self):
@@ -126,3 +132,53 @@ class TestTreeMethod:
        y = [1000000., 0., 0., 500000.]
        w = [0, 0, 1, 0]
        model.fit(X, y, sample_weight=w)
+
+    def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
+        onehot, label = tm.make_categorical(rows, cols, cats, True)
+        cat, _ = tm.make_categorical(rows, cols, cats, False)
+
+        by_etl_results = {}
+        by_builtin_results = {}
+
+        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
+        # Use one-hot exclusively
+        parameters = {
+            "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999
+        }
+
+        m = xgb.DMatrix(onehot, label, enable_categorical=False)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_etl_results,
+        )
+
+        m = xgb.DMatrix(cat, label, enable_categorical=True)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_builtin_results,
+        )
+
+        # There are guidelines on how to specify tolerance based on considering output as
+        # random variables. But in here the tree construction is extremely sensitive to
+        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
+        # different tree.  So even though the test is quite lenient, hypothesis can still
+        # pick up falsifying examples from time to time.
+        np.testing.assert_allclose(
+            np.array(by_etl_results["Train"]["rmse"]),
+            np.array(by_builtin_results["Train"]["rmse"]),
+            rtol=1e-3,
+        )
+        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+    @given(strategies.integers(10, 400), strategies.integers(3, 8),
+           strategies.integers(1, 2), strategies.integers(4, 7))
+    @settings(deadline=None)
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_categorical(self, rows, cols, rounds, cats):
+        self.run_categorical_basic(rows, cols, rounds, cats, "approx")
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -1184,9 +1184,13 @@ class TestWithDask:
            for arg in rabit_args:
                if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
                    port_env = arg.decode('utf-8')
+                if arg.decode("utf-8").startswith("DMLC_TRACKER_URI"):
+                    uri_env = arg.decode("utf-8")
            port = port_env.split('=')
            env = os.environ.copy()
            env[port[0]] = port[1]
+            uri = uri_env.split("=")
+            env["DMLC_TRACKER_URI"] = uri[1]
            return subprocess.run([str(exe), test], env=env, capture_output=True)

        with LocalCluster(n_workers=4) as cluster:
@@ -1210,11 +1214,13 @@ class TestWithDask:
    @pytest.mark.gtest
    def test_quantile_basic(self) -> None:
        self.run_quantile('DistributedBasic')
+        self.run_quantile('SortedDistributedBasic')

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.gtest
    def test_quantile(self) -> None:
        self.run_quantile('Distributed')
+        self.run_quantile('SortedDistributed')

    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.gtest
@@ -1252,13 +1258,17 @@ class TestWithDask:
        for i in range(kCols):
            fw[i] *= float(i)
        fw = da.from_array(fw)
-        poly_increasing = run_feature_weights(X, y, fw, model=xgb.dask.DaskXGBRegressor)
+        poly_increasing = run_feature_weights(
+            X, y, fw, "approx", model=xgb.dask.DaskXGBRegressor
+        )

        fw = np.ones(shape=(kCols,))
        for i in range(kCols):
            fw[i] *= float(kCols - i)
        fw = da.from_array(fw)
-        poly_decreasing = run_feature_weights(X, y, fw, model=xgb.dask.DaskXGBRegressor)
+        poly_decreasing = run_feature_weights(
+            X, y, fw, "approx", model=xgb.dask.DaskXGBRegressor
+        )

        # Approxmated test, this is dependent on the implementation of random
        # number generator in std library.
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1031,10 +1031,10 @@ def test_pandas_input():
                               np.array([0, 1]))


-def run_feature_weights(X, y, fw, model=xgb.XGBRegressor):
+def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
    with tempfile.TemporaryDirectory() as tmpdir:
        colsample_bynode = 0.5
-        reg = model(tree_method='hist', colsample_bynode=colsample_bynode)
+        reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode)

        reg.fit(X, y, feature_weights=fw)
        model_path = os.path.join(tmpdir, 'model.json')
@@ -1069,7 +1069,8 @@ def run_feature_weights(X, y, fw, model=xgb.XGBRegressor):
        return w


-def test_feature_weights():
+@pytest.mark.parametrize("tree_method", ["approx", "hist"])
+def test_feature_weights(tree_method):
    kRows = 512
    kCols = 64
    X = rng.randn(kRows, kCols)
@@ -1078,12 +1079,12 @@ def test_feature_weights():
    fw = np.ones(shape=(kCols,))
    for i in range(kCols):
        fw[i] *= float(i)
-    poly_increasing = run_feature_weights(X, y, fw, xgb.XGBRegressor)
+    poly_increasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)

    fw = np.ones(shape=(kCols,))
    for i in range(kCols):
        fw[i] *= float(kCols - i)
-    poly_decreasing = run_feature_weights(X, y, fw, xgb.XGBRegressor)
+    poly_decreasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)

    # Approxmated test, this is dependent on the implementation of random
    # number generator in std library.