From 7b65698187acb5163a4972beaf54d9e6b822beed Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 13 Jan 2020 15:48:17 +0800 Subject: [PATCH] Enforce correct data shape. (#5191) * Fix syncing DMatrix columns. * notes for tree method. * Enable feature validation for all interfaces except for jvm. * Better tests for boosting from predictions. * Disable validation on JVM. --- doc/parameter.rst | 20 +++++--- doc/tutorials/param_tuning.rst | 5 ++ include/xgboost/generic_parameters.h | 6 ++- .../java/ml/dmlc/xgboost4j/java/Booster.java | 2 +- src/common/observer.h | 6 +++ src/data/data.cc | 29 ++--------- src/gbm/gbtree.cc | 1 - src/learner.cc | 19 +++++++ src/tree/updater_colmaker.cc | 4 ++ tests/cpp/test_learner.cc | 3 +- tests/python-gpu/test_gpu_pickling.py | 1 - tests/python-gpu/test_gpu_prediction.py | 12 +++-- tests/python-gpu/test_gpu_with_sklearn.py | 11 ++++- tests/python/test_with_sklearn.py | 49 +++++++++++++------ 14 files changed, 108 insertions(+), 60 deletions(-) diff --git a/doc/parameter.rst b/doc/parameter.rst index d0ee4f20d..673d44c9f 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -112,18 +112,24 @@ Parameters for Tree Booster - The tree construction algorithm used in XGBoost. See description in the `reference paper `_. - XGBoost supports ``approx``, ``hist`` and ``gpu_hist`` for distributed training. Experimental support for external memory is available for ``approx`` and ``gpu_hist``. - - Choices: ``auto``, ``exact``, ``approx``, ``hist``, ``gpu_hist`` + + - Choices: ``auto``, ``exact``, ``approx``, ``hist``, ``gpu_hist``, this is a + combination of commonly used updaters. For other updaters like ``refresh``, set the + parameter ``updater`` directly. - ``auto``: Use heuristic to choose the fastest method. - - For small to medium dataset, exact greedy (``exact``) will be used. - - For very large dataset, approximate algorithm (``approx``) will be chosen. - - Because old behavior is always use exact greedy in single machine, - user will get a message when approximate algorithm is chosen to notify this choice. + - For small dataset, exact greedy (``exact``) will be used. + - For larger dataset, approximate algorithm (``approx``) will be chosen. It's + recommended to try ``hist`` and ``gpu_hist`` for higher performance with large + dataset. + (``gpu_hist``)has support for ``external memory``. - - ``exact``: Exact greedy algorithm. + - Because old behavior is always use exact greedy in single machine, user will get a + message when approximate algorithm is chosen to notify this choice. + - ``exact``: Exact greedy algorithm. Enumerates all split candidates. - ``approx``: Approximate greedy algorithm using quantile sketch and gradient histogram. - - ``hist``: Fast histogram optimized approximate greedy algorithm. It uses some performance improvements such as bins caching. + - ``hist``: Faster histogram optimized approximate greedy algorithm. - ``gpu_hist``: GPU implementation of ``hist`` algorithm. * ``sketch_eps`` [default=0.03] diff --git a/doc/tutorials/param_tuning.rst b/doc/tutorials/param_tuning.rst index 14775bb1c..cce145444 100644 --- a/doc/tutorials/param_tuning.rst +++ b/doc/tutorials/param_tuning.rst @@ -38,6 +38,11 @@ There are in general two ways that you can control overfitting in XGBoost: - This includes ``subsample`` and ``colsample_bytree``. - You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so. +*************************** +Faster training performance +*************************** +There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation. + ************************* Handle Imbalanced Dataset ************************* diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index b01995a7a..cd5a49ffa 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -29,6 +29,7 @@ struct GenericParameter : public XGBoostParameter { size_t gpu_page_size; bool enable_experimental_json_serialization {false}; bool validate_parameters {false}; + bool validate_features {true}; void CheckDeprecated() { if (this->n_gpus != 0) { @@ -73,7 +74,10 @@ struct GenericParameter : public XGBoostParameter { "rabit checkpoints etc.)."); DMLC_DECLARE_FIELD(validate_parameters) .set_default(false) - .describe("Enable to check whether parameters are used or not."); + .describe("Enable checking whether parameters are used or not."); + DMLC_DECLARE_FIELD(validate_features) + .set_default(false) + .describe("Enable validating input DMatrix."); DMLC_DECLARE_FIELD(n_gpus) .set_default(0) .set_range(0, 1) diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java index f14aba66d..8ebf781f1 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java @@ -49,7 +49,7 @@ public class Booster implements Serializable, KryoSerializable { */ Booster(Map params, DMatrix[] cacheMats) throws XGBoostError { init(cacheMats); - setParam("seed", "0"); + setParam("validate_features", "0"); setParams(params); } diff --git a/src/common/observer.h b/src/common/observer.h index c548b780b..a47db2b82 100644 --- a/src/common/observer.h +++ b/src/common/observer.h @@ -71,6 +71,12 @@ class TrainingObserver { auto const& h_vec = vec.HostVector(); this->Observe(h_vec, name); } + template + void Observe(HostDeviceVector* vec, std::string name) const { + if (XGBOOST_EXPECT(!observe_, true)) { return; } + this->Observe(*vec, name); + } + /*\brief Observe objects with `XGBoostParamer' type. */ template >&& source, const std::string& cache_prefix) { if (cache_prefix.length() == 0) { - // FIXME(trivialfis): Currently distcol is broken so we here check for number of rows. - // If we bring back column split this check will break. - bool is_distributed { rabit::IsDistributed() }; - if (is_distributed) { - auto world_size = rabit::GetWorldSize(); - auto rank = rabit::GetRank(); - std::vector ncols(world_size, 0); - ncols[rank] = source->info.num_col_; - rabit::Allreduce(ncols.data(), ncols.size()); - auto max_cols = std::max_element(ncols.cbegin(), ncols.cend()); - auto max_ind = std::distance(ncols.cbegin(), max_cols); - // FIXME(trivialfis): This is a hack, we should store a reference to global shape if possible. - if (source->info.num_col_ == 0 && source->info.num_row_ == 0) { - LOG(WARNING) << "DMatrix at rank: " << rank << " worker is empty."; - source->info.num_col_ = *max_cols; - } - - // validate the number of columns across all workers. - for (size_t i = 0; i < ncols.size(); ++i) { - auto v = ncols[i]; - CHECK(v == 0 || v == *max_cols) - << "DMatrix at rank: " << i << " worker " - << "has different number of columns than rank: " << max_ind << " worker. " - << "(" << v << " vs. " << *max_cols << ")"; - } - } + // Data split mode is fixed to be row right now. + rabit::Allreduce(&source->info.num_col_, 1); return new data::SimpleDMatrix(std::move(source)); } else { #if DMLC_ENABLE_STD_THREAD @@ -336,6 +312,7 @@ template DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size ) { if (cache_prefix.length() == 0) { + // Data split mode is fixed to be row right now. return new data::SimpleDMatrix(adapter, missing, nthread); } else { #if DMLC_ENABLE_STD_THREAD diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 5a972d28f..f1b5065ec 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -124,7 +124,6 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) { return; } - tparam_.updater_seq = "grow_histmaker,prune"; if (rabit::IsDistributed()) { LOG(WARNING) << "Tree method is automatically selected to be 'approx' " diff --git a/src/learner.cc b/src/learner.cc index 3d2fdbe3e..7cf21bb5f 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -925,6 +925,25 @@ class LearnerImpl : public Learner { << "num rows: " << p_fmat->Info().num_row_ << "\n" << "Number of weights should be equal to number of groups in ranking task."; } + + auto const row_based_split = [this]() { + return tparam_.dsplit == DataSplitMode::kRow || + tparam_.dsplit == DataSplitMode::kAuto; + }; + bool const valid_features = + !row_based_split() || + (learner_model_param_.num_feature == p_fmat->Info().num_col_); + std::string const msg { + "Number of columns does not match number of features in booster." + }; + if (generic_parameters_.validate_features) { + CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_) << msg; + } else if (!valid_features) { + // Remove this and make the equality check fatal once spark can fix all failing tests. + LOG(WARNING) << msg << " " + << "Columns: " << p_fmat->Info().num_col_ << " " + << "Features: " << learner_model_param_.num_feature; + } } // model parameter diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 62a63dec3..0fb9c067c 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -80,6 +80,10 @@ class ColMaker: public TreeUpdater { void Update(HostDeviceVector *gpair, DMatrix* dmat, const std::vector &trees) override { + if (rabit::IsDistributed()) { + LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't " + "support distributed training."; + } // rescale learning rate according to size of trees float lr = param_.learning_rate; param_.learning_rate = lr / trees.size(); diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 8200578f6..dddf2756c 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -91,7 +91,6 @@ TEST(Learner, CheckGroup) { } TEST(Learner, SLOW_CheckMultiBatch) { - using Arg = std::pair; // Create sufficiently large data to make two row pages dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/big.libsvm"; @@ -107,7 +106,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { dmat->Info().SetInfo("label", labels.data(), DataType::kFloat32, num_row); std::vector> mat{dmat}; auto learner = std::unique_ptr(Learner::Create(mat)); - learner->SetParams({Arg{"objective", "binary:logistic"}, Arg{"verbosity", "3"}}); + learner->SetParams(Args{{"objective", "binary:logistic"}}); learner->UpdateOneIter(0, dmat.get()); } diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py index 969735fed..b8cc56203 100644 --- a/tests/python-gpu/test_gpu_pickling.py +++ b/tests/python-gpu/test_gpu_pickling.py @@ -6,7 +6,6 @@ import subprocess import os import json import pytest -import copy import xgboost as xgb from xgboost import XGBClassifier diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index 1bdc56d3b..fd092c5e7 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -13,6 +13,10 @@ class TestGPUPredict(unittest.TestCase): np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] + # This test passes for tree_method=gpu_hist and tree_method=exact. but + # for `hist` and `approx` the floating point error accumulates faster + # and fails even tol is set to 1e-4. For `hist`, the mismatching rate + # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), @@ -27,7 +31,7 @@ class TestGPUPredict(unittest.TestCase): "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'auc', - 'verbosity': '3' + 'tree_method': 'gpu_hist' } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) @@ -43,11 +47,11 @@ class TestGPUPredict(unittest.TestCase): cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, - rtol=1e-3) + rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, - rtol=1e-3) + rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, - rtol=1e-3) + rtol=1e-6) def non_decreasing(self, L): return all((x - y) < 0.001 for x, y in zip(L, L[1:])) diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index fd465b419..4de78178e 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -2,9 +2,11 @@ import xgboost as xgb import pytest import sys import numpy as np +import unittest sys.path.append("tests/python") -import testing as tm +import testing as tm # noqa +import test_with_sklearn as twskl # noqa pytestmark = pytest.mark.skipif(**tm.no_sklearn()) @@ -29,3 +31,10 @@ def test_gpu_binary_classification(): err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 + + +class TestGPUBoostFromPrediction(unittest.TestCase): + cpu_test = twskl.TestBoostFromPrediction() + + def test_boost_from_prediction_gpu_hist(self): + self.cpu_test.run_boost_from_prediction('gpu_hist') diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 33ccecc25..11f288384 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -5,6 +5,7 @@ import tempfile import os import shutil import pytest +import unittest rng = np.random.RandomState(1994) @@ -697,21 +698,37 @@ def test_XGBClassifier_resume(): assert log_loss1 > log_loss2 -def test_boost_from_prediction(): - from sklearn.datasets import load_breast_cancer - X, y = load_breast_cancer(return_X_y=True) - model_0 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=4) - model_0.fit(X=X, y=y) - margin = model_0.predict(X, output_margin=True) +class TestBoostFromPrediction(unittest.TestCase): + def run_boost_from_prediction(self, tree_method): + from sklearn.datasets import load_breast_cancer + X, y = load_breast_cancer(return_X_y=True) + model_0 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, + tree_method=tree_method) + model_0.fit(X=X, y=y) + margin = model_0.predict(X, output_margin=True) - model_1 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=4) - model_1.fit(X=X, y=y, base_margin=margin) - predictions_1 = model_1.predict(X, base_margin=margin) + model_1 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, + tree_method=tree_method) + model_1.fit(X=X, y=y, base_margin=margin) + predictions_1 = model_1.predict(X, base_margin=margin) - cls_2 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8) - cls_2.fit(X=X, y=y) - predictions_2 = cls_2.predict(X, base_margin=margin) - assert np.all(predictions_1 == predictions_2) + cls_2 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=8, + tree_method=tree_method) + cls_2.fit(X=X, y=y) + predictions_2 = cls_2.predict(X) + assert np.all(predictions_1 == predictions_2) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_boost_from_prediction_hist(self): + self.run_boost_from_prediction('hist') + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_boost_from_prediction_approx(self): + self.run_boost_from_prediction('approx') + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_boost_from_prediction_exact(self): + self.run_boost_from_prediction('exact')