diff --git a/NEWS.md b/NEWS.md index 6fc6a37a5..81afdbb5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,9 @@ This file records the changes in xgboost library in reverse chronological order. - Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader. - Future plugin modules can be put into xgboost/plugin and register back to the library. - Remove most of the raw pointers to smart ptrs, for RAII safety. +* Add official option to approximate algorithm `tree_method` to parameter. + - Change default behavior to switch to prefer faster algorithm. + - User will get a message when approximate algorithm is chosen. * Change library name to libxgboost.so * Backward compatiblity - The binary buffer file is not backward compatible with previous version. diff --git a/doc/parameter.md b/doc/parameter.md index af3986bbf..32f772fcc 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -53,6 +53,24 @@ Parameters for Tree Booster - L2 regularization term on weights * alpha [default=0] - L1 regularization term on weights +* tree_method, string [default='auto'] + - The tree constructtion algorithm used in XGBoost(see description in the [reference paper](http://arxiv.org/abs/1603.02754)) + - Distributed and external memory version only support approximate algorithm. + - Choices: {'auto', 'exact', 'approx'} + - 'auto': Use heuristic to choose faster one. + - For small to medium dataset, exact greedy will be used. + - For very large-dataset, approximate algorithm will be choosed. + - Because old behavior is always use exact greedy in single machine, + user will get a message when approximate algorithm is choosed to notify this choice. + - 'exact': Exact greedy algorithm. + - 'approx': Approximate greedy algorithm using sketching and histogram. +* sketch_eps, [default=0.03] + - This is only used for approximate greedy algorithm. + - This roughly translated into ```O(1 / sketch_eps)``` number of bins. + Compared to directly select number of bins, this comes with theoretical ganrantee with sketch accuracy. + - Usuaully user do not have to tune this. + but consider set to lower number for more accurate enumeration. + - range: (0, 1) Parameters for Linear Booster ----------------------------- diff --git a/src/learner.cc b/src/learner.cc index 0fd8e7af6..6a95e0bab 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -4,6 +4,7 @@ * \brief Implementation of learning algorithm. * \author Tianqi Chen */ +#include #include #include #include @@ -69,6 +70,8 @@ struct LearnerTrainParam bool seed_per_iteration; // data split mode, can be row, col, or none. int dsplit; + // tree construction method + int tree_method; // internal test flag std::string test_flag; // maximum buffered row value @@ -87,6 +90,11 @@ struct LearnerTrainParam .add_enum("col", 1) .add_enum("row", 2) .describe("Data split mode for distributed trainig. "); + DMLC_DECLARE_FIELD(tree_method).set_default(0) + .add_enum("auto", 0) + .add_enum("approx", 1) + .add_enum("exact", 2) + .describe("Choice of tree construction method."); DMLC_DECLARE_FIELD(test_flag).set_default("") .describe("Internal test flag"); DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f) @@ -349,21 +357,42 @@ class LearnerImpl : public Learner { // check if p_train is ready to used by training. // if not, initialize the column access. inline void LazyInitDMatrix(DMatrix *p_train) { - if (p_train->HaveColAccess()) return; - int ncol = static_cast(p_train->info().num_col); - std::vector enabled(ncol, true); - // set max row per batch to limited value - // in distributed mode, use safe choice otherwise - size_t max_row_perbatch = tparam.max_row_perbatch; - if (tparam.test_flag == "block" || tparam.dsplit == 2) { - max_row_perbatch = std::min( - static_cast(32UL << 10UL), max_row_perbatch); + if (!p_train->HaveColAccess()) { + int ncol = static_cast(p_train->info().num_col); + std::vector enabled(ncol, true); + // set max row per batch to limited value + // in distributed mode, use safe choice otherwise + size_t max_row_perbatch = tparam.max_row_perbatch; + const size_t safe_max_row = static_cast(32UL << 10UL); + + if (tparam.tree_method == 0 && + p_train->info().num_row >= (4UL << 20UL)) { + LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'" + << " for faster speed." + << " to use old behavior(exact greedy algorithm on single machine)," + << " set tree_method to \'exact\'"; + max_row_perbatch = std::min(max_row_perbatch, safe_max_row); + } + + if (tparam.tree_method == 1) { + LOG(CONSOLE) << "Tree method is selected to be \'approx\'"; + max_row_perbatch = std::min(max_row_perbatch, safe_max_row); + } + + if (tparam.test_flag == "block" || tparam.dsplit == 2) { + max_row_perbatch = std::min(max_row_perbatch, safe_max_row); + } + // initialize column access + p_train->InitColAccess(enabled, + tparam.prob_buffer_row, + max_row_perbatch); } - // initialize column access - p_train->InitColAccess(enabled, - tparam.prob_buffer_row, - max_row_perbatch); + if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) { + if (tparam.tree_method == 2) { + LOG(CONSOLE) << "tree method is set to be 'exact'," + << " but currently we are only able to proceed with approximate algorithm"; + } cfg_["updater"] = "grow_histmaker,prune"; if (gbm_.get() != nullptr) { gbm_->Configure(cfg_.begin(), cfg_.end());