[METHOD], add tree method option to prefer faster algo

2016-03-13 12:06:11 -07:00 · 2016-03-13 12:06:11 -07:00 · a2714fe052
commit a2714fe052
parent 5fb09dc0ab
3 changed files with 63 additions and 13 deletions
--- a/NEWS.md
+++ b/NEWS.md
@ -12,6 +12,9 @@ This file records the changes in xgboost library in reverse chronological order.
  - Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader.
    - Future plugin modules can be put into xgboost/plugin and register back to the library.
  - Remove most of the raw pointers to smart ptrs, for RAII safety.
 * Add official option to approximate algorithm `tree_method` to parameter.
  - Change default behavior to switch to prefer faster algorithm.
  - User will get a message when approximate algorithm is chosen.
 * Change library name to libxgboost.so
 * Backward compatiblity
  - The binary buffer file is not backward compatible with previous version.
--- a/doc/parameter.md
+++ b/doc/parameter.md
@ -53,6 +53,24 @@ Parameters for Tree Booster
  - L2 regularization term on weights
 * alpha [default=0]
  - L1 regularization term on weights
 * tree_method, string [default='auto']
  - The tree constructtion algorithm used in XGBoost(see description in the [reference paper](http://arxiv.org/abs/1603.02754))
  - Distributed and external memory version only support approximate algorithm.
  - Choices: {'auto', 'exact', 'approx'}
    - 'auto': Use heuristic to choose faster one.
      - For small to medium dataset, exact greedy will be used.
      - For very large-dataset, approximate algorithm will be choosed.
      - Because old behavior is always use exact greedy in single machine,
        user will get a message when approximate algorithm is choosed to notify this choice.
    - 'exact': Exact greedy algorithm.
    - 'approx': Approximate greedy algorithm using sketching and histogram.
 * sketch_eps, [default=0.03]
  - This is only used for approximate greedy algorithm.
  - This roughly translated into ```O(1 / sketch_eps)``` number of bins.
    Compared to directly select number of bins, this comes with theoretical ganrantee with sketch accuracy.
  - Usuaully user do not have to tune this.
    but consider set to lower number for more accurate enumeration.
  - range: (0, 1)
 Parameters for Linear Booster
 -----------------------------
--- a/src/learner.cc
+++ b/src/learner.cc
@ -4,6 +4,7 @@
 * \brief Implementation of learning algorithm.
 * \author Tianqi Chen
 */
 #include <xgboost/logging.h>
 #include <xgboost/learner.h>
 #include <dmlc/io.h>
 #include <algorithm>
@ -69,6 +70,8 @@ struct LearnerTrainParam
  bool seed_per_iteration;
  // data split mode, can be row, col, or none.
  int dsplit;
  // tree construction method
  int tree_method;
  // internal test flag
  std::string test_flag;
  // maximum buffered row value
@ -87,6 +90,11 @@ struct LearnerTrainParam
        .add_enum("col", 1)
        .add_enum("row", 2)
        .describe("Data split mode for distributed trainig. ");
    DMLC_DECLARE_FIELD(tree_method).set_default(0)
        .add_enum("auto", 0)
        .add_enum("approx", 1)
        .add_enum("exact", 2)
        .describe("Choice of tree construction method.");
    DMLC_DECLARE_FIELD(test_flag).set_default("")
        .describe("Internal test flag");
    DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
@ -349,21 +357,42 @@ class LearnerImpl : public Learner {
  // check if p_train is ready to used by training.
  // if not, initialize the column access.
  inline void LazyInitDMatrix(DMatrix *p_train) {
-    if (p_train->HaveColAccess()) return;
+    if (!p_train->HaveColAccess()) {
      int ncol = static_cast<int>(p_train->info().num_col);
      std::vector<bool> enabled(ncol, true);
      // set max row per batch to limited value
      // in distributed mode, use safe choice otherwise
      size_t max_row_perbatch = tparam.max_row_perbatch;
      const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);
      if (tparam.tree_method == 0 &&
          p_train->info().num_row >= (4UL << 20UL)) {
        LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
                     << " for faster speed."
                     << " to use old behavior(exact greedy algorithm on single machine),"
                     << " set tree_method to \'exact\'";
        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
      }
      if (tparam.tree_method == 1) {
        LOG(CONSOLE) << "Tree method is selected to be \'approx\'";
        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
      }
      if (tparam.test_flag == "block" || tparam.dsplit == 2) {
-      max_row_perbatch = std::min(
+        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
        static_cast<size_t>(32UL << 10UL), max_row_perbatch);
      }
      // initialize column access
      p_train->InitColAccess(enabled,
                             tparam.prob_buffer_row,
                             max_row_perbatch);
    }
    if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
      if (tparam.tree_method == 2) {
        LOG(CONSOLE) << "tree method is set to be 'exact',"
                     << " but currently we are only able to proceed with approximate algorithm";
      }
      cfg_["updater"] = "grow_histmaker,prune";
      if (gbm_.get() != nullptr) {
        gbm_->Configure(cfg_.begin(), cfg_.end());