[METHOD], add tree method option to prefer faster algo
This commit is contained in:
parent
5fb09dc0ab
commit
a2714fe052
3
NEWS.md
3
NEWS.md
@ -12,6 +12,9 @@ This file records the changes in xgboost library in reverse chronological order.
|
|||||||
- Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader.
|
- Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader.
|
||||||
- Future plugin modules can be put into xgboost/plugin and register back to the library.
|
- Future plugin modules can be put into xgboost/plugin and register back to the library.
|
||||||
- Remove most of the raw pointers to smart ptrs, for RAII safety.
|
- Remove most of the raw pointers to smart ptrs, for RAII safety.
|
||||||
|
* Add official option to approximate algorithm `tree_method` to parameter.
|
||||||
|
- Change default behavior to switch to prefer faster algorithm.
|
||||||
|
- User will get a message when approximate algorithm is chosen.
|
||||||
* Change library name to libxgboost.so
|
* Change library name to libxgboost.so
|
||||||
* Backward compatiblity
|
* Backward compatiblity
|
||||||
- The binary buffer file is not backward compatible with previous version.
|
- The binary buffer file is not backward compatible with previous version.
|
||||||
|
|||||||
@ -53,6 +53,24 @@ Parameters for Tree Booster
|
|||||||
- L2 regularization term on weights
|
- L2 regularization term on weights
|
||||||
* alpha [default=0]
|
* alpha [default=0]
|
||||||
- L1 regularization term on weights
|
- L1 regularization term on weights
|
||||||
|
* tree_method, string [default='auto']
|
||||||
|
- The tree constructtion algorithm used in XGBoost(see description in the [reference paper](http://arxiv.org/abs/1603.02754))
|
||||||
|
- Distributed and external memory version only support approximate algorithm.
|
||||||
|
- Choices: {'auto', 'exact', 'approx'}
|
||||||
|
- 'auto': Use heuristic to choose faster one.
|
||||||
|
- For small to medium dataset, exact greedy will be used.
|
||||||
|
- For very large-dataset, approximate algorithm will be choosed.
|
||||||
|
- Because old behavior is always use exact greedy in single machine,
|
||||||
|
user will get a message when approximate algorithm is choosed to notify this choice.
|
||||||
|
- 'exact': Exact greedy algorithm.
|
||||||
|
- 'approx': Approximate greedy algorithm using sketching and histogram.
|
||||||
|
* sketch_eps, [default=0.03]
|
||||||
|
- This is only used for approximate greedy algorithm.
|
||||||
|
- This roughly translated into ```O(1 / sketch_eps)``` number of bins.
|
||||||
|
Compared to directly select number of bins, this comes with theoretical ganrantee with sketch accuracy.
|
||||||
|
- Usuaully user do not have to tune this.
|
||||||
|
but consider set to lower number for more accurate enumeration.
|
||||||
|
- range: (0, 1)
|
||||||
|
|
||||||
Parameters for Linear Booster
|
Parameters for Linear Booster
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
* \brief Implementation of learning algorithm.
|
* \brief Implementation of learning algorithm.
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <xgboost/logging.h>
|
||||||
#include <xgboost/learner.h>
|
#include <xgboost/learner.h>
|
||||||
#include <dmlc/io.h>
|
#include <dmlc/io.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -69,6 +70,8 @@ struct LearnerTrainParam
|
|||||||
bool seed_per_iteration;
|
bool seed_per_iteration;
|
||||||
// data split mode, can be row, col, or none.
|
// data split mode, can be row, col, or none.
|
||||||
int dsplit;
|
int dsplit;
|
||||||
|
// tree construction method
|
||||||
|
int tree_method;
|
||||||
// internal test flag
|
// internal test flag
|
||||||
std::string test_flag;
|
std::string test_flag;
|
||||||
// maximum buffered row value
|
// maximum buffered row value
|
||||||
@ -87,6 +90,11 @@ struct LearnerTrainParam
|
|||||||
.add_enum("col", 1)
|
.add_enum("col", 1)
|
||||||
.add_enum("row", 2)
|
.add_enum("row", 2)
|
||||||
.describe("Data split mode for distributed trainig. ");
|
.describe("Data split mode for distributed trainig. ");
|
||||||
|
DMLC_DECLARE_FIELD(tree_method).set_default(0)
|
||||||
|
.add_enum("auto", 0)
|
||||||
|
.add_enum("approx", 1)
|
||||||
|
.add_enum("exact", 2)
|
||||||
|
.describe("Choice of tree construction method.");
|
||||||
DMLC_DECLARE_FIELD(test_flag).set_default("")
|
DMLC_DECLARE_FIELD(test_flag).set_default("")
|
||||||
.describe("Internal test flag");
|
.describe("Internal test flag");
|
||||||
DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
|
DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
|
||||||
@ -349,21 +357,42 @@ class LearnerImpl : public Learner {
|
|||||||
// check if p_train is ready to used by training.
|
// check if p_train is ready to used by training.
|
||||||
// if not, initialize the column access.
|
// if not, initialize the column access.
|
||||||
inline void LazyInitDMatrix(DMatrix *p_train) {
|
inline void LazyInitDMatrix(DMatrix *p_train) {
|
||||||
if (p_train->HaveColAccess()) return;
|
if (!p_train->HaveColAccess()) {
|
||||||
int ncol = static_cast<int>(p_train->info().num_col);
|
int ncol = static_cast<int>(p_train->info().num_col);
|
||||||
std::vector<bool> enabled(ncol, true);
|
std::vector<bool> enabled(ncol, true);
|
||||||
// set max row per batch to limited value
|
// set max row per batch to limited value
|
||||||
// in distributed mode, use safe choice otherwise
|
// in distributed mode, use safe choice otherwise
|
||||||
size_t max_row_perbatch = tparam.max_row_perbatch;
|
size_t max_row_perbatch = tparam.max_row_perbatch;
|
||||||
|
const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);
|
||||||
|
|
||||||
|
if (tparam.tree_method == 0 &&
|
||||||
|
p_train->info().num_row >= (4UL << 20UL)) {
|
||||||
|
LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
|
||||||
|
<< " for faster speed."
|
||||||
|
<< " to use old behavior(exact greedy algorithm on single machine),"
|
||||||
|
<< " set tree_method to \'exact\'";
|
||||||
|
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tparam.tree_method == 1) {
|
||||||
|
LOG(CONSOLE) << "Tree method is selected to be \'approx\'";
|
||||||
|
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
|
||||||
|
}
|
||||||
|
|
||||||
if (tparam.test_flag == "block" || tparam.dsplit == 2) {
|
if (tparam.test_flag == "block" || tparam.dsplit == 2) {
|
||||||
max_row_perbatch = std::min(
|
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
|
||||||
static_cast<size_t>(32UL << 10UL), max_row_perbatch);
|
|
||||||
}
|
}
|
||||||
// initialize column access
|
// initialize column access
|
||||||
p_train->InitColAccess(enabled,
|
p_train->InitColAccess(enabled,
|
||||||
tparam.prob_buffer_row,
|
tparam.prob_buffer_row,
|
||||||
max_row_perbatch);
|
max_row_perbatch);
|
||||||
|
}
|
||||||
|
|
||||||
if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
|
if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
|
||||||
|
if (tparam.tree_method == 2) {
|
||||||
|
LOG(CONSOLE) << "tree method is set to be 'exact',"
|
||||||
|
<< " but currently we are only able to proceed with approximate algorithm";
|
||||||
|
}
|
||||||
cfg_["updater"] = "grow_histmaker,prune";
|
cfg_["updater"] = "grow_histmaker,prune";
|
||||||
if (gbm_.get() != nullptr) {
|
if (gbm_.get() != nullptr) {
|
||||||
gbm_->Configure(cfg_.begin(), cfg_.end());
|
gbm_->Configure(cfg_.begin(), cfg_.end());
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user