[Blocking] Fix #3840: Clean up logic for parsing tree_method parameter (#3849)

* Clean up logic for converting tree_method to updater sequence

* Use C++11 enum class for extra safety

Compiler will give warnings if switch statements don't handle all
possible values of C++11 enum class.

Also allow enum class to be used as DMLC parameter.

* Fix compiler error + lint

* Address reviewer comment

* Better docstring for DECLARE_FIELD_ENUM_CLASS

* Fix lint

* Add C++ test to see if tree_method is recognized

* Fix clang-tidy error

* Add test_learner.h to R package

* Update comments

* Fix lint error
This commit is contained in:
Philip Hyunsu Cho
2018-11-01 19:33:35 -07:00
committed by GitHub
parent 583c88bce7
commit ad68865d6b
6 changed files with 358 additions and 54 deletions

View File

@@ -19,14 +19,28 @@
#include "./common/host_device_vector.h"
#include "./common/io.h"
#include "./common/random.h"
#include "common/timer.h"
#include "./common/enum_class_param.h"
#include "./common/timer.h"
#include "../tests/cpp/test_learner.h"
namespace {
const char* kMaxDeltaStepDefaultValue = "0.7";
enum class TreeMethod : int {
kAuto = 0, kApprox = 1, kExact = 2, kHist = 3,
kGPUExact = 4, kGPUHist = 5
};
enum class DataSplitMode : int {
kAuto = 0, kCol = 1, kRow = 2
};
} // anonymous namespace
DECLARE_FIELD_ENUM_CLASS(TreeMethod);
DECLARE_FIELD_ENUM_CLASS(DataSplitMode);
namespace xgboost {
// implementation of base learner.
bool Learner::AllowLazyCheckPoint() const {
@@ -80,9 +94,9 @@ struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
// whether seed the PRNG each iteration
bool seed_per_iteration;
// data split mode, can be row, col, or none.
int dsplit;
DataSplitMode dsplit;
// tree construction method
int tree_method;
TreeMethod tree_method;
// internal test flag
std::string test_flag;
// number of threads to use if OpenMP is enabled
@@ -103,19 +117,19 @@ struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
"this option will be switched on automatically on distributed "
"mode.");
DMLC_DECLARE_FIELD(dsplit)
.set_default(0)
.add_enum("auto", 0)
.add_enum("col", 1)
.add_enum("row", 2)
.set_default(DataSplitMode::kAuto)
.add_enum("auto", DataSplitMode::kAuto)
.add_enum("col", DataSplitMode::kCol)
.add_enum("row", DataSplitMode::kRow)
.describe("Data split mode for distributed training.");
DMLC_DECLARE_FIELD(tree_method)
.set_default(0)
.add_enum("auto", 0)
.add_enum("approx", 1)
.add_enum("exact", 2)
.add_enum("hist", 3)
.add_enum("gpu_exact", 4)
.add_enum("gpu_hist", 5)
.set_default(TreeMethod::kAuto)
.add_enum("auto", TreeMethod::kAuto)
.add_enum("approx", TreeMethod::kApprox)
.add_enum("exact", TreeMethod::kExact)
.add_enum("hist", TreeMethod::kHist)
.add_enum("gpu_exact", TreeMethod::kGPUExact)
.add_enum("gpu_hist", TreeMethod::kGPUHist)
.describe("Choice of tree construction method.");
DMLC_DECLARE_FIELD(test_flag).set_default("").describe(
"Internal test flag");
@@ -138,7 +152,7 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
* \brief learner that performs gradient boosting for a specific objective
* function. It does training and prediction.
*/
class LearnerImpl : public Learner {
class LearnerImpl : public Learner, public LearnerTestHook {
public:
explicit LearnerImpl(std::vector<std::shared_ptr<DMatrix> > cache)
: cache_(std::move(cache)) {
@@ -154,37 +168,49 @@ class LearnerImpl : public Learner {
}
void ConfigureUpdaters() {
if (tparam_.tree_method == 0 || tparam_.tree_method == 1 ||
tparam_.tree_method == 2) {
if (cfg_.count("updater") == 0) {
if (tparam_.dsplit == 1) {
cfg_["updater"] = "distcol";
} else if (tparam_.dsplit == 2) {
cfg_["updater"] = "grow_histmaker,prune";
}
}
} else if (tparam_.tree_method == 3) {
/* histogram-based algorithm */
LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a "
"single updater "
<< "grow_fast_histmaker.";
/* Choose updaters according to tree_method parameters */
if (cfg_.count("updater") > 0) {
LOG(CONSOLE) << "DANGER AHEAD: You have manually specified `updater` "
"parameter. The `tree_method` parameter will be ignored. "
"Incorrect sequence of updaters will produce undefined "
"behavior. For common uses, we recommend using "
"`tree_method` parameter instead.";
return;
}
switch (tparam_.tree_method) {
case TreeMethod::kAuto:
// Use heuristic to choose between 'exact' and 'approx'
// This choice is deferred to PerformTreeMethodHeuristic().
break;
case TreeMethod::kApprox:
cfg_["updater"] = "grow_histmaker,prune";
break;
case TreeMethod::kExact:
cfg_["updater"] = "grow_colmaker,prune";
break;
case TreeMethod::kHist:
LOG(CONSOLE) << "Tree method is selected to be 'hist', which uses a "
"single updater grow_fast_histmaker.";
cfg_["updater"] = "grow_fast_histmaker";
} else if (tparam_.tree_method == 4) {
break;
case TreeMethod::kGPUExact:
this->AssertGPUSupport();
if (cfg_.count("updater") == 0) {
cfg_["updater"] = "grow_gpu,prune";
}
cfg_["updater"] = "grow_gpu,prune";
if (cfg_.count("predictor") == 0) {
cfg_["predictor"] = "gpu_predictor";
}
} else if (tparam_.tree_method == 5) {
break;
case TreeMethod::kGPUHist:
this->AssertGPUSupport();
if (cfg_.count("updater") == 0) {
cfg_["updater"] = "grow_gpu_hist";
}
cfg_["updater"] = "grow_gpu_hist";
if (cfg_.count("predictor") == 0) {
cfg_["predictor"] = "gpu_predictor";
}
break;
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(tparam_.tree_method) << ") detected";
}
}
@@ -214,8 +240,8 @@ class LearnerImpl : public Learner {
// add additional parameters
// These are cosntraints that need to be satisfied.
if (tparam_.dsplit == 0 && rabit::IsDistributed()) {
tparam_.dsplit = 2;
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
tparam_.dsplit = DataSplitMode::kRow;
}
if (cfg_.count("num_class") != 0) {
@@ -376,7 +402,7 @@ class LearnerImpl : public Learner {
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
this->PerformTreeMethodHeuristic(train);
monitor_.Start("PredictRaw");
this->PredictRaw(train, &preds_);
monitor_.Stop("PredictRaw");
@@ -393,7 +419,7 @@ class LearnerImpl : public Learner {
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
this->PerformTreeMethodHeuristic(train);
gbm_->DoBoost(train, in_gpair);
monitor_.Stop("BoostOneIter");
}
@@ -412,7 +438,7 @@ class LearnerImpl : public Learner {
for (auto& ev : metrics_) {
os << '\t' << data_names[i] << '-' << ev->Name() << ':'
<< ev->Eval(preds_.ConstHostVector(), data_sets[i]->Info(),
tparam_.dsplit == 2);
tparam_.dsplit == DataSplitMode::kRow);
}
}
@@ -456,7 +482,7 @@ class LearnerImpl : public Learner {
obj_->EvalTransform(&preds_);
return std::make_pair(metric,
ev->Eval(preds_.ConstHostVector(), data->Info(),
tparam_.dsplit == 2));
tparam_.dsplit == DataSplitMode::kRow));
}
void Predict(DMatrix* data, bool output_margin,
@@ -479,21 +505,94 @@ class LearnerImpl : public Learner {
}
protected:
// check if p_train is ready to used by training.
// if not, initialize the column access.
inline void LazyInitDMatrix(DMatrix* p_train) {
if (tparam_.tree_method == 3 || tparam_.tree_method == 4 ||
tparam_.tree_method == 5 || name_gbm_ == "gblinear") {
// Revise `tree_method` and `updater` parameters after seeing the training
// data matrix
inline void PerformTreeMethodHeuristic(DMatrix* p_train) {
if (name_gbm_ != "gbtree" || cfg_.count("updater") > 0) {
// 1. This method is not applicable for non-tree learners
// 2. This method is disabled when `updater` parameter is explicitly
// set, since only experts are expected to do so.
return;
}
if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
if (tparam_.tree_method == 2) {
LOG(CONSOLE) << "tree method is set to be 'exact',"
<< " but currently we are only able to proceed with "
"approximate algorithm";
const TreeMethod current_tree_method = tparam_.tree_method;
if (rabit::IsDistributed()) {
/* Choose tree_method='approx' when distributed training is activated */
CHECK(tparam_.dsplit != DataSplitMode::kAuto)
<< "Precondition violated; dsplit cannot be 'auto' in distributed mode";
if (tparam_.dsplit == DataSplitMode::kCol) {
// 'distcol' updater hidden until it becomes functional again
// See discussion at https://github.com/dmlc/xgboost/issues/1832
LOG(FATAL) << "Column-wise data split is currently not supported.";
}
cfg_["updater"] = "grow_histmaker,prune";
switch (current_tree_method) {
case TreeMethod::kAuto:
LOG(CONSOLE) << "Tree method is automatically selected to be 'approx' "
"for distributed training.";
break;
case TreeMethod::kApprox:
// things are okay, do nothing
break;
case TreeMethod::kExact:
case TreeMethod::kHist:
LOG(CONSOLE) << "Tree method was set to be '"
<< (current_tree_method == TreeMethod::kExact ?
"exact" : "hist")
<< "', but only 'approx' is available for distributed "
"training. The `tree_method` parameter is now being "
"changed to 'approx'";
break;
case TreeMethod::kGPUExact:
case TreeMethod::kGPUHist:
LOG(FATAL) << "Distributed training is not available with GPU algoritms";
break;
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(current_tree_method) << ") detected";
}
tparam_.tree_method = TreeMethod::kApprox;
} else if (!p_train->SingleColBlock()) {
/* Some tree methods are not available for external-memory DMatrix */
switch (current_tree_method) {
case TreeMethod::kAuto:
LOG(CONSOLE) << "Tree method is automatically set to 'approx' "
"since external-memory data matrix is used.";
break;
case TreeMethod::kApprox:
// things are okay, do nothing
break;
case TreeMethod::kExact:
LOG(CONSOLE) << "Tree method was set to be 'exact', "
"but currently we are only able to proceed with "
"approximate algorithm ('approx') because external-"
"memory data matrix is used.";
break;
case TreeMethod::kHist:
// things are okay, do nothing
break;
case TreeMethod::kGPUExact:
case TreeMethod::kGPUHist:
LOG(FATAL)
<< "External-memory data matrix is not available with GPU algorithms";
break;
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(current_tree_method) << ") detected";
}
tparam_.tree_method = TreeMethod::kApprox;
} else if (p_train->Info().num_row_ >= (4UL << 20UL)
&& current_tree_method == TreeMethod::kAuto) {
/* Choose tree_method='approx' automatically for large data matrix */
LOG(CONSOLE) << "Tree method is automatically selected to be "
"'approx' for faster speed. To use old behavior "
"(exact greedy algorithm on single machine), "
"set tree_method to 'exact'.";
tparam_.tree_method = TreeMethod::kApprox;
}
/* If tree_method was changed, re-configure updaters and gradient boosters */
if (tparam_.tree_method != current_tree_method) {
ConfigureUpdaters();
if (gbm_ != nullptr) {
gbm_->Configure(cfg_.begin(), cfg_.end());
}
@@ -565,6 +664,11 @@ class LearnerImpl : public Learner {
std::vector<std::shared_ptr<DMatrix> > cache_;
common::Monitor monitor_;
// diagnostic method reserved for C++ test learner.SelectTreeMethod
std::string GetUpdaterSequence() const override {
return cfg_.at("updater");
}
};
Learner* Learner::Create(