Use hist as the default tree method. (#9320)

This commit is contained in:
Jiaming Yuan
2023-06-27 23:04:24 +08:00
committed by GitHub
parent bc267dd729
commit f4798718c7
10 changed files with 138 additions and 178 deletions

View File

@@ -39,7 +39,6 @@ namespace xgboost::gbm {
DMLC_REGISTRY_FILE_TAG(gbtree);
void GBTree::Configure(Args const& cfg) {
this->cfg_ = cfg;
std::string updater_seq = tparam_.updater_seq;
tparam_.UpdateAllowUnknown(cfg);
tree_param_.UpdateAllowUnknown(cfg);
@@ -78,10 +77,9 @@ void GBTree::Configure(Args const& cfg) {
monitor_.Init("GBTree");
specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) {
return arg.first == "updater";
});
specified_updater_ = std::any_of(
cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
if (specified_updater_ && !showed_updater_warning_) {
LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
@@ -93,12 +91,19 @@ void GBTree::Configure(Args const& cfg) {
showed_updater_warning_ = true;
}
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
this->ConfigureUpdaters();
if (updater_seq != tparam_.updater_seq) {
updaters_.clear();
this->InitUpdater(cfg);
} else {
for (auto &up : updaters_) {
for (auto& up : updaters_) {
up->Configure(cfg);
}
}
@@ -106,66 +111,6 @@ void GBTree::Configure(Args const& cfg) {
configured_ = true;
}
// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on
// whether external memory is used and how large is dataset. We can remove the dependency
// on DMatrix once `hist` tree method can handle external memory so that we can make it
// default.
void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
CHECK(this->configured_);
std::string updater_seq = tparam_.updater_seq;
CHECK(tparam_.GetInitialised());
tparam_.UpdateAllowUnknown(cfg);
this->PerformTreeMethodHeuristic(fmat);
this->ConfigureUpdaters();
// initialize the updaters only when needed.
if (updater_seq != tparam_.updater_seq) {
LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
this->updaters_.clear();
this->InitUpdater(cfg);
}
}
void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
if (specified_updater_) {
// This method is disabled when `updater` parameter is explicitly
// set, since only experts are expected to do so.
return;
}
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}
// tparam_ is set before calling this function.
if (tparam_.tree_method != TreeMethod::kAuto) {
return;
}
if (collective::IsDistributed()) {
LOG(INFO) << "Tree method is automatically selected to be 'approx' "
"for distributed training.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (!fmat->SingleColBlock()) {
LOG(INFO) << "Tree method is automatically set to 'approx' "
"since external-memory data matrix is used.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
/* Choose tree_method='approx' automatically for large data matrix */
LOG(INFO) << "Tree method is automatically selected to be "
"'approx' for faster speed. To use old behavior "
"(exact greedy algorithm on single machine), "
"set tree_method to 'exact'.";
tparam_.tree_method = TreeMethod::kApprox;
} else {
tparam_.tree_method = TreeMethod::kExact;
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
}
void GBTree::ConfigureUpdaters() {
if (specified_updater_) {
return;
@@ -173,31 +118,25 @@ void GBTree::ConfigureUpdaters() {
// `updater` parameter was manually specified
/* Choose updaters according to tree_method parameters */
switch (tparam_.tree_method) {
case TreeMethod::kAuto:
// Use heuristic to choose between 'exact' and 'approx' This
// choice is carried out in PerformTreeMethodHeuristic() before
// calling this function.
case TreeMethod::kAuto: // Use hist as default in 2.0
case TreeMethod::kHist: {
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kApprox:
tparam_.updater_seq = "grow_histmaker";
break;
case TreeMethod::kExact:
tparam_.updater_seq = "grow_colmaker,prune";
break;
case TreeMethod::kHist: {
LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
"grow_quantile_histmaker.";
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kGPUHist: {
common::AssertGPUSupport();
tparam_.updater_seq = "grow_gpu_hist";
break;
}
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(tparam_.tree_method) << ") detected";
LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
<< ") detected";
}
}
@@ -253,7 +192,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) {
TreesOneIter new_trees;
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
ConfigureWithKnownData(this->cfg_, p_fmat);
monitor_.Start("BoostNewTrees");
// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let

View File

@@ -56,9 +56,7 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
namespace xgboost {
namespace gbm {
namespace xgboost::gbm {
/*! \brief training parameters */
struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
/*! \brief tree updater sequence */
@@ -192,12 +190,8 @@ class GBTree : public GradientBooster {
: GradientBooster{ctx}, model_(booster_config, ctx_) {}
void Configure(const Args& cfg) override;
// Revise `tree_method` and `updater` parameters after seeing the training
// data matrix, only useful when tree_method is auto.
void PerformTreeMethodHeuristic(DMatrix* fmat);
/*! \brief Map `tree_method` parameter to `updater` parameter */
void ConfigureUpdaters();
void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
/**
* \brief Optionally update the leaf value.
@@ -222,11 +216,7 @@ class GBTree : public GradientBooster {
return tparam_;
}
void Load(dmlc::Stream* fi) override {
model_.Load(fi);
this->cfg_.clear();
}
void Load(dmlc::Stream* fi) override { model_.Load(fi); }
void Save(dmlc::Stream* fo) const override {
model_.Save(fo);
}
@@ -416,8 +406,6 @@ class GBTree : public GradientBooster {
bool showed_updater_warning_ {false};
bool specified_updater_ {false};
bool configured_ {false};
// configurations for tree
Args cfg_;
// the updaters that can be applied to each of tree
std::vector<std::unique_ptr<TreeUpdater>> updaters_;
// Predictors
@@ -431,7 +419,6 @@ class GBTree : public GradientBooster {
common::Monitor monitor_;
};
} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm
#endif // XGBOOST_GBM_GBTREE_H_