Improve multi-threaded performance (#2104)

* Add UpdatePredictionCache() option to updaters

Some updaters (e.g. fast_hist) has enough information to quickly compute
prediction cache for the training data. Each updater may override
UpdaterPredictionCache() method to update the prediction cache. Note: this
trick does not apply to validation data.

* Respond to code review

* Disable some debug messages by default
* Document UpdatePredictionCache() interface
* Remove base_margin logic from UpdatePredictionCache() implementation
* Do not take pointer to cfg, as reference may get stale

* Improve multi-threaded performance

* Use columnwise accessor to accelerate ApplySplit() step,
  with support for a compressed representation
* Parallel sort for evaluation step
* Inline BuildHist() function
* Cache gradient pairs when building histograms in BuildHist()

* Add missing #if macro

* Respond to code review

* Use wrapper to enable parallel sort on Linux

* Fix C++ compatibility issues

* MSVC doesn't support unsigned in OpenMP loops
* gcc 4.6 doesn't support using keyword

* Fix lint issues

* Respond to code review

* Fix bug in ApplySplitSparseData()

* Attempting to read beyond the end of a sparse column
* Mishandling the case where an entire range of rows have missing values

* Fix training continuation bug

Disable UpdatePredictionCache() in the first iteration. This way, we can
accomodate the scenario where we build off of an existing (nonempty) ensemble.

* Add regression test for fast_hist

* Respond to code review

* Add back old version of ApplySplitSparseData
This commit is contained in:
Philip Cho
2017-03-25 10:35:01 -07:00
committed by Tianqi Chen
parent 332aea26a3
commit 14fba01b5a
14 changed files with 719 additions and 171 deletions

View File

@@ -6,6 +6,7 @@
*/
#include <xgboost/logging.h>
#include <xgboost/learner.h>
#include <dmlc/timer.h>
#include <dmlc/io.h>
#include <algorithm>
#include <vector>
@@ -83,6 +84,8 @@ struct LearnerTrainParam
// number of threads to use if OpenMP is enabled
// if equals 0, use system default
int nthread;
// flag to print out detailed breakdown of runtime
int debug_verbose;
// declare parameters
DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
DMLC_DECLARE_FIELD(seed).set_default(0)
@@ -109,6 +112,10 @@ struct LearnerTrainParam
.describe("maximum row per batch.");
DMLC_DECLARE_FIELD(nthread).set_default(0)
.describe("Number of threads to use.");
DMLC_DECLARE_FIELD(debug_verbose)
.set_lower_bound(0)
.set_default(0)
.describe("flag to print out detailed breakdown of runtime");
}
};
@@ -170,28 +177,9 @@ class LearnerImpl : public Learner {
if (tparam.tree_method == 3) {
/* histogram-based algorithm */
if (cfg_.count("updater") == 0) {
LOG(CONSOLE) << "Tree method is selected to be \'hist\', "
<< "which uses histogram aggregation for faster training. "
<< "Using default sequence of updaters: grow_fast_histmaker,prune";
cfg_["updater"] = "grow_fast_histmaker,prune";
} else {
const std::string first_str = "grow_fast_histmaker";
if (first_str.length() <= cfg_["updater"].length()
&& std::equal(first_str.begin(), first_str.end(), cfg_["updater"].begin())) {
// updater sequence starts with "grow_fast_histmaker"
LOG(CONSOLE) << "Tree method is selected to be \'hist\', "
<< "which uses histogram aggregation for faster training. "
<< "Using custom sequence of updaters: " << cfg_["updater"];
} else {
// updater sequence does not start with "grow_fast_histmaker"
LOG(CONSOLE) << "Tree method is selected to be \'hist\', but the given "
<< "sequence of updaters is not compatible; "
<< "grow_fast_histmaker must run first. "
<< "Using default sequence of updaters: grow_fast_histmaker,prune";
cfg_["updater"] = "grow_fast_histmaker,prune";
}
}
LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a single updater "
<< "grow_fast_histmaker.";
cfg_["updater"] = "grow_fast_histmaker";
} else if (cfg_.count("updater") == 0) {
if (tparam.dsplit == 1) {
cfg_["updater"] = "distcol";
@@ -333,6 +321,7 @@ class LearnerImpl : public Learner {
std::string EvalOneIter(int iter,
const std::vector<DMatrix*>& data_sets,
const std::vector<std::string>& data_names) override {
double tstart = dmlc::GetTime();
std::ostringstream os;
os << '[' << iter << ']'
<< std::setiosflags(std::ios::fixed);
@@ -347,6 +336,10 @@ class LearnerImpl : public Learner {
<< ev->Eval(preds_, data_sets[i]->info(), tparam.dsplit == 2);
}
}
if (tparam.debug_verbose > 0) {
LOG(INFO) << "EvalOneIter(): " << dmlc::GetTime() - tstart << " sec";
}
return os.str();
}