Dmatrix refactor stage 1 (#3301)

* Use sparse page as singular CSR matrix representation

* Simplify dmatrix methods

* Reduce statefullness of batch iterators

* BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
This commit is contained in:
Rory Mitchell
2018-06-07 10:25:58 +12:00
committed by GitHub
parent 286dccb8e8
commit a96039141a
47 changed files with 650 additions and 1036 deletions

View File

@@ -80,8 +80,6 @@ struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
int tree_method;
// internal test flag
std::string test_flag;
// maximum buffered row value
float prob_buffer_row;
// maximum row per batch.
size_t max_row_perbatch;
// number of threads to use if OpenMP is enabled
@@ -116,10 +114,6 @@ struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
.describe("Choice of tree construction method.");
DMLC_DECLARE_FIELD(test_flag).set_default("").describe(
"Internal test flag");
DMLC_DECLARE_FIELD(prob_buffer_row)
.set_default(1.0f)
.set_range(0.0f, 1.0f)
.describe("Maximum buffered row portion");
DMLC_DECLARE_FIELD(max_row_perbatch)
.set_default(std::numeric_limits<size_t>::max())
.describe("maximum row per batch.");
@@ -163,9 +157,6 @@ class LearnerImpl : public Learner {
} else if (tparam_.dsplit == 2) {
cfg_["updater"] = "grow_histmaker,prune";
}
if (tparam_.prob_buffer_row != 1.0f) {
cfg_["updater"] = "grow_histmaker,refresh,prune";
}
}
} else if (tparam_.tree_method == 3) {
/* histogram-based algorithm */
@@ -496,7 +487,7 @@ class LearnerImpl : public Learner {
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
}
// initialize column access
p_train->InitColAccess(enabled, tparam_.prob_buffer_row, max_row_perbatch, true);
p_train->InitColAccess(max_row_perbatch, true);
}
if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {