Dmatrix refactor stage 1 (#3301)
* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
This commit is contained in:
@@ -65,10 +65,10 @@ inline std::pair<double, double> GetGradient(int group_idx, int num_group, int f
|
||||
const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
|
||||
auto iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
ColBatch::Inst col = batch[0];
|
||||
auto batch = iter->Value();
|
||||
auto col = batch[fidx];
|
||||
const auto ndata = static_cast<bst_omp_uint>(col.length);
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
const bst_float v = col[j].fvalue;
|
||||
@@ -96,10 +96,10 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
|
||||
const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
|
||||
auto iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
ColBatch::Inst col = batch[0];
|
||||
auto batch = iter->Value();
|
||||
auto col = batch[fidx];
|
||||
const auto ndata = static_cast<bst_omp_uint>(col.length);
|
||||
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
@@ -154,10 +154,10 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
|
||||
float dw, std::vector<GradientPair> *in_gpair,
|
||||
DMatrix *p_fmat) {
|
||||
if (dw == 0.0f) return;
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
|
||||
auto iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
ColBatch::Inst col = batch[0];
|
||||
auto batch = iter->Value();
|
||||
auto col = batch[fidx];
|
||||
// update grad value
|
||||
const auto num_row = static_cast<bst_omp_uint>(col.length);
|
||||
#pragma omp parallel for schedule(static)
|
||||
@@ -325,12 +325,12 @@ class GreedyFeatureSelector : public FeatureSelector {
|
||||
const bst_omp_uint nfeat = model.param.num_feature;
|
||||
// Calculate univariate gradient sums
|
||||
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
|
||||
auto iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
auto batch = iter->Value();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||
const ColBatch::Inst col = batch[i];
|
||||
const auto col = batch[i];
|
||||
const bst_uint ndata = col.length;
|
||||
auto &sums = gpair_sums_[group_idx * nfeat + i];
|
||||
for (bst_uint j = 0u; j < ndata; ++j) {
|
||||
@@ -392,13 +392,13 @@ class ThriftyFeatureSelector : public FeatureSelector {
|
||||
}
|
||||
// Calculate univariate gradient sums
|
||||
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
|
||||
auto iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
auto batch = iter->Value();
|
||||
// column-parallel is usually faster than row-parallel
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||
const ColBatch::Inst col = batch[i];
|
||||
const auto col = batch[i];
|
||||
const bst_uint ndata = col.length;
|
||||
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
|
||||
auto &sums = gpair_sums_[gid * nfeat + i];
|
||||
|
||||
Reference in New Issue
Block a user