xgboost/src/data/simple_dmatrix.cc

/*!
 * Copyright 2014 by Contributors
 * \file simple_dmatrix.cc
 * \brief the input data structure for gradient boosting
 * \author Tianqi Chen
 */
#include <xgboost/data.h>
#include <limits>
#include <algorithm>
#include <vector>
#include "./simple_dmatrix.h"
#include "../common/random.h"
#include "../common/group_data.h"

namespace xgboost {
namespace data {

bool SimpleDMatrix::ColBatchIter::Next() {
  if (data_ptr_ >= cpages_.size()) return false;
  data_ptr_ += 1;
  SparsePage* pcol = cpages_[data_ptr_ - 1].get();
  batch_.size = col_index_.size();
  col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
  for (size_t i = 0; i < col_data_.size(); ++i) {
    const bst_uint ridx = col_index_[i];
    col_data_[i] = SparseBatch::Inst
        (dmlc::BeginPtr(pcol->data) + pcol->offset[ridx],
         static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
  }
  batch_.col_index = dmlc::BeginPtr(col_index_);
  batch_.col_data = dmlc::BeginPtr(col_data_);
  return true;
}

dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator() {
  size_t ncol = this->info().num_col;
  col_iter_.col_index_.resize(ncol);
  for (size_t i = 0; i < ncol; ++i) {
    col_iter_.col_index_[i] = static_cast<bst_uint>(i);
  }
  col_iter_.BeforeFirst();
  return &col_iter_;
}

dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator(const std::vector<bst_uint>&fset) {
  size_t ncol = this->info().num_col;
  col_iter_.col_index_.resize(0);
  for (size_t i = 0; i < fset.size(); ++i) {
    if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
  }
  col_iter_.BeforeFirst();
  return &col_iter_;
}

void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
                                  float pkeep,
                                  size_t max_row_perbatch) {
  if (this->HaveColAccess()) return;

  col_iter_.cpages_.clear();
  if (info().num_row < max_row_perbatch) {
    std::unique_ptr<SparsePage> page(new SparsePage());
    this->MakeOneBatch(enabled, pkeep, page.get());
    col_iter_.cpages_.push_back(std::move(page));
  } else {
    this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
  }
  // setup col-size
  col_size_.resize(info().num_col);
  std::fill(col_size_.begin(), col_size_.end(), 0);
  for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
    SparsePage *pcol = col_iter_.cpages_[i].get();
    for (size_t j = 0; j < pcol->Size(); ++j) {
      col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
    }
  }
}

// internal function to make one batch from row iter.
void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled,
                                 float pkeep,
                                 SparsePage *pcol) {
  // clear rowset
  buffered_rowset_.clear();
  // bit map
  int nthread;
  std::vector<bool> bmap;
  #pragma omp parallel
  {
    nthread = omp_get_num_threads();
  }

  pcol->Clear();
  common::ParallelGroupBuilder<SparseBatch::Entry>
      builder(&pcol->offset, &pcol->data);
  builder.InitBudget(info().num_col, nthread);
  // start working
  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
  iter->BeforeFirst();
  while (iter->Next()) {
    const RowBatch& batch = iter->Value();
    bmap.resize(bmap.size() + batch.size, true);
    std::bernoulli_distribution coin_flip(pkeep);
    auto& rnd = common::GlobalRandom();

    long batch_size = static_cast<long>(batch.size); // NOLINT(*)
    for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
      if (pkeep == 1.0f || coin_flip(rnd)) {
        buffered_rowset_.push_back(ridx);
      } else {
        bmap[i] = false;
      }
    }
    #pragma omp parallel for schedule(static)
    for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
      int tid = omp_get_thread_num();
      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
      if (bmap[ridx]) {
        RowBatch::Inst inst = batch[i];
        for (bst_uint j = 0; j < inst.length; ++j) {
          if (enabled[inst[j].index]) {
            builder.AddBudget(inst[j].index, tid);
          }
        }
      }
    }
  }
  builder.InitStorage();

  iter->BeforeFirst();
  while (iter->Next()) {
    const RowBatch& batch = iter->Value();
    #pragma omp parallel for schedule(static)
    for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
      int tid = omp_get_thread_num();
      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
      if (bmap[ridx]) {
        RowBatch::Inst inst = batch[i];
        for (bst_uint j = 0; j < inst.length; ++j) {
          if (enabled[inst[j].index]) {
            builder.Push(inst[j].index,
                         SparseBatch::Entry((bst_uint)(batch.base_rowid+i),
                                            inst[j].fvalue), tid);
          }
        }
      }
    }
  }

  CHECK_EQ(pcol->Size(), info().num_col);
  // sort columns
  bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
  #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
  for (bst_omp_uint i = 0; i < ncol; ++i) {
    if (pcol->offset[i] < pcol->offset[i + 1]) {
      std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
                dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
                SparseBatch::Entry::CmpValue);
    }
  }
}

void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
                                  float pkeep,
                                  size_t max_row_perbatch) {
  size_t btop = 0;
  std::bernoulli_distribution coin_flip(pkeep);
  auto& rnd = common::GlobalRandom();
  buffered_rowset_.clear();
  // internal temp cache
  SparsePage tmp; tmp.Clear();
  // start working
  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
  iter->BeforeFirst();

  while (iter->Next()) {
    const RowBatch &batch = iter->Value();
    for (size_t i = 0; i < batch.size; ++i) {
      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
      if (pkeep == 1.0f || coin_flip(rnd)) {
        buffered_rowset_.push_back(ridx);
        tmp.Push(batch[i]);
      }
      if (tmp.Size() >= max_row_perbatch) {
        std::unique_ptr<SparsePage> page(new SparsePage());
        this->MakeColPage(tmp.GetRowBatch(0),
                          dmlc::BeginPtr(buffered_rowset_) + btop,
                          enabled, page.get());
        col_iter_.cpages_.push_back(std::move(page));
        btop = buffered_rowset_.size();
        tmp.Clear();
      }
    }
  }

  if (tmp.Size() != 0) {
    std::unique_ptr<SparsePage> page(new SparsePage());
    this->MakeColPage(tmp.GetRowBatch(0),
                      dmlc::BeginPtr(buffered_rowset_) + btop,
                      enabled, page.get());
    col_iter_.cpages_.push_back(std::move(page));
  }
}

// make column page from subset of rowbatchs
void SimpleDMatrix::MakeColPage(const RowBatch& batch,
                                const bst_uint* ridx,
                                const std::vector<bool>& enabled,
                                SparsePage* pcol) {
  int nthread;
  #pragma omp parallel
  {
    nthread = omp_get_num_threads();
    int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
    if (nthread > max_nthread) {
      nthread = max_nthread;
    }
  }
  pcol->Clear();
  common::ParallelGroupBuilder<SparseBatch::Entry>
      builder(&pcol->offset, &pcol->data);
  builder.InitBudget(info().num_col, nthread);
  bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
  #pragma omp parallel for schedule(static) num_threads(nthread)
  for (bst_omp_uint i = 0; i < ndata; ++i) {
    int tid = omp_get_thread_num();
    RowBatch::Inst inst = batch[i];
    for (bst_uint j = 0; j < inst.length; ++j) {
      const SparseBatch::Entry &e = inst[j];
      if (enabled[e.index]) {
        builder.AddBudget(e.index, tid);
      }
    }
  }
  builder.InitStorage();
  #pragma omp parallel for schedule(static) num_threads(nthread)
  for (bst_omp_uint i = 0; i < ndata; ++i) {
    int tid = omp_get_thread_num();
    RowBatch::Inst inst = batch[i];
    for (bst_uint j = 0; j < inst.length; ++j) {
      const SparseBatch::Entry &e = inst[j];
      builder.Push(e.index,
                   SparseBatch::Entry(ridx[i], e.fvalue),
                   tid);
    }
  }
  CHECK_EQ(pcol->Size(), info().num_col);
  // sort columns
  bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
  #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
  for (bst_omp_uint i = 0; i < ncol; ++i) {
    if (pcol->offset[i] < pcol->offset[i + 1]) {
      std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
                dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
                SparseBatch::Entry::CmpValue);
    }
  }
}

bool SimpleDMatrix::SingleColBlock() const {
  return col_iter_.cpages_.size() <= 1;
}
}  // namespace data
}  // namespace xgboost