Dmatrix refactor stage 1 (#3301)
* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
This commit is contained in:
@@ -14,7 +14,7 @@ DMLC_REGISTRY_FILE_TAG(cpu_predictor);
|
||||
|
||||
class CPUPredictor : public Predictor {
|
||||
protected:
|
||||
static bst_float PredValue(const RowBatch::Inst& inst,
|
||||
static bst_float PredValue(const SparsePage::Inst& inst,
|
||||
const std::vector<std::unique_ptr<RegTree>>& trees,
|
||||
const std::vector<int>& tree_info, int bst_group,
|
||||
unsigned root_index, RegTree::FVec* p_feats,
|
||||
@@ -53,20 +53,20 @@ class CPUPredictor : public Predictor {
|
||||
<< "size_leaf_vector is enforced to 0 so far";
|
||||
CHECK_EQ(preds.size(), p_fmat->Info().num_row_ * num_group);
|
||||
// start collecting the prediction
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
auto iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
const auto& batch = iter->Value();
|
||||
// parallel over local batch
|
||||
constexpr int kUnroll = 8;
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||
const bst_omp_uint rest = nsize % kUnroll;
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nsize - rest; i += kUnroll) {
|
||||
const int tid = omp_get_thread_num();
|
||||
RegTree::FVec& feats = thread_temp[tid];
|
||||
int64_t ridx[kUnroll];
|
||||
RowBatch::Inst inst[kUnroll];
|
||||
SparsePage::Inst inst[kUnroll];
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
|
||||
}
|
||||
@@ -85,7 +85,7 @@ class CPUPredictor : public Predictor {
|
||||
for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
|
||||
RegTree::FVec& feats = thread_temp[0];
|
||||
const auto ridx = static_cast<int64_t>(batch.base_rowid + i);
|
||||
const RowBatch::Inst inst = batch[i];
|
||||
auto inst = batch[i];
|
||||
for (int gid = 0; gid < num_group; ++gid) {
|
||||
const size_t offset = ridx * num_group + gid;
|
||||
preds[offset] +=
|
||||
@@ -183,7 +183,7 @@ class CPUPredictor : public Predictor {
|
||||
}
|
||||
}
|
||||
|
||||
void PredictInstance(const SparseBatch::Inst& inst,
|
||||
void PredictInstance(const SparsePage::Inst& inst,
|
||||
std::vector<bst_float>* out_preds,
|
||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||
unsigned root_index) override {
|
||||
@@ -218,12 +218,12 @@ class CPUPredictor : public Predictor {
|
||||
std::vector<bst_float>& preds = *out_preds;
|
||||
preds.resize(info.num_row_ * ntree_limit);
|
||||
// start collecting the prediction
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
auto iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
auto batch = iter->Value();
|
||||
// parallel over local batch
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
const int tid = omp_get_thread_num();
|
||||
@@ -266,13 +266,13 @@ class CPUPredictor : public Predictor {
|
||||
model.trees[i]->FillNodeMeanValues();
|
||||
}
|
||||
// start collecting the contributions
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
auto iter = p_fmat->RowIterator();
|
||||
const std::vector<bst_float>& base_margin = info.base_margin_;
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
auto batch = iter->Value();
|
||||
// parallel over local batch
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
||||
|
||||
@@ -52,7 +52,7 @@ struct DeviceMatrix {
|
||||
DMatrix* p_mat; // Pointer to the original matrix on the host
|
||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||
dh::DVec<size_t> row_ptr;
|
||||
dh::DVec<SparseBatch::Entry> data;
|
||||
dh::DVec<Entry> data;
|
||||
thrust::device_vector<float> predictions;
|
||||
|
||||
DeviceMatrix(DMatrix* dmat, int device_idx, bool silent) : p_mat(dmat) {
|
||||
@@ -66,17 +66,17 @@ struct DeviceMatrix {
|
||||
while (iter->Next()) {
|
||||
auto batch = iter->Value();
|
||||
// Copy row ptr
|
||||
thrust::copy(batch.ind_ptr, batch.ind_ptr + batch.size + 1,
|
||||
thrust::copy(batch.offset.data(), batch.offset.data() + batch.Size() + 1,
|
||||
row_ptr.tbegin() + batch.base_rowid);
|
||||
if (batch.base_rowid > 0) {
|
||||
auto begin_itr = row_ptr.tbegin() + batch.base_rowid;
|
||||
auto end_itr = begin_itr + batch.size + 1;
|
||||
auto end_itr = begin_itr + batch.Size() + 1;
|
||||
IncrementOffset(begin_itr, end_itr, batch.base_rowid);
|
||||
}
|
||||
// Copy data
|
||||
thrust::copy(batch.data_ptr, batch.data_ptr + batch.ind_ptr[batch.size],
|
||||
thrust::copy(batch.data.begin(), batch.data.end(),
|
||||
data.tbegin() + data_offset);
|
||||
data_offset += batch.ind_ptr[batch.size];
|
||||
data_offset += batch.data.size();
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -139,12 +139,12 @@ struct DevicePredictionNode {
|
||||
struct ElementLoader {
|
||||
bool use_shared;
|
||||
size_t* d_row_ptr;
|
||||
SparseBatch::Entry* d_data;
|
||||
Entry* d_data;
|
||||
int num_features;
|
||||
float* smem;
|
||||
|
||||
__device__ ElementLoader(bool use_shared, size_t* row_ptr,
|
||||
SparseBatch::Entry* entry, int num_features,
|
||||
Entry* entry, int num_features,
|
||||
float* smem, int num_rows)
|
||||
: use_shared(use_shared),
|
||||
d_row_ptr(row_ptr),
|
||||
@@ -161,7 +161,7 @@ struct ElementLoader {
|
||||
bst_uint elem_begin = d_row_ptr[global_idx];
|
||||
bst_uint elem_end = d_row_ptr[global_idx + 1];
|
||||
for (bst_uint elem_idx = elem_begin; elem_idx < elem_end; elem_idx++) {
|
||||
SparseBatch::Entry elem = d_data[elem_idx];
|
||||
Entry elem = d_data[elem_idx];
|
||||
smem[threadIdx.x * num_features + elem.index] = elem.fvalue;
|
||||
}
|
||||
}
|
||||
@@ -175,7 +175,7 @@ struct ElementLoader {
|
||||
// Binary search
|
||||
auto begin_ptr = d_data + d_row_ptr[ridx];
|
||||
auto end_ptr = d_data + d_row_ptr[ridx + 1];
|
||||
SparseBatch::Entry* previous_middle = nullptr;
|
||||
Entry* previous_middle = nullptr;
|
||||
while (end_ptr != begin_ptr) {
|
||||
auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
|
||||
if (middle == previous_middle) {
|
||||
@@ -221,7 +221,7 @@ template <int BLOCK_THREADS>
|
||||
__global__ void PredictKernel(const DevicePredictionNode* d_nodes,
|
||||
float* d_out_predictions, size_t* d_tree_segments,
|
||||
int* d_tree_group, size_t* d_row_ptr,
|
||||
SparseBatch::Entry* d_data, size_t tree_begin,
|
||||
Entry* d_data, size_t tree_begin,
|
||||
size_t tree_end, size_t num_features,
|
||||
size_t num_rows, bool use_shared, int num_group) {
|
||||
extern __shared__ float smem[];
|
||||
@@ -422,7 +422,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
}
|
||||
|
||||
void PredictInstance(const SparseBatch::Inst& inst,
|
||||
void PredictInstance(const SparsePage::Inst& inst,
|
||||
std::vector<bst_float>* out_preds,
|
||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||
unsigned root_index) override {
|
||||
|
||||
Reference in New Issue
Block a user