Partial rewrite EllpackPage (#5352)
This commit is contained in:
parent
7a99f8f27f
commit
3ad4333b0e
@ -181,13 +181,13 @@ class CompressedIterator {
|
|||||||
typedef value_type reference; // NOLINT
|
typedef value_type reference; // NOLINT
|
||||||
|
|
||||||
private:
|
private:
|
||||||
CompressedByteT *buffer_;
|
const CompressedByteT *buffer_;
|
||||||
size_t symbol_bits_;
|
size_t symbol_bits_;
|
||||||
size_t offset_;
|
size_t offset_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {}
|
CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {}
|
||||||
CompressedIterator(CompressedByteT *buffer, size_t num_symbols)
|
CompressedIterator(const CompressedByteT *buffer, size_t num_symbols)
|
||||||
: buffer_(buffer), offset_(0) {
|
: buffer_(buffer), offset_(0) {
|
||||||
symbol_bits_ = detail::SymbolBits(num_symbols);
|
symbol_bits_ = detail::SymbolBits(num_symbols);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -31,7 +31,7 @@ namespace common {
|
|||||||
|
|
||||||
HistogramCuts::HistogramCuts() {
|
HistogramCuts::HistogramCuts() {
|
||||||
monitor_.Init(__FUNCTION__);
|
monitor_.Init(__FUNCTION__);
|
||||||
cut_ptrs_.emplace_back(0);
|
cut_ptrs_.HostVector().emplace_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dispatch to specific builder.
|
// Dispatch to specific builder.
|
||||||
@ -52,7 +52,7 @@ void HistogramCuts::Build(DMatrix* dmat, uint32_t const max_num_bins) {
|
|||||||
DenseCuts cuts(this);
|
DenseCuts cuts(this);
|
||||||
cuts.Build(dmat, max_num_bins);
|
cuts.Build(dmat, max_num_bins);
|
||||||
}
|
}
|
||||||
LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.back();
|
LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.HostVector().back();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CutsBuilder::UseGroup(DMatrix* dmat) {
|
bool CutsBuilder::UseGroup(DMatrix* dmat) {
|
||||||
@ -75,7 +75,10 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
|
|||||||
|
|
||||||
// Data groups, used in ranking.
|
// Data groups, used in ranking.
|
||||||
std::vector<bst_uint> const& group_ptr = info.group_ptr_;
|
std::vector<bst_uint> const& group_ptr = info.group_ptr_;
|
||||||
p_cuts_->min_vals_.resize(end_col - beg_col, 0);
|
auto &local_min_vals = p_cuts_->min_vals_.HostVector();
|
||||||
|
auto &local_cuts = p_cuts_->cut_values_.HostVector();
|
||||||
|
auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector();
|
||||||
|
local_min_vals.resize(end_col - beg_col, 0);
|
||||||
|
|
||||||
for (uint32_t col_id = beg_col; col_id < page.Size() && col_id < end_col; ++col_id) {
|
for (uint32_t col_id = beg_col; col_id < page.Size() && col_id < end_col; ++col_id) {
|
||||||
// Using a local variable makes things easier, but at the cost of memory trashing.
|
// Using a local variable makes things easier, but at the cost of memory trashing.
|
||||||
@ -85,7 +88,7 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
|
|||||||
max_num_bins);
|
max_num_bins);
|
||||||
if (n_bins == 0) {
|
if (n_bins == 0) {
|
||||||
// cut_ptrs_ is initialized with a zero, so there's always an element at the back
|
// cut_ptrs_ is initialized with a zero, so there's always an element at the back
|
||||||
p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_ptrs_.back());
|
local_ptrs.emplace_back(local_ptrs.back());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,17 +115,17 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
|
|||||||
// Can be use data[1] as the min values so that we don't need to
|
// Can be use data[1] as the min values so that we don't need to
|
||||||
// store another array?
|
// store another array?
|
||||||
float mval = summary.data[0].value;
|
float mval = summary.data[0].value;
|
||||||
p_cuts_->min_vals_[col_id - beg_col] = mval - (fabs(mval) + 1e-5);
|
local_min_vals[col_id - beg_col] = mval - (fabs(mval) + 1e-5);
|
||||||
|
|
||||||
this->AddCutPoint(summary, max_num_bins);
|
this->AddCutPoint(summary, max_num_bins);
|
||||||
|
|
||||||
bst_float cpt = (summary.size > 0) ?
|
bst_float cpt = (summary.size > 0) ?
|
||||||
summary.data[summary.size - 1].value :
|
summary.data[summary.size - 1].value :
|
||||||
p_cuts_->min_vals_[col_id - beg_col];
|
local_min_vals[col_id - beg_col];
|
||||||
cpt += fabs(cpt) + 1e-5;
|
cpt += fabs(cpt) + 1e-5;
|
||||||
p_cuts_->cut_values_.emplace_back(cpt);
|
local_cuts.emplace_back(cpt);
|
||||||
|
|
||||||
p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_values_.size());
|
local_ptrs.emplace_back(local_cuts.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,33 +199,40 @@ void SparseCuts::Concat(
|
|||||||
std::vector<std::unique_ptr<SparseCuts>> const& cuts, uint32_t n_cols) {
|
std::vector<std::unique_ptr<SparseCuts>> const& cuts, uint32_t n_cols) {
|
||||||
monitor_.Start(__FUNCTION__);
|
monitor_.Start(__FUNCTION__);
|
||||||
uint32_t nthreads = omp_get_max_threads();
|
uint32_t nthreads = omp_get_max_threads();
|
||||||
p_cuts_->min_vals_.resize(n_cols, std::numeric_limits<float>::max());
|
auto &local_min_vals = p_cuts_->min_vals_.HostVector();
|
||||||
|
auto &local_cuts = p_cuts_->cut_values_.HostVector();
|
||||||
|
auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector();
|
||||||
|
local_min_vals.resize(n_cols, std::numeric_limits<float>::max());
|
||||||
size_t min_vals_tail = 0;
|
size_t min_vals_tail = 0;
|
||||||
|
|
||||||
for (uint32_t t = 0; t < nthreads; ++t) {
|
for (uint32_t t = 0; t < nthreads; ++t) {
|
||||||
|
auto& thread_min_vals = cuts[t]->p_cuts_->min_vals_.HostVector();
|
||||||
|
auto& thread_cuts = cuts[t]->p_cuts_->cut_values_.HostVector();
|
||||||
|
auto& thread_ptrs = cuts[t]->p_cuts_->cut_ptrs_.HostVector();
|
||||||
|
|
||||||
// concat csc pointers.
|
// concat csc pointers.
|
||||||
size_t const old_ptr_size = p_cuts_->cut_ptrs_.size();
|
size_t const old_ptr_size = local_ptrs.size();
|
||||||
p_cuts_->cut_ptrs_.resize(
|
local_ptrs.resize(
|
||||||
cuts[t]->p_cuts_->cut_ptrs_.size() + p_cuts_->cut_ptrs_.size() - 1);
|
thread_ptrs.size() + local_ptrs.size() - 1);
|
||||||
size_t const new_icp_size = p_cuts_->cut_ptrs_.size();
|
size_t const new_icp_size = local_ptrs.size();
|
||||||
auto tail = p_cuts_->cut_ptrs_[old_ptr_size-1];
|
auto tail = local_ptrs[old_ptr_size-1];
|
||||||
for (size_t j = old_ptr_size; j < new_icp_size; ++j) {
|
for (size_t j = old_ptr_size; j < new_icp_size; ++j) {
|
||||||
p_cuts_->cut_ptrs_[j] = tail + cuts[t]->p_cuts_->cut_ptrs_[j-old_ptr_size+1];
|
local_ptrs[j] = tail + thread_ptrs[j-old_ptr_size+1];
|
||||||
}
|
}
|
||||||
// concat csc values
|
// concat csc values
|
||||||
size_t const old_iv_size = p_cuts_->cut_values_.size();
|
size_t const old_iv_size = local_cuts.size();
|
||||||
p_cuts_->cut_values_.resize(
|
local_cuts.resize(
|
||||||
cuts[t]->p_cuts_->cut_values_.size() + p_cuts_->cut_values_.size());
|
thread_cuts.size() + local_cuts.size());
|
||||||
size_t const new_iv_size = p_cuts_->cut_values_.size();
|
size_t const new_iv_size = local_cuts.size();
|
||||||
for (size_t j = old_iv_size; j < new_iv_size; ++j) {
|
for (size_t j = old_iv_size; j < new_iv_size; ++j) {
|
||||||
p_cuts_->cut_values_[j] = cuts[t]->p_cuts_->cut_values_[j-old_iv_size];
|
local_cuts[j] = thread_cuts[j-old_iv_size];
|
||||||
}
|
}
|
||||||
// merge min values
|
// merge min values
|
||||||
for (size_t j = 0; j < cuts[t]->p_cuts_->min_vals_.size(); ++j) {
|
for (size_t j = 0; j < thread_min_vals.size(); ++j) {
|
||||||
p_cuts_->min_vals_.at(min_vals_tail + j) =
|
local_min_vals.at(min_vals_tail + j) =
|
||||||
std::min(p_cuts_->min_vals_.at(min_vals_tail + j), cuts.at(t)->p_cuts_->min_vals_.at(j));
|
std::min(local_min_vals.at(min_vals_tail + j), thread_min_vals.at(j));
|
||||||
}
|
}
|
||||||
min_vals_tail += cuts[t]->p_cuts_->min_vals_.size();
|
min_vals_tail += thread_min_vals.size();
|
||||||
}
|
}
|
||||||
monitor_.Stop(__FUNCTION__);
|
monitor_.Stop(__FUNCTION__);
|
||||||
}
|
}
|
||||||
@ -323,27 +333,27 @@ void DenseCuts::Init
|
|||||||
// TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
|
// TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
|
||||||
// we need to move this allreduce before loadcheckpoint call in future
|
// we need to move this allreduce before loadcheckpoint call in future
|
||||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||||
p_cuts_->min_vals_.resize(sketchs.size());
|
p_cuts_->min_vals_.HostVector().resize(sketchs.size());
|
||||||
|
|
||||||
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
|
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
|
||||||
WQSketch::SummaryContainer a;
|
WQSketch::SummaryContainer a;
|
||||||
a.Reserve(max_num_bins + 1);
|
a.Reserve(max_num_bins + 1);
|
||||||
a.SetPrune(summary_array[fid], max_num_bins + 1);
|
a.SetPrune(summary_array[fid], max_num_bins + 1);
|
||||||
const bst_float mval = a.data[0].value;
|
const bst_float mval = a.data[0].value;
|
||||||
p_cuts_->min_vals_[fid] = mval - (fabs(mval) + 1e-5);
|
p_cuts_->min_vals_.HostVector()[fid] = mval - (fabs(mval) + 1e-5);
|
||||||
AddCutPoint(a, max_num_bins);
|
AddCutPoint(a, max_num_bins);
|
||||||
// push a value that is greater than anything
|
// push a value that is greater than anything
|
||||||
const bst_float cpt
|
const bst_float cpt
|
||||||
= (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_[fid];
|
= (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_.HostVector()[fid];
|
||||||
// this must be bigger than last value in a scale
|
// this must be bigger than last value in a scale
|
||||||
const bst_float last = cpt + (fabs(cpt) + 1e-5);
|
const bst_float last = cpt + (fabs(cpt) + 1e-5);
|
||||||
p_cuts_->cut_values_.push_back(last);
|
p_cuts_->cut_values_.HostVector().push_back(last);
|
||||||
|
|
||||||
// Ensure that every feature gets at least one quantile point
|
// Ensure that every feature gets at least one quantile point
|
||||||
CHECK_LE(p_cuts_->cut_values_.size(), std::numeric_limits<uint32_t>::max());
|
CHECK_LE(p_cuts_->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
|
||||||
auto cut_size = static_cast<uint32_t>(p_cuts_->cut_values_.size());
|
auto cut_size = static_cast<uint32_t>(p_cuts_->cut_values_.HostVector().size());
|
||||||
CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back());
|
CHECK_GT(cut_size, p_cuts_->cut_ptrs_.HostVector().back());
|
||||||
p_cuts_->cut_ptrs_.push_back(cut_size);
|
p_cuts_->cut_ptrs_.HostVector().push_back(cut_size);
|
||||||
}
|
}
|
||||||
monitor_.Stop(__func__);
|
monitor_.Stop(__func__);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,17 +44,35 @@ class HistogramCuts {
|
|||||||
using BinIdx = uint32_t;
|
using BinIdx = uint32_t;
|
||||||
common::Monitor monitor_;
|
common::Monitor monitor_;
|
||||||
|
|
||||||
std::vector<bst_float> cut_values_;
|
|
||||||
std::vector<uint32_t> cut_ptrs_;
|
|
||||||
std::vector<float> min_vals_; // storing minimum value in a sketch set.
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
HostDeviceVector<bst_float> cut_values_;
|
||||||
|
HostDeviceVector<uint32_t> cut_ptrs_;
|
||||||
|
HostDeviceVector<float> min_vals_; // storing minimum value in a sketch set.
|
||||||
|
|
||||||
HistogramCuts();
|
HistogramCuts();
|
||||||
HistogramCuts(HistogramCuts const& that) = delete;
|
HistogramCuts(HistogramCuts const& that) {
|
||||||
|
cut_values_.Resize(that.cut_values_.Size());
|
||||||
|
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||||
|
min_vals_.Resize(that.min_vals_.Size());
|
||||||
|
cut_values_.Copy(that.cut_values_);
|
||||||
|
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||||
|
min_vals_.Copy(that.min_vals_);
|
||||||
|
}
|
||||||
|
|
||||||
HistogramCuts(HistogramCuts&& that) noexcept(true) {
|
HistogramCuts(HistogramCuts&& that) noexcept(true) {
|
||||||
*this = std::forward<HistogramCuts&&>(that);
|
*this = std::forward<HistogramCuts&&>(that);
|
||||||
}
|
}
|
||||||
HistogramCuts& operator=(HistogramCuts const& that) = delete;
|
|
||||||
|
HistogramCuts& operator=(HistogramCuts const& that) {
|
||||||
|
cut_values_.Resize(that.cut_values_.Size());
|
||||||
|
cut_ptrs_.Resize(that.cut_ptrs_.Size());
|
||||||
|
min_vals_.Resize(that.min_vals_.Size());
|
||||||
|
cut_values_.Copy(that.cut_values_);
|
||||||
|
cut_ptrs_.Copy(that.cut_ptrs_);
|
||||||
|
min_vals_.Copy(that.min_vals_);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
|
HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
|
||||||
monitor_ = std::move(that.monitor_);
|
monitor_ = std::move(that.monitor_);
|
||||||
cut_ptrs_ = std::move(that.cut_ptrs_);
|
cut_ptrs_ = std::move(that.cut_ptrs_);
|
||||||
@ -67,28 +85,30 @@ class HistogramCuts {
|
|||||||
void Build(DMatrix* dmat, uint32_t const max_num_bins);
|
void Build(DMatrix* dmat, uint32_t const max_num_bins);
|
||||||
/* \brief How many bins a feature has. */
|
/* \brief How many bins a feature has. */
|
||||||
uint32_t FeatureBins(uint32_t feature) const {
|
uint32_t FeatureBins(uint32_t feature) const {
|
||||||
return cut_ptrs_.at(feature+1) - cut_ptrs_[feature];
|
return cut_ptrs_.ConstHostVector().at(feature + 1) -
|
||||||
|
cut_ptrs_.ConstHostVector()[feature];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Getters. Cuts should be of no use after building histogram indices, but currently
|
// Getters. Cuts should be of no use after building histogram indices, but currently
|
||||||
// it's deeply linked with quantile_hist, gpu sketcher and gpu_hist. So we preserve
|
// it's deeply linked with quantile_hist, gpu sketcher and gpu_hist. So we preserve
|
||||||
// these for now.
|
// these for now.
|
||||||
std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_; }
|
std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
|
||||||
std::vector<float> const& Values() const { return cut_values_; }
|
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
|
||||||
std::vector<float> const& MinValues() const { return min_vals_; }
|
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
|
||||||
|
|
||||||
size_t TotalBins() const { return cut_ptrs_.back(); }
|
size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
|
||||||
|
|
||||||
// Return the index of a cut point that is strictly greater than the input
|
// Return the index of a cut point that is strictly greater than the input
|
||||||
// value, or the last available index if none exists
|
// value, or the last available index if none exists
|
||||||
BinIdx SearchBin(float value, uint32_t column_id) const {
|
BinIdx SearchBin(float value, uint32_t column_id) const {
|
||||||
auto beg = cut_ptrs_.at(column_id);
|
auto beg = cut_ptrs_.ConstHostVector().at(column_id);
|
||||||
auto end = cut_ptrs_.at(column_id + 1);
|
auto end = cut_ptrs_.ConstHostVector().at(column_id + 1);
|
||||||
auto it = std::upper_bound(cut_values_.cbegin() + beg, cut_values_.cbegin() + end, value);
|
const auto &values = cut_values_.ConstHostVector();
|
||||||
if (it == cut_values_.cend()) {
|
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
|
||||||
it = cut_values_.cend() - 1;
|
if (it == values.cend()) {
|
||||||
|
it = values.cend() - 1;
|
||||||
}
|
}
|
||||||
BinIdx idx = it - cut_values_.cbegin();
|
BinIdx idx = it - values.cbegin();
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -133,8 +153,8 @@ class CutsBuilder {
|
|||||||
size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
|
size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
|
||||||
for (size_t i = 1; i < required_cuts; ++i) {
|
for (size_t i = 1; i < required_cuts; ++i) {
|
||||||
bst_float cpt = summary.data[i].value;
|
bst_float cpt = summary.data[i].value;
|
||||||
if (i == 1 || cpt > p_cuts_->cut_values_.back()) {
|
if (i == 1 || cpt > p_cuts_->cut_values_.ConstHostVector().back()) {
|
||||||
p_cuts_->cut_values_.push_back(cpt);
|
p_cuts_->cut_values_.HostVector().push_back(cpt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -371,6 +371,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
|||||||
template class HostDeviceVector<bst_float>;
|
template class HostDeviceVector<bst_float>;
|
||||||
template class HostDeviceVector<GradientPair>;
|
template class HostDeviceVector<GradientPair>;
|
||||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||||
|
template class HostDeviceVector<uint8_t>;
|
||||||
template class HostDeviceVector<Entry>;
|
template class HostDeviceVector<Entry>;
|
||||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||||
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
||||||
|
|||||||
@ -13,11 +13,24 @@ class EllpackPageImpl {};
|
|||||||
EllpackPage::EllpackPage() = default;
|
EllpackPage::EllpackPage() = default;
|
||||||
|
|
||||||
EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
|
EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
|
||||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
|
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||||
|
"EllpackPage is required";
|
||||||
}
|
}
|
||||||
|
|
||||||
EllpackPage::~EllpackPage() {
|
EllpackPage::~EllpackPage() {
|
||||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
|
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||||
|
"EllpackPage is required";
|
||||||
|
}
|
||||||
|
|
||||||
|
void EllpackPage::SetBaseRowId(size_t row_id) {
|
||||||
|
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||||
|
"EllpackPage is required";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t EllpackPage::Size() const {
|
||||||
|
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||||
|
"EllpackPage is required";
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -4,9 +4,9 @@
|
|||||||
|
|
||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
|
|
||||||
#include "./ellpack_page.cuh"
|
|
||||||
#include "../common/hist_util.h"
|
#include "../common/hist_util.h"
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
|
#include "./ellpack_page.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
@ -17,13 +17,9 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)
|
|||||||
|
|
||||||
EllpackPage::~EllpackPage() = default;
|
EllpackPage::~EllpackPage() = default;
|
||||||
|
|
||||||
size_t EllpackPage::Size() const {
|
size_t EllpackPage::Size() const { return impl_->Size(); }
|
||||||
return impl_->Size();
|
|
||||||
}
|
|
||||||
|
|
||||||
void EllpackPage::SetBaseRowId(size_t row_id) {
|
void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); }
|
||||||
impl_->SetBaseRowId(row_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bin each input data entry, store the bin indices in compressed form.
|
// Bin each input data entry, store the bin indices in compressed form.
|
||||||
__global__ void CompressBinEllpackKernel(
|
__global__ void CompressBinEllpackKernel(
|
||||||
@ -65,16 +61,18 @@ __global__ void CompressBinEllpackKernel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Construct an ELLPACK matrix with the given number of empty rows.
|
// Construct an ELLPACK matrix with the given number of empty rows.
|
||||||
EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) {
|
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||||
|
bool is_dense, size_t row_stride,
|
||||||
|
size_t n_rows)
|
||||||
|
: is_dense(is_dense),
|
||||||
|
cuts_(std::move(cuts)),
|
||||||
|
row_stride(row_stride),
|
||||||
|
n_rows(n_rows) {
|
||||||
monitor_.Init("ellpack_page");
|
monitor_.Init("ellpack_page");
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device));
|
||||||
|
|
||||||
matrix.info = info;
|
|
||||||
matrix.base_rowid = 0;
|
|
||||||
matrix.n_rows = n_rows;
|
|
||||||
|
|
||||||
monitor_.StartCuda("InitCompressedData");
|
monitor_.StartCuda("InitCompressedData");
|
||||||
InitCompressedData(device, n_rows);
|
InitCompressedData(device);
|
||||||
monitor_.StopCuda("InitCompressedData");
|
monitor_.StopCuda("InitCompressedData");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,33 +91,27 @@ size_t GetRowStride(DMatrix* dmat) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Construct an ELLPACK matrix in memory.
|
// Construct an ELLPACK matrix in memory.
|
||||||
EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
|
EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
|
||||||
|
: is_dense(dmat->IsDense()) {
|
||||||
monitor_.Init("ellpack_page");
|
monitor_.Init("ellpack_page");
|
||||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||||
|
|
||||||
matrix.n_rows = dmat->Info().num_row_;
|
n_rows = dmat->Info().num_row_;
|
||||||
|
|
||||||
monitor_.StartCuda("Quantiles");
|
monitor_.StartCuda("Quantiles");
|
||||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
|
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
|
||||||
size_t row_stride = GetRowStride(dmat);
|
row_stride = GetRowStride(dmat);
|
||||||
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
|
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
|
||||||
param.gpu_batch_nrows);
|
param.gpu_batch_nrows);
|
||||||
monitor_.StopCuda("Quantiles");
|
monitor_.StopCuda("Quantiles");
|
||||||
|
|
||||||
monitor_.StartCuda("InitEllpackInfo");
|
|
||||||
InitInfo(param.gpu_id, dmat->IsDense(), row_stride, cuts);
|
|
||||||
monitor_.StopCuda("InitEllpackInfo");
|
|
||||||
|
|
||||||
monitor_.StartCuda("InitCompressedData");
|
monitor_.StartCuda("InitCompressedData");
|
||||||
InitCompressedData(param.gpu_id, dmat->Info().num_row_);
|
InitCompressedData(param.gpu_id);
|
||||||
monitor_.StopCuda("InitCompressedData");
|
monitor_.StopCuda("InitCompressedData");
|
||||||
|
|
||||||
monitor_.StartCuda("BinningCompression");
|
monitor_.StartCuda("BinningCompression");
|
||||||
DeviceHistogramBuilderState hist_builder_row_state(dmat->Info().num_row_);
|
|
||||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||||
hist_builder_row_state.BeginBatch(batch);
|
CreateHistIndices(param.gpu_id, batch);
|
||||||
CreateHistIndices(param.gpu_id, batch, hist_builder_row_state.GetRowStateOnDevice());
|
|
||||||
hist_builder_row_state.EndBatch();
|
|
||||||
}
|
}
|
||||||
monitor_.StopCuda("BinningCompression");
|
monitor_.StopCuda("BinningCompression");
|
||||||
}
|
}
|
||||||
@ -133,23 +125,26 @@ struct CopyPage {
|
|||||||
size_t offset;
|
size_t offset;
|
||||||
|
|
||||||
CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
|
CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
|
||||||
: cbw{dst->matrix.info.NumSymbols()},
|
: cbw{dst->NumSymbols()},
|
||||||
dst_data_d{dst->gidx_buffer.data()},
|
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||||
src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
|
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||||
offset(offset) {}
|
offset(offset) {}
|
||||||
|
|
||||||
__device__ void operator()(size_t element_id) {
|
__device__ void operator()(size_t element_id) {
|
||||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
|
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
|
||||||
|
element_id + offset);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Copy the data from the given EllpackPage to the current page.
|
// Copy the data from the given EllpackPage to the current page.
|
||||||
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
|
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
|
||||||
monitor_.StartCuda("Copy");
|
monitor_.StartCuda("Copy");
|
||||||
size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
|
size_t num_elements = page->n_rows * page->row_stride;
|
||||||
CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
|
CHECK_EQ(row_stride, page->row_stride);
|
||||||
CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
|
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||||
CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements);
|
CHECK_GE(n_rows * row_stride, offset + num_elements);
|
||||||
|
gidx_buffer.SetDevice(device);
|
||||||
|
page->gidx_buffer.SetDevice(device);
|
||||||
dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
|
dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
|
||||||
monitor_.StopCuda("Copy");
|
monitor_.StopCuda("Copy");
|
||||||
return num_elements;
|
return num_elements;
|
||||||
@ -160,26 +155,29 @@ struct CompactPage {
|
|||||||
common::CompressedBufferWriter cbw;
|
common::CompressedBufferWriter cbw;
|
||||||
common::CompressedByteT* dst_data_d;
|
common::CompressedByteT* dst_data_d;
|
||||||
common::CompressedIterator<uint32_t> src_iterator_d;
|
common::CompressedIterator<uint32_t> src_iterator_d;
|
||||||
/*! \brief An array that maps the rows from the full DMatrix to the compacted page.
|
/*! \brief An array that maps the rows from the full DMatrix to the compacted
|
||||||
|
* page.
|
||||||
*
|
*
|
||||||
* The total size is the number of rows in the original, uncompacted DMatrix. Elements are the
|
* The total size is the number of rows in the original, uncompacted DMatrix.
|
||||||
* row ids in the compacted page. Rows not needed are set to SIZE_MAX.
|
* Elements are the row ids in the compacted page. Rows not needed are set to
|
||||||
|
* SIZE_MAX.
|
||||||
*
|
*
|
||||||
* An example compacting 16 rows to 8 rows:
|
* An example compacting 16 rows to 8 rows:
|
||||||
* [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX,
|
* [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6,
|
||||||
* SIZE_MAX]
|
* SIZE_MAX, 7, SIZE_MAX, SIZE_MAX]
|
||||||
*/
|
*/
|
||||||
common::Span<size_t> row_indexes;
|
common::Span<size_t> row_indexes;
|
||||||
size_t base_rowid;
|
size_t base_rowid;
|
||||||
size_t row_stride;
|
size_t row_stride;
|
||||||
|
|
||||||
CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span<size_t> row_indexes)
|
CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src,
|
||||||
: cbw{dst->matrix.info.NumSymbols()},
|
common::Span<size_t> row_indexes)
|
||||||
dst_data_d{dst->gidx_buffer.data()},
|
: cbw{dst->NumSymbols()},
|
||||||
src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
|
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||||
|
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||||
row_indexes(row_indexes),
|
row_indexes(row_indexes),
|
||||||
base_rowid{src->matrix.base_rowid},
|
base_rowid{src->base_rowid},
|
||||||
row_stride{src->matrix.info.row_stride} {}
|
row_stride{src->row_stride} {}
|
||||||
|
|
||||||
__device__ void operator()(size_t row_id) {
|
__device__ void operator()(size_t row_id) {
|
||||||
size_t src_row = base_rowid + row_id;
|
size_t src_row = base_rowid + row_id;
|
||||||
@ -188,100 +186,72 @@ struct CompactPage {
|
|||||||
size_t dst_offset = dst_row * row_stride;
|
size_t dst_offset = dst_row * row_stride;
|
||||||
size_t src_offset = row_id * row_stride;
|
size_t src_offset = row_id * row_stride;
|
||||||
for (size_t j = 0; j < row_stride; j++) {
|
for (size_t j = 0; j < row_stride; j++) {
|
||||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
|
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
|
||||||
|
dst_offset + j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Compacts the data from the given EllpackPage into the current page.
|
// Compacts the data from the given EllpackPage into the current page.
|
||||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes) {
|
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
|
||||||
|
common::Span<size_t> row_indexes) {
|
||||||
monitor_.StartCuda("Compact");
|
monitor_.StartCuda("Compact");
|
||||||
CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
|
CHECK_EQ(row_stride, page->row_stride);
|
||||||
CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
|
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||||
CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size());
|
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
|
||||||
dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes));
|
gidx_buffer.SetDevice(device);
|
||||||
|
page->gidx_buffer.SetDevice(device);
|
||||||
|
dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
|
||||||
monitor_.StopCuda("Compact");
|
monitor_.StopCuda("Compact");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct an EllpackInfo based on histogram cuts of features.
|
|
||||||
EllpackInfo::EllpackInfo(int device,
|
|
||||||
bool is_dense,
|
|
||||||
size_t row_stride,
|
|
||||||
const common::HistogramCuts& hmat,
|
|
||||||
dh::BulkAllocator* ba)
|
|
||||||
: is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {
|
|
||||||
|
|
||||||
ba->Allocate(device,
|
|
||||||
&feature_segments, hmat.Ptrs().size(),
|
|
||||||
&gidx_fvalue_map, hmat.Values().size(),
|
|
||||||
&min_fvalue, hmat.MinValues().size());
|
|
||||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
|
|
||||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
|
|
||||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize the EllpackInfo for this page.
|
|
||||||
void EllpackPageImpl::InitInfo(int device,
|
|
||||||
bool is_dense,
|
|
||||||
size_t row_stride,
|
|
||||||
const common::HistogramCuts& hmat) {
|
|
||||||
matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize the buffer to stored compressed features.
|
// Initialize the buffer to stored compressed features.
|
||||||
void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) {
|
void EllpackPageImpl::InitCompressedData(int device) {
|
||||||
size_t num_symbols = matrix.info.NumSymbols();
|
size_t num_symbols = NumSymbols();
|
||||||
|
|
||||||
// Required buffer size for storing data matrix in ELLPack format.
|
// Required buffer size for storing data matrix in ELLPack format.
|
||||||
size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
|
size_t compressed_size_bytes =
|
||||||
matrix.info.row_stride * num_rows, num_symbols);
|
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
|
||||||
ba_.Allocate(device, &gidx_buffer, compressed_size_bytes);
|
num_symbols);
|
||||||
|
gidx_buffer.SetDevice(device);
|
||||||
|
// Don't call fill unnecessarily
|
||||||
|
if (gidx_buffer.Size() == 0) {
|
||||||
|
gidx_buffer.Resize(compressed_size_bytes, 0);
|
||||||
|
} else {
|
||||||
|
gidx_buffer.Resize(compressed_size_bytes, 0);
|
||||||
thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
|
thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
|
||||||
|
}
|
||||||
matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compress a CSR page into ELLPACK.
|
// Compress a CSR page into ELLPACK.
|
||||||
void EllpackPageImpl::CreateHistIndices(int device,
|
void EllpackPageImpl::CreateHistIndices(int device,
|
||||||
const SparsePage& row_batch,
|
const SparsePage& row_batch) {
|
||||||
const RowStateOnDevice& device_row_state) {
|
if (row_batch.Size() == 0) return;
|
||||||
// Has any been allocated for me in this batch?
|
unsigned int null_gidx_value = NumSymbols() - 1;
|
||||||
if (!device_row_state.rows_to_process_from_batch) return;
|
|
||||||
|
|
||||||
unsigned int null_gidx_value = matrix.info.n_bins;
|
|
||||||
size_t row_stride = matrix.info.row_stride;
|
|
||||||
|
|
||||||
const auto& offset_vec = row_batch.offset.ConstHostVector();
|
const auto& offset_vec = row_batch.offset.ConstHostVector();
|
||||||
|
|
||||||
// bin and compress entries in batches of rows
|
// bin and compress entries in batches of rows
|
||||||
size_t gpu_batch_nrows = std::min(
|
size_t gpu_batch_nrows =
|
||||||
dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
|
std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
|
||||||
static_cast<size_t>(device_row_state.rows_to_process_from_batch));
|
static_cast<size_t>(row_batch.Size()));
|
||||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||||
|
|
||||||
size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
|
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
|
||||||
gpu_batch_nrows);
|
|
||||||
|
|
||||||
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
|
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
|
||||||
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
|
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
|
||||||
size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
|
size_t batch_row_end =
|
||||||
if (batch_row_end > device_row_state.rows_to_process_from_batch) {
|
std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
|
||||||
batch_row_end = device_row_state.rows_to_process_from_batch;
|
|
||||||
}
|
|
||||||
size_t batch_nrows = batch_row_end - batch_row_begin;
|
size_t batch_nrows = batch_row_end - batch_row_begin;
|
||||||
|
|
||||||
const auto ent_cnt_begin =
|
const auto ent_cnt_begin = offset_vec[batch_row_begin];
|
||||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
|
const auto ent_cnt_end = offset_vec[batch_row_end];
|
||||||
const auto ent_cnt_end =
|
|
||||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
|
|
||||||
|
|
||||||
/*! \brief row offset in SparsePage (the input data). */
|
/*! \brief row offset in SparsePage (the input data). */
|
||||||
dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
|
dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
|
||||||
thrust::copy(
|
thrust::copy(offset_vec.data() + batch_row_begin,
|
||||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
|
offset_vec.data() + batch_row_end + 1, row_ptrs.begin());
|
||||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
|
|
||||||
row_ptrs.begin());
|
|
||||||
|
|
||||||
// number of entries in this batch.
|
// number of entries in this batch.
|
||||||
size_t n_entries = ent_cnt_end - ent_cnt_begin;
|
size_t n_entries = ent_cnt_end - ent_cnt_begin;
|
||||||
@ -289,97 +259,50 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
|||||||
// copy data entries to device.
|
// copy data entries to device.
|
||||||
dh::safe_cuda(cudaMemcpy(entries_d.data().get(),
|
dh::safe_cuda(cudaMemcpy(entries_d.data().get(),
|
||||||
data_vec.data() + ent_cnt_begin,
|
data_vec.data() + ent_cnt_begin,
|
||||||
n_entries * sizeof(Entry),
|
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||||
cudaMemcpyDefault));
|
|
||||||
const dim3 block3(32, 8, 1); // 256 threads
|
const dim3 block3(32, 8, 1); // 256 threads
|
||||||
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||||
common::DivRoundUp(row_stride, block3.y),
|
common::DivRoundUp(row_stride, block3.y), 1);
|
||||||
1);
|
auto device_accessor = GetDeviceAccessor(device);
|
||||||
dh::LaunchKernel {grid3, block3}(
|
dh::LaunchKernel {grid3, block3}(
|
||||||
CompressBinEllpackKernel,
|
CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
|
||||||
common::CompressedBufferWriter(matrix.info.NumSymbols()),
|
gidx_buffer.DevicePointer(), row_ptrs.data().get(),
|
||||||
gidx_buffer.data(),
|
entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
|
||||||
row_ptrs.data().get(),
|
device_accessor.feature_segments.data(),
|
||||||
entries_d.data().get(),
|
row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride,
|
||||||
matrix.info.gidx_fvalue_map.data(),
|
|
||||||
matrix.info.feature_segments.data(),
|
|
||||||
device_row_state.total_rows_processed + batch_row_begin,
|
|
||||||
batch_nrows,
|
|
||||||
row_stride,
|
|
||||||
null_gidx_value);
|
null_gidx_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the number of rows contained in this page.
|
// Return the number of rows contained in this page.
|
||||||
size_t EllpackPageImpl::Size() const {
|
size_t EllpackPageImpl::Size() const { return n_rows; }
|
||||||
return matrix.n_rows;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear the current page.
|
|
||||||
void EllpackPageImpl::Clear() {
|
|
||||||
ba_.Clear();
|
|
||||||
gidx_buffer = {};
|
|
||||||
idx_buffer.clear();
|
|
||||||
sparse_page_.Clear();
|
|
||||||
matrix.base_rowid = 0;
|
|
||||||
matrix.n_rows = 0;
|
|
||||||
device_initialized_ = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Push a CSR page to the current page.
|
|
||||||
//
|
|
||||||
// The CSR pages are accumulated in memory until they reach a certain size, then written out as
|
|
||||||
// compressed ELLPACK.
|
|
||||||
void EllpackPageImpl::Push(int device, const SparsePage& batch) {
|
|
||||||
sparse_page_.Push(batch);
|
|
||||||
matrix.n_rows += batch.Size();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compress the accumulated SparsePage.
|
|
||||||
void EllpackPageImpl::CompressSparsePage(int device) {
|
|
||||||
monitor_.StartCuda("InitCompressedData");
|
|
||||||
InitCompressedData(device, matrix.n_rows);
|
|
||||||
monitor_.StopCuda("InitCompressedData");
|
|
||||||
|
|
||||||
monitor_.StartCuda("BinningCompression");
|
|
||||||
DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
|
|
||||||
hist_builder_row_state.BeginBatch(sparse_page_);
|
|
||||||
CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
|
|
||||||
hist_builder_row_state.EndBatch();
|
|
||||||
monitor_.StopCuda("BinningCompression");
|
|
||||||
|
|
||||||
monitor_.StartCuda("CopyDeviceToHost");
|
|
||||||
idx_buffer.resize(gidx_buffer.size());
|
|
||||||
dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
|
|
||||||
ba_.Clear();
|
|
||||||
gidx_buffer = {};
|
|
||||||
monitor_.StopCuda("CopyDeviceToHost");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the memory cost for storing the compressed features.
|
// Return the memory cost for storing the compressed features.
|
||||||
size_t EllpackPageImpl::MemCostBytes() const {
|
size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
|
||||||
// Required buffer size for storing data matrix in ELLPack format.
|
const common::HistogramCuts& cuts) {
|
||||||
size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
|
// Required buffer size for storing data matrix in EtoLLPack format.
|
||||||
matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols());
|
size_t compressed_size_bytes =
|
||||||
|
common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows,
|
||||||
|
cuts.TotalBins() + 1);
|
||||||
return compressed_size_bytes;
|
return compressed_size_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy the compressed features to GPU.
|
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const {
|
||||||
void EllpackPageImpl::InitDevice(int device, EllpackInfo info) {
|
gidx_buffer.SetDevice(device);
|
||||||
if (device_initialized_) return;
|
return EllpackDeviceAccessor(
|
||||||
|
device, cuts_, is_dense, row_stride, base_rowid, n_rows,
|
||||||
|
common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
|
||||||
|
NumSymbols()));
|
||||||
|
}
|
||||||
|
|
||||||
monitor_.StartCuda("CopyPageToDevice");
|
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
const SparsePage& page, bool is_dense,
|
||||||
|
size_t row_stride)
|
||||||
gidx_buffer = {};
|
: cuts_(std::move(cuts)),
|
||||||
ba_.Allocate(device, &gidx_buffer, idx_buffer.size());
|
is_dense(is_dense),
|
||||||
dh::CopyVectorToDeviceSpan(gidx_buffer, idx_buffer);
|
n_rows(page.Size()),
|
||||||
|
row_stride(row_stride) {
|
||||||
matrix.info = info;
|
this->InitCompressedData(device);
|
||||||
matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), info.n_bins + 1);
|
this->CreateHistIndices(device, page);
|
||||||
|
|
||||||
monitor_.StopCuda("CopyPageToDevice");
|
|
||||||
|
|
||||||
device_initialized_ = true;
|
|
||||||
}
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -40,71 +40,53 @@ __forceinline__ __device__ int BinarySearchRow(
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \brief Meta information about the ELLPACK matrix. */
|
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
||||||
struct EllpackInfo {
|
* device. Does not own underlying memory and may be trivially copied into
|
||||||
|
* kernels.*/
|
||||||
|
struct EllpackDeviceAccessor {
|
||||||
/*! \brief Whether or not if the matrix is dense. */
|
/*! \brief Whether or not if the matrix is dense. */
|
||||||
bool is_dense;
|
bool is_dense;
|
||||||
/*! \brief Row length for ELLPack, equal to number of features. */
|
/*! \brief Row length for ELLPack, equal to number of features. */
|
||||||
size_t row_stride;
|
size_t row_stride;
|
||||||
/*! \brief Total number of bins, also used as the null index value, . */
|
|
||||||
size_t n_bins;
|
|
||||||
/*! \brief Minimum value for each feature. Size equals to number of features. */
|
|
||||||
common::Span<bst_float> min_fvalue;
|
|
||||||
/*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
|
|
||||||
common::Span<uint32_t> feature_segments;
|
|
||||||
/*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
|
|
||||||
common::Span<bst_float> gidx_fvalue_map;
|
|
||||||
|
|
||||||
EllpackInfo() = default;
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Constructor.
|
|
||||||
*
|
|
||||||
* @param device The GPU device to use.
|
|
||||||
* @param is_dense Whether the matrix is dense.
|
|
||||||
* @param row_stride The number of features between starts of consecutive rows.
|
|
||||||
* @param hmat The histogram cuts of all the features.
|
|
||||||
* @param ba The BulkAllocator that owns the GPU memory.
|
|
||||||
*/
|
|
||||||
explicit EllpackInfo(int device,
|
|
||||||
bool is_dense,
|
|
||||||
size_t row_stride,
|
|
||||||
const common::HistogramCuts& hmat,
|
|
||||||
dh::BulkAllocator* ba);
|
|
||||||
|
|
||||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for not found). */
|
|
||||||
size_t NumSymbols() const {
|
|
||||||
return n_bins + 1;
|
|
||||||
}
|
|
||||||
size_t NumFeatures() const {
|
|
||||||
return min_fvalue.size();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
|
||||||
* device. Does not own underlying memory and may be trivially copied into
|
|
||||||
* kernels.*/
|
|
||||||
struct EllpackMatrix {
|
|
||||||
EllpackInfo info;
|
|
||||||
size_t base_rowid{};
|
size_t base_rowid{};
|
||||||
size_t n_rows{};
|
size_t n_rows{};
|
||||||
common::CompressedIterator<uint32_t> gidx_iter;
|
common::CompressedIterator<uint32_t> gidx_iter;
|
||||||
|
/*! \brief Minimum value for each feature. Size equals to number of features. */
|
||||||
|
common::Span<const bst_float> min_fvalue;
|
||||||
|
/*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
|
||||||
|
common::Span<const uint32_t> feature_segments;
|
||||||
|
/*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
|
||||||
|
common::Span<const bst_float> gidx_fvalue_map;
|
||||||
|
|
||||||
|
EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
|
||||||
|
bool is_dense, size_t row_stride, size_t base_rowid,
|
||||||
|
size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter)
|
||||||
|
: is_dense(is_dense),
|
||||||
|
row_stride(row_stride),
|
||||||
|
base_rowid(base_rowid),
|
||||||
|
n_rows(n_rows) ,gidx_iter(gidx_iter){
|
||||||
|
cuts.cut_values_.SetDevice(device);
|
||||||
|
cuts.cut_ptrs_.SetDevice(device);
|
||||||
|
cuts.min_vals_.SetDevice(device);
|
||||||
|
gidx_fvalue_map = cuts.cut_values_.ConstDeviceSpan();
|
||||||
|
feature_segments = cuts.cut_ptrs_.ConstDeviceSpan();
|
||||||
|
min_fvalue = cuts.min_vals_.ConstDeviceSpan();
|
||||||
|
}
|
||||||
// Get a matrix element, uses binary search for look up Return NaN if missing
|
// Get a matrix element, uses binary search for look up Return NaN if missing
|
||||||
// Given a row index and a feature index, returns the corresponding cut value
|
// Given a row index and a feature index, returns the corresponding cut value
|
||||||
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
||||||
ridx -= base_rowid;
|
ridx -= base_rowid;
|
||||||
auto row_begin = info.row_stride * ridx;
|
auto row_begin = row_stride * ridx;
|
||||||
auto row_end = row_begin + info.row_stride;
|
auto row_end = row_begin + row_stride;
|
||||||
auto gidx = -1;
|
auto gidx = -1;
|
||||||
if (info.is_dense) {
|
if (is_dense) {
|
||||||
gidx = gidx_iter[row_begin + fidx];
|
gidx = gidx_iter[row_begin + fidx];
|
||||||
} else {
|
} else {
|
||||||
gidx = BinarySearchRow(row_begin,
|
gidx = BinarySearchRow(row_begin,
|
||||||
row_end,
|
row_end,
|
||||||
gidx_iter,
|
gidx_iter,
|
||||||
info.feature_segments[fidx],
|
feature_segments[fidx],
|
||||||
info.feature_segments[fidx + 1]);
|
feature_segments[fidx + 1]);
|
||||||
}
|
}
|
||||||
return gidx;
|
return gidx;
|
||||||
}
|
}
|
||||||
@ -113,97 +95,27 @@ struct EllpackMatrix {
|
|||||||
if (gidx == -1) {
|
if (gidx == -1) {
|
||||||
return nan("");
|
return nan("");
|
||||||
}
|
}
|
||||||
return info.gidx_fvalue_map[gidx];
|
return gidx_fvalue_map[gidx];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if the row id is withing range of the current batch.
|
// Check if the row id is withing range of the current batch.
|
||||||
__device__ bool IsInRange(size_t row_id) const {
|
__device__ bool IsInRange(size_t row_id) const {
|
||||||
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
||||||
}
|
}
|
||||||
|
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||||
|
* not found). */
|
||||||
|
size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
|
||||||
|
|
||||||
|
size_t NullValue() const { return gidx_fvalue_map.size(); }
|
||||||
|
|
||||||
|
XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
|
||||||
|
|
||||||
|
XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Instances of this type are created while creating the histogram bins for the
|
|
||||||
// entire dataset across multiple sparse page batches. This keeps track of the number
|
|
||||||
// of rows to process from a batch and the position from which to process on each device.
|
|
||||||
struct RowStateOnDevice {
|
|
||||||
// Number of rows assigned to this device
|
|
||||||
size_t total_rows_assigned_to_device;
|
|
||||||
// Number of rows processed thus far
|
|
||||||
size_t total_rows_processed;
|
|
||||||
// Number of rows to process from the current sparse page batch
|
|
||||||
size_t rows_to_process_from_batch;
|
|
||||||
// Offset from the current sparse page batch to begin processing
|
|
||||||
size_t row_offset_in_current_batch;
|
|
||||||
|
|
||||||
explicit RowStateOnDevice(size_t total_rows)
|
|
||||||
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
|
||||||
rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
|
|
||||||
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
|
||||||
rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
|
|
||||||
}
|
|
||||||
|
|
||||||
// Advance the row state by the number of rows processed
|
|
||||||
void Advance() {
|
|
||||||
total_rows_processed += rows_to_process_from_batch;
|
|
||||||
CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
|
|
||||||
rows_to_process_from_batch = row_offset_in_current_batch = 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// An instance of this type is created which keeps track of total number of rows to process,
|
|
||||||
// rows processed thus far, rows to process and the offset from the current sparse page batch
|
|
||||||
// to begin processing on each device
|
|
||||||
class DeviceHistogramBuilderState {
|
|
||||||
public:
|
|
||||||
explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {}
|
|
||||||
|
|
||||||
const RowStateOnDevice& GetRowStateOnDevice() const {
|
|
||||||
return device_row_state_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This method is invoked at the beginning of each sparse page batch. This distributes
|
|
||||||
// the rows in the sparse page to the device.
|
|
||||||
// TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
|
|
||||||
void BeginBatch(const SparsePage &batch) {
|
|
||||||
size_t rem_rows = batch.Size();
|
|
||||||
size_t row_offset_in_current_batch = 0;
|
|
||||||
|
|
||||||
// Do we have anymore left to process from this batch on this device?
|
|
||||||
if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
|
|
||||||
// There are still some rows that needs to be assigned to this device
|
|
||||||
device_row_state_.rows_to_process_from_batch =
|
|
||||||
std::min(
|
|
||||||
device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
|
|
||||||
rem_rows);
|
|
||||||
} else {
|
|
||||||
// All rows have been assigned to this device
|
|
||||||
device_row_state_.rows_to_process_from_batch = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
|
|
||||||
row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
|
|
||||||
rem_rows -= device_row_state_.rows_to_process_from_batch;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This method is invoked after completion of each sparse page batch
|
|
||||||
void EndBatch() {
|
|
||||||
device_row_state_.Advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
RowStateOnDevice device_row_state_{0};
|
|
||||||
};
|
|
||||||
|
|
||||||
class EllpackPageImpl {
|
class EllpackPageImpl {
|
||||||
public:
|
public:
|
||||||
EllpackMatrix matrix;
|
|
||||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
|
||||||
common::Span<common::CompressedByteT> gidx_buffer;
|
|
||||||
std::vector<common::CompressedByteT> idx_buffer;
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Default constructor.
|
* \brief Default constructor.
|
||||||
*
|
*
|
||||||
@ -218,7 +130,12 @@ class EllpackPageImpl {
|
|||||||
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
||||||
* and the given number of rows.
|
* and the given number of rows.
|
||||||
*/
|
*/
|
||||||
explicit EllpackPageImpl(int device, EllpackInfo info, size_t n_rows);
|
EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
|
||||||
|
size_t row_stride, size_t n_rows);
|
||||||
|
|
||||||
|
EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||||
|
const SparsePage& page,
|
||||||
|
bool is_dense,size_t row_stride);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Constructor from an existing DMatrix.
|
* \brief Constructor from an existing DMatrix.
|
||||||
@ -245,77 +162,53 @@ class EllpackPageImpl {
|
|||||||
*/
|
*/
|
||||||
void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);
|
void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Initialize the EllpackInfo contained in the EllpackMatrix.
|
|
||||||
*
|
|
||||||
* This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns
|
|
||||||
* the GPU memory used by the EllpackInfo.
|
|
||||||
*
|
|
||||||
* @param device The GPU device to use.
|
|
||||||
* @param is_dense Whether the matrix is dense.
|
|
||||||
* @param row_stride The number of features between starts of consecutive rows.
|
|
||||||
* @param hmat The histogram cuts of all the features.
|
|
||||||
*/
|
|
||||||
void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat);
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Initialize the buffer to store compressed features.
|
|
||||||
*
|
|
||||||
* @param device The GPU device to use.
|
|
||||||
* @param num_rows The number of rows we are storing in the buffer.
|
|
||||||
*/
|
|
||||||
void InitCompressedData(int device, size_t num_rows);
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Compress a single page of CSR data into ELLPACK.
|
|
||||||
*
|
|
||||||
* @param device The GPU device to use.
|
|
||||||
* @param row_batch The CSR page.
|
|
||||||
* @param device_row_state On-device data for maintaining state.
|
|
||||||
*/
|
|
||||||
void CreateHistIndices(int device,
|
|
||||||
const SparsePage& row_batch,
|
|
||||||
const RowStateOnDevice& device_row_state);
|
|
||||||
|
|
||||||
/*! \return Number of instances in the page. */
|
/*! \return Number of instances in the page. */
|
||||||
size_t Size() const;
|
size_t Size() const;
|
||||||
|
|
||||||
/*! \brief Set the base row id for this page. */
|
/*! \brief Set the base row id for this page. */
|
||||||
inline void SetBaseRowId(size_t row_id) {
|
void SetBaseRowId(size_t row_id) {
|
||||||
matrix.base_rowid = row_id;
|
base_rowid = row_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! \brief clear the page. */
|
|
||||||
void Clear();
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Push a sparse page.
|
|
||||||
* \param batch The row page.
|
|
||||||
*/
|
|
||||||
void Push(int device, const SparsePage& batch);
|
|
||||||
|
|
||||||
/*! \return Estimation of memory cost of this page. */
|
/*! \return Estimation of memory cost of this page. */
|
||||||
size_t MemCostBytes() const;
|
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief Copy the ELLPACK matrix to GPU.
|
|
||||||
*
|
|
||||||
* @param device The GPU device to use.
|
|
||||||
* @param info The EllpackInfo for the matrix.
|
|
||||||
*/
|
|
||||||
void InitDevice(int device, EllpackInfo info);
|
|
||||||
|
|
||||||
/*! \brief Compress the accumulated SparsePage into ELLPACK format.
|
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||||
*
|
* not found). */
|
||||||
* @param device The GPU device to use.
|
size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
|
||||||
*/
|
|
||||||
void CompressSparsePage(int device);
|
EllpackDeviceAccessor GetDeviceAccessor(int device) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
/*!
|
||||||
|
* \brief Compress a single page of CSR data into ELLPACK.
|
||||||
|
*
|
||||||
|
* @param device The GPU device to use.
|
||||||
|
* @param row_batch The CSR page.
|
||||||
|
*/
|
||||||
|
void CreateHistIndices(int device,
|
||||||
|
const SparsePage& row_batch
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
* \brief Initialize the buffer to store compressed features.
|
||||||
|
*/
|
||||||
|
void InitCompressedData(int device);
|
||||||
|
|
||||||
|
|
||||||
|
public:
|
||||||
|
/*! \brief Whether or not if the matrix is dense. */
|
||||||
|
bool is_dense;
|
||||||
|
/*! \brief Row length for ELLPack. */
|
||||||
|
size_t row_stride;
|
||||||
|
size_t base_rowid{0};
|
||||||
|
size_t n_rows{};
|
||||||
|
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||||
|
HostDeviceVector<common::CompressedByteT> gidx_buffer;
|
||||||
|
common::HistogramCuts cuts_;
|
||||||
private:
|
private:
|
||||||
common::Monitor monitor_;
|
common::Monitor monitor_;
|
||||||
dh::BulkAllocator ba_;
|
|
||||||
bool device_initialized_{false};
|
|
||||||
SparsePage sparse_page_{};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -17,26 +17,35 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
|
|||||||
public:
|
public:
|
||||||
bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
|
bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
|
||||||
auto* impl = page->Impl();
|
auto* impl = page->Impl();
|
||||||
impl->Clear();
|
fi->Read(&impl->cuts_.cut_values_.HostVector());
|
||||||
if (!fi->Read(&impl->matrix.n_rows)) return false;
|
fi->Read(&impl->cuts_.cut_ptrs_.HostVector());
|
||||||
return fi->Read(&impl->idx_buffer);
|
fi->Read(&impl->cuts_.min_vals_.HostVector());
|
||||||
|
fi->Read(&impl->n_rows);
|
||||||
|
fi->Read(&impl->is_dense);
|
||||||
|
fi->Read(&impl->row_stride);
|
||||||
|
if (!fi->Read(&impl->gidx_buffer.HostVector())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Read(EllpackPage* page,
|
bool Read(EllpackPage* page,
|
||||||
dmlc::SeekStream* fi,
|
dmlc::SeekStream* fi,
|
||||||
const std::vector<bst_uint>& sorted_index_set) override {
|
const std::vector<bst_uint>& sorted_index_set) override {
|
||||||
auto* impl = page->Impl();
|
LOG(FATAL) << "Not implemented";
|
||||||
impl->Clear();
|
return false;
|
||||||
if (!fi->Read(&impl->matrix.n_rows)) return false;
|
|
||||||
return fi->Read(&page->Impl()->idx_buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Write(const EllpackPage& page, dmlc::Stream* fo) override {
|
void Write(const EllpackPage& page, dmlc::Stream* fo) override {
|
||||||
auto* impl = page.Impl();
|
auto* impl = page.Impl();
|
||||||
fo->Write(impl->matrix.n_rows);
|
fo->Write(impl->cuts_.cut_values_.ConstHostVector());
|
||||||
auto buffer = impl->idx_buffer;
|
fo->Write(impl->cuts_.cut_ptrs_.ConstHostVector());
|
||||||
CHECK(!buffer.empty());
|
fo->Write(impl->cuts_.min_vals_.ConstHostVector());
|
||||||
fo->Write(buffer);
|
fo->Write(impl->n_rows);
|
||||||
|
fo->Write(impl->is_dense);
|
||||||
|
fo->Write(impl->row_stride);
|
||||||
|
CHECK(!impl->gidx_buffer.ConstHostVector().empty());
|
||||||
|
fo->Write(impl->gidx_buffer.HostVector());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -2,45 +2,23 @@
|
|||||||
* Copyright 2019 XGBoost contributors
|
* Copyright 2019 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_USE_CUDA
|
#ifndef XGBOOST_USE_CUDA
|
||||||
|
#include <dmlc/base.h>
|
||||||
|
#if DMLC_ENABLE_STD_THREAD
|
||||||
|
|
||||||
#include <xgboost/data.h>
|
|
||||||
#include "ellpack_page_source.h"
|
#include "ellpack_page_source.h"
|
||||||
|
#include <xgboost/data.h>
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||||
const std::string& cache_info,
|
const std::string& cache_info,
|
||||||
const BatchParam& param) noexcept(false) {
|
const BatchParam& param) noexcept(false) {
|
||||||
LOG(FATAL) << "Internal Error: "
|
LOG(FATAL)
|
||||||
|
<< "Internal Error: "
|
||||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||||
}
|
}
|
||||||
|
|
||||||
void EllpackPageSource::BeforeFirst() {
|
|
||||||
LOG(FATAL) << "Internal Error: "
|
|
||||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
|
||||||
}
|
|
||||||
|
|
||||||
bool EllpackPageSource::Next() {
|
|
||||||
LOG(FATAL) << "Internal Error: "
|
|
||||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
EllpackPage& EllpackPageSource::Value() {
|
|
||||||
LOG(FATAL) << "Internal Error: "
|
|
||||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
|
||||||
EllpackPage* page { nullptr };
|
|
||||||
return *page;
|
|
||||||
}
|
|
||||||
|
|
||||||
const EllpackPage& EllpackPageSource::Value() const {
|
|
||||||
LOG(FATAL) << "Internal Error: "
|
|
||||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
|
||||||
EllpackPage* page { nullptr };
|
|
||||||
return *page;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace data
|
} // namespace data
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
#endif // DMLC_ENABLE_STD_THREAD
|
||||||
#endif // XGBOOST_USE_CUDA
|
#endif // XGBOOST_USE_CUDA
|
||||||
|
|||||||
@ -3,73 +3,16 @@
|
|||||||
*/
|
*/
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "../common/hist_util.h"
|
#include "../common/hist_util.h"
|
||||||
|
|
||||||
|
#include "ellpack_page.cuh"
|
||||||
#include "ellpack_page_source.h"
|
#include "ellpack_page_source.h"
|
||||||
#include "sparse_page_source.h"
|
#include "sparse_page_source.h"
|
||||||
#include "ellpack_page.cuh"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
class EllpackPageSourceImpl : public DataSource<EllpackPage> {
|
|
||||||
public:
|
|
||||||
/*!
|
|
||||||
* \brief Create source from cache files the cache_prefix.
|
|
||||||
* \param cache_prefix The prefix of cache we want to solve.
|
|
||||||
*/
|
|
||||||
explicit EllpackPageSourceImpl(DMatrix* dmat,
|
|
||||||
const std::string& cache_info,
|
|
||||||
const BatchParam& param) noexcept(false);
|
|
||||||
|
|
||||||
/*! \brief destructor */
|
|
||||||
~EllpackPageSourceImpl() override = default;
|
|
||||||
|
|
||||||
void BeforeFirst() override;
|
|
||||||
bool Next() override;
|
|
||||||
EllpackPage& Value();
|
|
||||||
const EllpackPage& Value() const override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
/*! \brief Write Ellpack pages after accumulating them in memory. */
|
|
||||||
void WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const;
|
|
||||||
|
|
||||||
/*! \brief The page type string for ELLPACK. */
|
|
||||||
const std::string kPageType_{".ellpack.page"};
|
|
||||||
|
|
||||||
int device_{-1};
|
|
||||||
size_t page_size_{DMatrix::kPageSize};
|
|
||||||
common::Monitor monitor_;
|
|
||||||
dh::BulkAllocator ba_;
|
|
||||||
/*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
|
|
||||||
EllpackInfo ellpack_info_;
|
|
||||||
std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> source_;
|
|
||||||
std::string cache_info_;
|
|
||||||
};
|
|
||||||
|
|
||||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
|
||||||
const std::string& cache_info,
|
|
||||||
const BatchParam& param) noexcept(false)
|
|
||||||
: impl_{new EllpackPageSourceImpl(dmat, cache_info, param)} {}
|
|
||||||
|
|
||||||
void EllpackPageSource::BeforeFirst() {
|
|
||||||
impl_->BeforeFirst();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool EllpackPageSource::Next() {
|
|
||||||
return impl_->Next();
|
|
||||||
}
|
|
||||||
|
|
||||||
EllpackPage& EllpackPageSource::Value() {
|
|
||||||
return impl_->Value();
|
|
||||||
}
|
|
||||||
|
|
||||||
const EllpackPage& EllpackPageSource::Value() const {
|
|
||||||
return impl_->Value();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t GetRowStride(DMatrix* dmat) {
|
size_t GetRowStride(DMatrix* dmat) {
|
||||||
if (dmat->IsDense()) return dmat->Info().num_col_;
|
if (dmat->IsDense()) return dmat->Info().num_col_;
|
||||||
|
|
||||||
@ -86,17 +29,19 @@ size_t GetRowStride(DMatrix* dmat) {
|
|||||||
|
|
||||||
// Build the quantile sketch across the whole input data, then use the histogram cuts to compress
|
// Build the quantile sketch across the whole input data, then use the histogram cuts to compress
|
||||||
// each CSR page, and write the accumulated ELLPACK pages to disk.
|
// each CSR page, and write the accumulated ELLPACK pages to disk.
|
||||||
EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||||
const std::string& cache_info,
|
const std::string& cache_info,
|
||||||
const BatchParam& param) noexcept(false)
|
const BatchParam& param) noexcept(false) {
|
||||||
: device_(param.gpu_id), cache_info_(cache_info) {
|
cache_info_ = ParseCacheInfo(cache_info, kPageType_);
|
||||||
|
for (auto file : cache_info_.name_shards) {
|
||||||
|
CheckCacheFileExists(file);
|
||||||
|
}
|
||||||
if (param.gpu_page_size > 0) {
|
if (param.gpu_page_size > 0) {
|
||||||
page_size_ = param.gpu_page_size;
|
page_size_ = param.gpu_page_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
monitor_.Init("ellpack_page_source");
|
monitor_.Init("ellpack_page_source");
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||||
|
|
||||||
monitor_.StartCuda("Quantiles");
|
monitor_.StartCuda("Quantiles");
|
||||||
size_t row_stride = GetRowStride(dmat);
|
size_t row_stride = GetRowStride(dmat);
|
||||||
@ -104,75 +49,52 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
|||||||
param.gpu_batch_nrows);
|
param.gpu_batch_nrows);
|
||||||
monitor_.StopCuda("Quantiles");
|
monitor_.StopCuda("Quantiles");
|
||||||
|
|
||||||
monitor_.StartCuda("CreateEllpackInfo");
|
|
||||||
ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, cuts, &ba_);
|
|
||||||
monitor_.StopCuda("CreateEllpackInfo");
|
|
||||||
|
|
||||||
monitor_.StartCuda("WriteEllpackPages");
|
monitor_.StartCuda("WriteEllpackPages");
|
||||||
WriteEllpackPages(dmat, cache_info);
|
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
|
||||||
monitor_.StopCuda("WriteEllpackPages");
|
monitor_.StopCuda("WriteEllpackPages");
|
||||||
|
|
||||||
source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
|
external_prefetcher_.reset(
|
||||||
ParseCacheInfo(cache_info_, kPageType_)));
|
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
|
||||||
}
|
|
||||||
|
|
||||||
void EllpackPageSourceImpl::BeforeFirst() {
|
|
||||||
source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
|
|
||||||
ParseCacheInfo(cache_info_, kPageType_)));
|
|
||||||
source_->BeforeFirst();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool EllpackPageSourceImpl::Next() {
|
|
||||||
return source_->Next();
|
|
||||||
}
|
|
||||||
|
|
||||||
EllpackPage& EllpackPageSourceImpl::Value() {
|
|
||||||
EllpackPage& page = source_->Value();
|
|
||||||
page.Impl()->InitDevice(device_, ellpack_info_);
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
const EllpackPage& EllpackPageSourceImpl::Value() const {
|
|
||||||
EllpackPage& page = source_->Value();
|
|
||||||
page.Impl()->InitDevice(device_, ellpack_info_);
|
|
||||||
return page;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
|
// Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
|
||||||
void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const {
|
void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
|
||||||
|
const common::HistogramCuts& cuts,
|
||||||
|
const std::string& cache_info,
|
||||||
|
size_t row_stride) const {
|
||||||
auto cinfo = ParseCacheInfo(cache_info, kPageType_);
|
auto cinfo = ParseCacheInfo(cache_info, kPageType_);
|
||||||
const size_t extra_buffer_capacity = 6;
|
const size_t extra_buffer_capacity = 6;
|
||||||
SparsePageWriter<EllpackPage> writer(
|
SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
|
||||||
cinfo.name_shards, cinfo.format_shards, extra_buffer_capacity);
|
extra_buffer_capacity);
|
||||||
std::shared_ptr<EllpackPage> page;
|
std::shared_ptr<EllpackPage> page;
|
||||||
|
SparsePage temp_host_page;
|
||||||
writer.Alloc(&page);
|
writer.Alloc(&page);
|
||||||
auto* impl = page->Impl();
|
auto* impl = page->Impl();
|
||||||
impl->matrix.info = ellpack_info_;
|
|
||||||
impl->Clear();
|
|
||||||
|
|
||||||
const MetaInfo& info = dmat->Info();
|
|
||||||
size_t bytes_write = 0;
|
size_t bytes_write = 0;
|
||||||
double tstart = dmlc::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||||
impl->Push(device_, batch);
|
temp_host_page.Push(batch);
|
||||||
|
|
||||||
size_t mem_cost_bytes = impl->MemCostBytes();
|
size_t mem_cost_bytes =
|
||||||
|
EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
|
||||||
if (mem_cost_bytes >= page_size_) {
|
if (mem_cost_bytes >= page_size_) {
|
||||||
bytes_write += mem_cost_bytes;
|
bytes_write += mem_cost_bytes;
|
||||||
impl->CompressSparsePage(device_);
|
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
|
||||||
|
row_stride);
|
||||||
writer.PushWrite(std::move(page));
|
writer.PushWrite(std::move(page));
|
||||||
writer.Alloc(&page);
|
writer.Alloc(&page);
|
||||||
impl = page->Impl();
|
impl = page->Impl();
|
||||||
impl->matrix.info = ellpack_info_;
|
temp_host_page.Clear();
|
||||||
impl->Clear();
|
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
double tdiff = dmlc::GetTime() - tstart;
|
||||||
LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
|
LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||||
<< (bytes_write >> 20UL) << " written";
|
<< (bytes_write >> 20UL) << " written";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (impl->Size() != 0) {
|
if (temp_host_page.Size() != 0) {
|
||||||
impl->CompressSparsePage(device_);
|
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
|
||||||
|
row_stride);
|
||||||
writer.PushWrite(std::move(page));
|
writer.PushWrite(std::move(page));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -10,19 +10,17 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "../common/timer.h"
|
#include "../common/timer.h"
|
||||||
|
#include "../common/hist_util.h"
|
||||||
|
#include "sparse_page_source.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
class EllpackPageSourceImpl;
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief External memory data source for ELLPACK format.
|
* \brief External memory data source for ELLPACK format.
|
||||||
*
|
*
|
||||||
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
|
|
||||||
* including CUDA-specific implementation details in the header.
|
|
||||||
*/
|
*/
|
||||||
class EllpackPageSource : public DataSource<EllpackPage> {
|
class EllpackPageSource {
|
||||||
public:
|
public:
|
||||||
/*!
|
/*!
|
||||||
* \brief Create source from cache files the cache_prefix.
|
* \brief Create source from cache files the cache_prefix.
|
||||||
@ -32,19 +30,33 @@ class EllpackPageSource : public DataSource<EllpackPage> {
|
|||||||
const std::string& cache_info,
|
const std::string& cache_info,
|
||||||
const BatchParam& param) noexcept(false);
|
const BatchParam& param) noexcept(false);
|
||||||
|
|
||||||
/*! \brief destructor */
|
BatchSet<EllpackPage> GetBatchSet() {
|
||||||
~EllpackPageSource() override = default;
|
auto begin_iter = BatchIterator<EllpackPage>(
|
||||||
|
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>,
|
||||||
|
EllpackPage>(external_prefetcher_.get()));
|
||||||
|
return BatchSet<EllpackPage>(begin_iter);
|
||||||
|
}
|
||||||
|
|
||||||
void BeforeFirst() override;
|
~EllpackPageSource() {
|
||||||
bool Next() override;
|
external_prefetcher_.reset();
|
||||||
EllpackPage& Value();
|
for (auto file : cache_info_.name_shards) {
|
||||||
const EllpackPage& Value() const override;
|
TryDeleteCacheFile(file);
|
||||||
|
}
|
||||||
const EllpackPageSourceImpl* Impl() const { return impl_.get(); }
|
}
|
||||||
EllpackPageSourceImpl* Impl() { return impl_.get(); }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<EllpackPageSourceImpl> impl_;
|
void WriteEllpackPages(int device, DMatrix* dmat,
|
||||||
|
const common::HistogramCuts& cuts,
|
||||||
|
const std::string& cache_info,
|
||||||
|
size_t row_stride) const;
|
||||||
|
|
||||||
|
/*! \brief The page type string for ELLPACK. */
|
||||||
|
const std::string kPageType_{".ellpack.page"};
|
||||||
|
|
||||||
|
size_t page_size_{DMatrix::kPageSize};
|
||||||
|
common::Monitor monitor_;
|
||||||
|
std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
|
||||||
|
CacheInfo cache_info_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace data
|
} // namespace data
|
||||||
|
|||||||
@ -51,11 +51,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
|
|||||||
ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
|
ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
|
||||||
batch_param_ = param;
|
batch_param_ = param;
|
||||||
}
|
}
|
||||||
ellpack_source_->BeforeFirst();
|
return ellpack_source_->GetBatchSet();
|
||||||
ellpack_source_->Next();
|
|
||||||
auto begin_iter = BatchIterator<EllpackPage>(
|
|
||||||
new SparseBatchIteratorImpl<EllpackPageSource, EllpackPage>(ellpack_source_.get()));
|
|
||||||
return BatchSet<EllpackPage>(begin_iter);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace data
|
} // namespace data
|
||||||
|
|||||||
@ -97,9 +97,11 @@ struct SparsePageLoader {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct EllpackLoader {
|
struct EllpackLoader {
|
||||||
EllpackMatrix const& matrix;
|
EllpackDeviceAccessor const& matrix;
|
||||||
XGBOOST_DEVICE EllpackLoader(EllpackMatrix const& m, bool use_shared, bst_feature_t num_features,
|
XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool use_shared,
|
||||||
bst_row_t num_rows, size_t entry_start) : matrix{m} {}
|
bst_feature_t num_features, bst_row_t num_rows,
|
||||||
|
size_t entry_start)
|
||||||
|
: matrix{m} {}
|
||||||
__device__ __forceinline__ float GetFvalue(int ridx, int fidx) const {
|
__device__ __forceinline__ float GetFvalue(int ridx, int fidx) const {
|
||||||
auto gidx = matrix.GetBinIndex(ridx, fidx);
|
auto gidx = matrix.GetBinIndex(ridx, fidx);
|
||||||
if (gidx == -1) {
|
if (gidx == -1) {
|
||||||
@ -107,10 +109,10 @@ struct EllpackLoader {
|
|||||||
}
|
}
|
||||||
// The gradient index needs to be shifted by one as min values are not included in the
|
// The gradient index needs to be shifted by one as min values are not included in the
|
||||||
// cuts.
|
// cuts.
|
||||||
if (gidx == matrix.info.feature_segments[fidx]) {
|
if (gidx == matrix.feature_segments[fidx]) {
|
||||||
return matrix.info.min_fvalue[fidx];
|
return matrix.min_fvalue[fidx];
|
||||||
}
|
}
|
||||||
return matrix.info.gidx_fvalue_map[gidx - 1];
|
return matrix.gidx_fvalue_map[gidx - 1];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -217,7 +219,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
this->tree_begin_, this->tree_end_, num_features, num_rows,
|
this->tree_begin_, this->tree_end_, num_features, num_rows,
|
||||||
entry_start, use_shared, this->num_group_);
|
entry_start, use_shared, this->num_group_);
|
||||||
}
|
}
|
||||||
void PredictInternal(EllpackMatrix const& batch, HostDeviceVector<bst_float>* out_preds,
|
void PredictInternal(EllpackDeviceAccessor const& batch, HostDeviceVector<bst_float>* out_preds,
|
||||||
size_t batch_offset) {
|
size_t batch_offset) {
|
||||||
const uint32_t BLOCK_THREADS = 256;
|
const uint32_t BLOCK_THREADS = 256;
|
||||||
size_t num_rows = batch.n_rows;
|
size_t num_rows = batch.n_rows;
|
||||||
@ -226,11 +228,11 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
bool use_shared = false;
|
bool use_shared = false;
|
||||||
size_t entry_start = 0;
|
size_t entry_start = 0;
|
||||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
|
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
|
||||||
PredictKernel<EllpackLoader, EllpackMatrix>,
|
PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
|
||||||
batch,
|
batch,
|
||||||
dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
|
dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
|
||||||
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
|
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
|
||||||
this->tree_begin_, this->tree_end_, batch.info.NumFeatures(), num_rows,
|
this->tree_begin_, this->tree_end_, batch.NumFeatures(), num_rows,
|
||||||
entry_start, use_shared, this->num_group_);
|
entry_start, use_shared, this->num_group_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -269,8 +271,10 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (dmat->PageExists<EllpackPage>()) {
|
if (dmat->PageExists<EllpackPage>()) {
|
||||||
size_t batch_offset = 0;
|
size_t batch_offset = 0;
|
||||||
for (auto const& page : dmat->GetBatches<EllpackPage>()) {
|
for (auto const& page : dmat->GetBatches<EllpackPage>()) {
|
||||||
this->PredictInternal(page.Impl()->matrix, out_preds, batch_offset);
|
this->PredictInternal(
|
||||||
batch_offset += page.Impl()->matrix.n_rows;
|
page.Impl()->GetDeviceAccessor(generic_param_->gpu_id), out_preds,
|
||||||
|
batch_offset);
|
||||||
|
batch_offset += page.Impl()->n_rows;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
size_t batch_offset = 0;
|
size_t batch_offset = 0;
|
||||||
|
|||||||
@ -153,7 +153,8 @@ ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
|
|||||||
size_t n_rows,
|
size_t n_rows,
|
||||||
const BatchParam& batch_param)
|
const BatchParam& batch_param)
|
||||||
: batch_param_(batch_param),
|
: batch_param_(batch_param),
|
||||||
page_(new EllpackPageImpl(batch_param.gpu_id, page->matrix.info, n_rows)) {}
|
page_(new EllpackPageImpl(batch_param.gpu_id, page->cuts_, page->is_dense,
|
||||||
|
page->row_stride, n_rows)) {}
|
||||||
|
|
||||||
GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
|
GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
|
||||||
DMatrix* dmat) {
|
DMatrix* dmat) {
|
||||||
@ -217,9 +218,9 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
|
|||||||
|
|
||||||
// Create a new ELLPACK page with empty rows.
|
// Create a new ELLPACK page with empty rows.
|
||||||
page_.reset(); // Release the device memory first before reallocating
|
page_.reset(); // Release the device memory first before reallocating
|
||||||
page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
|
page_.reset(new EllpackPageImpl(
|
||||||
original_page_->matrix.info,
|
batch_param_.gpu_id, original_page_->cuts_, original_page_->is_dense,
|
||||||
sample_rows));
|
original_page_->row_stride, sample_rows));
|
||||||
|
|
||||||
// Compact the ELLPACK pages into the single sample page.
|
// Compact the ELLPACK pages into the single sample page.
|
||||||
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||||
@ -298,9 +299,9 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
|
|||||||
|
|
||||||
// Create a new ELLPACK page with empty rows.
|
// Create a new ELLPACK page with empty rows.
|
||||||
page_.reset(); // Release the device memory first before reallocating
|
page_.reset(); // Release the device memory first before reallocating
|
||||||
page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
|
page_.reset(new EllpackPageImpl(batch_param_.gpu_id, original_page_->cuts_,
|
||||||
original_page_->matrix.info,
|
original_page_->is_dense,
|
||||||
sample_rows));
|
original_page_->row_stride, sample_rows));
|
||||||
|
|
||||||
// Compact the ELLPACK pages into the single sample page.
|
// Compact the ELLPACK pages into the single sample page.
|
||||||
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||||
@ -319,7 +320,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
|
|||||||
monitor_.Init("gradient_based_sampler");
|
monitor_.Init("gradient_based_sampler");
|
||||||
|
|
||||||
bool is_sampling = subsample < 1.0;
|
bool is_sampling = subsample < 1.0;
|
||||||
bool is_external_memory = page->matrix.n_rows != n_rows;
|
bool is_external_memory = page->n_rows != n_rows;
|
||||||
|
|
||||||
if (is_sampling) {
|
if (is_sampling) {
|
||||||
switch (sampling_method) {
|
switch (sampling_method) {
|
||||||
|
|||||||
@ -101,7 +101,7 @@ template GradientPairPrecise CreateRoundingFactor(common::Span<GradientPair cons
|
|||||||
template GradientPair CreateRoundingFactor(common::Span<GradientPair const> gpair);
|
template GradientPair CreateRoundingFactor(common::Span<GradientPair const> gpair);
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
__global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
|
__global__ void SharedMemHistKernel(EllpackDeviceAccessor matrix,
|
||||||
common::Span<const RowPartitioner::RowIndexT> d_ridx,
|
common::Span<const RowPartitioner::RowIndexT> d_ridx,
|
||||||
GradientSumT* __restrict__ d_node_hist,
|
GradientSumT* __restrict__ d_node_hist,
|
||||||
const GradientPair* __restrict__ d_gpair,
|
const GradientPair* __restrict__ d_gpair,
|
||||||
@ -112,14 +112,14 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
|
|||||||
extern __shared__ char smem[];
|
extern __shared__ char smem[];
|
||||||
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
|
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
|
||||||
if (use_shared_memory_histograms) {
|
if (use_shared_memory_histograms) {
|
||||||
dh::BlockFill(smem_arr, matrix.info.n_bins, GradientSumT());
|
dh::BlockFill(smem_arr, matrix.NumBins(), GradientSumT());
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
|
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
|
||||||
int ridx = d_ridx[idx / matrix.info.row_stride];
|
int ridx = d_ridx[idx / matrix.row_stride];
|
||||||
int gidx =
|
int gidx =
|
||||||
matrix.gidx_iter[ridx * matrix.info.row_stride + idx % matrix.info.row_stride];
|
matrix.gidx_iter[ridx * matrix.row_stride + idx % matrix.row_stride];
|
||||||
if (gidx != matrix.info.n_bins) {
|
if (gidx != matrix.NumBins()) {
|
||||||
GradientSumT truncated {
|
GradientSumT truncated {
|
||||||
TruncateWithRoundingFactor<T>(rounding.GetGrad(), d_gpair[ridx].GetGrad()),
|
TruncateWithRoundingFactor<T>(rounding.GetGrad(), d_gpair[ridx].GetGrad()),
|
||||||
TruncateWithRoundingFactor<T>(rounding.GetHess(), d_gpair[ridx].GetHess()),
|
TruncateWithRoundingFactor<T>(rounding.GetHess(), d_gpair[ridx].GetHess()),
|
||||||
@ -135,7 +135,7 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
|
|||||||
if (use_shared_memory_histograms) {
|
if (use_shared_memory_histograms) {
|
||||||
// Write shared memory back to global memory
|
// Write shared memory back to global memory
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.info.n_bins)) {
|
for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.NumBins())) {
|
||||||
GradientSumT truncated {
|
GradientSumT truncated {
|
||||||
TruncateWithRoundingFactor<T>(rounding.GetGrad(), smem_arr[i].GetGrad()),
|
TruncateWithRoundingFactor<T>(rounding.GetGrad(), smem_arr[i].GetGrad()),
|
||||||
TruncateWithRoundingFactor<T>(rounding.GetHess(), smem_arr[i].GetHess()),
|
TruncateWithRoundingFactor<T>(rounding.GetHess(), smem_arr[i].GetHess()),
|
||||||
@ -146,16 +146,16 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
void BuildGradientHistogram(EllpackMatrix const& matrix,
|
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> d_ridx,
|
common::Span<const uint32_t> d_ridx,
|
||||||
common::Span<GradientSumT> histogram,
|
common::Span<GradientSumT> histogram,
|
||||||
GradientSumT rounding, bool shared) {
|
GradientSumT rounding, bool shared) {
|
||||||
const size_t smem_size =
|
const size_t smem_size =
|
||||||
shared
|
shared
|
||||||
? sizeof(GradientSumT) * matrix.info.n_bins
|
? sizeof(GradientSumT) * matrix.NumBins()
|
||||||
: 0;
|
: 0;
|
||||||
auto n_elements = d_ridx.size() * matrix.info.row_stride;
|
auto n_elements = d_ridx.size() * matrix.row_stride;
|
||||||
|
|
||||||
uint32_t items_per_thread = 8;
|
uint32_t items_per_thread = 8;
|
||||||
uint32_t block_threads = 256;
|
uint32_t block_threads = 256;
|
||||||
@ -168,14 +168,14 @@ void BuildGradientHistogram(EllpackMatrix const& matrix,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template void BuildGradientHistogram<GradientPair>(
|
template void BuildGradientHistogram<GradientPair>(
|
||||||
EllpackMatrix const& matrix,
|
EllpackDeviceAccessor const& matrix,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> ridx,
|
common::Span<const uint32_t> ridx,
|
||||||
common::Span<GradientPair> histogram,
|
common::Span<GradientPair> histogram,
|
||||||
GradientPair rounding, bool shared);
|
GradientPair rounding, bool shared);
|
||||||
|
|
||||||
template void BuildGradientHistogram<GradientPairPrecise>(
|
template void BuildGradientHistogram<GradientPairPrecise>(
|
||||||
EllpackMatrix const& matrix,
|
EllpackDeviceAccessor const& matrix,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> ridx,
|
common::Span<const uint32_t> ridx,
|
||||||
common::Span<GradientPairPrecise> histogram,
|
common::Span<GradientPairPrecise> histogram,
|
||||||
|
|||||||
@ -18,7 +18,7 @@ DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
void BuildGradientHistogram(EllpackMatrix const& matrix,
|
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> ridx,
|
common::Span<const uint32_t> ridx,
|
||||||
common::Span<GradientSumT> histogram,
|
common::Span<GradientSumT> histogram,
|
||||||
|
|||||||
@ -180,15 +180,15 @@ template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
|
|||||||
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
||||||
__device__ void EvaluateFeature(
|
__device__ void EvaluateFeature(
|
||||||
int fidx, common::Span<const GradientSumT> node_histogram,
|
int fidx, common::Span<const GradientSumT> node_histogram,
|
||||||
const xgboost::EllpackMatrix& matrix,
|
const EllpackDeviceAccessor& matrix,
|
||||||
DeviceSplitCandidate* best_split, // shared memory storing best split
|
DeviceSplitCandidate* best_split, // shared memory storing best split
|
||||||
const DeviceNodeStats& node, const GPUTrainingParam& param,
|
const DeviceNodeStats& node, const GPUTrainingParam& param,
|
||||||
TempStorageT* temp_storage, // temp memory for cub operations
|
TempStorageT* temp_storage, // temp memory for cub operations
|
||||||
int constraint, // monotonic_constraints
|
int constraint, // monotonic_constraints
|
||||||
const ValueConstraint& value_constraint) {
|
const ValueConstraint& value_constraint) {
|
||||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||||
uint32_t gidx_begin = matrix.info.feature_segments[fidx]; // begining bin
|
uint32_t gidx_begin = matrix.feature_segments[fidx]; // begining bin
|
||||||
uint32_t gidx_end = matrix.info.feature_segments[fidx + 1]; // end bin for i^th feature
|
uint32_t gidx_end = matrix.feature_segments[fidx + 1]; // end bin for i^th feature
|
||||||
|
|
||||||
// Sum histogram bins for current feature
|
// Sum histogram bins for current feature
|
||||||
GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
|
GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
|
||||||
@ -236,9 +236,9 @@ __device__ void EvaluateFeature(
|
|||||||
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
||||||
float fvalue;
|
float fvalue;
|
||||||
if (split_gidx < static_cast<int>(gidx_begin)) {
|
if (split_gidx < static_cast<int>(gidx_begin)) {
|
||||||
fvalue = matrix.info.min_fvalue[fidx];
|
fvalue = matrix.min_fvalue[fidx];
|
||||||
} else {
|
} else {
|
||||||
fvalue = matrix.info.gidx_fvalue_map[split_gidx];
|
fvalue = matrix.gidx_fvalue_map[split_gidx];
|
||||||
}
|
}
|
||||||
GradientSumT left = missing_left ? bin + missing : bin;
|
GradientSumT left = missing_left ? bin + missing : bin;
|
||||||
GradientSumT right = parent_sum - left;
|
GradientSumT right = parent_sum - left;
|
||||||
@ -254,7 +254,7 @@ __global__ void EvaluateSplitKernel(
|
|||||||
common::Span<const GradientSumT> node_histogram, // histogram for gradients
|
common::Span<const GradientSumT> node_histogram, // histogram for gradients
|
||||||
common::Span<const bst_feature_t> feature_set, // Selected features
|
common::Span<const bst_feature_t> feature_set, // Selected features
|
||||||
DeviceNodeStats node,
|
DeviceNodeStats node,
|
||||||
xgboost::EllpackMatrix matrix,
|
xgboost::EllpackDeviceAccessor matrix,
|
||||||
GPUTrainingParam gpu_param,
|
GPUTrainingParam gpu_param,
|
||||||
common::Span<DeviceSplitCandidate> split_candidates, // resulting split
|
common::Span<DeviceSplitCandidate> split_candidates, // resulting split
|
||||||
ValueConstraint value_constraint,
|
ValueConstraint value_constraint,
|
||||||
@ -601,7 +601,7 @@ struct GPUHistMakerDevice {
|
|||||||
uint32_t constexpr kBlockThreads = 256;
|
uint32_t constexpr kBlockThreads = 256;
|
||||||
dh::LaunchKernel {uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]} (
|
dh::LaunchKernel {uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]} (
|
||||||
EvaluateSplitKernel<kBlockThreads, GradientSumT>,
|
EvaluateSplitKernel<kBlockThreads, GradientSumT>,
|
||||||
hist.GetNodeHistogram(nidx), d_feature_set, node, page->matrix,
|
hist.GetNodeHistogram(nidx), d_feature_set, node, page->GetDeviceAccessor(device_id),
|
||||||
gpu_param, d_split_candidates, node_value_constraints[nidx],
|
gpu_param, d_split_candidates, node_value_constraints[nidx],
|
||||||
monotone_constraints);
|
monotone_constraints);
|
||||||
|
|
||||||
@ -625,9 +625,7 @@ struct GPUHistMakerDevice {
|
|||||||
hist.AllocateHistogram(nidx);
|
hist.AllocateHistogram(nidx);
|
||||||
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
||||||
auto d_ridx = row_partitioner->GetRows(nidx);
|
auto d_ridx = row_partitioner->GetRows(nidx);
|
||||||
auto d_gpair = gpair.data();
|
BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
|
||||||
|
|
||||||
BuildGradientHistogram(page->matrix, gpair, d_ridx, d_node_hist,
|
|
||||||
histogram_rounding, use_shared_memory_histograms);
|
histogram_rounding, use_shared_memory_histograms);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -637,7 +635,7 @@ struct GPUHistMakerDevice {
|
|||||||
auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
|
auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
|
||||||
auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
|
auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
|
||||||
|
|
||||||
dh::LaunchN(device_id, page->matrix.info.n_bins, [=] __device__(size_t idx) {
|
dh::LaunchN(device_id, page->cuts_.TotalBins(), [=] __device__(size_t idx) {
|
||||||
d_node_hist_subtraction[idx] =
|
d_node_hist_subtraction[idx] =
|
||||||
d_node_hist_parent[idx] - d_node_hist_histogram[idx];
|
d_node_hist_parent[idx] - d_node_hist_histogram[idx];
|
||||||
});
|
});
|
||||||
@ -652,7 +650,7 @@ struct GPUHistMakerDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void UpdatePosition(int nidx, RegTree::Node split_node) {
|
void UpdatePosition(int nidx, RegTree::Node split_node) {
|
||||||
auto d_matrix = page->matrix;
|
auto d_matrix = page->GetDeviceAccessor(device_id);
|
||||||
|
|
||||||
row_partitioner->UpdatePosition(
|
row_partitioner->UpdatePosition(
|
||||||
nidx, split_node.LeftChild(), split_node.RightChild(),
|
nidx, split_node.LeftChild(), split_node.RightChild(),
|
||||||
@ -689,7 +687,7 @@ struct GPUHistMakerDevice {
|
|||||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||||
row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
|
row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
|
||||||
}
|
}
|
||||||
if (page->matrix.n_rows == p_fmat->Info().num_row_) {
|
if (page->n_rows == p_fmat->Info().num_row_) {
|
||||||
FinalisePositionInPage(page, d_nodes);
|
FinalisePositionInPage(page, d_nodes);
|
||||||
} else {
|
} else {
|
||||||
for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
|
for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
|
||||||
@ -699,7 +697,7 @@ struct GPUHistMakerDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void FinalisePositionInPage(EllpackPageImpl* page, const common::Span<RegTree::Node> d_nodes) {
|
void FinalisePositionInPage(EllpackPageImpl* page, const common::Span<RegTree::Node> d_nodes) {
|
||||||
auto d_matrix = page->matrix;
|
auto d_matrix = page->GetDeviceAccessor(device_id);
|
||||||
row_partitioner->FinalisePosition(
|
row_partitioner->FinalisePosition(
|
||||||
[=] __device__(size_t row_id, int position) {
|
[=] __device__(size_t row_id, int position) {
|
||||||
if (!d_matrix.IsInRange(row_id)) {
|
if (!d_matrix.IsInRange(row_id)) {
|
||||||
@ -765,7 +763,7 @@ struct GPUHistMakerDevice {
|
|||||||
reducer->AllReduceSum(
|
reducer->AllReduceSum(
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
page->matrix.info.n_bins * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
page->cuts_.TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||||
reducer->Synchronize();
|
reducer->Synchronize();
|
||||||
|
|
||||||
monitor.StopCuda("AllReduce");
|
monitor.StopCuda("AllReduce");
|
||||||
@ -954,14 +952,14 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
|
|||||||
// check if we can use shared memory for building histograms
|
// check if we can use shared memory for building histograms
|
||||||
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
|
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
|
||||||
// hiding)
|
// hiding)
|
||||||
auto histogram_size = sizeof(GradientSumT) * page->matrix.info.n_bins;
|
auto histogram_size = sizeof(GradientSumT) * page->cuts_.TotalBins();
|
||||||
auto max_smem = dh::MaxSharedMemory(device_id);
|
auto max_smem = dh::MaxSharedMemory(device_id);
|
||||||
if (histogram_size <= max_smem) {
|
if (histogram_size <= max_smem) {
|
||||||
use_shared_memory_histograms = true;
|
use_shared_memory_histograms = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init histogram
|
// Init histogram
|
||||||
hist.Init(device_id, page->matrix.info.n_bins);
|
hist.Init(device_id, page->cuts_.TotalBins());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
|
|||||||
@ -19,23 +19,19 @@ TEST(EllpackPage, EmptyDMatrix) {
|
|||||||
auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity);
|
auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity);
|
||||||
auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin, kGpuBatchNRows}).begin();
|
auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin, kGpuBatchNRows}).begin();
|
||||||
auto impl = page.Impl();
|
auto impl = page.Impl();
|
||||||
ASSERT_EQ(impl->matrix.info.feature_segments.size(), 1);
|
ASSERT_EQ(impl->row_stride, 0);
|
||||||
ASSERT_EQ(impl->matrix.info.min_fvalue.size(), 0);
|
ASSERT_EQ(impl->cuts_.TotalBins(), 0);
|
||||||
ASSERT_EQ(impl->matrix.info.gidx_fvalue_map.size(), 0);
|
ASSERT_EQ(impl->gidx_buffer.Size(), 4);
|
||||||
ASSERT_EQ(impl->matrix.info.row_stride, 0);
|
|
||||||
ASSERT_EQ(impl->matrix.info.n_bins, 0);
|
|
||||||
ASSERT_EQ(impl->gidx_buffer.size(), 4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(EllpackPage, BuildGidxDense) {
|
TEST(EllpackPage, BuildGidxDense) {
|
||||||
int constexpr kNRows = 16, kNCols = 8;
|
int constexpr kNRows = 16, kNCols = 8;
|
||||||
auto page = BuildEllpackPage(kNRows, kNCols);
|
auto page = BuildEllpackPage(kNRows, kNCols);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.size());
|
std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
|
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), page->NumSymbols());
|
||||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
|
||||||
|
|
||||||
ASSERT_EQ(page->matrix.info.row_stride, kNCols);
|
ASSERT_EQ(page->row_stride, kNCols);
|
||||||
|
|
||||||
std::vector<uint32_t> solution = {
|
std::vector<uint32_t> solution = {
|
||||||
0, 3, 8, 9, 14, 17, 20, 21,
|
0, 3, 8, 9, 14, 17, 20, 21,
|
||||||
@ -64,11 +60,10 @@ TEST(EllpackPage, BuildGidxSparse) {
|
|||||||
int constexpr kNRows = 16, kNCols = 8;
|
int constexpr kNRows = 16, kNCols = 8;
|
||||||
auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);
|
auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.size());
|
std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
|
|
||||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
||||||
|
|
||||||
ASSERT_LE(page->matrix.info.row_stride, 3);
|
ASSERT_LE(page->row_stride, 3);
|
||||||
|
|
||||||
// row_stride = 3, 16 rows, 48 entries for ELLPack
|
// row_stride = 3, 16 rows, 48 entries for ELLPack
|
||||||
std::vector<uint32_t> solution = {
|
std::vector<uint32_t> solution = {
|
||||||
@ -76,16 +71,16 @@ TEST(EllpackPage, BuildGidxSparse) {
|
|||||||
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
||||||
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < kNRows * page->matrix.info.row_stride; ++i) {
|
for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
|
||||||
ASSERT_EQ(solution[i], gidx[i]);
|
ASSERT_EQ(solution[i], gidx[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ReadRowFunction {
|
struct ReadRowFunction {
|
||||||
EllpackMatrix matrix;
|
EllpackDeviceAccessor matrix;
|
||||||
int row;
|
int row;
|
||||||
bst_float* row_data_d;
|
bst_float* row_data_d;
|
||||||
ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
|
ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
|
||||||
: matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
|
: matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
|
||||||
|
|
||||||
__device__ void operator()(size_t col) {
|
__device__ void operator()(size_t col) {
|
||||||
@ -110,7 +105,8 @@ TEST(EllpackPage, Copy) {
|
|||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
|
|
||||||
// Create an empty result page.
|
// Create an empty result page.
|
||||||
EllpackPageImpl result(0, page->matrix.info, kRows);
|
EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride,
|
||||||
|
kRows);
|
||||||
|
|
||||||
// Copy batch pages into the result page.
|
// Copy batch pages into the result page.
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
@ -126,13 +122,13 @@ TEST(EllpackPage, Copy) {
|
|||||||
std::vector<bst_float> row_result(kCols);
|
std::vector<bst_float> row_result(kCols);
|
||||||
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
||||||
auto impl = page.Impl();
|
auto impl = page.Impl();
|
||||||
EXPECT_EQ(impl->matrix.base_rowid, current_row);
|
EXPECT_EQ(impl->base_rowid, current_row);
|
||||||
|
|
||||||
for (size_t i = 0; i < impl->Size(); i++) {
|
for (size_t i = 0; i < impl->Size(); i++) {
|
||||||
dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
|
dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
|
||||||
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
||||||
|
|
||||||
dh::LaunchN(0, kCols, ReadRowFunction(result.matrix, current_row, row_result_d.data().get()));
|
dh::LaunchN(0, kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
|
||||||
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
||||||
|
|
||||||
EXPECT_EQ(row, row_result);
|
EXPECT_EQ(row, row_result);
|
||||||
@ -155,7 +151,8 @@ TEST(EllpackPage, Compact) {
|
|||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
|
|
||||||
// Create an empty result page.
|
// Create an empty result page.
|
||||||
EllpackPageImpl result(0, page->matrix.info, kCompactedRows);
|
EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride,
|
||||||
|
kCompactedRows);
|
||||||
|
|
||||||
// Compact batch pages into the result page.
|
// Compact batch pages into the result page.
|
||||||
std::vector<size_t> row_indexes_h {
|
std::vector<size_t> row_indexes_h {
|
||||||
@ -174,7 +171,7 @@ TEST(EllpackPage, Compact) {
|
|||||||
std::vector<bst_float> row_result(kCols);
|
std::vector<bst_float> row_result(kCols);
|
||||||
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
||||||
auto impl = page.Impl();
|
auto impl = page.Impl();
|
||||||
EXPECT_EQ(impl->matrix.base_rowid, current_row);
|
EXPECT_EQ(impl->base_rowid, current_row);
|
||||||
|
|
||||||
for (size_t i = 0; i < impl->Size(); i++) {
|
for (size_t i = 0; i < impl->Size(); i++) {
|
||||||
size_t compacted_row = row_indexes_h[current_row];
|
size_t compacted_row = row_indexes_h[current_row];
|
||||||
@ -183,11 +180,12 @@ TEST(EllpackPage, Compact) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
|
dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
|
||||||
|
dh::safe_cuda (cudaDeviceSynchronize());
|
||||||
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
||||||
|
|
||||||
dh::LaunchN(0, kCols,
|
dh::LaunchN(0, kCols,
|
||||||
ReadRowFunction(result.matrix, compacted_row, row_result_d.data().get()));
|
ReadRowFunction(result.GetDeviceAccessor(0), compacted_row, row_result_d.data().get()));
|
||||||
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
||||||
|
|
||||||
EXPECT_EQ(row, row_result);
|
EXPECT_EQ(row, row_result);
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
#include <dmlc/filesystem.h>
|
#include <dmlc/filesystem.h>
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "../../../src/common/compressed_iterator.h"
|
#include "../../../src/common/compressed_iterator.h"
|
||||||
|
#include "../../../src/data/ellpack_page.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
@ -58,31 +59,29 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
|
|||||||
|
|
||||||
BatchParam param{0, 2, 0, 0};
|
BatchParam param{0, 2, 0, 0};
|
||||||
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_EQ(impl->matrix.base_rowid, 0);
|
EXPECT_EQ(impl->base_rowid, 0);
|
||||||
EXPECT_EQ(impl->matrix.n_rows, kRows);
|
EXPECT_EQ(impl->n_rows, kRows);
|
||||||
EXPECT_FALSE(impl->matrix.info.is_dense);
|
EXPECT_FALSE(impl->is_dense);
|
||||||
EXPECT_EQ(impl->matrix.info.row_stride, 2);
|
EXPECT_EQ(impl->row_stride, 2);
|
||||||
EXPECT_EQ(impl->matrix.info.n_bins, 4);
|
EXPECT_EQ(impl->cuts_.TotalBins(), 4);
|
||||||
|
|
||||||
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_EQ(impl_ext->matrix.base_rowid, 0);
|
EXPECT_EQ(impl_ext->base_rowid, 0);
|
||||||
EXPECT_EQ(impl_ext->matrix.n_rows, kRows);
|
EXPECT_EQ(impl_ext->n_rows, kRows);
|
||||||
EXPECT_FALSE(impl_ext->matrix.info.is_dense);
|
EXPECT_FALSE(impl_ext->is_dense);
|
||||||
EXPECT_EQ(impl_ext->matrix.info.row_stride, 2);
|
EXPECT_EQ(impl_ext->row_stride, 2);
|
||||||
EXPECT_EQ(impl_ext->matrix.info.n_bins, 4);
|
EXPECT_EQ(impl_ext->cuts_.TotalBins(), 4);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.size());
|
std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.HostVector());
|
||||||
std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.size());
|
std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&buffer, impl->gidx_buffer);
|
|
||||||
dh::CopyDeviceSpanToVector(&buffer_ext, impl_ext->gidx_buffer);
|
|
||||||
EXPECT_EQ(buffer, buffer_ext);
|
EXPECT_EQ(buffer, buffer_ext);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ReadRowFunction {
|
struct ReadRowFunction {
|
||||||
EllpackMatrix matrix;
|
EllpackDeviceAccessor matrix;
|
||||||
int row;
|
int row;
|
||||||
bst_float* row_data_d;
|
bst_float* row_data_d;
|
||||||
ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
|
ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
|
||||||
: matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
|
: matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
|
||||||
|
|
||||||
__device__ void operator()(size_t col) {
|
__device__ void operator()(size_t col) {
|
||||||
@ -110,8 +109,8 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
|
|||||||
|
|
||||||
BatchParam param{0, kMaxBins, 0, kPageSize};
|
BatchParam param{0, kMaxBins, 0, kPageSize};
|
||||||
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_EQ(impl->matrix.base_rowid, 0);
|
EXPECT_EQ(impl->base_rowid, 0);
|
||||||
EXPECT_EQ(impl->matrix.n_rows, kRows);
|
EXPECT_EQ(impl->n_rows, kRows);
|
||||||
|
|
||||||
size_t current_row = 0;
|
size_t current_row = 0;
|
||||||
thrust::device_vector<bst_float> row_d(kCols);
|
thrust::device_vector<bst_float> row_d(kCols);
|
||||||
@ -120,13 +119,13 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
|
|||||||
std::vector<bst_float> row_ext(kCols);
|
std::vector<bst_float> row_ext(kCols);
|
||||||
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
||||||
auto impl_ext = page.Impl();
|
auto impl_ext = page.Impl();
|
||||||
EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
|
EXPECT_EQ(impl_ext->base_rowid, current_row);
|
||||||
|
|
||||||
for (size_t i = 0; i < impl_ext->Size(); i++) {
|
for (size_t i = 0; i < impl_ext->Size(); i++) {
|
||||||
dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
|
dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
|
||||||
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
||||||
|
|
||||||
dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->matrix, current_row, row_ext_d.data().get()));
|
dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
|
||||||
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
|
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
|
||||||
|
|
||||||
EXPECT_EQ(row, row_ext);
|
EXPECT_EQ(row, row_ext);
|
||||||
@ -155,8 +154,8 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
|
|||||||
size_t current_row = 0;
|
size_t current_row = 0;
|
||||||
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
||||||
auto impl_ext = page.Impl();
|
auto impl_ext = page.Impl();
|
||||||
EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
|
EXPECT_EQ(impl_ext->base_rowid, current_row);
|
||||||
current_row += impl_ext->matrix.n_rows;
|
current_row += impl_ext->n_rows;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -244,13 +244,13 @@ class HistogramCutsWrapper : public common::HistogramCuts {
|
|||||||
public:
|
public:
|
||||||
using SuperT = common::HistogramCuts;
|
using SuperT = common::HistogramCuts;
|
||||||
void SetValues(std::vector<float> cuts) {
|
void SetValues(std::vector<float> cuts) {
|
||||||
SuperT::cut_values_ = std::move(cuts);
|
SuperT::cut_values_.HostVector() = std::move(cuts);
|
||||||
}
|
}
|
||||||
void SetPtrs(std::vector<uint32_t> ptrs) {
|
void SetPtrs(std::vector<uint32_t> ptrs) {
|
||||||
SuperT::cut_ptrs_ = std::move(ptrs);
|
SuperT::cut_ptrs_.HostVector() = std::move(ptrs);
|
||||||
}
|
}
|
||||||
void SetMins(std::vector<float> mins) {
|
void SetMins(std::vector<float> mins) {
|
||||||
SuperT::min_vals_ = std::move(mins);
|
SuperT::min_vals_.HostVector() = std::move(mins);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
@ -279,10 +279,8 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
|
|||||||
row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
|
row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get(), {0, 256, 0}));
|
auto page = std::unique_ptr<EllpackPageImpl>(
|
||||||
page->InitInfo(0, (*dmat)->IsDense(), row_stride, cmat);
|
new EllpackPageImpl(0, cmat, batch, (*dmat)->IsDense(), row_stride));
|
||||||
page->InitCompressedData(0, n_rows);
|
|
||||||
page->CreateHistIndices(0, batch, RowStateOnDevice(batch.Size(), batch.Size()));
|
|
||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
#include "../../../../src/data/ellpack_page.cuh"
|
#include "../../../../src/data/ellpack_page.cuh"
|
||||||
#include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
|
#include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
|
||||||
#include "../../helpers.h"
|
#include "../../helpers.h"
|
||||||
|
#include "dmlc/filesystem.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -29,7 +30,7 @@ void VerifySampling(size_t page_size,
|
|||||||
BatchParam param{0, 256, 0, page_size};
|
BatchParam param{0, 256, 0, page_size};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
if (page_size != 0) {
|
if (page_size != 0) {
|
||||||
EXPECT_NE(page->matrix.n_rows, kRows);
|
EXPECT_NE(page->n_rows, kRows);
|
||||||
}
|
}
|
||||||
|
|
||||||
GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method);
|
GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method);
|
||||||
@ -37,11 +38,11 @@ void VerifySampling(size_t page_size,
|
|||||||
|
|
||||||
if (fixed_size_sampling) {
|
if (fixed_size_sampling) {
|
||||||
EXPECT_EQ(sample.sample_rows, kRows);
|
EXPECT_EQ(sample.sample_rows, kRows);
|
||||||
EXPECT_EQ(sample.page->matrix.n_rows, kRows);
|
EXPECT_EQ(sample.page->n_rows, kRows);
|
||||||
EXPECT_EQ(sample.gpair.size(), kRows);
|
EXPECT_EQ(sample.gpair.size(), kRows);
|
||||||
} else {
|
} else {
|
||||||
EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016f);
|
EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016);
|
||||||
EXPECT_NEAR(sample.page->matrix.n_rows, sample_rows, kRows * 0.016f);
|
EXPECT_NEAR(sample.page->n_rows, sample_rows, kRows * 0.016f);
|
||||||
EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.016f);
|
EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.016f);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,7 +84,7 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) {
|
|||||||
|
|
||||||
BatchParam param{0, 256, 0, kPageSize};
|
BatchParam param{0, 256, 0, kPageSize};
|
||||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||||
EXPECT_NE(page->matrix.n_rows, kRows);
|
EXPECT_NE(page->n_rows, kRows);
|
||||||
|
|
||||||
GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform);
|
GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform);
|
||||||
auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
|
auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
|
||||||
@ -91,21 +92,19 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) {
|
|||||||
EXPECT_EQ(sample.sample_rows, kRows);
|
EXPECT_EQ(sample.sample_rows, kRows);
|
||||||
EXPECT_EQ(sample.gpair.size(), gpair.Size());
|
EXPECT_EQ(sample.gpair.size(), gpair.Size());
|
||||||
EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
|
EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
|
||||||
EXPECT_EQ(sampled_page->matrix.n_rows, kRows);
|
EXPECT_EQ(sampled_page->n_rows, kRows);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.size());
|
std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&buffer, sampled_page->gidx_buffer);
|
|
||||||
common::CompressedIterator<common::CompressedByteT>
|
common::CompressedIterator<common::CompressedByteT>
|
||||||
ci(buffer.data(), sampled_page->matrix.info.NumSymbols());
|
ci(buffer.data(), sampled_page->NumSymbols());
|
||||||
|
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
|
for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
|
||||||
auto page = batch.Impl();
|
auto page = batch.Impl();
|
||||||
std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.size());
|
std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&page_buffer, page->gidx_buffer);
|
|
||||||
common::CompressedIterator<common::CompressedByteT>
|
common::CompressedIterator<common::CompressedByteT>
|
||||||
page_ci(page_buffer.data(), page->matrix.info.NumSymbols());
|
page_ci(page_buffer.data(), page->NumSymbols());
|
||||||
size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
|
size_t num_elements = page->n_rows * page->row_stride;
|
||||||
for (size_t i = 0; i < num_elements; i++) {
|
for (size_t i = 0; i < num_elements; i++) {
|
||||||
EXPECT_EQ(ci[i + offset], page_ci[i]);
|
EXPECT_EQ(ci[i + offset], page_ci[i]);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,7 +27,7 @@ void TestDeterminsticHistogram() {
|
|||||||
gpair.SetDevice(0);
|
gpair.SetDevice(0);
|
||||||
|
|
||||||
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
||||||
BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
|
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
||||||
d_histogram, rounding, true);
|
d_histogram, rounding, true);
|
||||||
|
|
||||||
for (size_t i = 0; i < kRounds; ++i) {
|
for (size_t i = 0; i < kRounds; ++i) {
|
||||||
@ -35,7 +35,7 @@ void TestDeterminsticHistogram() {
|
|||||||
auto d_histogram = dh::ToSpan(new_histogram);
|
auto d_histogram = dh::ToSpan(new_histogram);
|
||||||
|
|
||||||
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
||||||
BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
|
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
||||||
d_histogram, rounding, true);
|
d_histogram, rounding, true);
|
||||||
|
|
||||||
for (size_t j = 0; j < new_histogram.size(); ++j) {
|
for (size_t j = 0; j < new_histogram.size(); ++j) {
|
||||||
@ -50,7 +50,7 @@ void TestDeterminsticHistogram() {
|
|||||||
auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
|
auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
|
||||||
gpair.SetDevice(0);
|
gpair.SetDevice(0);
|
||||||
dh::device_vector<Gradient> baseline(kBins * kCols);
|
dh::device_vector<Gradient> baseline(kBins * kCols);
|
||||||
BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
|
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
||||||
dh::ToSpan(baseline), rounding, true);
|
dh::ToSpan(baseline), rounding, true);
|
||||||
for (size_t i = 0; i < baseline.size(); ++i) {
|
for (size_t i = 0; i < baseline.size(); ++i) {
|
||||||
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
|
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
|
||||||
|
|||||||
@ -97,12 +97,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
|||||||
}
|
}
|
||||||
gpair.SetDevice(0);
|
gpair.SetDevice(0);
|
||||||
|
|
||||||
thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.size());
|
thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
|
||||||
|
|
||||||
common::CompressedByteT* d_gidx_buffer_ptr = page->gidx_buffer.data();
|
|
||||||
dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr,
|
|
||||||
sizeof(common::CompressedByteT) * page->gidx_buffer.size(),
|
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
|
maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
|
||||||
maker.hist.AllocateHistogram(0);
|
maker.hist.AllocateHistogram(0);
|
||||||
@ -196,15 +192,10 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
auto cmat = GetHostCutMatrix();
|
auto cmat = GetHostCutMatrix();
|
||||||
|
|
||||||
// Copy cut matrix to device.
|
// Copy cut matrix to device.
|
||||||
maker.ba.Allocate(0,
|
page->cuts_ = cmat;
|
||||||
&(page->matrix.info.feature_segments), cmat.Ptrs().size(),
|
maker.ba.Allocate(0, &(maker.monotone_constraints), kNCols);
|
||||||
&(page->matrix.info.min_fvalue), cmat.MinValues().size(),
|
dh::CopyVectorToDeviceSpan(maker.monotone_constraints,
|
||||||
&(page->matrix.info.gidx_fvalue_map), 24,
|
param.monotone_constraints);
|
||||||
&(maker.monotone_constraints), kNCols);
|
|
||||||
dh::CopyVectorToDeviceSpan(page->matrix.info.feature_segments, cmat.Ptrs());
|
|
||||||
dh::CopyVectorToDeviceSpan(page->matrix.info.gidx_fvalue_map, cmat.Values());
|
|
||||||
dh::CopyVectorToDeviceSpan(maker.monotone_constraints, param.monotone_constraints);
|
|
||||||
dh::CopyVectorToDeviceSpan(page->matrix.info.min_fvalue, cmat.MinValues());
|
|
||||||
|
|
||||||
// Initialize GPUHistMakerDevice::hist
|
// Initialize GPUHistMakerDevice::hist
|
||||||
maker.hist.Init(0, (max_bins - 1) * kNCols);
|
maker.hist.Init(0, (max_bins - 1) * kNCols);
|
||||||
@ -274,15 +265,13 @@ void TestHistogramIndexImpl() {
|
|||||||
// Extract the device maker from the histogram makers and from that its compressed
|
// Extract the device maker from the histogram makers and from that its compressed
|
||||||
// histogram index
|
// histogram index
|
||||||
const auto &maker = hist_maker.maker;
|
const auto &maker = hist_maker.maker;
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.size());
|
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, maker->page->gidx_buffer);
|
|
||||||
|
|
||||||
const auto &maker_ext = hist_maker_ext.maker;
|
const auto &maker_ext = hist_maker_ext.maker;
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.size());
|
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
|
||||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, maker_ext->page->gidx_buffer);
|
|
||||||
|
|
||||||
ASSERT_EQ(maker->page->matrix.info.n_bins, maker_ext->page->matrix.info.n_bins);
|
ASSERT_EQ(maker->page->cuts_.TotalBins(), maker_ext->page->cuts_.TotalBins());
|
||||||
ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
|
ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user