diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h index 1c5543e43..64702f2a4 100644 --- a/src/common/compressed_iterator.h +++ b/src/common/compressed_iterator.h @@ -181,13 +181,13 @@ class CompressedIterator { typedef value_type reference; // NOLINT private: - CompressedByteT *buffer_; + const CompressedByteT *buffer_; size_t symbol_bits_; size_t offset_; public: CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {} - CompressedIterator(CompressedByteT *buffer, size_t num_symbols) + CompressedIterator(const CompressedByteT *buffer, size_t num_symbols) : buffer_(buffer), offset_(0) { symbol_bits_ = detail::SymbolBits(num_symbols); } diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index b57ff5e4b..8fc829834 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -31,7 +31,7 @@ namespace common { HistogramCuts::HistogramCuts() { monitor_.Init(__FUNCTION__); - cut_ptrs_.emplace_back(0); + cut_ptrs_.HostVector().emplace_back(0); } // Dispatch to specific builder. @@ -52,7 +52,7 @@ void HistogramCuts::Build(DMatrix* dmat, uint32_t const max_num_bins) { DenseCuts cuts(this); cuts.Build(dmat, max_num_bins); } - LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.back(); + LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.HostVector().back(); } bool CutsBuilder::UseGroup(DMatrix* dmat) { @@ -75,7 +75,10 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info, // Data groups, used in ranking. std::vector const& group_ptr = info.group_ptr_; - p_cuts_->min_vals_.resize(end_col - beg_col, 0); + auto &local_min_vals = p_cuts_->min_vals_.HostVector(); + auto &local_cuts = p_cuts_->cut_values_.HostVector(); + auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector(); + local_min_vals.resize(end_col - beg_col, 0); for (uint32_t col_id = beg_col; col_id < page.Size() && col_id < end_col; ++col_id) { // Using a local variable makes things easier, but at the cost of memory trashing. @@ -85,7 +88,7 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info, max_num_bins); if (n_bins == 0) { // cut_ptrs_ is initialized with a zero, so there's always an element at the back - p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_ptrs_.back()); + local_ptrs.emplace_back(local_ptrs.back()); continue; } @@ -112,17 +115,17 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info, // Can be use data[1] as the min values so that we don't need to // store another array? float mval = summary.data[0].value; - p_cuts_->min_vals_[col_id - beg_col] = mval - (fabs(mval) + 1e-5); + local_min_vals[col_id - beg_col] = mval - (fabs(mval) + 1e-5); this->AddCutPoint(summary, max_num_bins); bst_float cpt = (summary.size > 0) ? summary.data[summary.size - 1].value : - p_cuts_->min_vals_[col_id - beg_col]; + local_min_vals[col_id - beg_col]; cpt += fabs(cpt) + 1e-5; - p_cuts_->cut_values_.emplace_back(cpt); + local_cuts.emplace_back(cpt); - p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_values_.size()); + local_ptrs.emplace_back(local_cuts.size()); } } @@ -196,33 +199,40 @@ void SparseCuts::Concat( std::vector> const& cuts, uint32_t n_cols) { monitor_.Start(__FUNCTION__); uint32_t nthreads = omp_get_max_threads(); - p_cuts_->min_vals_.resize(n_cols, std::numeric_limits::max()); + auto &local_min_vals = p_cuts_->min_vals_.HostVector(); + auto &local_cuts = p_cuts_->cut_values_.HostVector(); + auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector(); + local_min_vals.resize(n_cols, std::numeric_limits::max()); size_t min_vals_tail = 0; for (uint32_t t = 0; t < nthreads; ++t) { + auto& thread_min_vals = cuts[t]->p_cuts_->min_vals_.HostVector(); + auto& thread_cuts = cuts[t]->p_cuts_->cut_values_.HostVector(); + auto& thread_ptrs = cuts[t]->p_cuts_->cut_ptrs_.HostVector(); + // concat csc pointers. - size_t const old_ptr_size = p_cuts_->cut_ptrs_.size(); - p_cuts_->cut_ptrs_.resize( - cuts[t]->p_cuts_->cut_ptrs_.size() + p_cuts_->cut_ptrs_.size() - 1); - size_t const new_icp_size = p_cuts_->cut_ptrs_.size(); - auto tail = p_cuts_->cut_ptrs_[old_ptr_size-1]; + size_t const old_ptr_size = local_ptrs.size(); + local_ptrs.resize( + thread_ptrs.size() + local_ptrs.size() - 1); + size_t const new_icp_size = local_ptrs.size(); + auto tail = local_ptrs[old_ptr_size-1]; for (size_t j = old_ptr_size; j < new_icp_size; ++j) { - p_cuts_->cut_ptrs_[j] = tail + cuts[t]->p_cuts_->cut_ptrs_[j-old_ptr_size+1]; + local_ptrs[j] = tail + thread_ptrs[j-old_ptr_size+1]; } // concat csc values - size_t const old_iv_size = p_cuts_->cut_values_.size(); - p_cuts_->cut_values_.resize( - cuts[t]->p_cuts_->cut_values_.size() + p_cuts_->cut_values_.size()); - size_t const new_iv_size = p_cuts_->cut_values_.size(); + size_t const old_iv_size = local_cuts.size(); + local_cuts.resize( + thread_cuts.size() + local_cuts.size()); + size_t const new_iv_size = local_cuts.size(); for (size_t j = old_iv_size; j < new_iv_size; ++j) { - p_cuts_->cut_values_[j] = cuts[t]->p_cuts_->cut_values_[j-old_iv_size]; + local_cuts[j] = thread_cuts[j-old_iv_size]; } // merge min values - for (size_t j = 0; j < cuts[t]->p_cuts_->min_vals_.size(); ++j) { - p_cuts_->min_vals_.at(min_vals_tail + j) = - std::min(p_cuts_->min_vals_.at(min_vals_tail + j), cuts.at(t)->p_cuts_->min_vals_.at(j)); + for (size_t j = 0; j < thread_min_vals.size(); ++j) { + local_min_vals.at(min_vals_tail + j) = + std::min(local_min_vals.at(min_vals_tail + j), thread_min_vals.at(j)); } - min_vals_tail += cuts[t]->p_cuts_->min_vals_.size(); + min_vals_tail += thread_min_vals.size(); } monitor_.Stop(__FUNCTION__); } @@ -323,27 +333,27 @@ void DenseCuts::Init // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint // we need to move this allreduce before loadcheckpoint call in future sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size()); - p_cuts_->min_vals_.resize(sketchs.size()); + p_cuts_->min_vals_.HostVector().resize(sketchs.size()); for (size_t fid = 0; fid < summary_array.size(); ++fid) { WQSketch::SummaryContainer a; a.Reserve(max_num_bins + 1); a.SetPrune(summary_array[fid], max_num_bins + 1); const bst_float mval = a.data[0].value; - p_cuts_->min_vals_[fid] = mval - (fabs(mval) + 1e-5); + p_cuts_->min_vals_.HostVector()[fid] = mval - (fabs(mval) + 1e-5); AddCutPoint(a, max_num_bins); // push a value that is greater than anything const bst_float cpt - = (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_[fid]; + = (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_.HostVector()[fid]; // this must be bigger than last value in a scale const bst_float last = cpt + (fabs(cpt) + 1e-5); - p_cuts_->cut_values_.push_back(last); + p_cuts_->cut_values_.HostVector().push_back(last); // Ensure that every feature gets at least one quantile point - CHECK_LE(p_cuts_->cut_values_.size(), std::numeric_limits::max()); - auto cut_size = static_cast(p_cuts_->cut_values_.size()); - CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back()); - p_cuts_->cut_ptrs_.push_back(cut_size); + CHECK_LE(p_cuts_->cut_values_.HostVector().size(), std::numeric_limits::max()); + auto cut_size = static_cast(p_cuts_->cut_values_.HostVector().size()); + CHECK_GT(cut_size, p_cuts_->cut_ptrs_.HostVector().back()); + p_cuts_->cut_ptrs_.HostVector().push_back(cut_size); } monitor_.Stop(__func__); } diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 2c3cb3d4a..3a1dca0ab 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -44,17 +44,35 @@ class HistogramCuts { using BinIdx = uint32_t; common::Monitor monitor_; - std::vector cut_values_; - std::vector cut_ptrs_; - std::vector min_vals_; // storing minimum value in a sketch set. - public: + HostDeviceVector cut_values_; + HostDeviceVector cut_ptrs_; + HostDeviceVector min_vals_; // storing minimum value in a sketch set. + HistogramCuts(); - HistogramCuts(HistogramCuts const& that) = delete; + HistogramCuts(HistogramCuts const& that) { + cut_values_.Resize(that.cut_values_.Size()); + cut_ptrs_.Resize(that.cut_ptrs_.Size()); + min_vals_.Resize(that.min_vals_.Size()); + cut_values_.Copy(that.cut_values_); + cut_ptrs_.Copy(that.cut_ptrs_); + min_vals_.Copy(that.min_vals_); + } + HistogramCuts(HistogramCuts&& that) noexcept(true) { *this = std::forward(that); } - HistogramCuts& operator=(HistogramCuts const& that) = delete; + + HistogramCuts& operator=(HistogramCuts const& that) { + cut_values_.Resize(that.cut_values_.Size()); + cut_ptrs_.Resize(that.cut_ptrs_.Size()); + min_vals_.Resize(that.min_vals_.Size()); + cut_values_.Copy(that.cut_values_); + cut_ptrs_.Copy(that.cut_ptrs_); + min_vals_.Copy(that.min_vals_); + return *this; + } + HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) { monitor_ = std::move(that.monitor_); cut_ptrs_ = std::move(that.cut_ptrs_); @@ -67,28 +85,30 @@ class HistogramCuts { void Build(DMatrix* dmat, uint32_t const max_num_bins); /* \brief How many bins a feature has. */ uint32_t FeatureBins(uint32_t feature) const { - return cut_ptrs_.at(feature+1) - cut_ptrs_[feature]; + return cut_ptrs_.ConstHostVector().at(feature + 1) - + cut_ptrs_.ConstHostVector()[feature]; } // Getters. Cuts should be of no use after building histogram indices, but currently // it's deeply linked with quantile_hist, gpu sketcher and gpu_hist. So we preserve // these for now. - std::vector const& Ptrs() const { return cut_ptrs_; } - std::vector const& Values() const { return cut_values_; } - std::vector const& MinValues() const { return min_vals_; } + std::vector const& Ptrs() const { return cut_ptrs_.ConstHostVector(); } + std::vector const& Values() const { return cut_values_.ConstHostVector(); } + std::vector const& MinValues() const { return min_vals_.ConstHostVector(); } - size_t TotalBins() const { return cut_ptrs_.back(); } + size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); } // Return the index of a cut point that is strictly greater than the input // value, or the last available index if none exists BinIdx SearchBin(float value, uint32_t column_id) const { - auto beg = cut_ptrs_.at(column_id); - auto end = cut_ptrs_.at(column_id + 1); - auto it = std::upper_bound(cut_values_.cbegin() + beg, cut_values_.cbegin() + end, value); - if (it == cut_values_.cend()) { - it = cut_values_.cend() - 1; + auto beg = cut_ptrs_.ConstHostVector().at(column_id); + auto end = cut_ptrs_.ConstHostVector().at(column_id + 1); + const auto &values = cut_values_.ConstHostVector(); + auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value); + if (it == values.cend()) { + it = values.cend() - 1; } - BinIdx idx = it - cut_values_.cbegin(); + BinIdx idx = it - values.cbegin(); return idx; } @@ -133,8 +153,8 @@ class CutsBuilder { size_t required_cuts = std::min(summary.size, static_cast(max_bin)); for (size_t i = 1; i < required_cuts; ++i) { bst_float cpt = summary.data[i].value; - if (i == 1 || cpt > p_cuts_->cut_values_.back()) { - p_cuts_->cut_values_.push_back(cpt); + if (i == 1 || cpt > p_cuts_->cut_values_.ConstHostVector().back()) { + p_cuts_->cut_values_.HostVector().push_back(cpt); } } } diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 6180b6312..3ffcf757e 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -371,6 +371,7 @@ void HostDeviceVector::Resize(size_t new_size, T v) { template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_node_t +template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_row_t template class HostDeviceVector; // bst_feature_t diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc index ed96007ba..6c7333b4a 100644 --- a/src/data/ellpack_page.cc +++ b/src/data/ellpack_page.cc @@ -13,11 +13,24 @@ class EllpackPageImpl {}; EllpackPage::EllpackPage() = default; EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) { - LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required"; + LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " + "EllpackPage is required"; } EllpackPage::~EllpackPage() { - LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required"; + LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " + "EllpackPage is required"; +} + +void EllpackPage::SetBaseRowId(size_t row_id) { + LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " + "EllpackPage is required"; +} + +size_t EllpackPage::Size() const { + LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " + "EllpackPage is required"; + return 0; } } // namespace xgboost diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index ccff62bea..ae361a7d8 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -4,9 +4,9 @@ #include -#include "./ellpack_page.cuh" #include "../common/hist_util.h" #include "../common/random.h" +#include "./ellpack_page.cuh" namespace xgboost { @@ -17,13 +17,9 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) EllpackPage::~EllpackPage() = default; -size_t EllpackPage::Size() const { - return impl_->Size(); -} +size_t EllpackPage::Size() const { return impl_->Size(); } -void EllpackPage::SetBaseRowId(size_t row_id) { - impl_->SetBaseRowId(row_id); -} +void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); } // Bin each input data entry, store the bin indices in compressed form. __global__ void CompressBinEllpackKernel( @@ -65,16 +61,18 @@ __global__ void CompressBinEllpackKernel( } // Construct an ELLPACK matrix with the given number of empty rows. -EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) { +EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, + bool is_dense, size_t row_stride, + size_t n_rows) + : is_dense(is_dense), + cuts_(std::move(cuts)), + row_stride(row_stride), + n_rows(n_rows) { monitor_.Init("ellpack_page"); dh::safe_cuda(cudaSetDevice(device)); - matrix.info = info; - matrix.base_rowid = 0; - matrix.n_rows = n_rows; - monitor_.StartCuda("InitCompressedData"); - InitCompressedData(device, n_rows); + InitCompressedData(device); monitor_.StopCuda("InitCompressedData"); } @@ -93,33 +91,27 @@ size_t GetRowStride(DMatrix* dmat) { } // Construct an ELLPACK matrix in memory. -EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) { +EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) + : is_dense(dmat->IsDense()) { monitor_.Init("ellpack_page"); dh::safe_cuda(cudaSetDevice(param.gpu_id)); - matrix.n_rows = dmat->Info().num_row_; + n_rows = dmat->Info().num_row_; monitor_.StartCuda("Quantiles"); // Create the quantile sketches for the dmatrix and initialize HistogramCuts. - size_t row_stride = GetRowStride(dmat); - auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin, + row_stride = GetRowStride(dmat); + cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin, param.gpu_batch_nrows); monitor_.StopCuda("Quantiles"); - monitor_.StartCuda("InitEllpackInfo"); - InitInfo(param.gpu_id, dmat->IsDense(), row_stride, cuts); - monitor_.StopCuda("InitEllpackInfo"); - monitor_.StartCuda("InitCompressedData"); - InitCompressedData(param.gpu_id, dmat->Info().num_row_); + InitCompressedData(param.gpu_id); monitor_.StopCuda("InitCompressedData"); monitor_.StartCuda("BinningCompression"); - DeviceHistogramBuilderState hist_builder_row_state(dmat->Info().num_row_); for (const auto& batch : dmat->GetBatches()) { - hist_builder_row_state.BeginBatch(batch); - CreateHistIndices(param.gpu_id, batch, hist_builder_row_state.GetRowStateOnDevice()); - hist_builder_row_state.EndBatch(); + CreateHistIndices(param.gpu_id, batch); } monitor_.StopCuda("BinningCompression"); } @@ -133,23 +125,26 @@ struct CopyPage { size_t offset; CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset) - : cbw{dst->matrix.info.NumSymbols()}, - dst_data_d{dst->gidx_buffer.data()}, - src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()}, + : cbw{dst->NumSymbols()}, + dst_data_d{dst->gidx_buffer.DevicePointer()}, + src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, offset(offset) {} __device__ void operator()(size_t element_id) { - cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset); + cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], + element_id + offset); } }; // Copy the data from the given EllpackPage to the current page. size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) { monitor_.StartCuda("Copy"); - size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride; - CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride); - CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols()); - CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements); + size_t num_elements = page->n_rows * page->row_stride; + CHECK_EQ(row_stride, page->row_stride); + CHECK_EQ(NumSymbols(), page->NumSymbols()); + CHECK_GE(n_rows * row_stride, offset + num_elements); + gidx_buffer.SetDevice(device); + page->gidx_buffer.SetDevice(device); dh::LaunchN(device, num_elements, CopyPage(this, page, offset)); monitor_.StopCuda("Copy"); return num_elements; @@ -160,26 +155,29 @@ struct CompactPage { common::CompressedBufferWriter cbw; common::CompressedByteT* dst_data_d; common::CompressedIterator src_iterator_d; - /*! \brief An array that maps the rows from the full DMatrix to the compacted page. + /*! \brief An array that maps the rows from the full DMatrix to the compacted + * page. * - * The total size is the number of rows in the original, uncompacted DMatrix. Elements are the - * row ids in the compacted page. Rows not needed are set to SIZE_MAX. + * The total size is the number of rows in the original, uncompacted DMatrix. + * Elements are the row ids in the compacted page. Rows not needed are set to + * SIZE_MAX. * * An example compacting 16 rows to 8 rows: - * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX, - * SIZE_MAX] + * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, + * SIZE_MAX, 7, SIZE_MAX, SIZE_MAX] */ common::Span row_indexes; size_t base_rowid; size_t row_stride; - CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span row_indexes) - : cbw{dst->matrix.info.NumSymbols()}, - dst_data_d{dst->gidx_buffer.data()}, - src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()}, + CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, + common::Span row_indexes) + : cbw{dst->NumSymbols()}, + dst_data_d{dst->gidx_buffer.DevicePointer()}, + src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, row_indexes(row_indexes), - base_rowid{src->matrix.base_rowid}, - row_stride{src->matrix.info.row_stride} {} + base_rowid{src->base_rowid}, + row_stride{src->row_stride} {} __device__ void operator()(size_t row_id) { size_t src_row = base_rowid + row_id; @@ -188,100 +186,72 @@ struct CompactPage { size_t dst_offset = dst_row * row_stride; size_t src_offset = row_id * row_stride; for (size_t j = 0; j < row_stride; j++) { - cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j); + cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], + dst_offset + j); } } }; // Compacts the data from the given EllpackPage into the current page. -void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span row_indexes) { +void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, + common::Span row_indexes) { monitor_.StartCuda("Compact"); - CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride); - CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols()); - CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size()); - dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes)); + CHECK_EQ(row_stride, page->row_stride); + CHECK_EQ(NumSymbols(), page->NumSymbols()); + CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); + gidx_buffer.SetDevice(device); + page->gidx_buffer.SetDevice(device); + dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes)); monitor_.StopCuda("Compact"); } -// Construct an EllpackInfo based on histogram cuts of features. -EllpackInfo::EllpackInfo(int device, - bool is_dense, - size_t row_stride, - const common::HistogramCuts& hmat, - dh::BulkAllocator* ba) - : is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) { - - ba->Allocate(device, - &feature_segments, hmat.Ptrs().size(), - &gidx_fvalue_map, hmat.Values().size(), - &min_fvalue, hmat.MinValues().size()); - dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values()); - dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues()); - dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs()); -} - -// Initialize the EllpackInfo for this page. -void EllpackPageImpl::InitInfo(int device, - bool is_dense, - size_t row_stride, - const common::HistogramCuts& hmat) { - matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_); -} - // Initialize the buffer to stored compressed features. -void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) { - size_t num_symbols = matrix.info.NumSymbols(); +void EllpackPageImpl::InitCompressedData(int device) { + size_t num_symbols = NumSymbols(); // Required buffer size for storing data matrix in ELLPack format. - size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize( - matrix.info.row_stride * num_rows, num_symbols); - ba_.Allocate(device, &gidx_buffer, compressed_size_bytes); - - thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0); - - matrix.gidx_iter = common::CompressedIterator(gidx_buffer.data(), num_symbols); + size_t compressed_size_bytes = + common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, + num_symbols); + gidx_buffer.SetDevice(device); + // Don't call fill unnecessarily + if (gidx_buffer.Size() == 0) { + gidx_buffer.Resize(compressed_size_bytes, 0); + } else { + gidx_buffer.Resize(compressed_size_bytes, 0); + thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0); + } } // Compress a CSR page into ELLPACK. void EllpackPageImpl::CreateHistIndices(int device, - const SparsePage& row_batch, - const RowStateOnDevice& device_row_state) { - // Has any been allocated for me in this batch? - if (!device_row_state.rows_to_process_from_batch) return; - - unsigned int null_gidx_value = matrix.info.n_bins; - size_t row_stride = matrix.info.row_stride; + const SparsePage& row_batch) { + if (row_batch.Size() == 0) return; + unsigned int null_gidx_value = NumSymbols() - 1; const auto& offset_vec = row_batch.offset.ConstHostVector(); // bin and compress entries in batches of rows - size_t gpu_batch_nrows = std::min( - dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)), - static_cast(device_row_state.rows_to_process_from_batch)); + size_t gpu_batch_nrows = + std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)), + static_cast(row_batch.Size())); const std::vector& data_vec = row_batch.data.ConstHostVector(); - size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch, - gpu_batch_nrows); + size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows); for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { size_t batch_row_begin = gpu_batch * gpu_batch_nrows; - size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows; - if (batch_row_end > device_row_state.rows_to_process_from_batch) { - batch_row_end = device_row_state.rows_to_process_from_batch; - } + size_t batch_row_end = + std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size()); size_t batch_nrows = batch_row_end - batch_row_begin; - const auto ent_cnt_begin = - offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin]; - const auto ent_cnt_end = - offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end]; + const auto ent_cnt_begin = offset_vec[batch_row_begin]; + const auto ent_cnt_end = offset_vec[batch_row_end]; /*! \brief row offset in SparsePage (the input data). */ dh::device_vector row_ptrs(batch_nrows + 1); - thrust::copy( - offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin, - offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1, - row_ptrs.begin()); + thrust::copy(offset_vec.data() + batch_row_begin, + offset_vec.data() + batch_row_end + 1, row_ptrs.begin()); // number of entries in this batch. size_t n_entries = ent_cnt_end - ent_cnt_begin; @@ -289,97 +259,50 @@ void EllpackPageImpl::CreateHistIndices(int device, // copy data entries to device. dh::safe_cuda(cudaMemcpy(entries_d.data().get(), data_vec.data() + ent_cnt_begin, - n_entries * sizeof(Entry), - cudaMemcpyDefault)); + n_entries * sizeof(Entry), cudaMemcpyDefault)); const dim3 block3(32, 8, 1); // 256 threads const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x), - common::DivRoundUp(row_stride, block3.y), - 1); - dh::LaunchKernel {grid3, block3} ( - CompressBinEllpackKernel, - common::CompressedBufferWriter(matrix.info.NumSymbols()), - gidx_buffer.data(), - row_ptrs.data().get(), - entries_d.data().get(), - matrix.info.gidx_fvalue_map.data(), - matrix.info.feature_segments.data(), - device_row_state.total_rows_processed + batch_row_begin, - batch_nrows, - row_stride, + common::DivRoundUp(row_stride, block3.y), 1); + auto device_accessor = GetDeviceAccessor(device); + dh::LaunchKernel {grid3, block3}( + CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), + gidx_buffer.DevicePointer(), row_ptrs.data().get(), + entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), + device_accessor.feature_segments.data(), + row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride, null_gidx_value); } } // Return the number of rows contained in this page. -size_t EllpackPageImpl::Size() const { - return matrix.n_rows; -} - -// Clear the current page. -void EllpackPageImpl::Clear() { - ba_.Clear(); - gidx_buffer = {}; - idx_buffer.clear(); - sparse_page_.Clear(); - matrix.base_rowid = 0; - matrix.n_rows = 0; - device_initialized_ = false; -} - -// Push a CSR page to the current page. -// -// The CSR pages are accumulated in memory until they reach a certain size, then written out as -// compressed ELLPACK. -void EllpackPageImpl::Push(int device, const SparsePage& batch) { - sparse_page_.Push(batch); - matrix.n_rows += batch.Size(); -} - -// Compress the accumulated SparsePage. -void EllpackPageImpl::CompressSparsePage(int device) { - monitor_.StartCuda("InitCompressedData"); - InitCompressedData(device, matrix.n_rows); - monitor_.StopCuda("InitCompressedData"); - - monitor_.StartCuda("BinningCompression"); - DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows); - hist_builder_row_state.BeginBatch(sparse_page_); - CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice()); - hist_builder_row_state.EndBatch(); - monitor_.StopCuda("BinningCompression"); - - monitor_.StartCuda("CopyDeviceToHost"); - idx_buffer.resize(gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer); - ba_.Clear(); - gidx_buffer = {}; - monitor_.StopCuda("CopyDeviceToHost"); -} +size_t EllpackPageImpl::Size() const { return n_rows; } // Return the memory cost for storing the compressed features. -size_t EllpackPageImpl::MemCostBytes() const { - // Required buffer size for storing data matrix in ELLPack format. - size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize( - matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols()); +size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride, + const common::HistogramCuts& cuts) { + // Required buffer size for storing data matrix in EtoLLPack format. + size_t compressed_size_bytes = + common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows, + cuts.TotalBins() + 1); return compressed_size_bytes; } -// Copy the compressed features to GPU. -void EllpackPageImpl::InitDevice(int device, EllpackInfo info) { - if (device_initialized_) return; +EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const { + gidx_buffer.SetDevice(device); + return EllpackDeviceAccessor( + device, cuts_, is_dense, row_stride, base_rowid, n_rows, + common::CompressedIterator(gidx_buffer.ConstDevicePointer(), + NumSymbols())); +} - monitor_.StartCuda("CopyPageToDevice"); - dh::safe_cuda(cudaSetDevice(device)); - - gidx_buffer = {}; - ba_.Allocate(device, &gidx_buffer, idx_buffer.size()); - dh::CopyVectorToDeviceSpan(gidx_buffer, idx_buffer); - - matrix.info = info; - matrix.gidx_iter = common::CompressedIterator(gidx_buffer.data(), info.n_bins + 1); - - monitor_.StopCuda("CopyPageToDevice"); - - device_initialized_ = true; +EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, + const SparsePage& page, bool is_dense, + size_t row_stride) + : cuts_(std::move(cuts)), + is_dense(is_dense), + n_rows(page.Size()), + row_stride(row_stride) { + this->InitCompressedData(device); + this->CreateHistIndices(device, page); } } // namespace xgboost diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 4d3c7a185..087f91fc9 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -40,71 +40,53 @@ __forceinline__ __device__ int BinarySearchRow( return -1; } -/** \brief Meta information about the ELLPACK matrix. */ -struct EllpackInfo { +/** \brief Struct for accessing and manipulating an ellpack matrix on the + * device. Does not own underlying memory and may be trivially copied into + * kernels.*/ +struct EllpackDeviceAccessor { /*! \brief Whether or not if the matrix is dense. */ bool is_dense; /*! \brief Row length for ELLPack, equal to number of features. */ size_t row_stride; - /*! \brief Total number of bins, also used as the null index value, . */ - size_t n_bins; - /*! \brief Minimum value for each feature. Size equals to number of features. */ - common::Span min_fvalue; - /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */ - common::Span feature_segments; - /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */ - common::Span gidx_fvalue_map; - - EllpackInfo() = default; - - /*! - * \brief Constructor. - * - * @param device The GPU device to use. - * @param is_dense Whether the matrix is dense. - * @param row_stride The number of features between starts of consecutive rows. - * @param hmat The histogram cuts of all the features. - * @param ba The BulkAllocator that owns the GPU memory. - */ - explicit EllpackInfo(int device, - bool is_dense, - size_t row_stride, - const common::HistogramCuts& hmat, - dh::BulkAllocator* ba); - - /*! \brief Return the total number of symbols (total number of bins plus 1 for not found). */ - size_t NumSymbols() const { - return n_bins + 1; - } - size_t NumFeatures() const { - return min_fvalue.size(); - } -}; - -/** \brief Struct for accessing and manipulating an ellpack matrix on the - * device. Does not own underlying memory and may be trivially copied into - * kernels.*/ -struct EllpackMatrix { - EllpackInfo info; size_t base_rowid{}; size_t n_rows{}; common::CompressedIterator gidx_iter; + /*! \brief Minimum value for each feature. Size equals to number of features. */ + common::Span min_fvalue; + /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */ + common::Span feature_segments; + /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */ + common::Span gidx_fvalue_map; + EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts, + bool is_dense, size_t row_stride, size_t base_rowid, + size_t n_rows,common::CompressedIterator gidx_iter) + : is_dense(is_dense), + row_stride(row_stride), + base_rowid(base_rowid), + n_rows(n_rows) ,gidx_iter(gidx_iter){ + cuts.cut_values_.SetDevice(device); + cuts.cut_ptrs_.SetDevice(device); + cuts.min_vals_.SetDevice(device); + gidx_fvalue_map = cuts.cut_values_.ConstDeviceSpan(); + feature_segments = cuts.cut_ptrs_.ConstDeviceSpan(); + min_fvalue = cuts.min_vals_.ConstDeviceSpan(); + } // Get a matrix element, uses binary search for look up Return NaN if missing // Given a row index and a feature index, returns the corresponding cut value __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const { ridx -= base_rowid; - auto row_begin = info.row_stride * ridx; - auto row_end = row_begin + info.row_stride; + auto row_begin = row_stride * ridx; + auto row_end = row_begin + row_stride; auto gidx = -1; - if (info.is_dense) { + if (is_dense) { gidx = gidx_iter[row_begin + fidx]; } else { gidx = BinarySearchRow(row_begin, row_end, gidx_iter, - info.feature_segments[fidx], - info.feature_segments[fidx + 1]); + feature_segments[fidx], + feature_segments[fidx + 1]); } return gidx; } @@ -113,97 +95,27 @@ struct EllpackMatrix { if (gidx == -1) { return nan(""); } - return info.gidx_fvalue_map[gidx]; + return gidx_fvalue_map[gidx]; } // Check if the row id is withing range of the current batch. __device__ bool IsInRange(size_t row_id) const { return row_id >= base_rowid && row_id < base_rowid + n_rows; } + /*! \brief Return the total number of symbols (total number of bins plus 1 for + * not found). */ + size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; } + + size_t NullValue() const { return gidx_fvalue_map.size(); } + + XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } + + XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); } }; -// Instances of this type are created while creating the histogram bins for the -// entire dataset across multiple sparse page batches. This keeps track of the number -// of rows to process from a batch and the position from which to process on each device. -struct RowStateOnDevice { - // Number of rows assigned to this device - size_t total_rows_assigned_to_device; - // Number of rows processed thus far - size_t total_rows_processed; - // Number of rows to process from the current sparse page batch - size_t rows_to_process_from_batch; - // Offset from the current sparse page batch to begin processing - size_t row_offset_in_current_batch; - - explicit RowStateOnDevice(size_t total_rows) - : total_rows_assigned_to_device(total_rows), total_rows_processed(0), - rows_to_process_from_batch(0), row_offset_in_current_batch(0) { - } - - explicit RowStateOnDevice(size_t total_rows, size_t batch_rows) - : total_rows_assigned_to_device(total_rows), total_rows_processed(0), - rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) { - } - - // Advance the row state by the number of rows processed - void Advance() { - total_rows_processed += rows_to_process_from_batch; - CHECK_LE(total_rows_processed, total_rows_assigned_to_device); - rows_to_process_from_batch = row_offset_in_current_batch = 0; - } -}; - -// An instance of this type is created which keeps track of total number of rows to process, -// rows processed thus far, rows to process and the offset from the current sparse page batch -// to begin processing on each device -class DeviceHistogramBuilderState { - public: - explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {} - - const RowStateOnDevice& GetRowStateOnDevice() const { - return device_row_state_; - } - - // This method is invoked at the beginning of each sparse page batch. This distributes - // the rows in the sparse page to the device. - // TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins. - void BeginBatch(const SparsePage &batch) { - size_t rem_rows = batch.Size(); - size_t row_offset_in_current_batch = 0; - - // Do we have anymore left to process from this batch on this device? - if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) { - // There are still some rows that needs to be assigned to this device - device_row_state_.rows_to_process_from_batch = - std::min( - device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed, - rem_rows); - } else { - // All rows have been assigned to this device - device_row_state_.rows_to_process_from_batch = 0; - } - - device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch; - row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch; - rem_rows -= device_row_state_.rows_to_process_from_batch; - } - - // This method is invoked after completion of each sparse page batch - void EndBatch() { - device_row_state_.Advance(); - } - - private: - RowStateOnDevice device_row_state_{0}; -}; class EllpackPageImpl { public: - EllpackMatrix matrix; - /*! \brief global index of histogram, which is stored in ELLPack format. */ - common::Span gidx_buffer; - std::vector idx_buffer; - /*! * \brief Default constructor. * @@ -218,7 +130,12 @@ class EllpackPageImpl { * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo * and the given number of rows. */ - explicit EllpackPageImpl(int device, EllpackInfo info, size_t n_rows); + EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense, + size_t row_stride, size_t n_rows); + + EllpackPageImpl(int device, common::HistogramCuts cuts, + const SparsePage& page, + bool is_dense,size_t row_stride); /*! * \brief Constructor from an existing DMatrix. @@ -245,77 +162,53 @@ class EllpackPageImpl { */ void Compact(int device, EllpackPageImpl* page, common::Span row_indexes); - /*! - * \brief Initialize the EllpackInfo contained in the EllpackMatrix. - * - * This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns - * the GPU memory used by the EllpackInfo. - * - * @param device The GPU device to use. - * @param is_dense Whether the matrix is dense. - * @param row_stride The number of features between starts of consecutive rows. - * @param hmat The histogram cuts of all the features. - */ - void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat); - - /*! - * \brief Initialize the buffer to store compressed features. - * - * @param device The GPU device to use. - * @param num_rows The number of rows we are storing in the buffer. - */ - void InitCompressedData(int device, size_t num_rows); - - /*! - * \brief Compress a single page of CSR data into ELLPACK. - * - * @param device The GPU device to use. - * @param row_batch The CSR page. - * @param device_row_state On-device data for maintaining state. - */ - void CreateHistIndices(int device, - const SparsePage& row_batch, - const RowStateOnDevice& device_row_state); /*! \return Number of instances in the page. */ size_t Size() const; /*! \brief Set the base row id for this page. */ - inline void SetBaseRowId(size_t row_id) { - matrix.base_rowid = row_id; + void SetBaseRowId(size_t row_id) { + base_rowid = row_id; } - /*! \brief clear the page. */ - void Clear(); - - /*! - * \brief Push a sparse page. - * \param batch The row page. - */ - void Push(int device, const SparsePage& batch); - /*! \return Estimation of memory cost of this page. */ - size_t MemCostBytes() const; + static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ; - /*! - * \brief Copy the ELLPACK matrix to GPU. - * - * @param device The GPU device to use. - * @param info The EllpackInfo for the matrix. - */ - void InitDevice(int device, EllpackInfo info); - /*! \brief Compress the accumulated SparsePage into ELLPACK format. - * - * @param device The GPU device to use. - */ - void CompressSparsePage(int device); + /*! \brief Return the total number of symbols (total number of bins plus 1 for + * not found). */ + size_t NumSymbols() const { return cuts_.TotalBins() + 1; } + + EllpackDeviceAccessor GetDeviceAccessor(int device) const; private: + /*! + * \brief Compress a single page of CSR data into ELLPACK. + * + * @param device The GPU device to use. + * @param row_batch The CSR page. + */ + void CreateHistIndices(int device, + const SparsePage& row_batch + ); + /*! + * \brief Initialize the buffer to store compressed features. + */ + void InitCompressedData(int device); + + +public: + /*! \brief Whether or not if the matrix is dense. */ + bool is_dense; + /*! \brief Row length for ELLPack. */ + size_t row_stride; + size_t base_rowid{0}; + size_t n_rows{}; + /*! \brief global index of histogram, which is stored in ELLPack format. */ + HostDeviceVector gidx_buffer; + common::HistogramCuts cuts_; +private: common::Monitor monitor_; - dh::BulkAllocator ba_; - bool device_initialized_{false}; - SparsePage sparse_page_{}; }; } // namespace xgboost diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index b46e35c96..147d8fb4d 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -17,26 +17,35 @@ class EllpackPageRawFormat : public SparsePageFormat { public: bool Read(EllpackPage* page, dmlc::SeekStream* fi) override { auto* impl = page->Impl(); - impl->Clear(); - if (!fi->Read(&impl->matrix.n_rows)) return false; - return fi->Read(&impl->idx_buffer); + fi->Read(&impl->cuts_.cut_values_.HostVector()); + fi->Read(&impl->cuts_.cut_ptrs_.HostVector()); + fi->Read(&impl->cuts_.min_vals_.HostVector()); + fi->Read(&impl->n_rows); + fi->Read(&impl->is_dense); + fi->Read(&impl->row_stride); + if (!fi->Read(&impl->gidx_buffer.HostVector())) { + return false; + } + return true; } bool Read(EllpackPage* page, dmlc::SeekStream* fi, const std::vector& sorted_index_set) override { - auto* impl = page->Impl(); - impl->Clear(); - if (!fi->Read(&impl->matrix.n_rows)) return false; - return fi->Read(&page->Impl()->idx_buffer); + LOG(FATAL) << "Not implemented"; + return false; } void Write(const EllpackPage& page, dmlc::Stream* fo) override { auto* impl = page.Impl(); - fo->Write(impl->matrix.n_rows); - auto buffer = impl->idx_buffer; - CHECK(!buffer.empty()); - fo->Write(buffer); + fo->Write(impl->cuts_.cut_values_.ConstHostVector()); + fo->Write(impl->cuts_.cut_ptrs_.ConstHostVector()); + fo->Write(impl->cuts_.min_vals_.ConstHostVector()); + fo->Write(impl->n_rows); + fo->Write(impl->is_dense); + fo->Write(impl->row_stride); + CHECK(!impl->gidx_buffer.ConstHostVector().empty()); + fo->Write(impl->gidx_buffer.HostVector()); } }; diff --git a/src/data/ellpack_page_source.cc b/src/data/ellpack_page_source.cc index 4f5453630..2838dc400 100644 --- a/src/data/ellpack_page_source.cc +++ b/src/data/ellpack_page_source.cc @@ -2,45 +2,23 @@ * Copyright 2019 XGBoost contributors */ #ifndef XGBOOST_USE_CUDA +#include +#if DMLC_ENABLE_STD_THREAD -#include #include "ellpack_page_source.h" +#include namespace xgboost { namespace data { EllpackPageSource::EllpackPageSource(DMatrix* dmat, const std::string& cache_info, const BatchParam& param) noexcept(false) { - LOG(FATAL) << "Internal Error: " - "XGBoost is not compiled with CUDA but EllpackPageSource is required"; -} - -void EllpackPageSource::BeforeFirst() { - LOG(FATAL) << "Internal Error: " - "XGBoost is not compiled with CUDA but EllpackPageSource is required"; -} - -bool EllpackPageSource::Next() { - LOG(FATAL) << "Internal Error: " - "XGBoost is not compiled with CUDA but EllpackPageSource is required"; - return false; -} - -EllpackPage& EllpackPageSource::Value() { - LOG(FATAL) << "Internal Error: " - "XGBoost is not compiled with CUDA but EllpackPageSource is required"; - EllpackPage* page { nullptr }; - return *page; -} - -const EllpackPage& EllpackPageSource::Value() const { - LOG(FATAL) << "Internal Error: " - "XGBoost is not compiled with CUDA but EllpackPageSource is required"; - EllpackPage* page { nullptr }; - return *page; + LOG(FATAL) + << "Internal Error: " + "XGBoost is not compiled with CUDA but EllpackPageSource is required"; } } // namespace data } // namespace xgboost - +#endif // DMLC_ENABLE_STD_THREAD #endif // XGBOOST_USE_CUDA diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index f1befaf2a..663cfd4cf 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -3,73 +3,16 @@ */ #include #include -#include #include "../common/hist_util.h" +#include "ellpack_page.cuh" #include "ellpack_page_source.h" #include "sparse_page_source.h" -#include "ellpack_page.cuh" namespace xgboost { namespace data { -class EllpackPageSourceImpl : public DataSource { - public: - /*! - * \brief Create source from cache files the cache_prefix. - * \param cache_prefix The prefix of cache we want to solve. - */ - explicit EllpackPageSourceImpl(DMatrix* dmat, - const std::string& cache_info, - const BatchParam& param) noexcept(false); - - /*! \brief destructor */ - ~EllpackPageSourceImpl() override = default; - - void BeforeFirst() override; - bool Next() override; - EllpackPage& Value(); - const EllpackPage& Value() const override; - - private: - /*! \brief Write Ellpack pages after accumulating them in memory. */ - void WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const; - - /*! \brief The page type string for ELLPACK. */ - const std::string kPageType_{".ellpack.page"}; - - int device_{-1}; - size_t page_size_{DMatrix::kPageSize}; - common::Monitor monitor_; - dh::BulkAllocator ba_; - /*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */ - EllpackInfo ellpack_info_; - std::unique_ptr> source_; - std::string cache_info_; -}; - -EllpackPageSource::EllpackPageSource(DMatrix* dmat, - const std::string& cache_info, - const BatchParam& param) noexcept(false) - : impl_{new EllpackPageSourceImpl(dmat, cache_info, param)} {} - -void EllpackPageSource::BeforeFirst() { - impl_->BeforeFirst(); -} - -bool EllpackPageSource::Next() { - return impl_->Next(); -} - -EllpackPage& EllpackPageSource::Value() { - return impl_->Value(); -} - -const EllpackPage& EllpackPageSource::Value() const { - return impl_->Value(); -} - size_t GetRowStride(DMatrix* dmat) { if (dmat->IsDense()) return dmat->Info().num_col_; @@ -86,17 +29,19 @@ size_t GetRowStride(DMatrix* dmat) { // Build the quantile sketch across the whole input data, then use the histogram cuts to compress // each CSR page, and write the accumulated ELLPACK pages to disk. -EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat, - const std::string& cache_info, - const BatchParam& param) noexcept(false) - : device_(param.gpu_id), cache_info_(cache_info) { - +EllpackPageSource::EllpackPageSource(DMatrix* dmat, + const std::string& cache_info, + const BatchParam& param) noexcept(false) { + cache_info_ = ParseCacheInfo(cache_info, kPageType_); + for (auto file : cache_info_.name_shards) { + CheckCacheFileExists(file); + } if (param.gpu_page_size > 0) { page_size_ = param.gpu_page_size; } monitor_.Init("ellpack_page_source"); - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(param.gpu_id)); monitor_.StartCuda("Quantiles"); size_t row_stride = GetRowStride(dmat); @@ -104,75 +49,52 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat, param.gpu_batch_nrows); monitor_.StopCuda("Quantiles"); - monitor_.StartCuda("CreateEllpackInfo"); - ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, cuts, &ba_); - monitor_.StopCuda("CreateEllpackInfo"); - monitor_.StartCuda("WriteEllpackPages"); - WriteEllpackPages(dmat, cache_info); + WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride); monitor_.StopCuda("WriteEllpackPages"); - source_.reset(new ExternalMemoryPrefetcher( - ParseCacheInfo(cache_info_, kPageType_))); -} - -void EllpackPageSourceImpl::BeforeFirst() { - source_.reset(new ExternalMemoryPrefetcher( - ParseCacheInfo(cache_info_, kPageType_))); - source_->BeforeFirst(); -} - -bool EllpackPageSourceImpl::Next() { - return source_->Next(); -} - -EllpackPage& EllpackPageSourceImpl::Value() { - EllpackPage& page = source_->Value(); - page.Impl()->InitDevice(device_, ellpack_info_); - return page; -} - -const EllpackPage& EllpackPageSourceImpl::Value() const { - EllpackPage& page = source_->Value(); - page.Impl()->InitDevice(device_, ellpack_info_); - return page; + external_prefetcher_.reset( + new ExternalMemoryPrefetcher(cache_info_)); } // Compress each CSR page to ELLPACK, and write the accumulated pages to disk. -void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const { +void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat, + const common::HistogramCuts& cuts, + const std::string& cache_info, + size_t row_stride) const { auto cinfo = ParseCacheInfo(cache_info, kPageType_); const size_t extra_buffer_capacity = 6; - SparsePageWriter writer( - cinfo.name_shards, cinfo.format_shards, extra_buffer_capacity); + SparsePageWriter writer(cinfo.name_shards, cinfo.format_shards, + extra_buffer_capacity); std::shared_ptr page; + SparsePage temp_host_page; writer.Alloc(&page); auto* impl = page->Impl(); - impl->matrix.info = ellpack_info_; - impl->Clear(); - const MetaInfo& info = dmat->Info(); size_t bytes_write = 0; double tstart = dmlc::GetTime(); for (const auto& batch : dmat->GetBatches()) { - impl->Push(device_, batch); + temp_host_page.Push(batch); - size_t mem_cost_bytes = impl->MemCostBytes(); + size_t mem_cost_bytes = + EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts); if (mem_cost_bytes >= page_size_) { bytes_write += mem_cost_bytes; - impl->CompressSparsePage(device_); + *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(), + row_stride); writer.PushWrite(std::move(page)); writer.Alloc(&page); impl = page->Impl(); - impl->matrix.info = ellpack_info_; - impl->Clear(); + temp_host_page.Clear(); double tdiff = dmlc::GetTime() - tstart; LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, " << (bytes_write >> 20UL) << " written"; } } - if (impl->Size() != 0) { - impl->CompressSparsePage(device_); + if (temp_host_page.Size() != 0) { + *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(), + row_stride); writer.PushWrite(std::move(page)); } } diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 7f8e65358..a1ce587c2 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -10,19 +10,17 @@ #include #include "../common/timer.h" +#include "../common/hist_util.h" +#include "sparse_page_source.h" namespace xgboost { namespace data { -class EllpackPageSourceImpl; - /*! * \brief External memory data source for ELLPACK format. * - * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid - * including CUDA-specific implementation details in the header. */ -class EllpackPageSource : public DataSource { +class EllpackPageSource { public: /*! * \brief Create source from cache files the cache_prefix. @@ -32,19 +30,33 @@ class EllpackPageSource : public DataSource { const std::string& cache_info, const BatchParam& param) noexcept(false); - /*! \brief destructor */ - ~EllpackPageSource() override = default; + BatchSet GetBatchSet() { + auto begin_iter = BatchIterator( + new SparseBatchIteratorImpl, + EllpackPage>(external_prefetcher_.get())); + return BatchSet(begin_iter); + } - void BeforeFirst() override; - bool Next() override; - EllpackPage& Value(); - const EllpackPage& Value() const override; - - const EllpackPageSourceImpl* Impl() const { return impl_.get(); } - EllpackPageSourceImpl* Impl() { return impl_.get(); } + ~EllpackPageSource() { + external_prefetcher_.reset(); + for (auto file : cache_info_.name_shards) { + TryDeleteCacheFile(file); + } + } private: - std::shared_ptr impl_; + void WriteEllpackPages(int device, DMatrix* dmat, + const common::HistogramCuts& cuts, + const std::string& cache_info, + size_t row_stride) const; + + /*! \brief The page type string for ELLPACK. */ + const std::string kPageType_{".ellpack.page"}; + + size_t page_size_{DMatrix::kPageSize}; + common::Monitor monitor_; + std::unique_ptr> external_prefetcher_; + CacheInfo cache_info_; }; } // namespace data diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc index 27abd8c31..4d903f87d 100644 --- a/src/data/sparse_page_dmatrix.cc +++ b/src/data/sparse_page_dmatrix.cc @@ -51,11 +51,7 @@ BatchSet SparsePageDMatrix::GetEllpackBatches(const BatchParam& par ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param)); batch_param_ = param; } - ellpack_source_->BeforeFirst(); - ellpack_source_->Next(); - auto begin_iter = BatchIterator( - new SparseBatchIteratorImpl(ellpack_source_.get())); - return BatchSet(begin_iter); + return ellpack_source_->GetBatchSet(); } } // namespace data diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 5bbda52af..165e99186 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -97,9 +97,11 @@ struct SparsePageLoader { }; struct EllpackLoader { - EllpackMatrix const& matrix; - XGBOOST_DEVICE EllpackLoader(EllpackMatrix const& m, bool use_shared, bst_feature_t num_features, - bst_row_t num_rows, size_t entry_start) : matrix{m} {} + EllpackDeviceAccessor const& matrix; + XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool use_shared, + bst_feature_t num_features, bst_row_t num_rows, + size_t entry_start) + : matrix{m} {} __device__ __forceinline__ float GetFvalue(int ridx, int fidx) const { auto gidx = matrix.GetBinIndex(ridx, fidx); if (gidx == -1) { @@ -107,10 +109,10 @@ struct EllpackLoader { } // The gradient index needs to be shifted by one as min values are not included in the // cuts. - if (gidx == matrix.info.feature_segments[fidx]) { - return matrix.info.min_fvalue[fidx]; + if (gidx == matrix.feature_segments[fidx]) { + return matrix.min_fvalue[fidx]; } - return matrix.info.gidx_fvalue_map[gidx - 1]; + return matrix.gidx_fvalue_map[gidx - 1]; } }; @@ -217,7 +219,7 @@ class GPUPredictor : public xgboost::Predictor { this->tree_begin_, this->tree_end_, num_features, num_rows, entry_start, use_shared, this->num_group_); } - void PredictInternal(EllpackMatrix const& batch, HostDeviceVector* out_preds, + void PredictInternal(EllpackDeviceAccessor const& batch, HostDeviceVector* out_preds, size_t batch_offset) { const uint32_t BLOCK_THREADS = 256; size_t num_rows = batch.n_rows; @@ -226,11 +228,11 @@ class GPUPredictor : public xgboost::Predictor { bool use_shared = false; size_t entry_start = 0; dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} ( - PredictKernel, + PredictKernel, batch, dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset), dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_), - this->tree_begin_, this->tree_end_, batch.info.NumFeatures(), num_rows, + this->tree_begin_, this->tree_end_, batch.NumFeatures(), num_rows, entry_start, use_shared, this->num_group_); } @@ -269,8 +271,10 @@ class GPUPredictor : public xgboost::Predictor { if (dmat->PageExists()) { size_t batch_offset = 0; for (auto const& page : dmat->GetBatches()) { - this->PredictInternal(page.Impl()->matrix, out_preds, batch_offset); - batch_offset += page.Impl()->matrix.n_rows; + this->PredictInternal( + page.Impl()->GetDeviceAccessor(generic_param_->gpu_id), out_preds, + batch_offset); + batch_offset += page.Impl()->n_rows; } } else { size_t batch_offset = 0; diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index f294855b5..888e39931 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -153,7 +153,8 @@ ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page, size_t n_rows, const BatchParam& batch_param) : batch_param_(batch_param), - page_(new EllpackPageImpl(batch_param.gpu_id, page->matrix.info, n_rows)) {} + page_(new EllpackPageImpl(batch_param.gpu_id, page->cuts_, page->is_dense, + page->row_stride, n_rows)) {} GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span gpair, DMatrix* dmat) { @@ -217,9 +218,9 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Spanmatrix.info, - sample_rows)); + page_.reset(new EllpackPageImpl( + batch_param_.gpu_id, original_page_->cuts_, original_page_->is_dense, + original_page_->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); @@ -298,9 +299,9 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Spanmatrix.info, - sample_rows)); + page_.reset(new EllpackPageImpl(batch_param_.gpu_id, original_page_->cuts_, + original_page_->is_dense, + original_page_->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); @@ -319,7 +320,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page, monitor_.Init("gradient_based_sampler"); bool is_sampling = subsample < 1.0; - bool is_external_memory = page->matrix.n_rows != n_rows; + bool is_external_memory = page->n_rows != n_rows; if (is_sampling) { switch (sampling_method) { diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 88ebfec61..0035fb214 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -101,7 +101,7 @@ template GradientPairPrecise CreateRoundingFactor(common::Span gpair); template -__global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix, +__global__ void SharedMemHistKernel(EllpackDeviceAccessor matrix, common::Span d_ridx, GradientSumT* __restrict__ d_node_hist, const GradientPair* __restrict__ d_gpair, @@ -112,14 +112,14 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix, extern __shared__ char smem[]; GradientSumT* smem_arr = reinterpret_cast(smem); // NOLINT if (use_shared_memory_histograms) { - dh::BlockFill(smem_arr, matrix.info.n_bins, GradientSumT()); + dh::BlockFill(smem_arr, matrix.NumBins(), GradientSumT()); __syncthreads(); } for (auto idx : dh::GridStrideRange(static_cast(0), n_elements)) { - int ridx = d_ridx[idx / matrix.info.row_stride]; + int ridx = d_ridx[idx / matrix.row_stride]; int gidx = - matrix.gidx_iter[ridx * matrix.info.row_stride + idx % matrix.info.row_stride]; - if (gidx != matrix.info.n_bins) { + matrix.gidx_iter[ridx * matrix.row_stride + idx % matrix.row_stride]; + if (gidx != matrix.NumBins()) { GradientSumT truncated { TruncateWithRoundingFactor(rounding.GetGrad(), d_gpair[ridx].GetGrad()), TruncateWithRoundingFactor(rounding.GetHess(), d_gpair[ridx].GetHess()), @@ -135,7 +135,7 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix, if (use_shared_memory_histograms) { // Write shared memory back to global memory __syncthreads(); - for (auto i : dh::BlockStrideRange(static_cast(0), matrix.info.n_bins)) { + for (auto i : dh::BlockStrideRange(static_cast(0), matrix.NumBins())) { GradientSumT truncated { TruncateWithRoundingFactor(rounding.GetGrad(), smem_arr[i].GetGrad()), TruncateWithRoundingFactor(rounding.GetHess(), smem_arr[i].GetHess()), @@ -146,16 +146,16 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix, } template -void BuildGradientHistogram(EllpackMatrix const& matrix, +void BuildGradientHistogram(EllpackDeviceAccessor const& matrix, common::Span gpair, common::Span d_ridx, common::Span histogram, GradientSumT rounding, bool shared) { const size_t smem_size = shared - ? sizeof(GradientSumT) * matrix.info.n_bins + ? sizeof(GradientSumT) * matrix.NumBins() : 0; - auto n_elements = d_ridx.size() * matrix.info.row_stride; + auto n_elements = d_ridx.size() * matrix.row_stride; uint32_t items_per_thread = 8; uint32_t block_threads = 256; @@ -168,14 +168,14 @@ void BuildGradientHistogram(EllpackMatrix const& matrix, } template void BuildGradientHistogram( - EllpackMatrix const& matrix, + EllpackDeviceAccessor const& matrix, common::Span gpair, common::Span ridx, common::Span histogram, GradientPair rounding, bool shared); template void BuildGradientHistogram( - EllpackMatrix const& matrix, + EllpackDeviceAccessor const& matrix, common::Span gpair, common::Span ridx, common::Span histogram, diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 3cacae352..a7c923b61 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -18,7 +18,7 @@ DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x) } template -void BuildGradientHistogram(EllpackMatrix const& matrix, +void BuildGradientHistogram(EllpackDeviceAccessor const& matrix, common::Span gpair, common::Span ridx, common::Span histogram, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 4f02343dd..117a190a3 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -180,15 +180,15 @@ template __device__ void EvaluateFeature( int fidx, common::Span node_histogram, - const xgboost::EllpackMatrix& matrix, + const EllpackDeviceAccessor& matrix, DeviceSplitCandidate* best_split, // shared memory storing best split const DeviceNodeStats& node, const GPUTrainingParam& param, TempStorageT* temp_storage, // temp memory for cub operations int constraint, // monotonic_constraints const ValueConstraint& value_constraint) { // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = matrix.info.feature_segments[fidx]; // begining bin - uint32_t gidx_end = matrix.info.feature_segments[fidx + 1]; // end bin for i^th feature + uint32_t gidx_begin = matrix.feature_segments[fidx]; // begining bin + uint32_t gidx_end = matrix.feature_segments[fidx + 1]; // end bin for i^th feature // Sum histogram bins for current feature GradientSumT const feature_sum = ReduceFeature( @@ -236,9 +236,9 @@ __device__ void EvaluateFeature( int split_gidx = (scan_begin + threadIdx.x) - 1; float fvalue; if (split_gidx < static_cast(gidx_begin)) { - fvalue = matrix.info.min_fvalue[fidx]; + fvalue = matrix.min_fvalue[fidx]; } else { - fvalue = matrix.info.gidx_fvalue_map[split_gidx]; + fvalue = matrix.gidx_fvalue_map[split_gidx]; } GradientSumT left = missing_left ? bin + missing : bin; GradientSumT right = parent_sum - left; @@ -254,7 +254,7 @@ __global__ void EvaluateSplitKernel( common::Span node_histogram, // histogram for gradients common::Span feature_set, // Selected features DeviceNodeStats node, - xgboost::EllpackMatrix matrix, + xgboost::EllpackDeviceAccessor matrix, GPUTrainingParam gpu_param, common::Span split_candidates, // resulting split ValueConstraint value_constraint, @@ -601,7 +601,7 @@ struct GPUHistMakerDevice { uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]} ( EvaluateSplitKernel, - hist.GetNodeHistogram(nidx), d_feature_set, node, page->matrix, + hist.GetNodeHistogram(nidx), d_feature_set, node, page->GetDeviceAccessor(device_id), gpu_param, d_split_candidates, node_value_constraints[nidx], monotone_constraints); @@ -625,9 +625,7 @@ struct GPUHistMakerDevice { hist.AllocateHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); - auto d_gpair = gpair.data(); - - BuildGradientHistogram(page->matrix, gpair, d_ridx, d_node_hist, + BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist, histogram_rounding, use_shared_memory_histograms); } @@ -637,7 +635,7 @@ struct GPUHistMakerDevice { auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); - dh::LaunchN(device_id, page->matrix.info.n_bins, [=] __device__(size_t idx) { + dh::LaunchN(device_id, page->cuts_.TotalBins(), [=] __device__(size_t idx) { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); @@ -652,7 +650,7 @@ struct GPUHistMakerDevice { } void UpdatePosition(int nidx, RegTree::Node split_node) { - auto d_matrix = page->matrix; + auto d_matrix = page->GetDeviceAccessor(device_id); row_partitioner->UpdatePosition( nidx, split_node.LeftChild(), split_node.RightChild(), @@ -689,7 +687,7 @@ struct GPUHistMakerDevice { row_partitioner.reset(); // Release the device memory first before reallocating row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_)); } - if (page->matrix.n_rows == p_fmat->Info().num_row_) { + if (page->n_rows == p_fmat->Info().num_row_) { FinalisePositionInPage(page, d_nodes); } else { for (auto& batch : p_fmat->GetBatches(batch_param)) { @@ -699,7 +697,7 @@ struct GPUHistMakerDevice { } void FinalisePositionInPage(EllpackPageImpl* page, const common::Span d_nodes) { - auto d_matrix = page->matrix; + auto d_matrix = page->GetDeviceAccessor(device_id); row_partitioner->FinalisePosition( [=] __device__(size_t row_id, int position) { if (!d_matrix.IsInRange(row_id)) { @@ -765,7 +763,7 @@ struct GPUHistMakerDevice { reducer->AllReduceSum( reinterpret_cast(d_node_hist), reinterpret_cast(d_node_hist), - page->matrix.info.n_bins * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); + page->cuts_.TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); reducer->Synchronize(); monitor.StopCuda("AllReduce"); @@ -954,14 +952,14 @@ inline void GPUHistMakerDevice::InitHistogram() { // check if we can use shared memory for building histograms // (assuming atleast we need 2 CTAs per SM to maintain decent latency // hiding) - auto histogram_size = sizeof(GradientSumT) * page->matrix.info.n_bins; + auto histogram_size = sizeof(GradientSumT) * page->cuts_.TotalBins(); auto max_smem = dh::MaxSharedMemory(device_id); if (histogram_size <= max_smem) { use_shared_memory_histograms = true; } // Init histogram - hist.Init(device_id, page->matrix.info.n_bins); + hist.Init(device_id, page->cuts_.TotalBins()); } template diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index b1c6f09eb..1bf5b7244 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -19,23 +19,19 @@ TEST(EllpackPage, EmptyDMatrix) { auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity); auto& page = *dmat->GetBatches({0, kMaxBin, kGpuBatchNRows}).begin(); auto impl = page.Impl(); - ASSERT_EQ(impl->matrix.info.feature_segments.size(), 1); - ASSERT_EQ(impl->matrix.info.min_fvalue.size(), 0); - ASSERT_EQ(impl->matrix.info.gidx_fvalue_map.size(), 0); - ASSERT_EQ(impl->matrix.info.row_stride, 0); - ASSERT_EQ(impl->matrix.info.n_bins, 0); - ASSERT_EQ(impl->gidx_buffer.size(), 4); + ASSERT_EQ(impl->row_stride, 0); + ASSERT_EQ(impl->cuts_.TotalBins(), 0); + ASSERT_EQ(impl->gidx_buffer.Size(), 4); } TEST(EllpackPage, BuildGidxDense) { int constexpr kNRows = 16, kNCols = 8; auto page = BuildEllpackPage(kNRows, kNCols); - std::vector h_gidx_buffer(page->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer); - common::CompressedIterator gidx(h_gidx_buffer.data(), 25); + std::vector h_gidx_buffer(page->gidx_buffer.HostVector()); + common::CompressedIterator gidx(h_gidx_buffer.data(), page->NumSymbols()); - ASSERT_EQ(page->matrix.info.row_stride, kNCols); + ASSERT_EQ(page->row_stride, kNCols); std::vector solution = { 0, 3, 8, 9, 14, 17, 20, 21, @@ -64,11 +60,10 @@ TEST(EllpackPage, BuildGidxSparse) { int constexpr kNRows = 16, kNCols = 8; auto page = BuildEllpackPage(kNRows, kNCols, 0.9f); - std::vector h_gidx_buffer(page->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer); + std::vector h_gidx_buffer(page->gidx_buffer.HostVector()); common::CompressedIterator gidx(h_gidx_buffer.data(), 25); - ASSERT_LE(page->matrix.info.row_stride, 3); + ASSERT_LE(page->row_stride, 3); // row_stride = 3, 16 rows, 48 entries for ELLPack std::vector solution = { @@ -76,16 +71,16 @@ TEST(EllpackPage, BuildGidxSparse) { 24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24, 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 }; - for (size_t i = 0; i < kNRows * page->matrix.info.row_stride; ++i) { + for (size_t i = 0; i < kNRows * page->row_stride; ++i) { ASSERT_EQ(solution[i], gidx[i]); } } struct ReadRowFunction { - EllpackMatrix matrix; + EllpackDeviceAccessor matrix; int row; bst_float* row_data_d; - ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d) + ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d) : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {} __device__ void operator()(size_t col) { @@ -110,7 +105,8 @@ TEST(EllpackPage, Copy) { auto page = (*dmat->GetBatches(param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(0, page->matrix.info, kRows); + EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride, + kRows); // Copy batch pages into the result page. size_t offset = 0; @@ -126,13 +122,13 @@ TEST(EllpackPage, Copy) { std::vector row_result(kCols); for (auto& page : dmat->GetBatches(param)) { auto impl = page.Impl(); - EXPECT_EQ(impl->matrix.base_rowid, current_row); + EXPECT_EQ(impl->base_rowid, current_row); for (size_t i = 0; i < impl->Size(); i++) { - dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get())); + dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(0, kCols, ReadRowFunction(result.matrix, current_row, row_result_d.data().get())); + dh::LaunchN(0, kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); EXPECT_EQ(row, row_result); @@ -155,7 +151,8 @@ TEST(EllpackPage, Compact) { auto page = (*dmat->GetBatches(param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(0, page->matrix.info, kCompactedRows); + EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride, + kCompactedRows); // Compact batch pages into the result page. std::vector row_indexes_h { @@ -174,7 +171,7 @@ TEST(EllpackPage, Compact) { std::vector row_result(kCols); for (auto& page : dmat->GetBatches(param)) { auto impl = page.Impl(); - EXPECT_EQ(impl->matrix.base_rowid, current_row); + EXPECT_EQ(impl->base_rowid, current_row); for (size_t i = 0; i < impl->Size(); i++) { size_t compacted_row = row_indexes_h[current_row]; @@ -183,11 +180,12 @@ TEST(EllpackPage, Compact) { continue; } - dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get())); + dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); + dh::safe_cuda (cudaDeviceSynchronize()); thrust::copy(row_d.begin(), row_d.end(), row.begin()); dh::LaunchN(0, kCols, - ReadRowFunction(result.matrix, compacted_row, row_result_d.data().get())); + ReadRowFunction(result.GetDeviceAccessor(0), compacted_row, row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); EXPECT_EQ(row, row_result); diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index 59b2df31a..f487ef8e9 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -3,6 +3,7 @@ #include #include "../helpers.h" #include "../../../src/common/compressed_iterator.h" +#include "../../../src/data/ellpack_page.cuh" namespace xgboost { @@ -58,31 +59,29 @@ TEST(SparsePageDMatrix, EllpackPageContent) { BatchParam param{0, 2, 0, 0}; auto impl = (*dmat->GetBatches(param).begin()).Impl(); - EXPECT_EQ(impl->matrix.base_rowid, 0); - EXPECT_EQ(impl->matrix.n_rows, kRows); - EXPECT_FALSE(impl->matrix.info.is_dense); - EXPECT_EQ(impl->matrix.info.row_stride, 2); - EXPECT_EQ(impl->matrix.info.n_bins, 4); + EXPECT_EQ(impl->base_rowid, 0); + EXPECT_EQ(impl->n_rows, kRows); + EXPECT_FALSE(impl->is_dense); + EXPECT_EQ(impl->row_stride, 2); + EXPECT_EQ(impl->cuts_.TotalBins(), 4); auto impl_ext = (*dmat_ext->GetBatches(param).begin()).Impl(); - EXPECT_EQ(impl_ext->matrix.base_rowid, 0); - EXPECT_EQ(impl_ext->matrix.n_rows, kRows); - EXPECT_FALSE(impl_ext->matrix.info.is_dense); - EXPECT_EQ(impl_ext->matrix.info.row_stride, 2); - EXPECT_EQ(impl_ext->matrix.info.n_bins, 4); + EXPECT_EQ(impl_ext->base_rowid, 0); + EXPECT_EQ(impl_ext->n_rows, kRows); + EXPECT_FALSE(impl_ext->is_dense); + EXPECT_EQ(impl_ext->row_stride, 2); + EXPECT_EQ(impl_ext->cuts_.TotalBins(), 4); - std::vector buffer(impl->gidx_buffer.size()); - std::vector buffer_ext(impl_ext->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&buffer, impl->gidx_buffer); - dh::CopyDeviceSpanToVector(&buffer_ext, impl_ext->gidx_buffer); + std::vector buffer(impl->gidx_buffer.HostVector()); + std::vector buffer_ext(impl_ext->gidx_buffer.HostVector()); EXPECT_EQ(buffer, buffer_ext); } struct ReadRowFunction { - EllpackMatrix matrix; + EllpackDeviceAccessor matrix; int row; bst_float* row_data_d; - ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d) + ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d) : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {} __device__ void operator()(size_t col) { @@ -110,8 +109,8 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) { BatchParam param{0, kMaxBins, 0, kPageSize}; auto impl = (*dmat->GetBatches(param).begin()).Impl(); - EXPECT_EQ(impl->matrix.base_rowid, 0); - EXPECT_EQ(impl->matrix.n_rows, kRows); + EXPECT_EQ(impl->base_rowid, 0); + EXPECT_EQ(impl->n_rows, kRows); size_t current_row = 0; thrust::device_vector row_d(kCols); @@ -120,13 +119,13 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) { std::vector row_ext(kCols); for (auto& page : dmat_ext->GetBatches(param)) { auto impl_ext = page.Impl(); - EXPECT_EQ(impl_ext->matrix.base_rowid, current_row); + EXPECT_EQ(impl_ext->base_rowid, current_row); for (size_t i = 0; i < impl_ext->Size(); i++) { - dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get())); + dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->matrix, current_row, row_ext_d.data().get())); + dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get())); thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin()); EXPECT_EQ(row, row_ext); @@ -155,8 +154,8 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) { size_t current_row = 0; for (auto& page : dmat_ext->GetBatches(param)) { auto impl_ext = page.Impl(); - EXPECT_EQ(impl_ext->matrix.base_rowid, current_row); - current_row += impl_ext->matrix.n_rows; + EXPECT_EQ(impl_ext->base_rowid, current_row); + current_row += impl_ext->n_rows; } } diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 1ebebfc95..ac1bf6194 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -244,13 +244,13 @@ class HistogramCutsWrapper : public common::HistogramCuts { public: using SuperT = common::HistogramCuts; void SetValues(std::vector cuts) { - SuperT::cut_values_ = std::move(cuts); + SuperT::cut_values_.HostVector() = std::move(cuts); } void SetPtrs(std::vector ptrs) { - SuperT::cut_ptrs_ = std::move(ptrs); + SuperT::cut_ptrs_.HostVector() = std::move(ptrs); } void SetMins(std::vector mins) { - SuperT::min_vals_ = std::move(mins); + SuperT::min_vals_.HostVector() = std::move(mins); } }; } // anonymous namespace @@ -279,10 +279,8 @@ inline std::unique_ptr BuildEllpackPage( row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]); } - auto page = std::unique_ptr(new EllpackPageImpl(dmat->get(), {0, 256, 0})); - page->InitInfo(0, (*dmat)->IsDense(), row_stride, cmat); - page->InitCompressedData(0, n_rows); - page->CreateHistIndices(0, batch, RowStateOnDevice(batch.Size(), batch.Size())); + auto page = std::unique_ptr( + new EllpackPageImpl(0, cmat, batch, (*dmat)->IsDense(), row_stride)); delete dmat; diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index 68144bc6e..151bb2beb 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -3,6 +3,7 @@ #include "../../../../src/data/ellpack_page.cuh" #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh" #include "../../helpers.h" +#include "dmlc/filesystem.h" namespace xgboost { namespace tree { @@ -29,7 +30,7 @@ void VerifySampling(size_t page_size, BatchParam param{0, 256, 0, page_size}; auto page = (*dmat->GetBatches(param).begin()).Impl(); if (page_size != 0) { - EXPECT_NE(page->matrix.n_rows, kRows); + EXPECT_NE(page->n_rows, kRows); } GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method); @@ -37,11 +38,11 @@ void VerifySampling(size_t page_size, if (fixed_size_sampling) { EXPECT_EQ(sample.sample_rows, kRows); - EXPECT_EQ(sample.page->matrix.n_rows, kRows); + EXPECT_EQ(sample.page->n_rows, kRows); EXPECT_EQ(sample.gpair.size(), kRows); } else { - EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016f); - EXPECT_NEAR(sample.page->matrix.n_rows, sample_rows, kRows * 0.016f); + EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016); + EXPECT_NEAR(sample.page->n_rows, sample_rows, kRows * 0.016f); EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.016f); } @@ -83,7 +84,7 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) { BatchParam param{0, 256, 0, kPageSize}; auto page = (*dmat->GetBatches(param).begin()).Impl(); - EXPECT_NE(page->matrix.n_rows, kRows); + EXPECT_NE(page->n_rows, kRows); GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform); auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get()); @@ -91,21 +92,19 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) { EXPECT_EQ(sample.sample_rows, kRows); EXPECT_EQ(sample.gpair.size(), gpair.Size()); EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer()); - EXPECT_EQ(sampled_page->matrix.n_rows, kRows); + EXPECT_EQ(sampled_page->n_rows, kRows); - std::vector buffer(sampled_page->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&buffer, sampled_page->gidx_buffer); + std::vector buffer(sampled_page->gidx_buffer.HostVector()); common::CompressedIterator - ci(buffer.data(), sampled_page->matrix.info.NumSymbols()); + ci(buffer.data(), sampled_page->NumSymbols()); size_t offset = 0; for (auto& batch : dmat->GetBatches(param)) { auto page = batch.Impl(); - std::vector page_buffer(page->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&page_buffer, page->gidx_buffer); + std::vector page_buffer(page->gidx_buffer.HostVector()); common::CompressedIterator - page_ci(page_buffer.data(), page->matrix.info.NumSymbols()); - size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride; + page_ci(page_buffer.data(), page->NumSymbols()); + size_t num_elements = page->n_rows * page->row_stride; for (size_t i = 0; i < num_elements; i++) { EXPECT_EQ(ci[i + offset], page_ci[i]); } diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index c57beb3c7..54531ecfd 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -27,7 +27,7 @@ void TestDeterminsticHistogram() { gpair.SetDevice(0); auto rounding = CreateRoundingFactor(gpair.DeviceSpan()); - BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx, + BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram, rounding, true); for (size_t i = 0; i < kRounds; ++i) { @@ -35,7 +35,7 @@ void TestDeterminsticHistogram() { auto d_histogram = dh::ToSpan(new_histogram); auto rounding = CreateRoundingFactor(gpair.DeviceSpan()); - BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx, + BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram, rounding, true); for (size_t j = 0; j < new_histogram.size(); ++j) { @@ -50,7 +50,7 @@ void TestDeterminsticHistogram() { auto gpair = GenerateRandomGradients(kRows, kLower, kUpper); gpair.SetDevice(0); dh::device_vector baseline(kBins * kCols); - BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx, + BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx, dh::ToSpan(baseline), rounding, true); for (size_t i = 0; i < baseline.size(); ++i) { EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(), diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 324241cab..b1990d69d 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -97,12 +97,8 @@ void TestBuildHist(bool use_shared_memory_histograms) { } gpair.SetDevice(0); - thrust::host_vector h_gidx_buffer (page->gidx_buffer.size()); + thrust::host_vector h_gidx_buffer (page->gidx_buffer.HostVector()); - common::CompressedByteT* d_gidx_buffer_ptr = page->gidx_buffer.data(); - dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr, - sizeof(common::CompressedByteT) * page->gidx_buffer.size(), - cudaMemcpyDeviceToHost)); maker.row_partitioner.reset(new RowPartitioner(0, kNRows)); maker.hist.AllocateHistogram(0); @@ -196,15 +192,10 @@ TEST(GpuHist, EvaluateSplits) { auto cmat = GetHostCutMatrix(); // Copy cut matrix to device. - maker.ba.Allocate(0, - &(page->matrix.info.feature_segments), cmat.Ptrs().size(), - &(page->matrix.info.min_fvalue), cmat.MinValues().size(), - &(page->matrix.info.gidx_fvalue_map), 24, - &(maker.monotone_constraints), kNCols); - dh::CopyVectorToDeviceSpan(page->matrix.info.feature_segments, cmat.Ptrs()); - dh::CopyVectorToDeviceSpan(page->matrix.info.gidx_fvalue_map, cmat.Values()); - dh::CopyVectorToDeviceSpan(maker.monotone_constraints, param.monotone_constraints); - dh::CopyVectorToDeviceSpan(page->matrix.info.min_fvalue, cmat.MinValues()); + page->cuts_ = cmat; + maker.ba.Allocate(0, &(maker.monotone_constraints), kNCols); + dh::CopyVectorToDeviceSpan(maker.monotone_constraints, + param.monotone_constraints); // Initialize GPUHistMakerDevice::hist maker.hist.Init(0, (max_bins - 1) * kNCols); @@ -274,15 +265,13 @@ void TestHistogramIndexImpl() { // Extract the device maker from the histogram makers and from that its compressed // histogram index const auto &maker = hist_maker.maker; - std::vector h_gidx_buffer(maker->page->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer, maker->page->gidx_buffer); + std::vector h_gidx_buffer(maker->page->gidx_buffer.HostVector()); const auto &maker_ext = hist_maker_ext.maker; - std::vector h_gidx_buffer_ext(maker_ext->page->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, maker_ext->page->gidx_buffer); + std::vector h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector()); - ASSERT_EQ(maker->page->matrix.info.n_bins, maker_ext->page->matrix.info.n_bins); - ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size()); + ASSERT_EQ(maker->page->cuts_.TotalBins(), maker_ext->page->cuts_.TotalBins()); + ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size()); }