/*! * Copyright (c) 2014 by Contributors * \file sparse_batch_page.h * content holder of sparse batch that can be saved to disk * the representation can be effectively * use in external memory computation * \author Tianqi Chen */ #ifndef XGBOOST_DATA_SPARSE_BATCH_PAGE_H_ #define XGBOOST_DATA_SPARSE_BATCH_PAGE_H_ #include #include #include #include #include #include #include #include #if DMLC_ENABLE_STD_THREAD #include #include #endif namespace xgboost { namespace data { /*! * \brief in-memory storage unit of sparse batch */ class SparsePage { public: /*! \brief Format of the sparse page. */ class Format; /*! \brief Writer to write the sparse page to files. */ class Writer; /*! \brief minimum index of all index, used as hint for compression. */ bst_uint min_index; /*! \brief offset of the segments */ std::vector offset; /*! \brief the data of the segments */ std::vector data; /*! \brief constructor */ SparsePage() { this->Clear(); } /*! \return number of instance in the page */ inline size_t Size() const { return offset.size() - 1; } /*! \return estimation of memory cost of this page */ inline size_t MemCostBytes(void) const { return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry); } /*! \brief clear the page */ inline void Clear(void) { min_index = 0; offset.clear(); offset.push_back(0); data.clear(); } /*! * \brief Push row batch into the page * \param batch the row batch */ inline void Push(const RowBatch &batch) { data.resize(offset.back() + batch.ind_ptr[batch.size]); std::memcpy(dmlc::BeginPtr(data) + offset.back(), batch.data_ptr + batch.ind_ptr[0], sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]); size_t top = offset.back(); size_t begin = offset.size(); offset.resize(offset.size() + batch.size); for (size_t i = 0; i < batch.size; ++i) { offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0]; } } /*! * \brief Push row block into the page. * \param batch the row batch. */ inline void Push(const dmlc::RowBlock& batch) { data.reserve(data.size() + batch.offset[batch.size] - batch.offset[0]); offset.reserve(offset.size() + batch.size); CHECK(batch.index != nullptr); for (size_t i = 0; i < batch.size; ++i) { offset.push_back(offset.back() + batch.offset[i + 1] - batch.offset[i]); } for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) { uint32_t index = batch.index[i]; bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i]; data.push_back(SparseBatch::Entry(index, fvalue)); } CHECK_EQ(offset.back(), data.size()); } /*! * \brief Push a sparse page * \param batch the row page */ inline void Push(const SparsePage &batch) { size_t top = offset.back(); data.resize(top + batch.data.size()); std::memcpy(dmlc::BeginPtr(data) + top, dmlc::BeginPtr(batch.data), sizeof(SparseBatch::Entry) * batch.data.size()); size_t begin = offset.size(); offset.resize(begin + batch.Size()); for (size_t i = 0; i < batch.Size(); ++i) { offset[i + begin] = top + batch.offset[i + 1]; } } /*! * \brief Push one instance into page * \param row an instance row */ inline void Push(const SparseBatch::Inst &inst) { offset.push_back(offset.back() + inst.length); size_t begin = data.size(); data.resize(begin + inst.length); if (inst.length != 0) { std::memcpy(dmlc::BeginPtr(data) + begin, inst.data, sizeof(SparseBatch::Entry) * inst.length); } } /*! * \param base_rowid base_rowid of the data * \return row batch representation of the page */ inline RowBatch GetRowBatch(size_t base_rowid) const { RowBatch out; out.base_rowid = base_rowid; out.ind_ptr = dmlc::BeginPtr(offset); out.data_ptr = dmlc::BeginPtr(data); out.size = offset.size() - 1; return out; } }; /*! * \brief Format specification of SparsePage. */ class SparsePage::Format { public: /*! \brief virtual destructor */ virtual ~Format() {} /*! * \brief Load all the segments into page, advance fi to end of the block. * \param page The data to read page into. * \param fi the input stream of the file * \return true of the loading as successful, false if end of file was reached */ virtual bool Read(SparsePage* page, dmlc::SeekStream* fi) = 0; /*! * \brief read only the segments we are interested in, advance fi to end of the block. * \param page The page to load the data into. * \param fi the input stream of the file * \param sorted_index_set sorted index of segments we are interested in * \return true of the loading as successful, false if end of file was reached */ virtual bool Read(SparsePage* page, dmlc::SeekStream* fi, const std::vector& sorted_index_set) = 0; /*! * \brief save the data to fo, when a page was written. * \param fo output stream */ virtual void Write(const SparsePage& page, dmlc::Stream* fo) = 0; /*! * \brief Create sparse page of format. * \return The created format functors. */ static Format* Create(const std::string& name); /*! * \brief decide the format from cache prefix. * \return pair of row format, column format type of the cache prefix. */ static std::pair DecideFormat(const std::string& cache_prefix); }; #if DMLC_ENABLE_STD_THREAD /*! * \brief A threaded writer to write sparse batch page to sharded files. */ class SparsePage::Writer { public: /*! * \brief constructor * \param name_shards name of shard files. * \param format_shards format of each shard. * \param extra_buffer_capacity Extra buffer capacity before block. */ explicit Writer( const std::vector& name_shards, const std::vector& format_shards, size_t extra_buffer_capacity); /*! \brief destructor, will close the files automatically */ ~Writer(); /*! * \brief Push a write job to the writer. * This function won't block, * writing is done by another thread inside writer. * \param page The page to be wriiten */ void PushWrite(std::unique_ptr&& page); /*! * \brief Allocate a page to store results. * This function can block when the writer is too slow and buffer pages * have not yet been recycled. * \param out_page Used to store the allocated pages. */ void Alloc(std::unique_ptr* out_page); private: /*! \brief number of allocated pages */ size_t num_free_buffer_; /*! \brief clock_pointer */ size_t clock_ptr_; /*! \brief writer threads */ std::vector > workers_; /*! \brief recycler queue */ dmlc::ConcurrentBlockingQueue > qrecycle_; /*! \brief worker threads */ std::vector > > qworkers_; }; #endif // DMLC_ENABLE_STD_THREAD /*! * \brief Registry entry for sparse page format. */ struct SparsePageFormatReg : public dmlc::FunctionRegEntryBase > { }; /*! * \brief Macro to register sparse page format. * * \code * // example of registering a objective * XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw) * .describe("Raw binary data format.") * .set_body([]() { * return new RawFormat(); * }); * \endcode */ #define XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(Name) \ DMLC_REGISTRY_REGISTER(::xgboost::data::SparsePageFormatReg, SparsePageFormat, Name) } // namespace data } // namespace xgboost #endif // XGBOOST_DATA_SPARSE_BATCH_PAGE_H_