/*! * Copyright 2015 by Contributors * \file sparse_page_dmatrix.h * \brief External-memory version of DMatrix. * \author Tianqi Chen */ #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_ #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_ #include #include #include #include #include #include #include #include "../common/common.h" #include "./sparse_page_writer.h" namespace xgboost { namespace data { class SparsePageDMatrix : public DMatrix { public: explicit SparsePageDMatrix(std::unique_ptr&& source, std::string cache_info) : source_(std::move(source)), cache_info_(std::move(cache_info)) { } MetaInfo& Info() override { return source_->info; } const MetaInfo& Info() const override { return source_->info; } dmlc::DataIter* RowIterator() override { auto iter = source_.get(); iter->BeforeFirst(); return iter; } bool HaveColAccess(bool sorted) const override { return col_iter_ != nullptr && col_iter_->sorted == sorted; } const RowSet& BufferedRowset() const override { return buffered_rowset_; } size_t GetColSize(size_t cidx) const override { return col_size_[cidx]; } float GetColDensity(size_t cidx) const override { size_t nmiss = buffered_rowset_.Size() - col_size_[cidx]; return 1.0f - (static_cast(nmiss)) / buffered_rowset_.Size(); } bool SingleColBlock() const override { return false; } dmlc::DataIter* ColIterator() override; void InitColAccess( size_t max_row_perbatch, bool sorted) override; /*! \brief page size 256 MB */ static const size_t kPageSize = 256UL << 20UL; /*! \brief Maximum number of rows per batch. */ static const size_t kMaxRowPerBatch = 64UL << 10UL; private: // declare the column batch iter. class ColPageIter : public dmlc::DataIter { public: explicit ColPageIter(std::vector >&& files); ~ColPageIter() override; void BeforeFirst() override; const SparsePage &Value() const override { return *page_; } bool Next() override; // initialize the column iterator with the specified index set. void Init(const std::vector& index_set); // If the column features are sorted bool sorted; private: // the temp page. SparsePage* page_; // internal clock ptr. size_t clock_ptr_; // data file pointer. std::vector > files_; // page format. std::vector > formats_; /*! \brief internal prefetcher. */ std::vector > > prefetchers_; // The index set to be loaded. std::vector index_set_; // The index set by the outsiders std::vector set_index_set_; // whether to load data dataset. bool set_load_all_, load_all_; }; /*! * \brief Try to initialize column data. * \return true if data already exists, false if they do not. */ bool TryInitColData(bool sorted); // source data pointer. std::unique_ptr source_; // the cache prefix std::string cache_info_; /*! \brief list of row index that are buffered */ RowSet buffered_rowset_; // count for column data std::vector col_size_; // internal column iter. std::unique_ptr col_iter_; }; } // namespace data } // namespace xgboost #endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_