Write ELLPACK pages to disk (#4879)

* add ellpack source
* add batch param
* extract function to parse cache info
* construct ellpack info separately
* push batch to ellpack page
* write ellpack page.
* make sparse page source reusable
This commit is contained in:
Rong Ou
2019-10-22 20:44:32 -07:00
committed by Jiaming Yuan
parent 310fe60b35
commit 5b1715d97c
25 changed files with 935 additions and 408 deletions

View File

@@ -21,7 +21,10 @@
#endif // DMLC_ENABLE_STD_THREAD
namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::CSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SortedCSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::EllpackPage>);
} // namespace dmlc
namespace xgboost {
@@ -329,31 +332,6 @@ DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
} // namespace xgboost
namespace xgboost {
data::SparsePageFormat* data::SparsePageFormat::Create(const std::string& name) {
auto *e = ::dmlc::Registry< ::xgboost::data::SparsePageFormatReg>::Get()->Find(name);
if (e == nullptr) {
LOG(FATAL) << "Unknown format type " << name;
}
return (e->body)();
}
std::pair<std::string, std::string>
data::SparsePageFormat::DecideFormat(const std::string& cache_prefix) {
size_t pos = cache_prefix.rfind(".fmt-");
if (pos != std::string::npos) {
std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
size_t cpos = fmt.rfind('-');
if (cpos != std::string::npos) {
return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
} else {
return std::make_pair(fmt, fmt);
}
} else {
std::string raw = "raw";
return std::make_pair(raw, raw);
}
}
SparsePage SparsePage::GetTranspose(int num_columns) const {
SparsePage transpose;
common::ParallelGroupBuilder<Entry> builder(&transpose.offset.HostVector(),
@@ -476,18 +454,6 @@ void SparsePage::PushCSC(const SparsePage &batch) {
self_offset = std::move(offset);
}
void SparsePage::Push(const Inst &inst) {
auto& data_vec = data.HostVector();
auto& offset_vec = offset.HostVector();
offset_vec.push_back(offset_vec.back() + inst.size());
size_t begin = data_vec.size();
data_vec.resize(begin + inst.size());
if (inst.size() != 0) {
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
sizeof(Entry) * inst.size());
}
}
namespace data {
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);