Fix external memory for get column batches. (#4622)
* Fix external memory for get column batches. This fixes two bugs: * Use PushCSC for get column batches. * Don't remove the created temporary directory before finishing test. * Check all pages.
This commit is contained in:
@@ -412,6 +412,18 @@ void SparsePage::PushCSC(const SparsePage &batch) {
|
||||
self_offset = std::move(offset);
|
||||
}
|
||||
|
||||
void SparsePage::Push(const Inst &inst) {
|
||||
auto& data_vec = data.HostVector();
|
||||
auto& offset_vec = offset.HostVector();
|
||||
offset_vec.push_back(offset_vec.back() + inst.size());
|
||||
size_t begin = data_vec.size();
|
||||
data_vec.resize(begin + inst.size());
|
||||
if (inst.size() != 0) {
|
||||
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
|
||||
sizeof(Entry) * inst.size());
|
||||
}
|
||||
}
|
||||
|
||||
namespace data {
|
||||
// List of files that will be force linked in static links.
|
||||
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
// Used for single batch data.
|
||||
class SimpleDMatrix : public DMatrix {
|
||||
public:
|
||||
explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
// Used for external memory.
|
||||
class SparsePageDMatrix : public DMatrix {
|
||||
public:
|
||||
explicit SparsePageDMatrix(std::unique_ptr<DataSource>&& source,
|
||||
|
||||
@@ -221,8 +221,8 @@ void SparsePageSource::CreateRowPage(dmlc::Parser<uint32_t>* src,
|
||||
CHECK(info.qids_.empty() || info.qids_.size() == info.num_row_);
|
||||
info.SaveBinary(fo.get());
|
||||
}
|
||||
LOG(CONSOLE) << "SparsePageSource::CreateRowPage Finished writing to "
|
||||
<< name_info;
|
||||
LOG(INFO) << "SparsePageSource::CreateRowPage Finished writing to "
|
||||
<< name_info;
|
||||
}
|
||||
|
||||
void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||
@@ -251,7 +251,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||
if (page_type == ".row.page") {
|
||||
page->Push(batch);
|
||||
} else if (page_type == ".col.page") {
|
||||
page->Push(batch.GetTranspose(src->Info().num_col_));
|
||||
page->PushCSC(batch.GetTranspose(src->Info().num_col_));
|
||||
} else if (page_type == ".sorted.col.page") {
|
||||
SparsePage tmp = batch.GetTranspose(src->Info().num_col_);
|
||||
page->PushCSC(tmp);
|
||||
@@ -266,9 +266,9 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||
writer.Alloc(&page);
|
||||
page->Clear();
|
||||
double tdiff = dmlc::GetTime() - tstart;
|
||||
LOG(CONSOLE) << "Writing to " << cache_info << " in "
|
||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " written";
|
||||
LOG(INFO) << "Writing to " << cache_info << " in "
|
||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " written";
|
||||
}
|
||||
}
|
||||
if (page->data.Size() != 0) {
|
||||
@@ -281,7 +281,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||
fo->Write(&tmagic, sizeof(tmagic));
|
||||
info.SaveBinary(fo.get());
|
||||
}
|
||||
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
|
||||
LOG(INFO) << "SparsePageSource: Finished writing to " << name_info;
|
||||
}
|
||||
|
||||
void SparsePageSource::CreateRowPage(DMatrix* src,
|
||||
|
||||
@@ -39,7 +39,7 @@ SparsePageWriter::SparsePageWriter(
|
||||
qrecycle_.Push(std::move(page));
|
||||
}
|
||||
fo.reset(nullptr);
|
||||
LOG(CONSOLE) << "SparsePage::Writer Finished writing to " << name_shard;
|
||||
LOG(INFO) << "SparsePage::Writer Finished writing to " << name_shard;
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user