Merge pull request #1218 from tqchen/master
[DATA] fix async data writing
This commit is contained in:
commit
587999755f
@ -256,41 +256,44 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
|||||||
name_shards.push_back(prefix + ".col.page");
|
name_shards.push_back(prefix + ".col.page");
|
||||||
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second);
|
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second);
|
||||||
}
|
}
|
||||||
SparsePage::Writer writer(name_shards, format_shards, 6);
|
|
||||||
std::unique_ptr<SparsePage> page;
|
|
||||||
writer.Alloc(&page); page->Clear();
|
|
||||||
|
|
||||||
double tstart = dmlc::GetTime();
|
{
|
||||||
size_t bytes_write = 0;
|
SparsePage::Writer writer(name_shards, format_shards, 6);
|
||||||
// print every 4 sec.
|
std::unique_ptr<SparsePage> page;
|
||||||
const double kStep = 4.0;
|
writer.Alloc(&page); page->Clear();
|
||||||
size_t tick_expected = kStep;
|
|
||||||
|
|
||||||
while (make_next_col(page.get())) {
|
double tstart = dmlc::GetTime();
|
||||||
for (size_t i = 0; i < page->Size(); ++i) {
|
size_t bytes_write = 0;
|
||||||
col_size_[i] += page->offset[i + 1] - page->offset[i];
|
// print every 4 sec.
|
||||||
}
|
const double kStep = 4.0;
|
||||||
|
size_t tick_expected = kStep;
|
||||||
bytes_write += page->MemCostBytes();
|
|
||||||
writer.PushWrite(std::move(page));
|
while (make_next_col(page.get())) {
|
||||||
writer.Alloc(&page);
|
for (size_t i = 0; i < page->Size(); ++i) {
|
||||||
page->Clear();
|
col_size_[i] += page->offset[i + 1] - page->offset[i];
|
||||||
|
}
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
|
||||||
if (tdiff >= tick_expected) {
|
bytes_write += page->MemCostBytes();
|
||||||
LOG(CONSOLE) << "Writing col.page file to " << cache_info_
|
writer.PushWrite(std::move(page));
|
||||||
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
writer.Alloc(&page);
|
||||||
<< (bytes_write >> 20UL) << " MB writen";
|
page->Clear();
|
||||||
tick_expected += kStep;
|
|
||||||
|
double tdiff = dmlc::GetTime() - tstart;
|
||||||
|
if (tdiff >= tick_expected) {
|
||||||
|
LOG(CONSOLE) << "Writing col.page file to " << cache_info_
|
||||||
|
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||||
|
<< (bytes_write >> 20UL) << " MB writen";
|
||||||
|
tick_expected += kStep;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// save meta data
|
||||||
|
std::string col_meta_name = cache_shards[0] + ".col.meta";
|
||||||
|
std::unique_ptr<dmlc::Stream> fo(
|
||||||
|
dmlc::Stream::Create(col_meta_name.c_str(), "w"));
|
||||||
|
fo->Write(buffered_rowset_);
|
||||||
|
fo->Write(col_size_);
|
||||||
|
fo.reset(nullptr);
|
||||||
}
|
}
|
||||||
// save meta data
|
|
||||||
std::string col_meta_name = cache_shards[0] + ".col.meta";
|
|
||||||
std::unique_ptr<dmlc::Stream> fo(
|
|
||||||
dmlc::Stream::Create(col_meta_name.c_str(), "w"));
|
|
||||||
fo->Write(buffered_rowset_);
|
|
||||||
fo->Write(col_size_);
|
|
||||||
fo.reset(nullptr);
|
|
||||||
// initialize column data
|
// initialize column data
|
||||||
CHECK(TryInitColData());
|
CHECK(TryInitColData());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -110,58 +110,60 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
|
|||||||
name_shards.push_back(prefix + ".row.page");
|
name_shards.push_back(prefix + ".row.page");
|
||||||
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
|
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
|
||||||
}
|
}
|
||||||
SparsePage::Writer writer(name_shards, format_shards, 6);
|
{
|
||||||
std::unique_ptr<SparsePage> page;
|
SparsePage::Writer writer(name_shards, format_shards, 6);
|
||||||
writer.Alloc(&page); page->Clear();
|
std::unique_ptr<SparsePage> page;
|
||||||
|
writer.Alloc(&page); page->Clear();
|
||||||
|
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
size_t bytes_write = 0;
|
size_t bytes_write = 0;
|
||||||
double tstart = dmlc::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
// print every 4 sec.
|
// print every 4 sec.
|
||||||
const double kStep = 4.0;
|
const double kStep = 4.0;
|
||||||
size_t tick_expected = kStep;
|
size_t tick_expected = kStep;
|
||||||
|
|
||||||
while (src->Next()) {
|
while (src->Next()) {
|
||||||
const dmlc::RowBlock<uint32_t>& batch = src->Value();
|
const dmlc::RowBlock<uint32_t>& batch = src->Value();
|
||||||
if (batch.label != nullptr) {
|
if (batch.label != nullptr) {
|
||||||
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
|
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
|
||||||
}
|
}
|
||||||
if (batch.weight != nullptr) {
|
if (batch.weight != nullptr) {
|
||||||
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
|
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
|
||||||
}
|
}
|
||||||
info.num_row += batch.size;
|
info.num_row += batch.size;
|
||||||
info.num_nonzero += batch.offset[batch.size] - batch.offset[0];
|
info.num_nonzero += batch.offset[batch.size] - batch.offset[0];
|
||||||
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
||||||
uint32_t index = batch.index[i];
|
uint32_t index = batch.index[i];
|
||||||
info.num_col = std::max(info.num_col,
|
info.num_col = std::max(info.num_col,
|
||||||
static_cast<uint64_t>(index + 1));
|
static_cast<uint64_t>(index + 1));
|
||||||
}
|
}
|
||||||
page->Push(batch);
|
page->Push(batch);
|
||||||
if (page->MemCostBytes() >= kPageSize) {
|
if (page->MemCostBytes() >= kPageSize) {
|
||||||
bytes_write += page->MemCostBytes();
|
bytes_write += page->MemCostBytes();
|
||||||
writer.PushWrite(std::move(page));
|
writer.PushWrite(std::move(page));
|
||||||
writer.Alloc(&page);
|
writer.Alloc(&page);
|
||||||
page->Clear();
|
page->Clear();
|
||||||
|
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
double tdiff = dmlc::GetTime() - tstart;
|
||||||
if (tdiff >= tick_expected) {
|
if (tdiff >= tick_expected) {
|
||||||
LOG(CONSOLE) << "Writing row.page to " << cache_info << " in "
|
LOG(CONSOLE) << "Writing row.page to " << cache_info << " in "
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||||
<< (bytes_write >> 20UL) << " written";
|
<< (bytes_write >> 20UL) << " written";
|
||||||
tick_expected += kStep;
|
tick_expected += kStep;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (page->data.size() != 0) {
|
if (page->data.size() != 0) {
|
||||||
writer.PushWrite(std::move(page));
|
writer.PushWrite(std::move(page));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<dmlc::Stream> fo(
|
std::unique_ptr<dmlc::Stream> fo(
|
||||||
dmlc::Stream::Create(name_info.c_str(), "w"));
|
dmlc::Stream::Create(name_info.c_str(), "w"));
|
||||||
int tmagic = kMagic;
|
int tmagic = kMagic;
|
||||||
fo->Write(&tmagic, sizeof(tmagic));
|
fo->Write(&tmagic, sizeof(tmagic));
|
||||||
info.SaveBinary(fo.get());
|
info.SaveBinary(fo.get());
|
||||||
|
}
|
||||||
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
|
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -176,38 +178,40 @@ void SparsePageSource::Create(DMatrix* src,
|
|||||||
name_shards.push_back(prefix + ".row.page");
|
name_shards.push_back(prefix + ".row.page");
|
||||||
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
|
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
|
||||||
}
|
}
|
||||||
SparsePage::Writer writer(name_shards, format_shards, 6);
|
{
|
||||||
std::unique_ptr<SparsePage> page;
|
SparsePage::Writer writer(name_shards, format_shards, 6);
|
||||||
writer.Alloc(&page); page->Clear();
|
std::unique_ptr<SparsePage> page;
|
||||||
|
writer.Alloc(&page); page->Clear();
|
||||||
|
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
size_t bytes_write = 0;
|
size_t bytes_write = 0;
|
||||||
double tstart = dmlc::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
dmlc::DataIter<RowBatch>* iter = src->RowIterator();
|
dmlc::DataIter<RowBatch>* iter = src->RowIterator();
|
||||||
|
|
||||||
while (iter->Next()) {
|
while (iter->Next()) {
|
||||||
page->Push(iter->Value());
|
page->Push(iter->Value());
|
||||||
if (page->MemCostBytes() >= kPageSize) {
|
if (page->MemCostBytes() >= kPageSize) {
|
||||||
bytes_write += page->MemCostBytes();
|
bytes_write += page->MemCostBytes();
|
||||||
writer.PushWrite(std::move(page));
|
writer.PushWrite(std::move(page));
|
||||||
writer.Alloc(&page);
|
writer.Alloc(&page);
|
||||||
page->Clear();
|
page->Clear();
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
double tdiff = dmlc::GetTime() - tstart;
|
||||||
LOG(CONSOLE) << "Writing to " << cache_info << " in "
|
LOG(CONSOLE) << "Writing to " << cache_info << " in "
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||||
<< (bytes_write >> 20UL) << " written";
|
<< (bytes_write >> 20UL) << " written";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (page->data.size() != 0) {
|
if (page->data.size() != 0) {
|
||||||
writer.PushWrite(std::move(page));
|
writer.PushWrite(std::move(page));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<dmlc::Stream> fo(
|
std::unique_ptr<dmlc::Stream> fo(
|
||||||
dmlc::Stream::Create(name_info.c_str(), "w"));
|
dmlc::Stream::Create(name_info.c_str(), "w"));
|
||||||
int tmagic = kMagic;
|
int tmagic = kMagic;
|
||||||
fo->Write(&tmagic, sizeof(tmagic));
|
fo->Write(&tmagic, sizeof(tmagic));
|
||||||
info.SaveBinary(fo.get());
|
info.SaveBinary(fo.get());
|
||||||
|
}
|
||||||
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
|
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,7 @@ SparsePage::Writer::Writer(
|
|||||||
fo->Write(format_shard);
|
fo->Write(format_shard);
|
||||||
std::unique_ptr<SparsePage> page;
|
std::unique_ptr<SparsePage> page;
|
||||||
while (wqueue->Pop(&page)) {
|
while (wqueue->Pop(&page)) {
|
||||||
|
if (page.get() == nullptr) break;
|
||||||
fmt->Write(*page, fo.get());
|
fmt->Write(*page, fo.get());
|
||||||
qrecycle_.Push(std::move(page));
|
qrecycle_.Push(std::move(page));
|
||||||
}
|
}
|
||||||
@ -45,7 +46,9 @@ SparsePage::Writer::Writer(
|
|||||||
|
|
||||||
SparsePage::Writer::~Writer() {
|
SparsePage::Writer::~Writer() {
|
||||||
for (auto& queue : qworkers_) {
|
for (auto& queue : qworkers_) {
|
||||||
queue.SignalForKill();
|
// use nullptr to signal termination.
|
||||||
|
std::unique_ptr<SparsePage> sig(nullptr);
|
||||||
|
queue.Push(std::move(sig));
|
||||||
}
|
}
|
||||||
for (auto& thread : workers_) {
|
for (auto& thread : workers_) {
|
||||||
thread->join();
|
thread->join();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user