Merge pull request #1218 from tqchen/master

[DATA] fix async data writing
This commit is contained in:
Tianqi Chen 2016-05-21 19:40:41 -07:00
commit 587999755f
3 changed files with 114 additions and 104 deletions

View File

@ -256,41 +256,44 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
name_shards.push_back(prefix + ".col.page"); name_shards.push_back(prefix + ".col.page");
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second); format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second);
} }
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
double tstart = dmlc::GetTime(); {
size_t bytes_write = 0; SparsePage::Writer writer(name_shards, format_shards, 6);
// print every 4 sec. std::unique_ptr<SparsePage> page;
const double kStep = 4.0; writer.Alloc(&page); page->Clear();
size_t tick_expected = kStep;
while (make_next_col(page.get())) { double tstart = dmlc::GetTime();
for (size_t i = 0; i < page->Size(); ++i) { size_t bytes_write = 0;
col_size_[i] += page->offset[i + 1] - page->offset[i]; // print every 4 sec.
} const double kStep = 4.0;
size_t tick_expected = kStep;
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page)); while (make_next_col(page.get())) {
writer.Alloc(&page); for (size_t i = 0; i < page->Size(); ++i) {
page->Clear(); col_size_[i] += page->offset[i + 1] - page->offset[i];
}
double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) { bytes_write += page->MemCostBytes();
LOG(CONSOLE) << "Writing col.page file to " << cache_info_ writer.PushWrite(std::move(page));
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, " writer.Alloc(&page);
<< (bytes_write >> 20UL) << " MB writen"; page->Clear();
tick_expected += kStep;
double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) {
LOG(CONSOLE) << "Writing col.page file to " << cache_info_
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " MB writen";
tick_expected += kStep;
}
} }
// save meta data
std::string col_meta_name = cache_shards[0] + ".col.meta";
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(col_meta_name.c_str(), "w"));
fo->Write(buffered_rowset_);
fo->Write(col_size_);
fo.reset(nullptr);
} }
// save meta data
std::string col_meta_name = cache_shards[0] + ".col.meta";
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(col_meta_name.c_str(), "w"));
fo->Write(buffered_rowset_);
fo->Write(col_size_);
fo.reset(nullptr);
// initialize column data // initialize column data
CHECK(TryInitColData()); CHECK(TryInitColData());
} }

View File

@ -110,58 +110,60 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
name_shards.push_back(prefix + ".row.page"); name_shards.push_back(prefix + ".row.page");
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first); format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
} }
SparsePage::Writer writer(name_shards, format_shards, 6); {
std::unique_ptr<SparsePage> page; SparsePage::Writer writer(name_shards, format_shards, 6);
writer.Alloc(&page); page->Clear(); std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
MetaInfo info; MetaInfo info;
size_t bytes_write = 0; size_t bytes_write = 0;
double tstart = dmlc::GetTime(); double tstart = dmlc::GetTime();
// print every 4 sec. // print every 4 sec.
const double kStep = 4.0; const double kStep = 4.0;
size_t tick_expected = kStep; size_t tick_expected = kStep;
while (src->Next()) { while (src->Next()) {
const dmlc::RowBlock<uint32_t>& batch = src->Value(); const dmlc::RowBlock<uint32_t>& batch = src->Value();
if (batch.label != nullptr) { if (batch.label != nullptr) {
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size); info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
} }
if (batch.weight != nullptr) { if (batch.weight != nullptr) {
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size); info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
} }
info.num_row += batch.size; info.num_row += batch.size;
info.num_nonzero += batch.offset[batch.size] - batch.offset[0]; info.num_nonzero += batch.offset[batch.size] - batch.offset[0];
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) { for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
uint32_t index = batch.index[i]; uint32_t index = batch.index[i];
info.num_col = std::max(info.num_col, info.num_col = std::max(info.num_col,
static_cast<uint64_t>(index + 1)); static_cast<uint64_t>(index + 1));
} }
page->Push(batch); page->Push(batch);
if (page->MemCostBytes() >= kPageSize) { if (page->MemCostBytes() >= kPageSize) {
bytes_write += page->MemCostBytes(); bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page)); writer.PushWrite(std::move(page));
writer.Alloc(&page); writer.Alloc(&page);
page->Clear(); page->Clear();
double tdiff = dmlc::GetTime() - tstart; double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) { if (tdiff >= tick_expected) {
LOG(CONSOLE) << "Writing row.page to " << cache_info << " in " LOG(CONSOLE) << "Writing row.page to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written"; << (bytes_write >> 20UL) << " written";
tick_expected += kStep; tick_expected += kStep;
}
} }
} }
}
if (page->data.size() != 0) { if (page->data.size() != 0) {
writer.PushWrite(std::move(page)); writer.PushWrite(std::move(page));
} }
std::unique_ptr<dmlc::Stream> fo( std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(name_info.c_str(), "w")); dmlc::Stream::Create(name_info.c_str(), "w"));
int tmagic = kMagic; int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic)); fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo.get()); info.SaveBinary(fo.get());
}
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info; LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
} }
@ -176,38 +178,40 @@ void SparsePageSource::Create(DMatrix* src,
name_shards.push_back(prefix + ".row.page"); name_shards.push_back(prefix + ".row.page");
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first); format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
} }
SparsePage::Writer writer(name_shards, format_shards, 6); {
std::unique_ptr<SparsePage> page; SparsePage::Writer writer(name_shards, format_shards, 6);
writer.Alloc(&page); page->Clear(); std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
MetaInfo info; MetaInfo info;
size_t bytes_write = 0; size_t bytes_write = 0;
double tstart = dmlc::GetTime(); double tstart = dmlc::GetTime();
dmlc::DataIter<RowBatch>* iter = src->RowIterator(); dmlc::DataIter<RowBatch>* iter = src->RowIterator();
while (iter->Next()) { while (iter->Next()) {
page->Push(iter->Value()); page->Push(iter->Value());
if (page->MemCostBytes() >= kPageSize) { if (page->MemCostBytes() >= kPageSize) {
bytes_write += page->MemCostBytes(); bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page)); writer.PushWrite(std::move(page));
writer.Alloc(&page); writer.Alloc(&page);
page->Clear(); page->Clear();
double tdiff = dmlc::GetTime() - tstart; double tdiff = dmlc::GetTime() - tstart;
LOG(CONSOLE) << "Writing to " << cache_info << " in " LOG(CONSOLE) << "Writing to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written"; << (bytes_write >> 20UL) << " written";
}
} }
}
if (page->data.size() != 0) { if (page->data.size() != 0) {
writer.PushWrite(std::move(page)); writer.PushWrite(std::move(page));
} }
std::unique_ptr<dmlc::Stream> fo( std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(name_info.c_str(), "w")); dmlc::Stream::Create(name_info.c_str(), "w"));
int tmagic = kMagic; int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic)); fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo.get()); info.SaveBinary(fo.get());
}
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info; LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
} }

View File

@ -34,6 +34,7 @@ SparsePage::Writer::Writer(
fo->Write(format_shard); fo->Write(format_shard);
std::unique_ptr<SparsePage> page; std::unique_ptr<SparsePage> page;
while (wqueue->Pop(&page)) { while (wqueue->Pop(&page)) {
if (page.get() == nullptr) break;
fmt->Write(*page, fo.get()); fmt->Write(*page, fo.get());
qrecycle_.Push(std::move(page)); qrecycle_.Push(std::move(page));
} }
@ -45,7 +46,9 @@ SparsePage::Writer::Writer(
SparsePage::Writer::~Writer() { SparsePage::Writer::~Writer() {
for (auto& queue : qworkers_) { for (auto& queue : qworkers_) {
queue.SignalForKill(); // use nullptr to signal termination.
std::unique_ptr<SparsePage> sig(nullptr);
queue.Push(std::move(sig));
} }
for (auto& thread : workers_) { for (auto& thread : workers_) {
thread->join(); thread->join();