[LZ4] enable 16 bit index
This commit is contained in:
@@ -247,12 +247,21 @@ SparsePage::Format* SparsePage::Format::Create(const std::string& name) {
|
||||
return (e->body)();
|
||||
}
|
||||
|
||||
std::string SparsePage::Format::DecideFormat(const std::string& cache_prefix) {
|
||||
std::pair<std::string, std::string>
|
||||
SparsePage::Format::DecideFormat(const std::string& cache_prefix) {
|
||||
size_t pos = cache_prefix.rfind(".fmt-");
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
return cache_prefix.substr(pos + 5, cache_prefix.length());
|
||||
std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
|
||||
size_t cpos = fmt.rfind('-');
|
||||
if (cpos != std::string::npos) {
|
||||
return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
|
||||
} else {
|
||||
return std::make_pair(fmt, fmt);
|
||||
}
|
||||
} else {
|
||||
return "raw";
|
||||
std::string raw = "raw";
|
||||
return std::make_pair(raw, raw);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
@@ -25,7 +26,8 @@ class SparsePage {
|
||||
public:
|
||||
/*! \brief Format of the sparse page. */
|
||||
class Format;
|
||||
|
||||
/*! \brief minimum index of all index, used as hint for compression. */
|
||||
bst_uint min_index;
|
||||
/*! \brief offset of the segments */
|
||||
std::vector<size_t> offset;
|
||||
/*! \brief the data of the segments */
|
||||
@@ -45,6 +47,7 @@ class SparsePage {
|
||||
}
|
||||
/*! \brief clear the page */
|
||||
inline void Clear(void) {
|
||||
min_index = 0;
|
||||
offset.clear();
|
||||
offset.push_back(0);
|
||||
data.clear();
|
||||
@@ -163,9 +166,9 @@ class SparsePage::Format {
|
||||
static Format* Create(const std::string& name);
|
||||
/*!
|
||||
* \brief decide the format from cache prefix.
|
||||
* \return format type of the cache prefix.
|
||||
* \return pair of row format, column format type of the cache prefix.
|
||||
*/
|
||||
static std::string DecideFormat(const std::string& cache_prefix);
|
||||
static std::pair<std::string, std::string> DecideFormat(const std::string& cache_prefix);
|
||||
};
|
||||
|
||||
/*!
|
||||
|
||||
@@ -136,6 +136,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
// make the sparse page.
|
||||
dmlc::ThreadedIter<SparsePage> cmaker;
|
||||
SparsePage tmp;
|
||||
size_t batch_ptr = 0, batch_top = 0;
|
||||
dmlc::DataIter<RowBatch>* iter = this->RowIterator();
|
||||
std::bernoulli_distribution coin_flip(pkeep);
|
||||
|
||||
@@ -151,13 +152,13 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
}
|
||||
SparsePage* pcol = *dptr;
|
||||
pcol->Clear();
|
||||
pcol->min_index = ridx[0];
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
nthread = std::max(nthread, std::max(omp_get_num_procs() / 2 - 1, 1));
|
||||
}
|
||||
pcol->Clear();
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info.num_col, nthread);
|
||||
@@ -199,21 +200,32 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
auto make_next_col = [&] (SparsePage** dptr) {
|
||||
tmp.Clear();
|
||||
size_t btop = buffered_rowset_.size();
|
||||
while (iter->Next()) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
|
||||
while (true) {
|
||||
if (batch_ptr != batch_top) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
CHECK_EQ(batch_top, batch.size);
|
||||
for (size_t i = batch_ptr; i < batch_top; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
}
|
||||
|
||||
if (tmp.Size() >= max_row_perbatch ||
|
||||
tmp.MemCostBytes() >= kPageSize) {
|
||||
make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
|
||||
batch_ptr = i + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
batch_ptr = batch_top;
|
||||
}
|
||||
if (tmp.MemCostBytes() >= kPageSize ||
|
||||
tmp.Size() >= max_row_perbatch) {
|
||||
make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
|
||||
return true;
|
||||
}
|
||||
if (!iter->Next()) break;
|
||||
batch_ptr = 0;
|
||||
batch_top = iter->Value().size;
|
||||
}
|
||||
|
||||
if (tmp.Size() != 0) {
|
||||
make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
|
||||
return true;
|
||||
@@ -227,12 +239,15 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
std::string col_data_name = cache_prefix_ + ".col.page";
|
||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(col_data_name.c_str(), "w"));
|
||||
// find format.
|
||||
std::string name_format = SparsePage::Format::DecideFormat(cache_prefix_);
|
||||
std::string name_format = SparsePage::Format::DecideFormat(cache_prefix_).second;
|
||||
fo->Write(name_format);
|
||||
std::unique_ptr<SparsePage::Format> format(SparsePage::Format::Create(name_format));
|
||||
|
||||
double tstart = dmlc::GetTime();
|
||||
size_t bytes_write = 0;
|
||||
// print every 4 sec.
|
||||
const double kStep = 4.0;
|
||||
size_t tick_expected = kStep;
|
||||
SparsePage* pcol = nullptr;
|
||||
|
||||
while (cmaker.Next(&pcol)) {
|
||||
@@ -243,9 +258,12 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
size_t spage = pcol->MemCostBytes();
|
||||
bytes_write += spage;
|
||||
double tdiff = dmlc::GetTime() - tstart;
|
||||
LOG(CONSOLE) << "Writing to " << col_data_name
|
||||
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " MB writen";
|
||||
if (tdiff >= tick_expected) {
|
||||
LOG(CONSOLE) << "Writing to " << col_data_name
|
||||
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " MB writen";
|
||||
tick_expected += kStep;
|
||||
}
|
||||
cmaker.Recycle(&pcol);
|
||||
}
|
||||
// save meta data
|
||||
|
||||
@@ -78,7 +78,7 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
|
||||
std::string name_info = cache_prefix;
|
||||
std::string name_row = cache_prefix + ".row.page";
|
||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(name_row.c_str(), "w"));
|
||||
std::string name_format = SparsePage::Format::DecideFormat(cache_prefix);
|
||||
std::string name_format = SparsePage::Format::DecideFormat(cache_prefix).first;
|
||||
fo->Write(name_format);
|
||||
std::unique_ptr<SparsePage::Format> format(SparsePage::Format::Create(name_format));
|
||||
|
||||
@@ -86,6 +86,9 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
|
||||
SparsePage page;
|
||||
size_t bytes_write = 0;
|
||||
double tstart = dmlc::GetTime();
|
||||
// print every 4 sec.
|
||||
const double kStep = 4.0;
|
||||
size_t tick_expected = kStep;
|
||||
|
||||
while (src->Next()) {
|
||||
const dmlc::RowBlock<uint32_t>& batch = src->Value();
|
||||
@@ -108,9 +111,12 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
|
||||
format->Write(page, fo.get());
|
||||
page.Clear();
|
||||
double tdiff = dmlc::GetTime() - tstart;
|
||||
LOG(CONSOLE) << "Writing to " << name_row << " in "
|
||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " written";
|
||||
if (tdiff >= tick_expected) {
|
||||
LOG(CONSOLE) << "Writing to " << name_row << " in "
|
||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " written";
|
||||
tick_expected += kStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,7 +139,7 @@ void SparsePageSource::Create(DMatrix* src,
|
||||
std::string name_row = cache_prefix + ".row.page";
|
||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(name_row.c_str(), "w"));
|
||||
// find format.
|
||||
std::string name_format = SparsePage::Format::DecideFormat(cache_prefix);
|
||||
std::string name_format = SparsePage::Format::DecideFormat(cache_prefix).first;
|
||||
fo->Write(name_format);
|
||||
std::unique_ptr<SparsePage::Format> format(SparsePage::Format::Create(name_format));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user