Dmatrix refactor stage 1 (#3301)
* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
This commit is contained in:
@@ -5,14 +5,14 @@
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include "./sparse_batch_page.h"
|
||||
#include "./sparse_page_writer.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
|
||||
|
||||
class SparsePageRawFormat : public SparsePage::Format {
|
||||
class SparsePageRawFormat : public SparsePageFormat {
|
||||
public:
|
||||
bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
|
||||
if (!fi->Read(&(page->offset))) return false;
|
||||
@@ -20,8 +20,8 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
page->data.resize(page->offset.back());
|
||||
if (page->data.size() != 0) {
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(page->data),
|
||||
(page->data).size() * sizeof(SparseBatch::Entry)),
|
||||
(page->data).size() * sizeof(SparseBatch::Entry))
|
||||
(page->data).size() * sizeof(Entry)),
|
||||
(page->data).size() * sizeof(Entry))
|
||||
<< "Invalid SparsePage file";
|
||||
}
|
||||
return true;
|
||||
@@ -47,7 +47,7 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
bst_uint fid = sorted_index_set[i];
|
||||
if (disk_offset_[fid] != curr_offset) {
|
||||
CHECK_GT(disk_offset_[fid], curr_offset);
|
||||
fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
|
||||
fi->Seek(begin + disk_offset_[fid] * sizeof(Entry));
|
||||
curr_offset = disk_offset_[fid];
|
||||
}
|
||||
size_t j, size_to_read = 0;
|
||||
@@ -61,8 +61,8 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
|
||||
if (size_to_read != 0) {
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(page->data) + page->offset[i],
|
||||
size_to_read * sizeof(SparseBatch::Entry)),
|
||||
size_to_read * sizeof(SparseBatch::Entry))
|
||||
size_to_read * sizeof(Entry)),
|
||||
size_to_read * sizeof(Entry))
|
||||
<< "Invalid SparsePage file";
|
||||
curr_offset += size_to_read;
|
||||
}
|
||||
@@ -70,7 +70,7 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
}
|
||||
// seek to end of record
|
||||
if (curr_offset != disk_offset_.back()) {
|
||||
fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
|
||||
fi->Seek(begin + disk_offset_.back() * sizeof(Entry));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -80,7 +80,7 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
CHECK_EQ(page.offset.back(), page.data.size());
|
||||
fo->Write(page.offset);
|
||||
if (page.data.size() != 0) {
|
||||
fo->Write(dmlc::BeginPtr(page.data), page.data.size() * sizeof(SparseBatch::Entry));
|
||||
fo->Write(dmlc::BeginPtr(page.data), page.data.size() * sizeof(Entry));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user