Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves.
- Remove use of threaded iterator and IO queue.
- Remove `page_size`.
- Make sure the number of pages in memory is bounded.
- Make sure the cache can not be violated.
- Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
Jiaming Yuan
2021-07-16 12:33:31 +08:00
committed by GitHub
parent 2f524e9f41
commit bd1f3a38f0
51 changed files with 1445 additions and 1391 deletions

View File

@@ -125,12 +125,10 @@ TEST(DenseColumnWithMissing, Test) {
}
void TestGHistIndexMatrixCreation(size_t nthreads) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
/* This should create multiple sparse pages */
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries, kPageSize, filename) };
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries) };
omp_set_num_threads(nthreads);
GHistIndexMatrix gmat(dmat.get(), 256);
}

View File

@@ -83,7 +83,7 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
}
fo.close();
return std::shared_ptr<DMatrix>(DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
}
// Test that elements are approximately equally distributed among bins