Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves.
- Remove use of threaded iterator and IO queue.
- Remove `page_size`.
- Make sure the number of pages in memory is bounded.
- Make sure the cache can not be violated.
- Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
Jiaming Yuan
2021-07-16 12:33:31 +08:00
committed by GitHub
parent 2f524e9f41
commit bd1f3a38f0
51 changed files with 1445 additions and 1391 deletions

View File

@@ -92,13 +92,10 @@ TEST(CpuPredictor, IterationRange) {
}
TEST(CpuPredictor, ExternalMemory) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
auto lparam = CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Predictor> cpu_predictor =

View File

@@ -102,13 +102,10 @@ TEST(GPUPredictor, ExternalMemoryTest) {
gbm::GBTreeModel model = CreateTestModel(&param, n_classes);
std::vector<std::unique_ptr<DMatrix>> dmats;
dmlc::TemporaryDirectory tmpdir;
std::string file0 = tmpdir.path + "/big_0.libsvm";
std::string file1 = tmpdir.path + "/big_1.libsvm";
std::string file2 = tmpdir.path + "/big_2.libsvm";
dmats.push_back(CreateSparsePageDMatrix(400, 64UL, file0));
dmats.push_back(CreateSparsePageDMatrix(800, 128UL, file1));
dmats.push_back(CreateSparsePageDMatrix(8000, 1024UL, file2));
dmats.push_back(CreateSparsePageDMatrix(400));
dmats.push_back(CreateSparsePageDMatrix(800));
dmats.push_back(CreateSparsePageDMatrix(8000));
for (const auto& dmat: dmats) {
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);