Deterministic data partitioning for external memory (#6317)

* Make external memory data partitioning deterministic.

* Change the meaning of `page_size` from bytes to number of rows.

* Design a data pool.

* Note for external memory.

* Enable unity build on Windows CI.

* Force garbage collect on test.
This commit is contained in:
Jiaming Yuan
2020-11-11 06:11:06 +08:00
committed by GitHub
parent 9564886d9f
commit 43efadea2e
15 changed files with 334 additions and 88 deletions

View File

@@ -373,12 +373,8 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
batch_count++;
row_count += batch.Size();
}
#if defined(_OPENMP)
EXPECT_GE(batch_count, 2);
EXPECT_EQ(row_count, dmat->Info().num_row_);
#else
#warning "External memory doesn't work with Non-OpenMP build "
#endif // defined(_OPENMP)
return dmat;
}
@@ -495,6 +491,36 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
return gbm;
}
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
std::vector<size_t> *p_row_ptr,
std::vector<bst_feature_t> *p_cids) {
auto &data = *p_data;
auto &row_ptr = *p_row_ptr;
auto &cids = *p_cids;
data.resize(dmat->Info().num_nonzero_);
cids.resize(data.size());
row_ptr.resize(dmat->Info().num_row_ + 1);
SparsePage page;
for (const auto &batch : dmat->GetBatches<SparsePage>()) {
page.Push(batch);
}
auto const& in_offset = page.offset.HostVector();
auto const& in_data = page.data.HostVector();
CHECK_EQ(in_offset.size(), row_ptr.size());
std::copy(in_offset.cbegin(), in_offset.cend(), row_ptr.begin());
ASSERT_EQ(in_data.size(), data.size());
std::transform(in_data.cbegin(), in_data.cend(), data.begin(), [](Entry const& e) {
return e.fvalue;
});
ASSERT_EQ(in_data.size(), cids.size());
std::transform(in_data.cbegin(), in_data.cend(), cids.begin(), [](Entry const& e) {
return e.index;
});
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
using CUDAMemoryResource = rmm::mr::cuda_memory_resource;