[EM] Enable access to the number of batches. (#10691)
- Expose `NumBatches` in `DMatrix`. - Small cleanup for removing legacy CUDA stream and ~force CUDA context initialization~. - Purge old external memory data generation code.
This commit is contained in:
@@ -63,26 +63,27 @@ TEST(SparsePage, PushCSC) {
|
||||
}
|
||||
|
||||
TEST(SparsePage, PushCSCAfterTranspose) {
|
||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
|
||||
bst_idx_t constexpr kRows = 1024, kCols = 21;
|
||||
|
||||
auto dmat =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
|
||||
const int ncols = dmat->Info().num_col_;
|
||||
SparsePage page; // Consolidated sparse page
|
||||
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
||||
SparsePage page; // Consolidated sparse page
|
||||
for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
||||
// Transpose each batch and push
|
||||
SparsePage tmp = batch.GetTranspose(ncols, AllThreadsForTest());
|
||||
page.PushCSC(tmp);
|
||||
}
|
||||
|
||||
// Make sure that the final sparse page has the right number of entries
|
||||
ASSERT_EQ(kEntries, page.data.Size());
|
||||
ASSERT_EQ(kRows * kCols, page.data.Size());
|
||||
|
||||
page.SortRows(AllThreadsForTest());
|
||||
auto v = page.GetView();
|
||||
for (size_t i = 0; i < v.Size(); ++i) {
|
||||
auto column = v[i];
|
||||
for (size_t j = 1; j < column.size(); ++j) {
|
||||
ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
|
||||
ASSERT_GE(column[j].fvalue, column[j - 1].fvalue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,13 +140,11 @@ struct ReadRowFunction {
|
||||
TEST(EllpackPage, Copy) {
|
||||
constexpr size_t kRows = 1024;
|
||||
constexpr size_t kCols = 16;
|
||||
constexpr size_t kPageSize = 1024;
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto dmat =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
||||
|
||||
@@ -187,14 +185,12 @@ TEST(EllpackPage, Copy) {
|
||||
TEST(EllpackPage, Compact) {
|
||||
constexpr size_t kRows = 16;
|
||||
constexpr size_t kCols = 2;
|
||||
constexpr size_t kPageSize = 1;
|
||||
constexpr size_t kCompactedRows = 8;
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix> dmat(
|
||||
CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto dmat =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
||||
|
||||
|
||||
@@ -214,15 +214,15 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RowAccess) {
|
||||
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(24);
|
||||
auto dmat = RandomDataGenerator{12, 6, 0.8f}.Batches(2).GenerateSparsePageDMatrix("temp", false);
|
||||
|
||||
// Test the data read into the first row
|
||||
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
|
||||
auto page = batch.GetView();
|
||||
auto first_row = page[0];
|
||||
ASSERT_EQ(first_row.size(), 3ul);
|
||||
EXPECT_EQ(first_row[2].index, 2u);
|
||||
EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4);
|
||||
ASSERT_EQ(first_row.size(), 1ul);
|
||||
EXPECT_EQ(first_row[0].index, 5u);
|
||||
EXPECT_NEAR(first_row[0].fvalue, 0.1805125, 1e-4);
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, ColAccess) {
|
||||
@@ -268,11 +268,10 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||
size_t constexpr kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
|
||||
Context ctx;
|
||||
|
||||
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(kEntries);
|
||||
auto dmat =
|
||||
RandomDataGenerator{4096, 12, 0.0f}.Batches(8).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
int threads = 1000;
|
||||
|
||||
@@ -304,10 +303,9 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||
|
||||
// Multi-batches access
|
||||
TEST(SparsePageDMatrix, ColAccessBatches) {
|
||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
// Create multiple sparse pages
|
||||
std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
|
||||
auto dmat =
|
||||
RandomDataGenerator{1024, 32, 0.4f}.Batches(3).GenerateSparsePageDMatrix("temp", true);
|
||||
ASSERT_EQ(dmat->Ctx()->Threads(), AllThreadsForTest());
|
||||
Context ctx;
|
||||
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
|
||||
|
||||
@@ -115,13 +115,10 @@ TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, filename);
|
||||
auto dmat = RandomDataGenerator{1024, 2, 0.5f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
// Loop over the batches and count the records
|
||||
std::int64_t batch_count = 0;
|
||||
@@ -135,15 +132,13 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
|
||||
auto path =
|
||||
data::MakeId(filename,
|
||||
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
|
||||
".ellpack.page";
|
||||
data::MakeId("tmep", dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".ellpack.page";
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RetainEllpackPage) {
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{32, tree::TrainParam::DftSparseThreshold()};
|
||||
auto m = CreateSparsePageDMatrix(10000);
|
||||
auto m = RandomDataGenerator{2048, 4, 0.0f}.Batches(8).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
auto batches = m->GetBatches<EllpackPage>(&ctx, param);
|
||||
auto begin = batches.begin();
|
||||
@@ -278,20 +273,19 @@ struct ReadRowFunction {
|
||||
};
|
||||
|
||||
TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
|
||||
constexpr size_t kRows = 6;
|
||||
constexpr size_t kRows = 16;
|
||||
constexpr size_t kCols = 2;
|
||||
constexpr int kMaxBins = 256;
|
||||
constexpr size_t kPageSize = 1;
|
||||
|
||||
// Create an in-memory DMatrix.
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
|
||||
auto dmat =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
auto dmat_ext =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
|
||||
auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
||||
EXPECT_EQ(impl->base_rowid, 0);
|
||||
@@ -325,17 +319,16 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
|
||||
constexpr size_t kRows = 1024;
|
||||
constexpr size_t kCols = 16;
|
||||
constexpr int kMaxBins = 256;
|
||||
constexpr size_t kPageSize = 4096;
|
||||
|
||||
// Create an in-memory DMatrix.
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
|
||||
auto dmat =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
auto dmat_ext =
|
||||
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(8).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
|
||||
|
||||
size_t current_row = 0;
|
||||
|
||||
Reference in New Issue
Block a user