merge 23Mar01

This commit is contained in:
amdsc21
2023-05-02 00:05:58 +02:00
258 changed files with 7471 additions and 5379 deletions

View File

@@ -1,17 +1,17 @@
/*!
* Copyright 2019-2020 XGBoost contributors
/**
* Copyright 2019-2023, XGBoost contributors
*/
#include <xgboost/base.h>
#include <utility>
#include "../helpers.h"
#include "../histogram_helpers.h"
#include "gtest/gtest.h"
#include "../../../src/common/categorical.h"
#include "../../../src/common/hist_util.h"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/tree/param.h" // TrainParam
#include "../helpers.h"
#include "../histogram_helpers.h"
#include "gtest/gtest.h"
namespace xgboost {
@@ -19,7 +19,10 @@ TEST(EllpackPage, EmptyDMatrix) {
constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
constexpr float kSparsity = 0;
auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin}).begin();
Context ctx{MakeCUDACtx(0)};
auto& page = *dmat->GetBatches<EllpackPage>(
&ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
.begin();
auto impl = page.Impl();
ASSERT_EQ(impl->row_stride, 0);
ASSERT_EQ(impl->Cuts().TotalBins(), 0);
@@ -87,8 +90,9 @@ TEST(EllpackPage, FromCategoricalBasic) {
auto& h_ft = m->Info().feature_types.HostVector();
h_ft.resize(kCols, FeatureType::kCategorical);
BatchParam p{0, max_bins};
auto ellpack = EllpackPage(m.get(), p);
Context ctx{MakeCUDACtx(0)};
auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
auto ellpack = EllpackPage(&ctx, m.get(), p);
auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
ASSERT_EQ(kCats, accessor.NumBins());
@@ -142,8 +146,9 @@ TEST(EllpackPage, Copy) {
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
// Create an empty result page.
EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
@@ -151,7 +156,7 @@ TEST(EllpackPage, Copy) {
// Copy batch pages into the result page.
size_t offset = 0;
for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
size_t num_elements = result.Copy(0, batch.Impl(), offset);
offset += num_elements;
}
@@ -161,7 +166,7 @@ TEST(EllpackPage, Copy) {
thrust::device_vector<bst_float> row_result_d(kCols);
std::vector<bst_float> row(kCols);
std::vector<bst_float> row_result(kCols);
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
for (auto& page : dmat->GetBatches<EllpackPage>(&ctx, param)) {
auto impl = page.Impl();
EXPECT_EQ(impl->base_rowid, current_row);
@@ -186,10 +191,11 @@ TEST(EllpackPage, Compact) {
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
std::unique_ptr<DMatrix> dmat(
CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
// Create an empty result page.
EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
@@ -201,7 +207,7 @@ TEST(EllpackPage, Compact) {
SIZE_MAX};
thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
result.Compact(0, batch.Impl(), row_indexes_span);
}
@@ -210,7 +216,7 @@ TEST(EllpackPage, Compact) {
thrust::device_vector<bst_float> row_result_d(kCols);
std::vector<bst_float> row(kCols);
std::vector<bst_float> row_result(kCols);
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
for (auto& page : dmat->GetBatches<EllpackPage>(&ctx, param)) {
auto impl = page.Impl();
ASSERT_EQ(impl->base_rowid, current_row);
@@ -249,15 +255,17 @@ class EllpackPageTest : public testing::TestWithParam<float> {
// device.
size_t n_samples{128}, n_features{13};
Context ctx;
ctx.gpu_id = 0;
Context gpu_ctx{MakeCUDACtx(0)};
auto Xy = RandomDataGenerator{n_samples, n_features, sparsity}.GenerateDMatrix(true);
std::unique_ptr<EllpackPageImpl> from_ghist;
ASSERT_TRUE(Xy->SingleColBlock());
for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(BatchParam{17, 0.6})) {
from_ghist.reset(new EllpackPageImpl{&ctx, page, {}});
for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{17, 0.6})) {
from_ghist.reset(new EllpackPageImpl{&gpu_ctx, page, {}});
}
for (auto const& page : Xy->GetBatches<EllpackPage>(BatchParam{0, 17})) {
for (auto const& page : Xy->GetBatches<EllpackPage>(
&gpu_ctx, BatchParam{17, tree::TrainParam::DftSparseThreshold()})) {
auto from_sparse_page = page.Impl();
ASSERT_EQ(from_sparse_page->is_dense, from_ghist->is_dense);
ASSERT_EQ(from_sparse_page->base_rowid, 0);

View File

@@ -1,17 +1,21 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/sparse_page_source.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../../../src/tree/param.h" // TrainParam
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
namespace xgboost {
namespace data {
TEST(EllpackPageRawFormat, IO) {
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
std::unique_ptr<SparsePageFormat<EllpackPage>> format{CreatePageFormat<EllpackPage>("raw")};
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
@@ -20,7 +24,7 @@ TEST(EllpackPageRawFormat, IO) {
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
format->Write(ellpack, fo.get());
}
}
@@ -29,7 +33,7 @@ TEST(EllpackPageRawFormat, IO) {
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
format->Read(&page, fi.get());
for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
auto loaded = page.Impl();
auto orig = ellpack.Impl();
ASSERT_EQ(loaded->Cuts().Ptrs(), orig->Cuts().Ptrs());

View File

@@ -29,16 +29,16 @@ TEST(FileIterator, Basic) {
{
auto zpath = tmpdir.path + "/0-based.svm";
CreateBigTestData(zpath, 3 * 64, true);
zpath += "?indexing_mode=0";
FileIterator iter{zpath, 0, 1, "libsvm"};
zpath += "?indexing_mode=0&format=libsvm";
FileIterator iter{zpath, 0, 1};
check_n_features(&iter);
}
{
auto opath = tmpdir.path + "/1-based.svm";
CreateBigTestData(opath, 3 * 64, false);
opath += "?indexing_mode=1";
FileIterator iter{opath, 0, 1, "libsvm"};
opath += "?indexing_mode=1&format=libsvm";
FileIterator iter{opath, 0, 1};
check_n_features(&iter);
}
}

View File

@@ -2,20 +2,38 @@
* Copyright 2021-2023 by XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include <xgboost/data.h> // for BatchIterator, BatchSet, DMatrix, BatchParam
#include "../../../src/common/column_matrix.h"
#include "../../../src/common/io.h" // MemoryBufferStream
#include "../../../src/data/gradient_index.h"
#include "../helpers.h"
#include <algorithm> // for sort, unique
#include <cmath> // for isnan
#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr, __shared_ptr_access, unique_ptr
#include <string> // for string
#include <tuple> // for make_tuple, tie, tuple
#include <utility> // for move
#include <vector> // for vector
#include "../../../src/common/categorical.h" // for AsCat
#include "../../../src/common/column_matrix.h" // for ColumnMatrix
#include "../../../src/common/hist_util.h" // for Index, HistogramCuts, SketchOnDMatrix
#include "../../../src/common/io.h" // for MemoryBufferStream
#include "../../../src/data/adapter.h" // for SparsePageAdapterBatch
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
#include "../../../src/tree/param.h" // for TrainParam
#include "../helpers.h" // for CreateEmptyGenericParam, GenerateRandomCa...
#include "xgboost/base.h" // for bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/host_device_vector.h" // for HostDeviceVector
namespace xgboost {
namespace data {
TEST(GradientIndex, ExternalMemory) {
auto ctx = CreateEmptyGenericParam(Context::kCpuId);
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
std::vector<size_t> base_rowids;
std::vector<float> hessian(dmat->Info().num_row_, 1);
for (auto const &page : dmat->GetBatches<GHistIndexMatrix>({64, hessian, true})) {
for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
base_rowids.push_back(page.base_rowid);
}
size_t i = 0;
@@ -24,9 +42,8 @@ TEST(GradientIndex, ExternalMemory) {
++i;
}
base_rowids.clear();
for (auto const &page : dmat->GetBatches<GHistIndexMatrix>({64, hessian, false})) {
for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
base_rowids.push_back(page.base_rowid);
}
i = 0;
@@ -41,12 +58,13 @@ TEST(GradientIndex, FromCategoricalBasic) {
size_t max_bins = 8;
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
auto m = GetDMatrixFromData(x, kRows, 1);
auto ctx = CreateEmptyGenericParam(Context::kCpuId);
auto &h_ft = m->Info().feature_types.HostVector();
h_ft.resize(kCols, FeatureType::kCategorical);
BatchParam p(max_bins, 0.8);
GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
GHistIndexMatrix gidx(&ctx, m.get(), max_bins, p.sparse_thresh, false, {});
auto x_copy = x;
std::sort(x_copy.begin(), x_copy.end());
@@ -80,11 +98,11 @@ TEST(GradientIndex, FromCategoricalLarge) {
BatchParam p{max_bins, 0.8};
{
GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
GHistIndexMatrix gidx{&ctx, m.get(), max_bins, p.sparse_thresh, false, {}};
ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
}
{
for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
for (auto const &page : m->GetBatches<GHistIndexMatrix>(&ctx, p)) {
common::HistogramCuts cut = page.cut;
GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
@@ -96,10 +114,11 @@ TEST(GradientIndex, PushBatch) {
size_t constexpr kRows = 64, kCols = 4;
bst_bin_t max_bins = 64;
float st = 0.5;
Context ctx;
auto test = [&](float sparisty) {
auto m = RandomDataGenerator{kRows, kCols, sparisty}.GenerateDMatrix(true);
auto cuts = common::SketchOnDMatrix(m.get(), max_bins, AllThreadsForTest(), false, {});
auto cuts = common::SketchOnDMatrix(&ctx, m.get(), max_bins, false, {});
common::HistogramCuts copy_cuts = cuts;
ASSERT_EQ(m->Info().num_row_, kRows);
@@ -112,7 +131,7 @@ TEST(GradientIndex, PushBatch) {
m->Info().num_row_);
gmat.PushAdapterBatchColumns(m->Ctx(), batch, std::numeric_limits<float>::quiet_NaN(), 0);
}
for (auto const &page : m->GetBatches<GHistIndexMatrix>(BatchParam{max_bins, st})) {
for (auto const &page : m->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{max_bins, st})) {
for (size_t i = 0; i < kRows; ++i) {
for (size_t j = 0; j < kCols; ++j) {
auto v0 = gmat.GetFvalue(i, j, false);
@@ -143,17 +162,19 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
// device.
size_t n_samples{128}, n_features{13};
Context ctx;
ctx.gpu_id = 0;
auto Xy = RandomDataGenerator{n_samples, n_features, 1 - density}.GenerateDMatrix(true);
std::unique_ptr<GHistIndexMatrix> from_ellpack;
ASSERT_TRUE(Xy->SingleColBlock());
bst_bin_t constexpr kBins{17};
auto p = BatchParam{kBins, threshold};
for (auto const &page : Xy->GetBatches<EllpackPage>(BatchParam{0, kBins})) {
Context gpu_ctx;
gpu_ctx.gpu_id = 0;
for (auto const &page : Xy->GetBatches<EllpackPage>(
&gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
}
for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(p)) {
for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
ASSERT_EQ(from_sparse_page.IsDense(), from_ellpack->IsDense());
ASSERT_EQ(from_sparse_page.base_rowid, 0);
ASSERT_EQ(from_sparse_page.base_rowid, from_ellpack->base_rowid);

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
@@ -11,6 +11,8 @@
namespace xgboost {
namespace data {
TEST(GHistIndexPageRawFormat, IO) {
Context ctx;
std::unique_ptr<SparsePageFormat<GHistIndexMatrix>> format{
CreatePageFormat<GHistIndexMatrix>("raw")};
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
@@ -20,7 +22,7 @@ TEST(GHistIndexPageRawFormat, IO) {
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
for (auto const &index : m->GetBatches<GHistIndexMatrix>(batch)) {
for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
format->Write(index, fo.get());
}
}
@@ -29,7 +31,7 @@ TEST(GHistIndexPageRawFormat, IO) {
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
format->Read(&page, fi.get());
for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(batch)) {
for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
auto const &loaded = gidx;
ASSERT_EQ(loaded.cut.Ptrs(), page.cut.Ptrs());
ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
@@ -43,5 +45,5 @@ TEST(GHistIndexPageRawFormat, IO) {
ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
}
}
} // namespace data
} // namespace xgboost
} // namespace data
} // namespace xgboost

View File

@@ -15,8 +15,9 @@
namespace xgboost {
namespace data {
TEST(IterativeDMatrix, Ref) {
Context ctx;
TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
[&](GHistIndexMatrix const& page) { return page.cut; });
&ctx, [&](GHistIndexMatrix const& page) { return page.cut; });
}
TEST(IterativeDMatrix, IsDense) {

View File

@@ -1,11 +1,12 @@
/*!
* Copyright 2020-2022 XGBoost contributors
/**
* Copyright 2020-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include "../../../src/data/device_adapter.cuh"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/iterative_dmatrix.h"
#include "../../../src/tree/param.h" // TrainParam
#include "../helpers.h"
#include "test_iterative_dmatrix.h"
@@ -13,15 +14,17 @@ namespace xgboost {
namespace data {
void TestEquivalent(float sparsity) {
Context ctx{MakeCUDACtx(0)};
CudaArrayIterForTest iter{sparsity};
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, 256);
size_t offset = 0;
auto first = (*m.GetEllpackBatches({}).begin()).Impl();
std::size_t offset = 0;
auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
std::unique_ptr<EllpackPageImpl> page_concatenated {
new EllpackPageImpl(0, first->Cuts(), first->is_dense,
first->row_stride, 1000 * 100)};
for (auto& batch : m.GetBatches<EllpackPage>({})) {
for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
auto page = batch.Impl();
size_t num_elements = page_concatenated->Copy(0, page, offset);
offset += num_elements;
@@ -34,8 +37,8 @@ void TestEquivalent(float sparsity) {
auto adapter = CupyAdapter(interface_str);
std::unique_ptr<DMatrix> dm{
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
BatchParam bp {0, 256};
for (auto& ellpack : dm->GetBatches<EllpackPage>(bp)) {
auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
@@ -92,7 +95,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
std::numeric_limits<float>::quiet_NaN(), 0, 256);
size_t n_batches = 0;
std::string interface_str = iter.AsArray();
for (auto& ellpack : m.GetBatches<EllpackPage>({})) {
Context ctx{MakeCUDACtx(0)};
for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
n_batches ++;
auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator(
@@ -140,7 +144,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, 256);
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
auto ctx = MakeCUDACtx(0);
auto& ellpack =
*m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
.begin();
auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
@@ -171,8 +178,9 @@ TEST(IterativeDeviceDMatrix, IsDense) {
}
TEST(IterativeDeviceDMatrix, Ref) {
Context ctx{MakeCUDACtx(0)};
TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
[](EllpackPage const& page) { return page.Impl()->Cuts(); });
&ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
}
} // namespace data
} // namespace xgboost

View File

@@ -1,8 +1,11 @@
/*!
* Copyright 2022 XGBoost contributors
/**
* Copyright 2022-2023, XGBoost contributors
*/
#pragma once
#include <memory> // std::make_shared
#include <xgboost/context.h> // for Context
#include <limits> // for numeric_limits
#include <memory> // for make_shared
#include "../../../src/data/iterative_dmatrix.h"
#include "../helpers.h"
@@ -10,7 +13,7 @@
namespace xgboost {
namespace data {
template <typename Page, typename Iter, typename Cuts>
void TestRefDMatrix(Cuts&& get_cuts) {
void TestRefDMatrix(Context const* ctx, Cuts&& get_cuts) {
int n_bins = 256;
Iter iter(0.3, 2048);
auto m = std::make_shared<IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
@@ -20,8 +23,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
auto m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), m, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
for (auto const& page_0 : m->template GetBatches<Page>({})) {
for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
for (auto const& page_0 : m->template GetBatches<Page>(ctx, {})) {
for (auto const& page_1 : m_1->template GetBatches<Page>(ctx, {})) {
auto const& cuts_0 = get_cuts(page_0);
auto const& cuts_1 = get_cuts(page_1);
ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
@@ -32,8 +35,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
for (auto const& page_0 : m->template GetBatches<Page>({})) {
for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
for (auto const& page_0 : m->template GetBatches<Page>(ctx, {})) {
for (auto const& page_1 : m_1->template GetBatches<Page>(ctx, {})) {
auto const& cuts_0 = get_cuts(page_0);
auto const& cuts_1 = get_cuts(page_1);
ASSERT_NE(cuts_0.Values(), cuts_1.Values());
@@ -45,8 +48,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
auto dm = RandomDataGenerator(2048, Iter::Cols(), 0.5).GenerateDMatrix(true);
auto dqm = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), dm, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
for (auto const& page_0 : dm->template GetBatches<Page>({})) {
for (auto const& page_1 : dqm->template GetBatches<Page>({})) {
for (auto const& page_0 : dm->template GetBatches<Page>(ctx, {})) {
for (auto const& page_1 : dqm->template GetBatches<Page>(ctx, {})) {
auto const& cuts_0 = get_cuts(page_0);
auto const& cuts_1 = get_cuts(page_1);
ASSERT_EQ(cuts_0.Values(), cuts_1.Values());

View File

@@ -157,8 +157,7 @@ TEST(MetaInfo, LoadQid) {
dmlc::TemporaryDirectory tempdir;
std::string tmp_file = tempdir.path + "/qid_test.libsvm";
{
std::unique_ptr<dmlc::Stream> fs(
dmlc::Stream::Create(tmp_file.c_str(), "w"));
std::unique_ptr<dmlc::Stream> fs(dmlc::Stream::Create(tmp_file.c_str(), "w"));
dmlc::ostream os(fs.get());
os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
2 qid:1 1:0 2:0 3:1 4:0.1 5:1
@@ -175,7 +174,7 @@ TEST(MetaInfo, LoadQid) {
os.set_stream(nullptr);
}
std::unique_ptr<xgboost::DMatrix> dmat(
xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm"));
xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow));
const xgboost::MetaInfo& info = dmat->Info();
const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};

View File

@@ -17,11 +17,15 @@
using namespace xgboost; // NOLINT
namespace {
std::string UriSVM(std::string name) { return name + "?format=libsvm"; }
} // namespace
TEST(SimpleDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
// Test the metadata that was parsed
EXPECT_EQ(dmat->Info().num_row_, 2);
@@ -37,7 +41,7 @@ TEST(SimpleDMatrix, RowAccess) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false);
// Loop over the batches and count the records
int64_t row_count = 0;
@@ -57,16 +61,17 @@ TEST(SimpleDMatrix, RowAccess) {
}
TEST(SimpleDMatrix, ColAccessWithoutBatches) {
Context ctx;
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
ASSERT_TRUE(dmat->SingleColBlock());
// Loop over the batches and assert the data is as expected
int64_t num_col_batch = 0;
for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
num_col_batch += 1;
EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
<< "Expected batch size = number of cells as #batches is 1.";
@@ -387,7 +392,7 @@ TEST(SimpleDMatrix, SaveLoadBinary) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
data::SimpleDMatrix *simple_dmat = dynamic_cast<data::SimpleDMatrix*>(dmat);
const std::string tmp_binfile = tempdir.path + "/csr_source.binary";

View File

@@ -16,14 +16,19 @@
#include "../helpers.h"
using namespace xgboost; // NOLINT
namespace {
std::string UriSVM(std::string name, std::string cache) {
return name + "?format=libsvm" + "#" + cache + ".cache";
}
} // namespace
template <typename Page>
void TestSparseDMatrixLoadFile() {
void TestSparseDMatrixLoadFile(Context const* ctx) {
dmlc::TemporaryDirectory tmpdir;
auto opath = tmpdir.path + "/1-based.svm";
CreateBigTestData(opath, 3 * 64, false);
opath += "?indexing_mode=1";
data::FileIterator iter{opath, 0, 1, "libsvm"};
opath += "?indexing_mode=1&format=libsvm";
data::FileIterator iter{opath, 0, 1};
auto n_threads = 0;
data::SparsePageDMatrix m{&iter,
iter.Proxy(),
@@ -43,7 +48,7 @@ void TestSparseDMatrixLoadFile() {
data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
1};
Page out;
for (auto const& page : m.GetBatches<Page>()) {
for (auto const &page : m.GetBatches<Page>(ctx)) {
if (std::is_same<Page, SparsePage>::value) {
out.Push(page);
} else {
@@ -53,7 +58,7 @@ void TestSparseDMatrixLoadFile() {
ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
for (auto const& page : simple.GetBatches<Page>()) {
for (auto const& page : simple.GetBatches<Page>(ctx)) {
ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
for (size_t i = 0; i < page.data.Size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
@@ -62,16 +67,18 @@ void TestSparseDMatrixLoadFile() {
}
TEST(SparsePageDMatrix, LoadFile) {
TestSparseDMatrixLoadFile<SparsePage>();
TestSparseDMatrixLoadFile<CSCPage>();
TestSparseDMatrixLoadFile<SortedCSCPage>();
auto ctx = CreateEmptyGenericParam(Context::kCpuId);
TestSparseDMatrixLoadFile<SparsePage>(&ctx);
TestSparseDMatrixLoadFile<CSCPage>(&ctx);
TestSparseDMatrixLoadFile<SortedCSCPage>(&ctx);
}
// allow caller to retain pages so they can process multiple pages at the same time.
template <typename Page>
void TestRetainPage() {
auto m = CreateSparsePageDMatrix(10000);
auto batches = m->GetBatches<Page>();
auto ctx = CreateEmptyGenericParam(Context::kCpuId);
auto batches = m->GetBatches<Page>(&ctx);
auto begin = batches.begin();
auto end = batches.end();
@@ -95,7 +102,7 @@ void TestRetainPage() {
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<Page>()) {
for (auto &page : m->GetBatches<Page>({&ctx})) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
}
}
@@ -112,15 +119,13 @@ TEST(SparsePageDMatrix, MetaInfo) {
size_t constexpr kEntries = 24;
CreateBigTestData(tmp_file, kEntries);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false);
std::unique_ptr<DMatrix> dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)};
// Test the metadata that was parsed
EXPECT_EQ(dmat->Info().num_row_, 8ul);
EXPECT_EQ(dmat->Info().num_col_, 5ul);
EXPECT_EQ(dmat->Info().num_nonzero_, kEntries);
EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_);
delete dmat;
}
TEST(SparsePageDMatrix, RowAccess) {
@@ -139,11 +144,12 @@ TEST(SparsePageDMatrix, ColAccess) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file));
auto ctx = CreateEmptyGenericParam(Context::kCpuId);
// Loop over the batches and assert the data is as expected
size_t iter = 0;
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
auto col_page = col_batch.GetView();
ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
if (iter == 1) {
@@ -161,7 +167,7 @@ TEST(SparsePageDMatrix, ColAccess) {
// Loop over the batches and assert the data is as expected
iter = 0;
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
auto col_page = col_batch.GetView();
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
if (iter == 0) {
@@ -179,9 +185,9 @@ TEST(SparsePageDMatrix, ColAccess) {
TEST(SparsePageDMatrix, ThreadSafetyException) {
size_t constexpr kEntriesPerCol = 3;
size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
Context ctx;
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(kEntries);
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(kEntries);
int threads = 1000;
@@ -218,7 +224,8 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
// Create multiple sparse pages
std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
ASSERT_EQ(dmat->Ctx()->Threads(), AllThreadsForTest());
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
auto ctx = CreateEmptyGenericParam(Context::kCpuId);
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
ASSERT_EQ(dmat->Info().num_col_, page.Size());
}
}
@@ -231,7 +238,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
std::string filename = tempdir.path + "/simple.libsvm";
CreateBigTestData(filename, 1 << 16);
data::FileIterator iter(filename, 0, 1, "auto");
data::FileIterator iter(filename + "?format=libsvm", 0, 1);
std::unique_ptr<DMatrix> sparse{
new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(), threads, filename}};

View File

@@ -1,23 +1,28 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
*/
#include <xgboost/data.h> // for DMatrix
#include "../../../src/common/compressed_iterator.h"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/sparse_page_dmatrix.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../../../src/tree/param.h" // TrainParam
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
namespace xgboost {
TEST(SparsePageDMatrix, EllpackPage) {
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache");
// Loop over the batches and assert the data is as expected
size_t n = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
n += batch.Size();
}
EXPECT_EQ(n, dmat->Info().num_row_);
@@ -37,6 +42,8 @@ TEST(SparsePageDMatrix, EllpackPage) {
}
TEST(SparsePageDMatrix, MultipleEllpackPages) {
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
@@ -46,7 +53,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
// Loop over the batches and count the records
int64_t batch_count = 0;
int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++;
row_count += batch.Size();
@@ -61,8 +68,11 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
}
TEST(SparsePageDMatrix, RetainEllpackPage) {
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{32, tree::TrainParam::DftSparseThreshold()};
auto m = CreateSparsePageDMatrix(10000);
auto batches = m->GetBatches<EllpackPage>({0, 32});
auto batches = m->GetBatches<EllpackPage>(&ctx, param);
auto begin = batches.begin();
auto end = batches.end();
@@ -87,7 +97,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
}
@@ -98,6 +108,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
}
TEST(SparsePageDMatrix, EllpackPageContent) {
auto ctx = CreateEmptyGenericParam(0);
constexpr size_t kRows = 6;
constexpr size_t kCols = 2;
constexpr size_t kPageSize = 1;
@@ -110,8 +121,8 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 2};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
auto param = BatchParam{2, tree::TrainParam::DftSparseThreshold()};
auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
EXPECT_EQ(impl->base_rowid, 0);
EXPECT_EQ(impl->n_rows, kRows);
EXPECT_FALSE(impl->is_dense);
@@ -120,7 +131,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
std::unique_ptr<EllpackPageImpl> impl_ext;
size_t offset = 0;
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
if (!impl_ext) {
impl_ext.reset(new EllpackPageImpl(
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
@@ -170,8 +181,9 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
EXPECT_EQ(impl->base_rowid, 0);
EXPECT_EQ(impl->n_rows, kRows);
@@ -180,7 +192,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
thrust::device_vector<bst_float> row_ext_d(kCols);
std::vector<bst_float> row(kCols);
std::vector<bst_float> row_ext(kCols);
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
for (auto& page : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
auto impl_ext = page.Impl();
EXPECT_EQ(impl_ext->base_rowid, current_row);
@@ -211,10 +223,11 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins};
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
size_t current_row = 0;
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
for (auto& page : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
auto impl_ext = page.Impl();
EXPECT_EQ(impl_ext->base_rowid, current_row);
current_row += impl_ext->n_rows;

View File

@@ -1,17 +1,24 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include <xgboost/data.h> // for CSCPage, SortedCSCPage, SparsePage
#include "../../../src/data/sparse_page_source.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
#include <memory> // for allocator, unique_ptr, __shared_ptr_ac...
#include <string> // for char_traits, operator+, basic_string
#include "../../../src/data/sparse_page_writer.h" // for CreatePageFormat
#include "../helpers.h" // for RandomDataGenerator
#include "dmlc/filesystem.h" // for TemporaryDirectory
#include "dmlc/io.h" // for SeekStream, Stream
#include "gtest/gtest_pred_impl.h" // for Test, AssertionResult, ASSERT_EQ, TEST
#include "xgboost/context.h" // for Context
namespace xgboost {
namespace data {
template <typename S> void TestSparsePageRawFormat() {
std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
Context ctx;
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
ASSERT_TRUE(m->SingleColBlock());
@@ -21,7 +28,7 @@ template <typename S> void TestSparsePageRawFormat() {
{
// block code to flush the stream
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
for (auto const &page : m->GetBatches<S>()) {
for (auto const &page : m->GetBatches<S>(&ctx)) {
orig.Push(page);
format->Write(page, fo.get());
}