Prepare external memory support for hist. (#7638)
This PR prepares the GHistIndexMatrix to host the column matrix which is used by the hist tree method by accepting sparse_threshold parameter. Some cleanups are made to ensure the correct batch param is being passed into DMatrix along with some additional tests for correctness of SimpleDMatrix.
This commit is contained in:
@@ -12,12 +12,14 @@ namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
TEST(DenseColumn, Test) {
|
||||
uint64_t max_num_bins[] = {static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (size_t max_num_bin : max_num_bins) {
|
||||
int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (int32_t max_num_bin : max_num_bins) {
|
||||
auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatrix();
|
||||
GHistIndexMatrix gmat(dmat.get(), max_num_bin, false, common::OmpGetNumThreads(0));
|
||||
auto sparse_thresh = 0.2;
|
||||
GHistIndexMatrix gmat{dmat.get(), max_num_bin, sparse_thresh, false,
|
||||
common::OmpGetNumThreads(0)};
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.2, common::OmpGetNumThreads(0));
|
||||
|
||||
@@ -59,12 +61,12 @@ inline void CheckSparseColumn(const Column<BinIdxType>& col_input, const GHistIn
|
||||
}
|
||||
|
||||
TEST(SparseColumn, Test) {
|
||||
uint64_t max_num_bins[] = {static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (size_t max_num_bin : max_num_bins) {
|
||||
int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (int32_t max_num_bin : max_num_bins) {
|
||||
auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatrix();
|
||||
GHistIndexMatrix gmat(dmat.get(), max_num_bin, false, common::OmpGetNumThreads(0));
|
||||
GHistIndexMatrix gmat{dmat.get(), max_num_bin, 0.5f, false, common::OmpGetNumThreads(0)};
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.5, common::OmpGetNumThreads(0));
|
||||
switch (column_matrix.GetTypeSize()) {
|
||||
@@ -99,12 +101,12 @@ inline void CheckColumWithMissingValue(const Column<BinIdxType>& col_input,
|
||||
}
|
||||
|
||||
TEST(DenseColumnWithMissing, Test) {
|
||||
uint64_t max_num_bins[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
|
||||
for (size_t max_num_bin : max_num_bins) {
|
||||
int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (int32_t max_num_bin : max_num_bins) {
|
||||
auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatrix();
|
||||
GHistIndexMatrix gmat(dmat.get(), max_num_bin, false, common::OmpGetNumThreads(0));
|
||||
GHistIndexMatrix gmat{dmat.get(), max_num_bin, 0.2, false, common::OmpGetNumThreads(0)};
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.2, common::OmpGetNumThreads(0));
|
||||
switch (column_matrix.GetTypeSize()) {
|
||||
@@ -131,9 +133,8 @@ void TestGHistIndexMatrixCreation(size_t nthreads) {
|
||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
/* This should create multiple sparse pages */
|
||||
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries) };
|
||||
omp_set_num_threads(nthreads);
|
||||
GHistIndexMatrix gmat(dmat.get(), 256, false, common::OmpGetNumThreads(0));
|
||||
std::unique_ptr<DMatrix> dmat{CreateSparsePageDMatrix(kEntries)};
|
||||
GHistIndexMatrix gmat(dmat.get(), 256, 0.5f, false, common::OmpGetNumThreads(nthreads));
|
||||
}
|
||||
|
||||
TEST(HistIndexCreationWithExternalMemory, Test) {
|
||||
|
||||
@@ -299,7 +299,7 @@ TEST(HistUtil, IndexBinBound) {
|
||||
for (auto max_bin : bin_sizes) {
|
||||
auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
|
||||
|
||||
GHistIndexMatrix hmat(p_fmat.get(), max_bin, false, common::OmpGetNumThreads(0));
|
||||
GHistIndexMatrix hmat(p_fmat.get(), max_bin, 0.5, false, common::OmpGetNumThreads(0));
|
||||
EXPECT_EQ(hmat.index.Size(), kRows*kCols);
|
||||
EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.GetBinTypeSize());
|
||||
}
|
||||
@@ -322,7 +322,7 @@ TEST(HistUtil, IndexBinData) {
|
||||
|
||||
for (auto max_bin : kBinSizes) {
|
||||
auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
|
||||
GHistIndexMatrix hmat(p_fmat.get(), max_bin, false, common::OmpGetNumThreads(0));
|
||||
GHistIndexMatrix hmat(p_fmat.get(), max_bin, 0.5, false, common::OmpGetNumThreads(0));
|
||||
uint32_t* offsets = hmat.index.Offset();
|
||||
EXPECT_EQ(hmat.index.Size(), kRows*kCols);
|
||||
switch (max_bin) {
|
||||
|
||||
Reference in New Issue
Block a user