Implement iterative DMatrix for CPU. (#8116)

This commit is contained in:
Jiaming Yuan
2022-07-26 22:34:21 +08:00
committed by GitHub
parent 546de5efd2
commit 2c70751d1e
20 changed files with 636 additions and 190 deletions

View File

@@ -0,0 +1,36 @@
/*!
* Copyright 2022 XGBoost contributors
*/
#include "test_iterative_dmatrix.h"
#include <gtest/gtest.h>
#include "../../../src/data/gradient_index.h"
#include "../../../src/data/iterative_dmatrix.h"
#include "../helpers.h"
namespace xgboost {
namespace data {
TEST(IterativeDMatrix, Ref) {
TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
[&](GHistIndexMatrix const& page) { return page.cut; });
}
TEST(IterativeDMatrix, IsDense) {
int n_bins = 16;
auto test = [n_bins](float sparsity) {
NumpyArrayIterForTest iter(sparsity);
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
if (sparsity == 0.0) {
ASSERT_TRUE(m.IsDense());
} else {
ASSERT_FALSE(m.IsDense());
}
};
test(0.0);
test(0.1);
test(1.0);
}
} // namespace data
} // namespace xgboost

View File

@@ -3,19 +3,19 @@
*/
#include <gtest/gtest.h>
#include "../helpers.h"
#include "../../../src/data/iterative_dmatrix.h"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/device_adapter.cuh"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/iterative_dmatrix.h"
#include "../helpers.h"
#include "test_iterative_dmatrix.h"
namespace xgboost {
namespace data {
void TestEquivalent(float sparsity) {
CudaArrayIterForTest iter{sparsity};
IterativeDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, 256);
size_t offset = 0;
auto first = (*m.GetEllpackBatches({}).begin()).Impl();
std::unique_ptr<EllpackPageImpl> page_concatenated {
@@ -88,9 +88,8 @@ TEST(IterativeDeviceDMatrix, Basic) {
TEST(IterativeDeviceDMatrix, RowMajor) {
CudaArrayIterForTest iter(0.0f);
IterativeDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, 256);
size_t n_batches = 0;
std::string interface_str = iter.AsArray();
for (auto& ellpack : m.GetBatches<EllpackPage>({})) {
@@ -139,9 +138,8 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
reinterpret_cast<float *>(get<Integer>(j_interface["data"][0])));
thrust::copy(h_data.cbegin(), h_data.cend(), ptr);
IterativeDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, 256);
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator(
@@ -157,11 +155,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
TEST(IterativeDeviceDMatrix, IsDense) {
int num_bins = 16;
auto test = [num_bins] (float sparsity) {
auto test = [num_bins](float sparsity) {
CudaArrayIterForTest iter(sparsity);
IterativeDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, num_bins);
if (sparsity == 0.0) {
ASSERT_TRUE(m.IsDense());
} else {
@@ -170,6 +167,12 @@ TEST(IterativeDeviceDMatrix, IsDense) {
};
test(0.0);
test(0.1);
test(1.0);
}
TEST(IterativeDeviceDMatrix, Ref) {
TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
[](EllpackPage const& page) { return page.Impl()->Cuts(); });
}
} // namespace data
} // namespace xgboost

View File

@@ -0,0 +1,59 @@
/*!
* Copyright 2022 XGBoost contributors
*/
#pragma once
#include <memory> // std::make_shared
#include "../../../src/data/iterative_dmatrix.h"
#include "../helpers.h"
namespace xgboost {
namespace data {
template <typename Page, typename Iter, typename Cuts>
void TestRefDMatrix(Cuts&& get_cuts) {
int n_bins = 256;
Iter iter(0.3, 2048);
auto m = std::make_shared<IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
Iter iter_1(0.8, 32, Iter::Cols(), 13);
auto m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), m, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
for (auto const& page_0 : m->template GetBatches<Page>({})) {
for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
auto const& cuts_0 = get_cuts(page_0);
auto const& cuts_1 = get_cuts(page_1);
ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
ASSERT_EQ(cuts_0.Ptrs(), cuts_1.Ptrs());
ASSERT_EQ(cuts_0.MinValues(), cuts_1.MinValues());
}
}
m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), nullptr, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
for (auto const& page_0 : m->template GetBatches<Page>({})) {
for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
auto const& cuts_0 = get_cuts(page_0);
auto const& cuts_1 = get_cuts(page_1);
ASSERT_NE(cuts_0.Values(), cuts_1.Values());
ASSERT_NE(cuts_0.Ptrs(), cuts_1.Ptrs());
}
}
// Use DMatrix as ref
auto dm = RandomDataGenerator(2048, Iter::Cols(), 0.5).GenerateDMatrix(true);
auto dqm = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), dm, Reset, Next,
std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
for (auto const& page_0 : dm->template GetBatches<Page>({})) {
for (auto const& page_1 : dqm->template GetBatches<Page>({})) {
auto const& cuts_0 = get_cuts(page_0);
auto const& cuts_1 = get_cuts(page_1);
ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
ASSERT_EQ(cuts_0.Ptrs(), cuts_1.Ptrs());
ASSERT_EQ(cuts_0.MinValues(), cuts_1.MinValues());
}
}
}
} // namespace data
} // namespace xgboost

View File

@@ -384,7 +384,7 @@ RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label,
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix() {
NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
auto m = std::make_shared<data::IterativeDMatrix>(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
&iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
return m;
}
@@ -569,7 +569,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
auto& h_gpair = gpair.HostVector();
h_gpair.resize(kRows);
for (size_t i = 0; i < kRows; ++i) {
h_gpair[i] = {static_cast<float>(i), 1};
h_gpair[i] = GradientPair{static_cast<float>(i), 1};
}
PredictionCacheEntry predts;

View File

@@ -27,7 +27,7 @@ int CudaArrayIterForTest::Next() {
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix() {
CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
auto m = std::make_shared<data::IterativeDMatrix>(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
&iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
return m;
}
} // namespace xgboost

View File

@@ -245,6 +245,17 @@ void TestUpdatePredictionCache(bool use_subsampling) {
}
}
TEST(CPUPredictor, GHistIndex) {
size_t constexpr kRows{128}, kCols{16}, kBins{64};
auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix();
HostDeviceVector<float> storage(kRows * kCols);
auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
auto adapter = data::ArrayAdapter(columnar.c_str());
std::shared_ptr<DMatrix> p_full{
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
TestTrainingPrediction(kRows, kBins, "hist", p_full, p_hist);
}
TEST(CPUPredictor, CategoricalPrediction) {
TestCategoricalPrediction("cpu_predictor");
}