Support multiple batches in gpu_hist (#5014)

* Initial external memory training support for GPU Hist tree method.
This commit is contained in:
Rong Ou
2019-11-15 22:50:20 -08:00
committed by Jiaming Yuan
parent 97abcc7ee2
commit 0afcc55d98
15 changed files with 559 additions and 134 deletions

View File

@@ -2,10 +2,11 @@
#include <dmlc/filesystem.h>
#include "../helpers.h"
#include "../../../src/common/compressed_iterator.h"
namespace xgboost {
TEST(GPUSparsePageDMatrix, EllpackPage) {
TEST(SparsePageDMatrix, EllpackPage) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
@@ -23,4 +24,162 @@ TEST(GPUSparsePageDMatrix, EllpackPage) {
delete dmat;
}
TEST(SparsePageDMatrix, MultipleEllpackPages) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
// Loop over the batches and count the records
int64_t batch_count = 0;
int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 0, 7UL})) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++;
row_count += batch.Size();
}
EXPECT_GE(batch_count, 2);
EXPECT_EQ(row_count, dmat->Info().num_row_);
EXPECT_TRUE(FileExists(filename + ".cache.ellpack.page"));
}
TEST(SparsePageDMatrix, EllpackPageContent) {
constexpr size_t kRows = 6;
constexpr size_t kCols = 2;
constexpr size_t kPageSize = 1;
// Create an in-memory DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 2, 0, 0};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl->matrix.base_rowid, 0);
EXPECT_EQ(impl->matrix.n_rows, kRows);
EXPECT_FALSE(impl->matrix.info.is_dense);
EXPECT_EQ(impl->matrix.info.row_stride, 2);
EXPECT_EQ(impl->matrix.info.n_bins, 4);
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl_ext->matrix.base_rowid, 0);
EXPECT_EQ(impl_ext->matrix.n_rows, kRows);
EXPECT_FALSE(impl_ext->matrix.info.is_dense);
EXPECT_EQ(impl_ext->matrix.info.row_stride, 2);
EXPECT_EQ(impl_ext->matrix.info.n_bins, 4);
std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.size());
std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.size());
dh::CopyDeviceSpanToVector(&buffer, impl->gidx_buffer);
dh::CopyDeviceSpanToVector(&buffer_ext, impl_ext->gidx_buffer);
EXPECT_EQ(buffer, buffer_ext);
}
struct ReadRowFunction {
EllpackMatrix matrix;
int row;
bst_float* row_data_d;
ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
: matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
__device__ void operator()(size_t col) {
auto value = matrix.GetElement(row, col);
if (isnan(value)) {
value = -1;
}
row_data_d[col] = value;
}
};
TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
constexpr size_t kRows = 6;
constexpr size_t kCols = 2;
constexpr int kMaxBins = 256;
constexpr size_t kPageSize = 1;
// Create an in-memory DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins, 0, kPageSize};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl->matrix.base_rowid, 0);
EXPECT_EQ(impl->matrix.n_rows, kRows);
size_t current_row = 0;
thrust::device_vector<bst_float> row_d(kCols);
thrust::device_vector<bst_float> row_ext_d(kCols);
std::vector<bst_float> row(kCols);
std::vector<bst_float> row_ext(kCols);
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
auto impl_ext = page.Impl();
EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
for (size_t i = 0; i < impl_ext->Size(); i++) {
dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
thrust::copy(row_d.begin(), row_d.end(), row.begin());
dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->matrix, current_row, row_ext_d.data().get()));
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
EXPECT_EQ(row, row_ext);
current_row++;
}
}
}
TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
constexpr size_t kRows = 1024;
constexpr size_t kCols = 16;
constexpr int kMaxBins = 256;
constexpr size_t kPageSize = 4096;
// Create an in-memory DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins, 0, kPageSize};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
size_t current_row = 0;
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
auto impl_ext = page.Impl();
EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
current_row += impl_ext->matrix.n_rows;
}
current_row = 0;
thrust::device_vector<bst_float> row_d(kCols);
thrust::device_vector<bst_float> row_ext_d(kCols);
std::vector<bst_float> row(kCols);
std::vector<bst_float> row_ext(kCols);
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
auto impl_ext = page.Impl();
EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
for (size_t i = 0; i < impl_ext->Size(); i++) {
dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
thrust::copy(row_d.begin(), row_d.end(), row.begin());
dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->matrix, current_row, row_ext_d.data().get()));
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
EXPECT_EQ(row, row_ext) << "for row " << current_row;
current_row++;
}
}
}
} // namespace xgboost

View File

@@ -217,17 +217,17 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
} else {
gen.reset(new std::mt19937(rdev()));
}
std::uniform_int_distribution<size_t> label(0, 1);
std::uniform_int_distribution<size_t> dis(1, n_cols);
for (size_t i = 0; i < n_rows; ++i) {
// Make sure that all cols are slotted in the first few rows; randomly distribute the
// rest
std::stringstream row_data;
fo << i;
size_t j = 0;
if (rem_cols > 0) {
for (; j < std::min(static_cast<size_t>(rem_cols), cols_per_row); ++j) {
row_data << " " << (col_idx+j) << ":" << (col_idx+j+1)*10;
row_data << label(*gen) << " " << (col_idx+j) << ":" << (col_idx+j+1)*10*i;
}
rem_cols -= cols_per_row;
} else {
@@ -235,7 +235,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
size_t ncols = dis(*gen);
for (; j < ncols; ++j) {
size_t fid = (col_idx+j) % n_cols;
row_data << " " << fid << ":" << (fid+1)*10;
row_data << label(*gen) << " " << fid << ":" << (fid+1)*10*i;
}
}
col_idx += j;

View File

@@ -56,22 +56,6 @@ TEST(GpuHist, DeviceHistogram) {
};
}
namespace {
class HistogramCutsWrapper : public common::HistogramCuts {
public:
using SuperT = common::HistogramCuts;
void SetValues(std::vector<float> cuts) {
SuperT::cut_values_ = cuts;
}
void SetPtrs(std::vector<uint32_t> ptrs) {
SuperT::cut_ptrs_ = ptrs;
}
void SetMins(std::vector<float> mins) {
SuperT::min_vals_ = mins;
}
};
} // anonymous namespace
std::vector<GradientPairPrecise> GetHostHistGpair() {
// 24 bins, 3 bins for each feature (column).
std::vector<GradientPairPrecise> hist_gpair = {
@@ -98,7 +82,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
};
param.Init(args);
auto page = BuildEllpackPage(kNRows, kNCols);
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), kNRows, param, kNCols, kNCols);
BatchParam batch_param{};
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), kNRows, param, kNCols, kNCols, batch_param);
maker.InitHistogram();
xgboost::SimpleLCG gen;
@@ -199,7 +184,9 @@ TEST(GpuHist, EvaluateSplits) {
// Initialize GPUHistMakerDevice
auto page = BuildEllpackPage(kNRows, kNCols);
GPUHistMakerDevice<GradientPairPrecise> maker(0, page.get(), kNRows, param, kNCols, kNCols);
BatchParam batch_param{};
GPUHistMakerDevice<GradientPairPrecise>
maker(0, page.get(), kNRows, param, kNCols, kNCols, batch_param);
// Initialize GPUHistMakerDevice::node_sum_gradients
maker.node_sum_gradients = {{6.4f, 12.8f}};
@@ -332,21 +319,25 @@ int32_t TestMinSplitLoss(DMatrix* dmat, float gamma, HostDeviceVector<GradientPa
return n_nodes;
}
TEST(GpuHist, MinSplitLoss) {
constexpr size_t kRows = 32;
constexpr size_t kCols = 16;
constexpr float kSparsity = 0.6;
auto dmat = CreateDMatrix(kRows, kCols, kSparsity, 3);
HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows) {
xgboost::SimpleLCG gen;
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
std::vector<GradientPair> h_gpair(kRows);
std::vector<GradientPair> h_gpair(n_rows);
for (auto &gpair : h_gpair) {
bst_float grad = dist(&gen);
bst_float hess = dist(&gen);
gpair = GradientPair(grad, hess);
}
HostDeviceVector<GradientPair> gpair(h_gpair);
return gpair;
}
TEST(GpuHist, MinSplitLoss) {
constexpr size_t kRows = 32;
constexpr size_t kCols = 16;
constexpr float kSparsity = 0.6;
auto dmat = CreateDMatrix(kRows, kCols, kSparsity, 3);
auto gpair = GenerateRandomGradients(kRows);
{
int32_t n_nodes = TestMinSplitLoss((*dmat).get(), 0.01, &gpair);
@@ -363,5 +354,75 @@ TEST(GpuHist, MinSplitLoss) {
delete dmat;
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair,
DMatrix* dmat,
size_t gpu_page_size,
RegTree* tree,
HostDeviceVector<bst_float>* preds) {
constexpr size_t kMaxBin = 2;
if (gpu_page_size > 0) {
// Loop over the batches and count the records
int64_t batch_count = 0;
int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, kMaxBin, 0, gpu_page_size})) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++;
row_count += batch.Size();
}
EXPECT_GE(batch_count, 2);
EXPECT_EQ(row_count, dmat->Info().num_row_);
}
Args args{
{"max_depth", "2"},
{"max_bin", std::to_string(kMaxBin)},
{"min_child_weight", "0.0"},
{"reg_alpha", "0"},
{"reg_lambda", "0"}
};
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker;
GenericParameter generic_param(CreateEmptyGenericParam(0));
generic_param.gpu_page_size = gpu_page_size;
hist_maker.Configure(args, &generic_param);
hist_maker.Update(gpair, dmat, {tree});
hist_maker.UpdatePredictionCache(dmat, preds);
}
TEST(GpuHist, ExternalMemory) {
constexpr size_t kRows = 6;
constexpr size_t kCols = 2;
constexpr size_t kPageSize = 1;
// Create an in-memory DMatrix.
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
auto gpair = GenerateRandomGradients(kRows);
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds);
// Build another tree using multiple ELLPACK pages.
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_ext_h = preds_ext.ConstHostVector();
for (int i = 0; i < kRows; i++) {
ASSERT_FLOAT_EQ(preds_h[i], preds_ext_h[i]);
}
}
} // namespace tree
} // namespace xgboost