Fix external memory for get column batches. (#4622)

* Fix external memory for get column batches.

This fixes two bugs:

* Use PushCSC for get column batches.
* Don't remove the created temporary directory before finishing test.

* Check all pages.
This commit is contained in:
Jiaming Yuan
2019-06-30 09:56:49 +08:00
committed by GitHub
parent a30176907f
commit 45876bf41b
14 changed files with 90 additions and 46 deletions

View File

@@ -1,6 +1,9 @@
#include <dmlc/filesystem.h>
#include <gtest/gtest.h>
#include "../../../src/common/column_matrix.h"
#include "../helpers.h"
#include "gtest/gtest.h"
namespace xgboost {
namespace common {
@@ -51,10 +54,11 @@ TEST(DenseColumnWithMissing, Test) {
delete dmat;
}
void
TestGHistIndexMatrixCreation(size_t nthreads) {
void TestGHistIndexMatrixCreation(size_t nthreads) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
/* This should create multiple sparse pages */
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(1024, 1024);
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(1024, 1024, filename) };
omp_set_num_threads(nthreads);
GHistIndexMatrix gmat;
gmat.Init(dmat.get(), 256);

View File

@@ -1,7 +1,9 @@
#include <dmlc/filesystem.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <cmath>
#include "gtest/gtest.h"
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
@@ -22,10 +24,12 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
std::shared_ptr<xgboost::DMatrix> *dmat = nullptr;
size_t num_cols = 1;
dmlc::TemporaryDirectory tmpdir;
std::string file = tmpdir.path + "/big.libsvm";
if (use_external_memory) {
auto sp_dmat = CreateSparsePageDMatrix(nrows * 3, 128UL); // 3 entries/row
dmat = new std::shared_ptr<xgboost::DMatrix>(std::move(sp_dmat));
num_cols = 5;
auto sp_dmat = CreateSparsePageDMatrix(nrows * 3, 128UL, file); // 3 entries/row
dmat = new std::shared_ptr<xgboost::DMatrix>(std::move(sp_dmat));
num_cols = 5;
} else {
std::vector<float> test_data(nrows);
auto count_iter = thrust::make_counting_iterator(0);

View File

@@ -1,4 +1,5 @@
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <vector>
#include "xgboost/data.h"
@@ -55,8 +56,11 @@ TEST(SparsePage, PushCSC) {
}
TEST(SparsePage, PushCSCAfterTranspose) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
const int n_entries = 9;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(n_entries, 64UL);
std::unique_ptr<DMatrix> dmat =
CreateSparsePageDMatrix(n_entries, 64UL, filename);
const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page
for (const auto &batch : dmat->GetRowBatches()) {
@@ -70,7 +74,7 @@ TEST(SparsePage, PushCSCAfterTranspose) {
// The feature value for a feature in each row should be identical, as that is
// how the dmatrix has been created
for (int i = 0; i < page.Size(); ++i) {
for (size_t i = 0; i < page.Size(); ++i) {
auto inst = page[i];
for (int j = 1; j < inst.size(); ++j) {
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);

View File

@@ -1,4 +1,5 @@
// Copyright by Contributors
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include <dmlc/filesystem.h>
#include <cinttypes>
@@ -26,7 +27,10 @@ TEST(SparsePageDMatrix, MetaInfo) {
}
TEST(SparsePageDMatrix, RowAccess) {
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(12, 64);
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(12, 64, filename);
// Test the data read into the first row
auto &batch = *dmat->GetRowBatches().begin();
@@ -67,3 +71,19 @@ TEST(SparsePageDMatrix, ColAccess) {
delete dmat;
}
// Multi-batches access
TEST(SparsePageDMatrix, ColAccessBatches) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
// Create multiple sparse pages
std::unique_ptr<xgboost::DMatrix> dmat {
xgboost::CreateSparsePageDMatrix(1024, 1024, filename)
};
auto n_threads = omp_get_max_threads();
omp_set_num_threads(16);
for (auto const& page : dmat->GetColumnBatches()) {
ASSERT_EQ(dmat->Info().num_col_, page.Size());
}
omp_set_num_threads(n_threads);
}

View File

@@ -1,11 +1,13 @@
/*!
* Copyright 2016-2018 XGBoost contributors
*/
#include "./helpers.h"
#include "xgboost/c_api.h"
#include <dmlc/filesystem.h>
#include <xgboost/logging.h>
#include <random>
#include <cinttypes>
#include <dmlc/filesystem.h>
#include "./helpers.h"
#include "xgboost/c_api.h"
#include "../../src/data/simple_csr_source.h"
bool FileExists(const std::string& filename) {
@@ -144,13 +146,12 @@ std::shared_ptr<xgboost::DMatrix>* CreateDMatrix(int rows, int columns,
return static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
}
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size) {
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
size_t n_entries, size_t page_size, std::string tmp_file) {
// Create sufficiently large data to make two row pages
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/big.libsvm";
CreateBigTestData(tmp_file, n_entries);
std::unique_ptr<DMatrix> dmat = std::unique_ptr<DMatrix>(DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
std::unique_ptr<DMatrix> dmat { DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
// Loop over the batches and count the records

View File

@@ -163,7 +163,8 @@ class SimpleRealUniformDistribution {
std::shared_ptr<xgboost::DMatrix> *CreateDMatrix(int rows, int columns,
float sparsity, int seed = 0);
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size);
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
size_t n_entries, size_t page_size, std::string tmp_file);
/**
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,

View File

@@ -56,7 +56,9 @@ TEST(cpu_predictor, Test) {
}
TEST(cpu_predictor, ExternalMemoryTest) {
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64);
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
auto lparam = CreateEmptyGenericParam(0, 0);
std::unique_ptr<Predictor> cpu_predictor =
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));

View File

@@ -97,7 +97,9 @@ TEST(gpu_predictor, ExternalMemoryTest) {
gbm::GBTreeModel model = CreateTestModel();
int n_col = 3;
model.param.num_feature = n_col;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(32, 64);
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(32, 64, filename);
// Test predict batch
HostDeviceVector<float> out_predictions;
@@ -268,9 +270,13 @@ TEST(gpu_predictor, MGPU_ExternalMemoryTest) {
const int n_classes = 3;
model.param.num_output_group = n_classes;
std::vector<std::unique_ptr<DMatrix>> dmats;
dmats.push_back(CreateSparsePageDMatrix(9, 64UL));
dmats.push_back(CreateSparsePageDMatrix(128, 128UL));
dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL));
dmlc::TemporaryDirectory tmpdir;
std::string file0 = tmpdir.path + "/big_0.libsvm";
std::string file1 = tmpdir.path + "/big_1.libsvm";
std::string file2 = tmpdir.path + "/big_2.libsvm";
dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
for (const auto& dmat: dmats) {
// Test predict batch