Fix external memory for get column batches. (#4622)

* Fix external memory for get column batches.

This fixes two bugs:

* Use PushCSC for get column batches.
* Don't remove the created temporary directory before finishing test.

* Check all pages.
This commit is contained in:
Jiaming Yuan
2019-06-30 09:56:49 +08:00
committed by GitHub
parent a30176907f
commit 45876bf41b
14 changed files with 90 additions and 46 deletions

View File

@@ -1,11 +1,13 @@
/*!
* Copyright 2016-2018 XGBoost contributors
*/
#include "./helpers.h"
#include "xgboost/c_api.h"
#include <dmlc/filesystem.h>
#include <xgboost/logging.h>
#include <random>
#include <cinttypes>
#include <dmlc/filesystem.h>
#include "./helpers.h"
#include "xgboost/c_api.h"
#include "../../src/data/simple_csr_source.h"
bool FileExists(const std::string& filename) {
@@ -144,13 +146,12 @@ std::shared_ptr<xgboost::DMatrix>* CreateDMatrix(int rows, int columns,
return static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
}
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size) {
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
size_t n_entries, size_t page_size, std::string tmp_file) {
// Create sufficiently large data to make two row pages
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/big.libsvm";
CreateBigTestData(tmp_file, n_entries);
std::unique_ptr<DMatrix> dmat = std::unique_ptr<DMatrix>(DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
std::unique_ptr<DMatrix> dmat { DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
// Loop over the batches and count the records