Update dmlc-core and use data iter for GPU sampling tests. (#7398)
* Update dmlc-core. * New parquet parser in dmlc-core. * Use data iter for GPU sampling tests.
This commit is contained in:
parent
c968217ca8
commit
6ede12412c
@ -1 +1 @@
|
||||
Subproject commit f00e3ec7abc9f293a1b7061157b0a4e22a735cf5
|
||||
Subproject commit 97e9afa320731763c12e4e80182725465a572600
|
||||
@ -146,7 +146,7 @@ MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
|
||||
auto it = cache_info.find(id);
|
||||
if (it == cache_info.cend()) {
|
||||
cache_info[id].reset(new Cache{false, name, format});
|
||||
LOG(INFO) << "Make cache:" << name << std::endl;
|
||||
LOG(INFO) << "Make cache:" << cache_info[id]->ShardName() << std::endl;
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
||||
@ -362,6 +362,32 @@ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1));
|
||||
}
|
||||
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
|
||||
size_t n_batches, std::string prefix) {
|
||||
CHECK_GE(n_samples, n_batches);
|
||||
ArrayIterForTest iter(0, n_samples, n_features, n_batches);
|
||||
|
||||
std::unique_ptr<DMatrix> dmat{
|
||||
DMatrix::Create(static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
|
||||
std::numeric_limits<float>::quiet_NaN(), omp_get_max_threads(), prefix)};
|
||||
|
||||
auto row_page_path =
|
||||
data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
|
||||
EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
|
||||
|
||||
// Loop over the batches and count the number of pages
|
||||
int64_t batch_count = 0;
|
||||
int64_t row_count = 0;
|
||||
for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
||||
batch_count++;
|
||||
row_count += batch.Size();
|
||||
}
|
||||
|
||||
EXPECT_GE(batch_count, n_batches);
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
return dmat;
|
||||
}
|
||||
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries,
|
||||
std::string prefix) {
|
||||
size_t n_columns = 3;
|
||||
|
||||
@ -302,11 +302,26 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
|
||||
std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
|
||||
int num_rows, int num_columns);
|
||||
|
||||
/**
|
||||
* \brief Create Sparse Page using data iterator.
|
||||
*
|
||||
* \param n_samples Total number of rows for all batches combined.
|
||||
* \param n_features Number of features
|
||||
* \param n_batches Number of batches
|
||||
* \param prefix Cache prefix, can be used for specifying file path.
|
||||
*
|
||||
* \return A Sparse DMatrix with n_batches.
|
||||
*/
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
|
||||
size_t n_batches, std::string prefix = "cache");
|
||||
|
||||
/**
|
||||
* Deprecated, stop using it
|
||||
*/
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
|
||||
|
||||
/**
|
||||
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
||||
* size_t page_size);
|
||||
* Deprecated, stop using it
|
||||
*
|
||||
* \brief Creates dmatrix with some records, each record containing random number of
|
||||
* features in [1, n_cols]
|
||||
|
||||
@ -22,8 +22,8 @@ void VerifySampling(size_t page_size,
|
||||
size_t sample_rows = kRows * subsample;
|
||||
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix> dmat(
|
||||
CreateSparsePageDMatrixWithRC(kRows, kCols, page_size, true, tmpdir));
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(
|
||||
kRows, kCols, kRows / (page_size == 0 ? kRows : page_size), tmpdir.path + "/cache"));
|
||||
auto gpair = GenerateRandomGradients(kRows);
|
||||
GradientPair sum_gpair{};
|
||||
for (const auto& gp : gpair.ConstHostVector()) {
|
||||
@ -81,8 +81,8 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
std::unique_ptr<DMatrix> dmat(
|
||||
CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
|
||||
auto gpair = GenerateRandomGradients(kRows);
|
||||
gpair.SetDevice(0);
|
||||
|
||||
|
||||
@ -468,13 +468,14 @@ TEST(GpuHist, ExternalMemory) {
|
||||
constexpr size_t kCols = 2;
|
||||
constexpr size_t kPageSize = 1024;
|
||||
|
||||
// Create an in-memory DMatrix.
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
std::unique_ptr<DMatrix> dmat_ext(
|
||||
CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
|
||||
|
||||
// Create a single batch DMatrix.
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
|
||||
|
||||
auto gpair = GenerateRandomGradients(kRows);
|
||||
|
||||
@ -503,13 +504,14 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
|
||||
const std::string kSamplingMethod = "gradient_based";
|
||||
common::GlobalRandom().seed(0);
|
||||
|
||||
// Create an in-memory DMatrix.
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
|
||||
// Create a single batch DMatrix.
|
||||
std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
std::unique_ptr<DMatrix> dmat_ext(
|
||||
CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
|
||||
|
||||
auto gpair = GenerateRandomGradients(kRows);
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user