Cleanup data generator. (#8094)

- Avoid duplicated definition of data shape.
- Explicitly define numpy iterator for CPU data.
This commit is contained in:
Jiaming Yuan 2022-07-20 13:48:52 +08:00 committed by GitHub
parent 5156be0f49
commit ef11b024e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 59 additions and 48 deletions

View File

@ -27,8 +27,8 @@ void TestEquivalent(float sparsity) {
offset += num_elements; offset += num_elements;
} }
auto from_iter = page_concatenated->GetDeviceAccessor(0); auto from_iter = page_concatenated->GetDeviceAccessor(0);
ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::kCols); ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::kRows); ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
std::string interface_str = iter.AsArray(); std::string interface_str = iter.AsArray();
auto adapter = CupyAdapter(interface_str); auto adapter = CupyAdapter(interface_str);
@ -98,8 +98,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
auto impl = ellpack.Impl(); auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator( common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
auto cols = CudaArrayIterForTest::kCols; auto cols = CudaArrayIterForTest::Cols();
auto rows = CudaArrayIterForTest::kRows; auto rows = CudaArrayIterForTest::Rows();
auto j_interface = auto j_interface =
Json::Load({interface_str.c_str(), interface_str.size()}); Json::Load({interface_str.c_str(), interface_str.size()});

View File

@ -1,25 +1,27 @@
/*! /*!
* Copyright 2016-2022 by XGBoost contributors * Copyright 2016-2022 by XGBoost contributors
*/ */
#include "helpers.h"
#include <dmlc/filesystem.h> #include <dmlc/filesystem.h>
#include <xgboost/logging.h> #include <gtest/gtest.h>
#include <xgboost/objective.h>
#include <xgboost/metric.h>
#include <xgboost/learner.h>
#include <xgboost/gbm.h> #include <xgboost/gbm.h>
#include <xgboost/json.h> #include <xgboost/json.h>
#include <gtest/gtest.h> #include <xgboost/learner.h>
#include <xgboost/logging.h>
#include <xgboost/metric.h>
#include <xgboost/objective.h>
#include <algorithm> #include <algorithm>
#include <random>
#include <cinttypes> #include <cinttypes>
#include <random>
#include "helpers.h"
#include "xgboost/c_api.h"
#include "../../src/data/adapter.h" #include "../../src/data/adapter.h"
#include "../../src/data/iterative_dmatrix.h"
#include "../../src/data/simple_dmatrix.h" #include "../../src/data/simple_dmatrix.h"
#include "../../src/data/sparse_page_dmatrix.h" #include "../../src/data/sparse_page_dmatrix.h"
#include "../../src/gbm/gbtree_model.h" #include "../../src/gbm/gbtree_model.h"
#include "xgboost/c_api.h"
#include "xgboost/predictor.h" #include "xgboost/predictor.h"
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
@ -379,6 +381,30 @@ RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label,
return out; return out;
} }
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix() {
NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
auto m = std::make_shared<data::IterativeDMatrix>(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
return m;
}
NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
size_t batches)
: ArrayIterForTest{sparsity, rows, cols, batches} {
rng_->Device(Context::kCpuId);
std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
this->Reset();
}
int NumpyArrayIterForTest::Next() {
if (iter_ == n_batches_) {
return 0;
}
XGProxyDMatrixSetDataDense(proxy_, batches_[iter_].c_str());
iter_++;
return 1;
}
std::shared_ptr<DMatrix> std::shared_ptr<DMatrix>
GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
data::DenseAdapter adapter(x.data(), num_rows, num_columns); data::DenseAdapter adapter(x.data(), num_rows, num_columns);
@ -389,7 +415,7 @@ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features, std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
size_t n_batches, std::string prefix) { size_t n_batches, std::string prefix) {
CHECK_GE(n_samples, n_batches); CHECK_GE(n_samples, n_batches);
ArrayIterForTest iter(0, n_samples, n_features, n_batches); NumpyArrayIterForTest iter(0, n_samples, n_features, n_batches);
std::unique_ptr<DMatrix> dmat{ std::unique_ptr<DMatrix> dmat{
DMatrix::Create(static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next, DMatrix::Create(static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
@ -416,7 +442,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries,
std::string prefix) { std::string prefix) {
size_t n_columns = 3; size_t n_columns = 3;
size_t n_rows = n_entries / n_columns; size_t n_rows = n_entries / n_columns;
ArrayIterForTest iter(0, n_rows, n_columns, 2); NumpyArrayIterForTest iter(0, n_rows, n_columns, 2);
std::unique_ptr<DMatrix> dmat{DMatrix::Create( std::unique_ptr<DMatrix> dmat{DMatrix::Create(
static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next, static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
@ -563,18 +589,6 @@ ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols,
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); } ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
int ArrayIterForTest::Next() {
if (iter_ == n_batches_) {
return 0;
}
XGProxyDMatrixSetDataDense(proxy_, batches_[iter_].c_str());
iter_++;
return 1;
}
size_t constexpr ArrayIterForTest::kRows;
size_t constexpr ArrayIterForTest::kCols;
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data, void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
std::vector<size_t> *p_row_ptr, std::vector<size_t> *p_row_ptr,
std::vector<bst_feature_t> *p_cids) { std::vector<bst_feature_t> *p_cids) {

View File

@ -15,10 +15,6 @@ CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
this->Reset(); this->Reset();
} }
size_t constexpr CudaArrayIterForTest::kRows;
size_t constexpr CudaArrayIterForTest::kCols;
size_t constexpr CudaArrayIterForTest::kBatches;
int CudaArrayIterForTest::Next() { int CudaArrayIterForTest::Next() {
if (iter_ == n_batches_) { if (iter_ == n_batches_) {
return 0; return 0;

View File

@ -298,6 +298,7 @@ class RandomDataGenerator {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
std::shared_ptr<DMatrix> GenerateDeviceDMatrix(); std::shared_ptr<DMatrix> GenerateDeviceDMatrix();
#endif #endif
std::shared_ptr<DMatrix> GenerateQuantileDMatrix();
}; };
inline std::vector<float> inline std::vector<float>
@ -401,38 +402,38 @@ class ArrayIterForTest {
size_t n_batches_; size_t n_batches_;
public: public:
size_t static constexpr kRows { 1000 }; size_t static constexpr Rows() { return 1024; }
size_t static constexpr kBatches { 100 }; size_t static constexpr Batches() { return 100; }
size_t static constexpr kCols { 13 }; size_t static constexpr Cols() { return 13; }
std::string AsArray() const { public:
return interface_; std::string AsArray() const { return interface_; }
}
virtual int Next(); virtual int Next() = 0;
virtual void Reset() { virtual void Reset() { iter_ = 0; }
iter_ = 0;
}
size_t Iter() const { return iter_; } size_t Iter() const { return iter_; }
auto Proxy() -> decltype(proxy_) { return proxy_; } auto Proxy() -> decltype(proxy_) { return proxy_; }
explicit ArrayIterForTest(float sparsity, size_t rows = kRows, explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
size_t cols = kCols, size_t batches = kBatches);
virtual ~ArrayIterForTest(); virtual ~ArrayIterForTest();
}; };
class CudaArrayIterForTest : public ArrayIterForTest { class CudaArrayIterForTest : public ArrayIterForTest {
public: public:
size_t static constexpr kRows{1000}; explicit CudaArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
size_t static constexpr kBatches{100}; size_t batches = Batches());
size_t static constexpr kCols{13};
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
size_t cols = kCols, size_t batches = kBatches);
int Next() override; int Next() override;
~CudaArrayIterForTest() override = default; ~CudaArrayIterForTest() override = default;
}; };
class NumpyArrayIterForTest : public ArrayIterForTest {
public:
explicit NumpyArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
size_t batches = Batches());
int Next() override;
~NumpyArrayIterForTest() override = default;
};
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data, void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
std::vector<size_t> *p_row_ptr, std::vector<size_t> *p_row_ptr,
std::vector<bst_feature_t> *p_cids); std::vector<bst_feature_t> *p_cids);