From bd9d57f579621f7189d4d98f884fd4003c3a5a6e Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 5 Jun 2020 09:53:56 +0800 Subject: [PATCH] Add helper for generating batches of data. (#5756) * Add helper for generating batches of data. * VC keyword clash. * Another clash. --- tests/cpp/helpers.cc | 53 +++++++++++++++++++++++++++++++++++++-- tests/cpp/helpers.h | 47 ++++++++++++++++++++++++++-------- tests/cpp/test_helpers.cc | 26 +++++++++++++++++++ 3 files changed, 113 insertions(+), 13 deletions(-) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 951b8b66d..893891d13 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -156,10 +156,10 @@ SimpleLCG::StateType SimpleLCG::Max() const { } void RandomDataGenerator::GenerateDense(HostDeviceVector *out) const { - SimpleLCG lcg{seed_}; xgboost::SimpleRealUniformDistribution dist(lower_, upper_); CHECK(out); + SimpleLCG lcg{lcg_}; out->Resize(rows_ * cols_, 0); auto &h_data = out->HostVector(); float sparsity = sparsity_ * (upper_ - lower_) + lower_; @@ -202,7 +202,56 @@ std::string RandomDataGenerator::GenerateArrayInterface( return out; } +std::pair, std::string> +RandomDataGenerator::GenerateArrayInterfaceBatch( + HostDeviceVector *storage, size_t batches) const { + this->GenerateDense(storage); + std::vector result(batches); + std::vector objects; + size_t const rows_per_batch = rows_ / batches; + + auto make_interface = [storage, this](size_t offset, size_t rows) { + Json array_interface{Object()}; + array_interface["data"] = std::vector(2); + if (device_ >= 0) { + array_interface["data"][0] = + Integer(reinterpret_cast(storage->DevicePointer() + offset)); + } else { + array_interface["data"][0] = + Integer(reinterpret_cast(storage->HostPointer() + offset)); + } + + array_interface["data"][1] = Boolean(false); + + array_interface["shape"] = std::vector(2); + array_interface["shape"][0] = rows; + array_interface["shape"][1] = cols_; + + array_interface["typestr"] = String("> *data) const { @@ -225,8 +274,8 @@ void RandomDataGenerator::GenerateCSR( auto& h_value = value->HostVector(); auto& h_rptr = row_ptr->HostVector(); auto& h_cols = columns->HostVector(); + SimpleLCG lcg{lcg_}; - SimpleLCG lcg{seed_}; xgboost::SimpleRealUniformDistribution dist(lower_, upper_); float sparsity = sparsity_ * (upper_ - lower_) + lower_; diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 9f7c57527..7d5907718 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -97,7 +97,7 @@ bool IsNear(std::vector::const_iterator _beg1, class SimpleLCG { private: using StateType = int64_t; - static StateType constexpr default_init_ = 3; + static StateType constexpr kDefaultInit = 3; static StateType constexpr default_alpha_ = 61; static StateType constexpr max_value_ = ((StateType)1 << 32) - 1; @@ -105,11 +105,17 @@ class SimpleLCG { StateType const alpha_; StateType const mod_; - StateType const seed_; + StateType seed_; public: - SimpleLCG() : state_{default_init_}, + SimpleLCG() : state_{kDefaultInit}, alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{} + SimpleLCG(SimpleLCG const& that) = default; + SimpleLCG(SimpleLCG&& that) = default; + + void Seed(StateType seed) { + seed_ = seed; + } /*! * \brief Initialize SimpleLCG. * @@ -118,9 +124,9 @@ class SimpleLCG { * \param alpha multiplier * \param mod modulo */ - SimpleLCG(StateType state, - StateType alpha=default_alpha_, StateType mod=max_value_) - : state_{state == 0 ? default_init_ : state}, + explicit SimpleLCG(StateType state, + StateType alpha=default_alpha_, StateType mod=max_value_) + : state_{state == 0 ? kDefaultInit : state}, alpha_{alpha}, mod_{mod} , seed_{state} {} StateType operator()(); @@ -131,8 +137,8 @@ class SimpleLCG { template class SimpleRealUniformDistribution { private: - ResultT const lower; - ResultT const upper; + ResultT const lower_; + ResultT const upper_; /*! \brief Over-simplified version of std::generate_canonical. */ template @@ -156,13 +162,13 @@ class SimpleRealUniformDistribution { public: SimpleRealUniformDistribution(ResultT l, ResultT u) : - lower{l}, upper{u} {} + lower_{l}, upper_{u} {} template ResultT operator()(GeneratorT* rng) const { ResultT tmp = GenerateCanonical::digits, GeneratorT>(rng); - return (tmp * (upper - lower)) + lower; + return (tmp * (upper_ - lower_)) + lower_; } }; @@ -177,6 +183,7 @@ class RandomDataGenerator { int32_t device_; int32_t seed_; + SimpleLCG lcg_; size_t bins_; @@ -186,7 +193,7 @@ class RandomDataGenerator { public: RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity) : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f}, - device_{-1}, seed_{0}, bins_{0} {} + device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {} RandomDataGenerator &Lower(float v) { lower_ = v; @@ -202,6 +209,7 @@ class RandomDataGenerator { } RandomDataGenerator& Seed(int32_t s) { seed_ = s; + lcg_.Seed(seed_); return *this; } RandomDataGenerator& Bins(size_t b) { @@ -210,9 +218,26 @@ class RandomDataGenerator { } void GenerateDense(HostDeviceVector* out) const; + std::string GenerateArrayInterface(HostDeviceVector* storage) const; + + /*! + * \brief Generate batches of array interface stored in consecutive memory. + * + * \param storage The consecutive momory used to store the arrays. + * \param batches Number of batches. + * + * \return A vector storing JSON string representation of interface for each batch, and + * a single JSON string representing the consecutive memory as a whole + * (combining all the batches). + */ + std::pair, std::string> + GenerateArrayInterfaceBatch(HostDeviceVector *storage, + size_t batches) const; + std::string GenerateColumnarArrayInterface( std::vector> *data) const; + void GenerateCSR(HostDeviceVector* value, HostDeviceVector* row_ptr, HostDeviceVector* columns) const; diff --git a/tests/cpp/test_helpers.cc b/tests/cpp/test_helpers.cc index 2c4ca6fed..79d8d2475 100644 --- a/tests/cpp/test_helpers.cc +++ b/tests/cpp/test_helpers.cc @@ -2,6 +2,7 @@ #include #include "helpers.h" +#include "../../src/data/array_interface.h" namespace xgboost { TEST(RandomDataGenerator, DMatrix) { @@ -41,4 +42,29 @@ TEST(RandomDataGenerator, DMatrix) { } } +TEST(RandomDataGenerator, GenerateArrayInterfaceBatch) { + size_t constexpr kRows { 937 }, kCols { 100 }, kBatches { 13 }; + float constexpr kSparsity { 0.4f }; + + HostDeviceVector storage; + std::string array; + std::vector batches; + std::tie(batches, array) = + RandomDataGenerator{kRows, kCols, kSparsity}.GenerateArrayInterfaceBatch( + &storage, kBatches); + CHECK_EQ(batches.size(), kBatches); + + size_t rows = 0; + for (auto const &interface_str : batches) { + Json j_interface = + Json::Load({interface_str.c_str(), interface_str.size()}); + ArrayInterfaceHandler::Validate(get(j_interface)); + CHECK_EQ(get(j_interface["shape"][1]), kCols); + rows += get(j_interface["shape"][0]); + } + CHECK_EQ(rows, kRows); + auto j_array = Json::Load({array.c_str(), array.size()}); + CHECK_EQ(get(j_array["shape"][0]), kRows); + CHECK_EQ(get(j_array["shape"][1]), kCols); +} } // namespace xgboost