Add helper for generating batches of data. (#5756)

* Add helper for generating batches of data.

* VC keyword clash.

* Another clash.
This commit is contained in:
Jiaming Yuan 2020-06-05 09:53:56 +08:00 committed by GitHub
parent 359023c0fa
commit bd9d57f579
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 113 additions and 13 deletions

View File

@ -156,10 +156,10 @@ SimpleLCG::StateType SimpleLCG::Max() const {
}
void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
CHECK(out);
SimpleLCG lcg{lcg_};
out->Resize(rows_ * cols_, 0);
auto &h_data = out->HostVector();
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
@ -202,7 +202,56 @@ std::string RandomDataGenerator::GenerateArrayInterface(
return out;
}
std::pair<std::vector<std::string>, std::string>
RandomDataGenerator::GenerateArrayInterfaceBatch(
HostDeviceVector<float> *storage, size_t batches) const {
this->GenerateDense(storage);
std::vector<std::string> result(batches);
std::vector<Json> objects;
size_t const rows_per_batch = rows_ / batches;
auto make_interface = [storage, this](size_t offset, size_t rows) {
Json array_interface{Object()};
array_interface["data"] = std::vector<Json>(2);
if (device_ >= 0) {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
} else {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->HostPointer() + offset));
}
array_interface["data"][1] = Boolean(false);
array_interface["shape"] = std::vector<Json>(2);
array_interface["shape"][0] = rows;
array_interface["shape"][1] = cols_;
array_interface["typestr"] = String("<f4");
array_interface["version"] = 1;
return array_interface;
};
auto j_interface = make_interface(0, rows_);
size_t offset = 0;
for (size_t i = 0; i < batches - 1; ++i) {
objects.emplace_back(make_interface(offset, rows_per_batch));
offset += rows_per_batch * cols_;
}
size_t const remaining = rows_ - offset / cols_;
CHECK_LE(offset, rows_ * cols_);
objects.emplace_back(make_interface(offset, remaining));
for (size_t i = 0; i < batches; ++i) {
Json::Dump(objects[i], &result[i]);
}
std::string interface_str;
Json::Dump(j_interface, &interface_str);
return {result, interface_str};
}
std::string RandomDataGenerator::GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const {
@ -225,8 +274,8 @@ void RandomDataGenerator::GenerateCSR(
auto& h_value = value->HostVector();
auto& h_rptr = row_ptr->HostVector();
auto& h_cols = columns->HostVector();
SimpleLCG lcg{lcg_};
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
float sparsity = sparsity_ * (upper_ - lower_) + lower_;

View File

@ -97,7 +97,7 @@ bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
class SimpleLCG {
private:
using StateType = int64_t;
static StateType constexpr default_init_ = 3;
static StateType constexpr kDefaultInit = 3;
static StateType constexpr default_alpha_ = 61;
static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;
@ -105,11 +105,17 @@ class SimpleLCG {
StateType const alpha_;
StateType const mod_;
StateType const seed_;
StateType seed_;
public:
SimpleLCG() : state_{default_init_},
SimpleLCG() : state_{kDefaultInit},
alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
SimpleLCG(SimpleLCG const& that) = default;
SimpleLCG(SimpleLCG&& that) = default;
void Seed(StateType seed) {
seed_ = seed;
}
/*!
* \brief Initialize SimpleLCG.
*
@ -118,9 +124,9 @@ class SimpleLCG {
* \param alpha multiplier
* \param mod modulo
*/
SimpleLCG(StateType state,
StateType alpha=default_alpha_, StateType mod=max_value_)
: state_{state == 0 ? default_init_ : state},
explicit SimpleLCG(StateType state,
StateType alpha=default_alpha_, StateType mod=max_value_)
: state_{state == 0 ? kDefaultInit : state},
alpha_{alpha}, mod_{mod} , seed_{state} {}
StateType operator()();
@ -131,8 +137,8 @@ class SimpleLCG {
template <typename ResultT>
class SimpleRealUniformDistribution {
private:
ResultT const lower;
ResultT const upper;
ResultT const lower_;
ResultT const upper_;
/*! \brief Over-simplified version of std::generate_canonical. */
template <size_t Bits, typename GeneratorT>
@ -156,13 +162,13 @@ class SimpleRealUniformDistribution {
public:
SimpleRealUniformDistribution(ResultT l, ResultT u) :
lower{l}, upper{u} {}
lower_{l}, upper_{u} {}
template <typename GeneratorT>
ResultT operator()(GeneratorT* rng) const {
ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
GeneratorT>(rng);
return (tmp * (upper - lower)) + lower;
return (tmp * (upper_ - lower_)) + lower_;
}
};
@ -177,6 +183,7 @@ class RandomDataGenerator {
int32_t device_;
int32_t seed_;
SimpleLCG lcg_;
size_t bins_;
@ -186,7 +193,7 @@ class RandomDataGenerator {
public:
RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
: rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
device_{-1}, seed_{0}, bins_{0} {}
device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {}
RandomDataGenerator &Lower(float v) {
lower_ = v;
@ -202,6 +209,7 @@ class RandomDataGenerator {
}
RandomDataGenerator& Seed(int32_t s) {
seed_ = s;
lcg_.Seed(seed_);
return *this;
}
RandomDataGenerator& Bins(size_t b) {
@ -210,9 +218,26 @@ class RandomDataGenerator {
}
void GenerateDense(HostDeviceVector<float>* out) const;
std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
/*!
* \brief Generate batches of array interface stored in consecutive memory.
*
* \param storage The consecutive momory used to store the arrays.
* \param batches Number of batches.
*
* \return A vector storing JSON string representation of interface for each batch, and
* a single JSON string representing the consecutive memory as a whole
* (combining all the batches).
*/
std::pair<std::vector<std::string>, std::string>
GenerateArrayInterfaceBatch(HostDeviceVector<float> *storage,
size_t batches) const;
std::string GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const;
void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
HostDeviceVector<bst_feature_t>* columns) const;

View File

@ -2,6 +2,7 @@
#include <algorithm>
#include "helpers.h"
#include "../../src/data/array_interface.h"
namespace xgboost {
TEST(RandomDataGenerator, DMatrix) {
@ -41,4 +42,29 @@ TEST(RandomDataGenerator, DMatrix) {
}
}
TEST(RandomDataGenerator, GenerateArrayInterfaceBatch) {
size_t constexpr kRows { 937 }, kCols { 100 }, kBatches { 13 };
float constexpr kSparsity { 0.4f };
HostDeviceVector<float> storage;
std::string array;
std::vector<std::string> batches;
std::tie(batches, array) =
RandomDataGenerator{kRows, kCols, kSparsity}.GenerateArrayInterfaceBatch(
&storage, kBatches);
CHECK_EQ(batches.size(), kBatches);
size_t rows = 0;
for (auto const &interface_str : batches) {
Json j_interface =
Json::Load({interface_str.c_str(), interface_str.size()});
ArrayInterfaceHandler::Validate(get<Object const>(j_interface));
CHECK_EQ(get<Integer>(j_interface["shape"][1]), kCols);
rows += get<Integer>(j_interface["shape"][0]);
}
CHECK_EQ(rows, kRows);
auto j_array = Json::Load({array.c_str(), array.size()});
CHECK_EQ(get<Integer>(j_array["shape"][0]), kRows);
CHECK_EQ(get<Integer>(j_array["shape"][1]), kCols);
}
} // namespace xgboost