Add helper for generating batches of data. (#5756)
* Add helper for generating batches of data. * VC keyword clash. * Another clash.
This commit is contained in:
parent
359023c0fa
commit
bd9d57f579
@ -156,10 +156,10 @@ SimpleLCG::StateType SimpleLCG::Max() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
|
void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
|
||||||
SimpleLCG lcg{seed_};
|
|
||||||
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
|
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
|
||||||
CHECK(out);
|
CHECK(out);
|
||||||
|
|
||||||
|
SimpleLCG lcg{lcg_};
|
||||||
out->Resize(rows_ * cols_, 0);
|
out->Resize(rows_ * cols_, 0);
|
||||||
auto &h_data = out->HostVector();
|
auto &h_data = out->HostVector();
|
||||||
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
|
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
|
||||||
@ -202,7 +202,56 @@ std::string RandomDataGenerator::GenerateArrayInterface(
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::pair<std::vector<std::string>, std::string>
|
||||||
|
RandomDataGenerator::GenerateArrayInterfaceBatch(
|
||||||
|
HostDeviceVector<float> *storage, size_t batches) const {
|
||||||
|
this->GenerateDense(storage);
|
||||||
|
std::vector<std::string> result(batches);
|
||||||
|
std::vector<Json> objects;
|
||||||
|
|
||||||
|
size_t const rows_per_batch = rows_ / batches;
|
||||||
|
|
||||||
|
auto make_interface = [storage, this](size_t offset, size_t rows) {
|
||||||
|
Json array_interface{Object()};
|
||||||
|
array_interface["data"] = std::vector<Json>(2);
|
||||||
|
if (device_ >= 0) {
|
||||||
|
array_interface["data"][0] =
|
||||||
|
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
|
||||||
|
} else {
|
||||||
|
array_interface["data"][0] =
|
||||||
|
Integer(reinterpret_cast<int64_t>(storage->HostPointer() + offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
array_interface["data"][1] = Boolean(false);
|
||||||
|
|
||||||
|
array_interface["shape"] = std::vector<Json>(2);
|
||||||
|
array_interface["shape"][0] = rows;
|
||||||
|
array_interface["shape"][1] = cols_;
|
||||||
|
|
||||||
|
array_interface["typestr"] = String("<f4");
|
||||||
|
array_interface["version"] = 1;
|
||||||
|
return array_interface;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto j_interface = make_interface(0, rows_);
|
||||||
|
size_t offset = 0;
|
||||||
|
for (size_t i = 0; i < batches - 1; ++i) {
|
||||||
|
objects.emplace_back(make_interface(offset, rows_per_batch));
|
||||||
|
offset += rows_per_batch * cols_;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t const remaining = rows_ - offset / cols_;
|
||||||
|
CHECK_LE(offset, rows_ * cols_);
|
||||||
|
objects.emplace_back(make_interface(offset, remaining));
|
||||||
|
|
||||||
|
for (size_t i = 0; i < batches; ++i) {
|
||||||
|
Json::Dump(objects[i], &result[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string interface_str;
|
||||||
|
Json::Dump(j_interface, &interface_str);
|
||||||
|
return {result, interface_str};
|
||||||
|
}
|
||||||
|
|
||||||
std::string RandomDataGenerator::GenerateColumnarArrayInterface(
|
std::string RandomDataGenerator::GenerateColumnarArrayInterface(
|
||||||
std::vector<HostDeviceVector<float>> *data) const {
|
std::vector<HostDeviceVector<float>> *data) const {
|
||||||
@ -225,8 +274,8 @@ void RandomDataGenerator::GenerateCSR(
|
|||||||
auto& h_value = value->HostVector();
|
auto& h_value = value->HostVector();
|
||||||
auto& h_rptr = row_ptr->HostVector();
|
auto& h_rptr = row_ptr->HostVector();
|
||||||
auto& h_cols = columns->HostVector();
|
auto& h_cols = columns->HostVector();
|
||||||
|
SimpleLCG lcg{lcg_};
|
||||||
|
|
||||||
SimpleLCG lcg{seed_};
|
|
||||||
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
|
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
|
||||||
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
|
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
|
||||||
|
|
||||||
|
|||||||
@ -97,7 +97,7 @@ bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
|
|||||||
class SimpleLCG {
|
class SimpleLCG {
|
||||||
private:
|
private:
|
||||||
using StateType = int64_t;
|
using StateType = int64_t;
|
||||||
static StateType constexpr default_init_ = 3;
|
static StateType constexpr kDefaultInit = 3;
|
||||||
static StateType constexpr default_alpha_ = 61;
|
static StateType constexpr default_alpha_ = 61;
|
||||||
static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;
|
static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;
|
||||||
|
|
||||||
@ -105,11 +105,17 @@ class SimpleLCG {
|
|||||||
StateType const alpha_;
|
StateType const alpha_;
|
||||||
StateType const mod_;
|
StateType const mod_;
|
||||||
|
|
||||||
StateType const seed_;
|
StateType seed_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SimpleLCG() : state_{default_init_},
|
SimpleLCG() : state_{kDefaultInit},
|
||||||
alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
|
alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
|
||||||
|
SimpleLCG(SimpleLCG const& that) = default;
|
||||||
|
SimpleLCG(SimpleLCG&& that) = default;
|
||||||
|
|
||||||
|
void Seed(StateType seed) {
|
||||||
|
seed_ = seed;
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief Initialize SimpleLCG.
|
* \brief Initialize SimpleLCG.
|
||||||
*
|
*
|
||||||
@ -118,9 +124,9 @@ class SimpleLCG {
|
|||||||
* \param alpha multiplier
|
* \param alpha multiplier
|
||||||
* \param mod modulo
|
* \param mod modulo
|
||||||
*/
|
*/
|
||||||
SimpleLCG(StateType state,
|
explicit SimpleLCG(StateType state,
|
||||||
StateType alpha=default_alpha_, StateType mod=max_value_)
|
StateType alpha=default_alpha_, StateType mod=max_value_)
|
||||||
: state_{state == 0 ? default_init_ : state},
|
: state_{state == 0 ? kDefaultInit : state},
|
||||||
alpha_{alpha}, mod_{mod} , seed_{state} {}
|
alpha_{alpha}, mod_{mod} , seed_{state} {}
|
||||||
|
|
||||||
StateType operator()();
|
StateType operator()();
|
||||||
@ -131,8 +137,8 @@ class SimpleLCG {
|
|||||||
template <typename ResultT>
|
template <typename ResultT>
|
||||||
class SimpleRealUniformDistribution {
|
class SimpleRealUniformDistribution {
|
||||||
private:
|
private:
|
||||||
ResultT const lower;
|
ResultT const lower_;
|
||||||
ResultT const upper;
|
ResultT const upper_;
|
||||||
|
|
||||||
/*! \brief Over-simplified version of std::generate_canonical. */
|
/*! \brief Over-simplified version of std::generate_canonical. */
|
||||||
template <size_t Bits, typename GeneratorT>
|
template <size_t Bits, typename GeneratorT>
|
||||||
@ -156,13 +162,13 @@ class SimpleRealUniformDistribution {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
SimpleRealUniformDistribution(ResultT l, ResultT u) :
|
SimpleRealUniformDistribution(ResultT l, ResultT u) :
|
||||||
lower{l}, upper{u} {}
|
lower_{l}, upper_{u} {}
|
||||||
|
|
||||||
template <typename GeneratorT>
|
template <typename GeneratorT>
|
||||||
ResultT operator()(GeneratorT* rng) const {
|
ResultT operator()(GeneratorT* rng) const {
|
||||||
ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
|
ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
|
||||||
GeneratorT>(rng);
|
GeneratorT>(rng);
|
||||||
return (tmp * (upper - lower)) + lower;
|
return (tmp * (upper_ - lower_)) + lower_;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -177,6 +183,7 @@ class RandomDataGenerator {
|
|||||||
|
|
||||||
int32_t device_;
|
int32_t device_;
|
||||||
int32_t seed_;
|
int32_t seed_;
|
||||||
|
SimpleLCG lcg_;
|
||||||
|
|
||||||
size_t bins_;
|
size_t bins_;
|
||||||
|
|
||||||
@ -186,7 +193,7 @@ class RandomDataGenerator {
|
|||||||
public:
|
public:
|
||||||
RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
|
RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
|
||||||
: rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
|
: rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
|
||||||
device_{-1}, seed_{0}, bins_{0} {}
|
device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {}
|
||||||
|
|
||||||
RandomDataGenerator &Lower(float v) {
|
RandomDataGenerator &Lower(float v) {
|
||||||
lower_ = v;
|
lower_ = v;
|
||||||
@ -202,6 +209,7 @@ class RandomDataGenerator {
|
|||||||
}
|
}
|
||||||
RandomDataGenerator& Seed(int32_t s) {
|
RandomDataGenerator& Seed(int32_t s) {
|
||||||
seed_ = s;
|
seed_ = s;
|
||||||
|
lcg_.Seed(seed_);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
RandomDataGenerator& Bins(size_t b) {
|
RandomDataGenerator& Bins(size_t b) {
|
||||||
@ -210,9 +218,26 @@ class RandomDataGenerator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GenerateDense(HostDeviceVector<float>* out) const;
|
void GenerateDense(HostDeviceVector<float>* out) const;
|
||||||
|
|
||||||
std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
|
std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Generate batches of array interface stored in consecutive memory.
|
||||||
|
*
|
||||||
|
* \param storage The consecutive momory used to store the arrays.
|
||||||
|
* \param batches Number of batches.
|
||||||
|
*
|
||||||
|
* \return A vector storing JSON string representation of interface for each batch, and
|
||||||
|
* a single JSON string representing the consecutive memory as a whole
|
||||||
|
* (combining all the batches).
|
||||||
|
*/
|
||||||
|
std::pair<std::vector<std::string>, std::string>
|
||||||
|
GenerateArrayInterfaceBatch(HostDeviceVector<float> *storage,
|
||||||
|
size_t batches) const;
|
||||||
|
|
||||||
std::string GenerateColumnarArrayInterface(
|
std::string GenerateColumnarArrayInterface(
|
||||||
std::vector<HostDeviceVector<float>> *data) const;
|
std::vector<HostDeviceVector<float>> *data) const;
|
||||||
|
|
||||||
void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
|
void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
|
||||||
HostDeviceVector<bst_feature_t>* columns) const;
|
HostDeviceVector<bst_feature_t>* columns) const;
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "helpers.h"
|
#include "helpers.h"
|
||||||
|
#include "../../src/data/array_interface.h"
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
TEST(RandomDataGenerator, DMatrix) {
|
TEST(RandomDataGenerator, DMatrix) {
|
||||||
@ -41,4 +42,29 @@ TEST(RandomDataGenerator, DMatrix) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(RandomDataGenerator, GenerateArrayInterfaceBatch) {
|
||||||
|
size_t constexpr kRows { 937 }, kCols { 100 }, kBatches { 13 };
|
||||||
|
float constexpr kSparsity { 0.4f };
|
||||||
|
|
||||||
|
HostDeviceVector<float> storage;
|
||||||
|
std::string array;
|
||||||
|
std::vector<std::string> batches;
|
||||||
|
std::tie(batches, array) =
|
||||||
|
RandomDataGenerator{kRows, kCols, kSparsity}.GenerateArrayInterfaceBatch(
|
||||||
|
&storage, kBatches);
|
||||||
|
CHECK_EQ(batches.size(), kBatches);
|
||||||
|
|
||||||
|
size_t rows = 0;
|
||||||
|
for (auto const &interface_str : batches) {
|
||||||
|
Json j_interface =
|
||||||
|
Json::Load({interface_str.c_str(), interface_str.size()});
|
||||||
|
ArrayInterfaceHandler::Validate(get<Object const>(j_interface));
|
||||||
|
CHECK_EQ(get<Integer>(j_interface["shape"][1]), kCols);
|
||||||
|
rows += get<Integer>(j_interface["shape"][0]);
|
||||||
|
}
|
||||||
|
CHECK_EQ(rows, kRows);
|
||||||
|
auto j_array = Json::Load({array.c_str(), array.size()});
|
||||||
|
CHECK_EQ(get<Integer>(j_array["shape"][0]), kRows);
|
||||||
|
CHECK_EQ(get<Integer>(j_array["shape"][1]), kCols);
|
||||||
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user