Add helper for generating batches of data. (#5756)

* Add helper for generating batches of data.

* VC keyword clash.

* Another clash.
This commit is contained in:
Jiaming Yuan
2020-06-05 09:53:56 +08:00
committed by GitHub
parent 359023c0fa
commit bd9d57f579
3 changed files with 113 additions and 13 deletions

View File

@@ -156,10 +156,10 @@ SimpleLCG::StateType SimpleLCG::Max() const {
}
void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
CHECK(out);
SimpleLCG lcg{lcg_};
out->Resize(rows_ * cols_, 0);
auto &h_data = out->HostVector();
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
@@ -202,7 +202,56 @@ std::string RandomDataGenerator::GenerateArrayInterface(
return out;
}
std::pair<std::vector<std::string>, std::string>
RandomDataGenerator::GenerateArrayInterfaceBatch(
HostDeviceVector<float> *storage, size_t batches) const {
this->GenerateDense(storage);
std::vector<std::string> result(batches);
std::vector<Json> objects;
size_t const rows_per_batch = rows_ / batches;
auto make_interface = [storage, this](size_t offset, size_t rows) {
Json array_interface{Object()};
array_interface["data"] = std::vector<Json>(2);
if (device_ >= 0) {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
} else {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->HostPointer() + offset));
}
array_interface["data"][1] = Boolean(false);
array_interface["shape"] = std::vector<Json>(2);
array_interface["shape"][0] = rows;
array_interface["shape"][1] = cols_;
array_interface["typestr"] = String("<f4");
array_interface["version"] = 1;
return array_interface;
};
auto j_interface = make_interface(0, rows_);
size_t offset = 0;
for (size_t i = 0; i < batches - 1; ++i) {
objects.emplace_back(make_interface(offset, rows_per_batch));
offset += rows_per_batch * cols_;
}
size_t const remaining = rows_ - offset / cols_;
CHECK_LE(offset, rows_ * cols_);
objects.emplace_back(make_interface(offset, remaining));
for (size_t i = 0; i < batches; ++i) {
Json::Dump(objects[i], &result[i]);
}
std::string interface_str;
Json::Dump(j_interface, &interface_str);
return {result, interface_str};
}
std::string RandomDataGenerator::GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const {
@@ -225,8 +274,8 @@ void RandomDataGenerator::GenerateCSR(
auto& h_value = value->HostVector();
auto& h_rptr = row_ptr->HostVector();
auto& h_cols = columns->HostVector();
SimpleLCG lcg{lcg_};
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
float sparsity = sparsity_ * (upper_ - lower_) + lower_;