Refactor tests with data generator. (#5439)

This commit is contained in:
Jiaming Yuan
2020-03-27 06:44:44 +08:00
committed by GitHub
parent 7146b91d5a
commit 4942da64ae
26 changed files with 334 additions and 259 deletions

View File

@@ -16,7 +16,7 @@
#include "helpers.h"
#include "xgboost/c_api.h"
#include "../../src/data/adapter.h"
#include "../../src/gbm/gbtree_model.h"
#include "xgboost/predictor.h"
@@ -155,26 +155,112 @@ SimpleLCG::StateType SimpleLCG::Max() const {
return max_value_;
}
std::shared_ptr<xgboost::DMatrix>* CreateDMatrix(int rows, int columns,
float sparsity, int seed) {
const float missing_value = -1;
std::vector<float> test_data(rows * columns);
void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
CHECK(out);
xgboost::SimpleLCG gen(seed);
SimpleRealUniformDistribution<float> dis(0.0f, 1.0f);
for (auto &e : test_data) {
if (dis(&gen) < sparsity) {
e = missing_value;
out->Resize(rows_ * cols_, 0);
auto &h_data = out->HostVector();
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
for (auto &v : h_data) {
auto g = dist(&lcg);
if (g < sparsity) {
v = std::numeric_limits<float>::quiet_NaN();
} else {
e = dis(&gen);
v = dist(&lcg);
}
}
if (device_ >= 0) {
out->SetDevice(device_);
out->DeviceSpan();
}
}
DMatrixHandle handle;
XGDMatrixCreateFromMat(test_data.data(), rows, columns, missing_value,
&handle);
return static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
void RandomDataGenerator::GenerateArrayInterface(
HostDeviceVector<float> *storage, std::string *out) const {
CHECK(out);
this->GenerateDense(storage);
Json array_interface {Object()};
array_interface["data"] = std::vector<Json>(2);
array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(storage->DevicePointer()));
array_interface["data"][1] = Boolean(false);
array_interface["shape"] = std::vector<Json>(2);
array_interface["shape"][0] = rows_;
array_interface["shape"][1] = cols_;
array_interface["typestr"] = String("<f4");
array_interface["version"] = 1;
Json::Dump(array_interface, out);
}
void RandomDataGenerator::GenerateCSR(
HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
HostDeviceVector<bst_feature_t>* columns) const {
auto& h_value = value->HostVector();
auto& h_rptr = row_ptr->HostVector();
auto& h_cols = columns->HostVector();
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
h_rptr.emplace_back(0);
for (size_t i = 0; i < rows_; ++i) {
size_t rptr = h_rptr.back();
for (size_t j = 0; j < cols_; ++j) {
auto g = dist(&lcg);
if (g >= sparsity) {
g = dist(&lcg);
h_value.emplace_back(g);
rptr++;
h_cols.emplace_back(j);
}
}
h_rptr.emplace_back(rptr);
}
if (device_ >= 0) {
value->SetDevice(device_);
value->DeviceSpan();
row_ptr->SetDevice(device_);
row_ptr->DeviceSpan();
columns->SetDevice(device_);
columns->DeviceSpan();
}
CHECK_LE(h_value.size(), rows_ * cols_);
CHECK_EQ(value->Size(), h_rptr.back());
CHECK_EQ(columns->Size(), value->Size());
}
std::shared_ptr<DMatrix>
RandomDataGenerator::GenerateDMatix(bool with_label, bool float_label,
size_t classes) const {
HostDeviceVector<float> data;
HostDeviceVector<bst_row_t> rptrs;
HostDeviceVector<bst_feature_t> columns;
this->GenerateCSR(&data, &rptrs, &columns);
data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(),
data.HostPointer(), rows_, data.Size(), cols_);
std::shared_ptr<DMatrix> out{
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
if (with_label) {
RandomDataGenerator gen(rows_, 1, 0);
if (!float_label) {
gen.Lower(0).Upper(classes).GenerateDense(&out->Info().labels_);
auto& h_labels = out->Info().labels_.HostVector();
for (auto& v : h_labels) {
v = static_cast<float>(static_cast<uint32_t>(v));
}
} else {
gen.GenerateDense(&out->Info().labels_);
}
}
return out;
}
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
@@ -290,8 +376,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
std::unique_ptr<GradientBooster> gbm {
GradientBooster::Create(name, generic_param, learner_model_param)};
gbm->Configure(kwargs);
auto pp_dmat = CreateDMatrix(kRows, kCols, 0);
auto p_dmat = *pp_dmat;
auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatix();
std::vector<float> labels(kRows);
for (size_t i = 0; i < kRows; ++i) {
@@ -309,7 +394,6 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
gbm->DoBoost(p_dmat.get(), &gpair, &predts);
delete pp_dmat;
return gbm;
}