xgboost/tests/cpp/data/test_simple_dmatrix.cu
Jiaming Yuan d33854af1b
[Breaking] Accept multi-dim meta info. (#7405)
This PR changes base_margin into a 3-dim array, with one of them being reserved for multi-target classification. Also, a breaking change is made for binary serialization due to extra dimension along with a fix for saving the feature weights. Lastly, it unifies the prediction initialization between CPU and GPU. After this PR, the meta info setter in Python will be based on array interface.
2021-11-18 23:02:54 +08:00

373 lines
11 KiB
Plaintext

// Copyright by Contributors
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include "../../../src/data/simple_dmatrix.h"
#include <thrust/sequence.h>
#include "../../../src/data/device_adapter.cuh"
#include "../helpers.h"
#include "test_array_interface.h"
#include "../../../src/data/array_interface.h"
using namespace xgboost; // NOLINT
TEST(SimpleDMatrix, FromColumnarDenseBasic) {
constexpr size_t kRows{16};
std::vector<Json> columns;
thrust::device_vector<double> d_data_0(kRows);
thrust::device_vector<uint32_t> d_data_1(kRows);
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
Json column_arr{columns};
std::string str;
Json::Dump(column_arr, &str);
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 16);
EXPECT_EQ(dmat.Info().num_nonzero_, 32);
}
void TestDenseColumn(DMatrix* dmat, size_t n_rows, size_t n_cols) {
for (auto& batch : dmat->GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, i * 2);
EXPECT_EQ(inst[j].index, j);
}
}
}
ASSERT_EQ(dmat->Info().num_row_, n_rows);
ASSERT_EQ(dmat->Info().num_col_, n_cols);
}
TEST(SimpleDMatrix, FromColumnarDense) {
constexpr size_t kRows{16};
constexpr size_t kCols{2};
std::vector<Json> columns;
thrust::device_vector<float> d_data_0(kRows);
thrust::device_vector<int32_t> d_data_1(kRows);
columns.emplace_back(GenerateDenseColumn<float>("<f4", kRows, &d_data_0));
columns.emplace_back(GenerateDenseColumn<int32_t>("<i4", kRows, &d_data_1));
Json column_arr{columns};
std::string str;
Json::Dump(column_arr, &str);
// no missing value
{
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
TestDenseColumn(&dmat, kRows, kCols);
}
// with missing value specified
{
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, 4.0, -1);
ASSERT_EQ(dmat.Info().num_row_, kRows);
ASSERT_EQ(dmat.Info().num_col_, kCols);
ASSERT_EQ(dmat.Info().num_nonzero_, kCols * kRows - 2);
}
{
// no missing value, but has NaN
d_data_0[3] = std::numeric_limits<float>::quiet_NaN();
ASSERT_TRUE(std::isnan(d_data_0[3])); // removes 6.0
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 1);
ASSERT_EQ(dmat.Info().num_row_, kRows);
ASSERT_EQ(dmat.Info().num_col_, kCols);
}
}
TEST(SimpleDMatrix, FromColumnarWithEmptyRows) {
constexpr size_t kRows = 102;
constexpr size_t kCols = 24;
std::vector<Json> v_columns(kCols);
std::vector<dh::device_vector<float>> columns_data(kCols);
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(
kCols);
RBitField8::value_type constexpr kUCOne = 1;
for (size_t i = 0; i < kCols; ++i) {
auto& col = v_columns[i];
col = Object();
auto& data = columns_data[i];
data.resize(kRows);
thrust::sequence(data.begin(), data.end(), 0);
dh::safe_cuda(cudaDeviceSynchronize());
dh::safe_cuda(cudaGetLastError());
ASSERT_EQ(data.size(), kRows);
auto p_d_data = raw_pointer_cast(data.data());
std::vector<Json> j_data{
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
col["data"] = j_data;
std::vector<Json> j_shape{Json(Integer(static_cast<Integer::Int>(kRows)))};
col["shape"] = Array(j_shape);
col["version"] = 3;
col["typestr"] = String("<f4");
// Construct the mask object.
col["mask"] = Object();
auto& j_mask = col["mask"];
j_mask["version"] = 3;
auto& mask_storage = column_bitfields[i];
mask_storage.resize(16); // 16 bytes
mask_storage[0] = ~(kUCOne << 2); // 3^th row is missing
mask_storage[1] = ~(kUCOne << 3); // 12^th row is missing
size_t last_ind = 12;
mask_storage[last_ind] = ~(kUCOne << 5);
std::set<size_t> missing_row_index{0, 1, last_ind};
for (size_t j = 0; j < mask_storage.size(); ++j) {
if (missing_row_index.find(j) == missing_row_index.cend()) {
// all other rows are valid
mask_storage[j] = ~0;
}
}
j_mask["data"] = std::vector<Json>{
Json(
Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(
std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
j_mask["typestr"] = String("|i1");
}
Json column_arr{Array(v_columns)};
std::string str;
Json::Dump(column_arr, &str);
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
for (auto& batch : dmat.GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, i);
EXPECT_EQ(inst[j].index, j);
}
}
}
ASSERT_EQ(dmat.Info().num_nonzero_, (kRows - 3) * kCols);
ASSERT_EQ(dmat.Info().num_row_, kRows);
ASSERT_EQ(dmat.Info().num_col_, kCols);
}
TEST(SimpleCSRSource, FromColumnarSparse) {
constexpr size_t kRows = 32;
constexpr size_t kCols = 2;
RBitField8::value_type constexpr kUCOne = 1;
std::vector<dh::device_vector<float>> columns_data(kCols);
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
{
// column 0
auto& mask = column_bitfields[0];
mask.resize(8);
for (size_t j = 0; j < mask.size(); ++j) {
mask[j] = ~0;
}
// the 2^th entry of first column is invalid
// [0 0 0 0 0 1 0 0]
mask[0] = ~(kUCOne << 2);
}
{
// column 1
auto& mask = column_bitfields[1];
mask.resize(8);
for (size_t j = 0; j < mask.size(); ++j) {
mask[j] = ~0;
}
// the 19^th entry of second column is invalid
// [~0~], [~0~], [0 0 0 0 1 0 0 0]
mask[2] = ~(kUCOne << 3);
}
for (size_t c = 0; c < kCols; ++c) {
columns_data[c].resize(kRows);
thrust::sequence(columns_data[c].begin(), columns_data[c].end(), 0);
}
std::vector<Json> j_columns(kCols);
for (size_t c = 0; c < kCols; ++c) {
auto& column = j_columns[c];
column = Object();
column["version"] = 3;
column["typestr"] = String("<f4");
auto p_d_data = raw_pointer_cast(columns_data[c].data());
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
column["data"] = j_data;
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
column["version"] = 3;
column["typestr"] = String("<f4");
column["mask"] = Object();
auto& j_mask = column["mask"];
j_mask["version"] = 3;
j_mask["data"] = std::vector<Json>{
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
j_mask["typestr"] = String("|i1");
}
Json column_arr {Array(j_columns)};
std::string str;
Json::Dump(column_arr, &str);
{
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
ASSERT_EQ(dmat.Info().num_row_, kRows);
ASSERT_EQ(dmat.Info().num_nonzero_, (kRows*kCols)-2);
}
{
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, 2.0, -1);
for (auto& batch : dmat.GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto e : inst) {
ASSERT_NE(e.fvalue, 2.0);
}
}
}
}
{
// no missing value, but has NaN
data::CudfAdapter adapter(str);
columns_data[0][4] = std::numeric_limits<float>::quiet_NaN(); // 0^th column 4^th row
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
ASSERT_TRUE(std::isnan(columns_data[0][4]));
// Two invalid entries and one NaN, in CSC
// 0^th column: 0, 1, 4, 5, 6, ..., kRows
// 1^th column: 0, 1, 2, 3, ..., 19, 21, ..., kRows
ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 3);
}
}
TEST(SimpleDMatrix, FromColumnarSparseBasic) {
constexpr size_t kRows{16};
std::vector<Json> columns;
thrust::device_vector<double> d_data_0(kRows);
thrust::device_vector<uint32_t> d_data_1(kRows);
columns.emplace_back(GenerateSparseColumn<double>("<f8", kRows, &d_data_0));
columns.emplace_back(GenerateSparseColumn<uint32_t>("<u4", kRows, &d_data_1));
Json column_arr{columns};
std::string str;
Json::Dump(column_arr, &str);
data::CudfAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 16);
EXPECT_EQ(dmat.Info().num_nonzero_, 32);
for (auto& batch : dmat.GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, i * 2);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SimpleDMatrix, FromCupy){
int rows = 50;
int cols = 10;
thrust::device_vector< float> data(rows*cols);
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
std::string str;
Json::Dump(json_array_interface, &str);
data::CupyAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, -1, 1);
EXPECT_EQ(dmat.Info().num_col_, cols);
EXPECT_EQ(dmat.Info().num_row_, rows);
EXPECT_EQ(dmat.Info().num_nonzero_, rows*cols);
for (auto& batch : dmat.GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, i * cols + j);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SimpleDMatrix, FromCupySparse){
int rows = 2;
int cols = 2;
thrust::device_vector< float> data(rows*cols);
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
data[1] = std::numeric_limits<float>::quiet_NaN();
data[2] = std::numeric_limits<float>::quiet_NaN();
std::string str;
Json::Dump(json_array_interface, &str);
data::CupyAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, -1, 1);
EXPECT_EQ(dmat.Info().num_col_, cols);
EXPECT_EQ(dmat.Info().num_row_, rows);
EXPECT_EQ(dmat.Info().num_nonzero_, rows * cols - 2);
auto& batch = *dmat.GetBatches<SparsePage>().begin();
auto page = batch.GetView();
auto inst0 = page[0];
auto inst1 = page[1];
EXPECT_EQ(page[0].size(), 1);
EXPECT_EQ(page[1].size(), 1);
EXPECT_EQ(page[0][0].fvalue, 0.0f);
EXPECT_EQ(page[0][0].index, 0);
EXPECT_EQ(page[1][0].fvalue, 3.0f);
EXPECT_EQ(page[1][0].index, 1);
}