This PR changes base_margin into a 3-dim array, with one of them being reserved for multi-target classification. Also, a breaking change is made for binary serialization due to extra dimension along with a fix for saving the feature weights. Lastly, it unifies the prediction initialization between CPU and GPU. After this PR, the meta info setter in Python will be based on array interface.
373 lines
11 KiB
Plaintext
373 lines
11 KiB
Plaintext
// Copyright by Contributors
|
|
#include <dmlc/filesystem.h>
|
|
#include <xgboost/data.h>
|
|
#include "../../../src/data/simple_dmatrix.h"
|
|
|
|
#include <thrust/sequence.h>
|
|
#include "../../../src/data/device_adapter.cuh"
|
|
#include "../helpers.h"
|
|
#include "test_array_interface.h"
|
|
#include "../../../src/data/array_interface.h"
|
|
|
|
using namespace xgboost; // NOLINT
|
|
|
|
TEST(SimpleDMatrix, FromColumnarDenseBasic) {
|
|
constexpr size_t kRows{16};
|
|
std::vector<Json> columns;
|
|
thrust::device_vector<double> d_data_0(kRows);
|
|
thrust::device_vector<uint32_t> d_data_1(kRows);
|
|
|
|
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
|
|
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
|
|
|
|
Json column_arr{columns};
|
|
|
|
std::string str;
|
|
Json::Dump(column_arr, &str);
|
|
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
|
-1);
|
|
EXPECT_EQ(dmat.Info().num_col_, 2);
|
|
EXPECT_EQ(dmat.Info().num_row_, 16);
|
|
EXPECT_EQ(dmat.Info().num_nonzero_, 32);
|
|
}
|
|
|
|
void TestDenseColumn(DMatrix* dmat, size_t n_rows, size_t n_cols) {
|
|
for (auto& batch : dmat->GetBatches<SparsePage>()) {
|
|
auto page = batch.GetView();
|
|
for (auto i = 0ull; i < batch.Size(); i++) {
|
|
auto inst = page[i];
|
|
for (auto j = 0ull; j < inst.size(); j++) {
|
|
EXPECT_EQ(inst[j].fvalue, i * 2);
|
|
EXPECT_EQ(inst[j].index, j);
|
|
}
|
|
}
|
|
}
|
|
ASSERT_EQ(dmat->Info().num_row_, n_rows);
|
|
ASSERT_EQ(dmat->Info().num_col_, n_cols);
|
|
}
|
|
|
|
TEST(SimpleDMatrix, FromColumnarDense) {
|
|
constexpr size_t kRows{16};
|
|
constexpr size_t kCols{2};
|
|
std::vector<Json> columns;
|
|
thrust::device_vector<float> d_data_0(kRows);
|
|
thrust::device_vector<int32_t> d_data_1(kRows);
|
|
columns.emplace_back(GenerateDenseColumn<float>("<f4", kRows, &d_data_0));
|
|
columns.emplace_back(GenerateDenseColumn<int32_t>("<i4", kRows, &d_data_1));
|
|
|
|
Json column_arr{columns};
|
|
|
|
std::string str;
|
|
Json::Dump(column_arr, &str);
|
|
|
|
// no missing value
|
|
{
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
|
-1);
|
|
TestDenseColumn(&dmat, kRows, kCols);
|
|
}
|
|
|
|
// with missing value specified
|
|
{
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, 4.0, -1);
|
|
|
|
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
|
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
|
ASSERT_EQ(dmat.Info().num_nonzero_, kCols * kRows - 2);
|
|
}
|
|
|
|
{
|
|
// no missing value, but has NaN
|
|
d_data_0[3] = std::numeric_limits<float>::quiet_NaN();
|
|
ASSERT_TRUE(std::isnan(d_data_0[3])); // removes 6.0
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
|
-1);
|
|
ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 1);
|
|
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
|
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
|
}
|
|
}
|
|
|
|
TEST(SimpleDMatrix, FromColumnarWithEmptyRows) {
|
|
constexpr size_t kRows = 102;
|
|
constexpr size_t kCols = 24;
|
|
|
|
std::vector<Json> v_columns(kCols);
|
|
std::vector<dh::device_vector<float>> columns_data(kCols);
|
|
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(
|
|
kCols);
|
|
|
|
RBitField8::value_type constexpr kUCOne = 1;
|
|
|
|
for (size_t i = 0; i < kCols; ++i) {
|
|
auto& col = v_columns[i];
|
|
col = Object();
|
|
auto& data = columns_data[i];
|
|
data.resize(kRows);
|
|
thrust::sequence(data.begin(), data.end(), 0);
|
|
dh::safe_cuda(cudaDeviceSynchronize());
|
|
dh::safe_cuda(cudaGetLastError());
|
|
|
|
ASSERT_EQ(data.size(), kRows);
|
|
|
|
auto p_d_data = raw_pointer_cast(data.data());
|
|
std::vector<Json> j_data{
|
|
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
|
Json(Boolean(false))};
|
|
col["data"] = j_data;
|
|
std::vector<Json> j_shape{Json(Integer(static_cast<Integer::Int>(kRows)))};
|
|
col["shape"] = Array(j_shape);
|
|
col["version"] = 3;
|
|
col["typestr"] = String("<f4");
|
|
|
|
// Construct the mask object.
|
|
col["mask"] = Object();
|
|
auto& j_mask = col["mask"];
|
|
j_mask["version"] = 3;
|
|
auto& mask_storage = column_bitfields[i];
|
|
mask_storage.resize(16); // 16 bytes
|
|
|
|
mask_storage[0] = ~(kUCOne << 2); // 3^th row is missing
|
|
mask_storage[1] = ~(kUCOne << 3); // 12^th row is missing
|
|
size_t last_ind = 12;
|
|
mask_storage[last_ind] = ~(kUCOne << 5);
|
|
std::set<size_t> missing_row_index{0, 1, last_ind};
|
|
|
|
for (size_t j = 0; j < mask_storage.size(); ++j) {
|
|
if (missing_row_index.find(j) == missing_row_index.cend()) {
|
|
// all other rows are valid
|
|
mask_storage[j] = ~0;
|
|
}
|
|
}
|
|
|
|
j_mask["data"] = std::vector<Json>{
|
|
Json(
|
|
Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
|
|
Json(Boolean(false))};
|
|
j_mask["shape"] = Array(
|
|
std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
|
|
j_mask["typestr"] = String("|i1");
|
|
}
|
|
|
|
Json column_arr{Array(v_columns)};
|
|
std::string str;
|
|
Json::Dump(column_arr, &str);
|
|
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
|
-1);
|
|
|
|
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
|
auto page = batch.GetView();
|
|
for (auto i = 0ull; i < batch.Size(); i++) {
|
|
auto inst = page[i];
|
|
for (auto j = 0ull; j < inst.size(); j++) {
|
|
EXPECT_EQ(inst[j].fvalue, i);
|
|
EXPECT_EQ(inst[j].index, j);
|
|
}
|
|
}
|
|
}
|
|
ASSERT_EQ(dmat.Info().num_nonzero_, (kRows - 3) * kCols);
|
|
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
|
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
|
}
|
|
|
|
TEST(SimpleCSRSource, FromColumnarSparse) {
|
|
constexpr size_t kRows = 32;
|
|
constexpr size_t kCols = 2;
|
|
RBitField8::value_type constexpr kUCOne = 1;
|
|
|
|
std::vector<dh::device_vector<float>> columns_data(kCols);
|
|
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
|
|
|
|
{
|
|
// column 0
|
|
auto& mask = column_bitfields[0];
|
|
mask.resize(8);
|
|
|
|
for (size_t j = 0; j < mask.size(); ++j) {
|
|
mask[j] = ~0;
|
|
}
|
|
// the 2^th entry of first column is invalid
|
|
// [0 0 0 0 0 1 0 0]
|
|
mask[0] = ~(kUCOne << 2);
|
|
}
|
|
{
|
|
// column 1
|
|
auto& mask = column_bitfields[1];
|
|
mask.resize(8);
|
|
|
|
for (size_t j = 0; j < mask.size(); ++j) {
|
|
mask[j] = ~0;
|
|
}
|
|
// the 19^th entry of second column is invalid
|
|
// [~0~], [~0~], [0 0 0 0 1 0 0 0]
|
|
mask[2] = ~(kUCOne << 3);
|
|
}
|
|
|
|
for (size_t c = 0; c < kCols; ++c) {
|
|
columns_data[c].resize(kRows);
|
|
thrust::sequence(columns_data[c].begin(), columns_data[c].end(), 0);
|
|
}
|
|
|
|
std::vector<Json> j_columns(kCols);
|
|
|
|
for (size_t c = 0; c < kCols; ++c) {
|
|
auto& column = j_columns[c];
|
|
column = Object();
|
|
column["version"] = 3;
|
|
column["typestr"] = String("<f4");
|
|
auto p_d_data = raw_pointer_cast(columns_data[c].data());
|
|
std::vector<Json> j_data {
|
|
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
|
Json(Boolean(false))};
|
|
column["data"] = j_data;
|
|
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
|
column["shape"] = Array(j_shape);
|
|
column["version"] = 3;
|
|
column["typestr"] = String("<f4");
|
|
|
|
column["mask"] = Object();
|
|
auto& j_mask = column["mask"];
|
|
j_mask["version"] = 3;
|
|
j_mask["data"] = std::vector<Json>{
|
|
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
|
|
Json(Boolean(false))};
|
|
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
|
|
j_mask["typestr"] = String("|i1");
|
|
}
|
|
|
|
Json column_arr {Array(j_columns)};
|
|
|
|
std::string str;
|
|
Json::Dump(column_arr, &str);
|
|
|
|
{
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
|
|
|
|
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
|
ASSERT_EQ(dmat.Info().num_nonzero_, (kRows*kCols)-2);
|
|
}
|
|
|
|
{
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, 2.0, -1);
|
|
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
|
auto page = batch.GetView();
|
|
for (auto i = 0ull; i < batch.Size(); i++) {
|
|
auto inst = page[i];
|
|
for (auto e : inst) {
|
|
ASSERT_NE(e.fvalue, 2.0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
{
|
|
// no missing value, but has NaN
|
|
data::CudfAdapter adapter(str);
|
|
columns_data[0][4] = std::numeric_limits<float>::quiet_NaN(); // 0^th column 4^th row
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
|
-1);
|
|
ASSERT_TRUE(std::isnan(columns_data[0][4]));
|
|
|
|
// Two invalid entries and one NaN, in CSC
|
|
// 0^th column: 0, 1, 4, 5, 6, ..., kRows
|
|
// 1^th column: 0, 1, 2, 3, ..., 19, 21, ..., kRows
|
|
ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 3);
|
|
}
|
|
}
|
|
|
|
|
|
TEST(SimpleDMatrix, FromColumnarSparseBasic) {
|
|
constexpr size_t kRows{16};
|
|
std::vector<Json> columns;
|
|
thrust::device_vector<double> d_data_0(kRows);
|
|
thrust::device_vector<uint32_t> d_data_1(kRows);
|
|
|
|
columns.emplace_back(GenerateSparseColumn<double>("<f8", kRows, &d_data_0));
|
|
columns.emplace_back(GenerateSparseColumn<uint32_t>("<u4", kRows, &d_data_1));
|
|
|
|
Json column_arr{columns};
|
|
|
|
std::string str;
|
|
Json::Dump(column_arr, &str);
|
|
|
|
data::CudfAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
|
-1);
|
|
EXPECT_EQ(dmat.Info().num_col_, 2);
|
|
EXPECT_EQ(dmat.Info().num_row_, 16);
|
|
EXPECT_EQ(dmat.Info().num_nonzero_, 32);
|
|
|
|
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
|
auto page = batch.GetView();
|
|
for (auto i = 0ull; i < batch.Size(); i++) {
|
|
auto inst = page[i];
|
|
for (auto j = 0ull; j < inst.size(); j++) {
|
|
EXPECT_EQ(inst[j].fvalue, i * 2);
|
|
EXPECT_EQ(inst[j].index, j);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
TEST(SimpleDMatrix, FromCupy){
|
|
int rows = 50;
|
|
int cols = 10;
|
|
thrust::device_vector< float> data(rows*cols);
|
|
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
|
|
std::string str;
|
|
Json::Dump(json_array_interface, &str);
|
|
data::CupyAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, -1, 1);
|
|
EXPECT_EQ(dmat.Info().num_col_, cols);
|
|
EXPECT_EQ(dmat.Info().num_row_, rows);
|
|
EXPECT_EQ(dmat.Info().num_nonzero_, rows*cols);
|
|
|
|
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
|
auto page = batch.GetView();
|
|
for (auto i = 0ull; i < batch.Size(); i++) {
|
|
auto inst = page[i];
|
|
for (auto j = 0ull; j < inst.size(); j++) {
|
|
EXPECT_EQ(inst[j].fvalue, i * cols + j);
|
|
EXPECT_EQ(inst[j].index, j);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST(SimpleDMatrix, FromCupySparse){
|
|
int rows = 2;
|
|
int cols = 2;
|
|
thrust::device_vector< float> data(rows*cols);
|
|
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
|
|
data[1] = std::numeric_limits<float>::quiet_NaN();
|
|
data[2] = std::numeric_limits<float>::quiet_NaN();
|
|
std::string str;
|
|
Json::Dump(json_array_interface, &str);
|
|
data::CupyAdapter adapter(str);
|
|
data::SimpleDMatrix dmat(&adapter, -1, 1);
|
|
EXPECT_EQ(dmat.Info().num_col_, cols);
|
|
EXPECT_EQ(dmat.Info().num_row_, rows);
|
|
EXPECT_EQ(dmat.Info().num_nonzero_, rows * cols - 2);
|
|
auto& batch = *dmat.GetBatches<SparsePage>().begin();
|
|
auto page = batch.GetView();
|
|
|
|
auto inst0 = page[0];
|
|
auto inst1 = page[1];
|
|
EXPECT_EQ(page[0].size(), 1);
|
|
EXPECT_EQ(page[1].size(), 1);
|
|
EXPECT_EQ(page[0][0].fvalue, 0.0f);
|
|
EXPECT_EQ(page[0][0].index, 0);
|
|
EXPECT_EQ(page[1][0].fvalue, 3.0f);
|
|
EXPECT_EQ(page[1][0].index, 1);
|
|
}
|