Complete cudf support. (#4850)

* Handles missing value.
* Accept all floating point and integer types.
* Move to cudf 9.0 API.
* Remove requirement on `null_count`.
* Arbitrary column types support.
This commit is contained in:
Jiaming Yuan
2019-09-16 23:52:00 -04:00
committed by GitHub
parent 125bcec62e
commit 5374f52531
17 changed files with 702 additions and 339 deletions

View File

@@ -7,11 +7,13 @@
#include "../../../src/common/device_helpers.cuh"
namespace xgboost {
TEST(MetaInfo, FromInterface) {
cudaSetDevice(0);
constexpr size_t kRows = 16;
thrust::device_vector<float> d_data(kRows);
template <typename T>
std::string PrepareData(std::string typestr, thrust::device_vector<T>* out) {
constexpr size_t kRows = 16;
out->resize(kRows);
auto& d_data = *out;
for (size_t i = 0; i < d_data.size(); ++i) {
d_data[i] = i * 2.0;
}
@@ -22,7 +24,7 @@ TEST(MetaInfo, FromInterface) {
column["shape"] = Array(j_shape);
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String("<f4");
column["typestr"] = String(typestr);
auto p_d_data = dh::Raw(d_data);
std::vector<Json> j_data {
@@ -34,6 +36,15 @@ TEST(MetaInfo, FromInterface) {
Json::Dump(column, &ss);
std::string str = ss.str();
return str;
}
TEST(MetaInfo, FromInterface) {
cudaSetDevice(0);
thrust::device_vector<float> d_data;
std::string str = PrepareData<float>("<f4", &d_data);
MetaInfo info;
info.SetInfo("label", str.c_str());
@@ -53,5 +64,22 @@ TEST(MetaInfo, FromInterface) {
for (size_t i = 0; i < d_data.size(); ++i) {
ASSERT_EQ(h_base_margin[i], d_data[i]);
}
EXPECT_ANY_THROW({info.SetInfo("group", str.c_str());});
}
TEST(MetaInfo, Group) {
cudaSetDevice(0);
thrust::device_vector<uint32_t> d_data;
std::string str = PrepareData<uint32_t>("<u4", &d_data);
MetaInfo info;
info.SetInfo("group", str.c_str());
auto const& h_group = info.group_ptr_;
ASSERT_EQ(h_group.size(), d_data.size() + 1);
for (size_t i = 1; i < h_group.size(); ++i) {
ASSERT_EQ(h_group[i], d_data[i-1] + h_group[i-1]) << "i: " << i;
}
}
} // namespace xgboost

View File

@@ -8,17 +8,48 @@
#include "../../../src/common/bitfield.h"
#include "../../../src/common/device_helpers.cuh"
#include "../../../src/data/simple_csr_source.h"
#include "../../../src/data/columnar.h"
namespace xgboost {
TEST(SimpleCSRSource, FromColumnarDense) {
constexpr size_t kRows = 16;
TEST(ArrayInterfaceHandler, Error) {
constexpr size_t kRows {16};
Json column { Object() };
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(nullptr))),
Json(Boolean(false))};
auto const& column_obj = get<Object>(column);
// missing version
EXPECT_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj), dmlc::Error);
column["version"] = Integer(static_cast<Integer::Int>(1));
// missing data
EXPECT_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj), dmlc::Error);
column["data"] = j_data;
// missing typestr
EXPECT_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj), dmlc::Error);
column["typestr"] = String("<f4");
// nullptr is not valid
EXPECT_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj), dmlc::Error);
thrust::device_vector<float> d_data(kRows);
j_data = {Json(Integer(reinterpret_cast<Integer::Int>(d_data.data().get()))),
Json(Boolean(false))};
column["data"] = j_data;
EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj));
}
template <typename T>
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
thrust::device_vector<T>* out_d_data) {
auto& d_data = *out_d_data;
Json column { Object() };
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
thrust::device_vector<float> d_data(kRows);
d_data.resize(kRows);
for (size_t i = 0; i < d_data.size(); ++i) {
d_data[i] = i * 2.0;
}
@@ -26,39 +57,91 @@ TEST(SimpleCSRSource, FromColumnarDense) {
auto p_d_data = dh::Raw(d_data);
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
column["data"] = j_data;
column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String("<f4");
Json column_arr {Array{std::vector<Json>{column}}};
column["typestr"] = String(typestr);
return column;
}
TEST(SimpleCSRSource, FromColumnarDense) {
constexpr size_t kRows {16};
constexpr size_t kCols {2};
std::vector<Json> columns;
thrust::device_vector<float> d_data_0(kRows);
thrust::device_vector<int32_t> d_data_1(kRows);
columns.emplace_back(GenerateDenseColumn<float>("<f4", kRows, &d_data_0));
columns.emplace_back(GenerateDenseColumn<int32_t>("<i4", kRows, &d_data_1));
Json column_arr {columns};
std::stringstream ss;
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str());
// no missing value
{
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), false);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
for (size_t i = 0; i < kRows; ++i) {
auto e = data[i];
ASSERT_NEAR(e.fvalue, i * 2.0, kRtEps);
ASSERT_EQ(e.index, 0); // feature 0
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
for (size_t i = 0; i < kRows; i++) {
auto const idx = i * kCols;
auto const e_0 = data.at(idx);
ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx;
ASSERT_EQ(e_0.index, 0); // feature 0
auto e_1 = data.at(idx+1);
ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps);
ASSERT_EQ(e_1.index, 1); // feature 1
}
ASSERT_EQ(offset.back(), kRows * kCols);
for (size_t i = 0; i < kRows + 1; ++i) {
ASSERT_EQ(offset[i], i * kCols);
}
ASSERT_EQ(source->info.num_row_, kRows);
ASSERT_EQ(source->info.num_col_, kCols);
}
ASSERT_EQ(offset.back(), 16);
for (size_t i = 0; i < kRows + 1; ++i) {
ASSERT_EQ(offset[i], i);
// with missing value specified
{
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), true, 4.0);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
ASSERT_EQ(data.size(), kRows * kCols - 2);
ASSERT_NEAR(data[4].fvalue, 6.0, kRtEps); // kCols * 2
ASSERT_EQ(offset.back(), 30);
for (size_t i = 3; i < kRows + 1; ++i) {
ASSERT_EQ(offset[i], (i - 1) * 2);
}
ASSERT_EQ(source->info.num_row_, kRows);
ASSERT_EQ(source->info.num_col_, kCols);
}
{
// no missing value, but has NaN
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
d_data_0[3] = std::numeric_limits<float>::quiet_NaN();
ASSERT_TRUE(std::isnan(d_data_0[3])); // removes 6.0
source->CopyFrom(str.c_str(), false);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
ASSERT_EQ(data.size(), kRows * kCols - 1);
ASSERT_NEAR(data[7].fvalue, 8.0, kRtEps);
ASSERT_EQ(source->info.num_row_, kRows);
ASSERT_EQ(source->info.num_col_, kCols);
}
}
TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
// In this test we construct a data storage similar to cudf
constexpr size_t kRows = 102;
constexpr size_t kCols = 24;
constexpr size_t kMissingRows = 3;
std::vector<Json> v_columns (kCols);
std::vector<dh::device_vector<float>> columns_data(kCols);
@@ -90,6 +173,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
// Construct the mask object.
col["mask"] = Object();
auto& j_mask = col["mask"];
j_mask["version"] = Integer(static_cast<Integer::Int>(1));
auto& mask_storage = column_bitfields[i];
mask_storage.resize(16); // 16 bytes
@@ -111,7 +195,6 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(16)))});
j_mask["typestr"] = String("|i1");
j_mask["null_count"] = Json(Integer(static_cast<Integer::Int>(kMissingRows)));
}
Json column_arr {Array(v_columns)};
@@ -119,7 +202,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str());
source->CopyFrom(str.c_str(), false);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
@@ -131,6 +214,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
ASSERT_NEAR(data[j].fvalue, i - 1, kRtEps);
}
}
ASSERT_EQ(source->info.num_row_, kRows);
}
TEST(SimpleCSRSource, FromColumnarSparse) {
@@ -149,6 +233,8 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
for (size_t j = 0; j < mask.size(); ++j) {
mask[j] = ~0;
}
// the 2^th entry of first column is invalid
// [0 0 0 0 0 1 0 0]
mask[0] = ~(kUCOne << 2);
}
{
@@ -159,6 +245,8 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
for (size_t j = 0; j < mask.size(); ++j) {
mask[j] = ~0;
}
// the 19^th entry of second column is invalid
// [~0~], [~0~], [0 0 0 0 1 0 0 0]
mask[2] = ~(kUCOne << 3);
}
@@ -186,12 +274,12 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
column["mask"] = Object();
auto& j_mask = column["mask"];
j_mask["version"] = Integer(static_cast<Integer::Int>(1));
j_mask["data"] = std::vector<Json>{
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(8)))});
j_mask["typestr"] = String("|i1");
j_mask["null_count"] = Json(Integer(static_cast<Integer::Int>(1)));
}
Json column_arr {Array(j_columns)};
@@ -200,17 +288,64 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str());
{
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), false);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
ASSERT_EQ(offset.size(), kRows + 1);
ASSERT_EQ(data[4].index, 1);
ASSERT_EQ(data[4].fvalue, 2);
ASSERT_EQ(data[37].index, 0);
ASSERT_EQ(data[37].fvalue, 19);
ASSERT_EQ(offset.size(), kRows + 1);
ASSERT_EQ(data[4].index, 1);
ASSERT_EQ(data[4].fvalue, 2);
ASSERT_EQ(data[37].index, 0);
ASSERT_EQ(data[37].fvalue, 19);
}
{
// with missing value
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), true, /*missing=*/2.0);
auto const& data = source->page_.data.HostVector();
ASSERT_NE(data[4].fvalue, 2.0);
}
{
// no missing value, but has NaN
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
columns_data[0][4] = std::numeric_limits<float>::quiet_NaN(); // 0^th column 4^th row
ASSERT_TRUE(std::isnan(columns_data[0][4]));
source->CopyFrom(str.c_str(), false);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
// Two invalid entries and one NaN, in CSC
// 0^th column: 0, 1, 4, 5, 6, ..., kRows
// 1^th column: 0, 1, 2, 3, ..., 19, 21, ..., kRows
// Turning it into CSR:
// | 0, 0 | 1, 1 | 2 | 3, 3 | 4 | ...
ASSERT_EQ(data.size(), kRows * kCols - 3);
ASSERT_EQ(data[4].index, 1); // from 1^th column
ASSERT_EQ(data[5].fvalue, 3.0);
ASSERT_EQ(data[7].index, 1); // from 1^th column
ASSERT_EQ(data[7].fvalue, 4.0);
ASSERT_EQ(data[offset[2]].fvalue, 2.0);
ASSERT_EQ(data[offset[4]].fvalue, 4.0);
}
{
// with NaN as missing value
// NaN is already set up by above test
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), true,
/*missing=*/std::numeric_limits<float>::quiet_NaN());
auto const& data = source->page_.data.HostVector();
ASSERT_EQ(data.size(), kRows * kCols - 1);
ASSERT_EQ(data[8].fvalue, 4.0);
}
}
} // namespace xgboost